qubes-linux-kernel/patches.xen/xen3-patch-2.6.25

From: kernel.org
Subject: 2.6.25
Patch-mainline: 2.6.25

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py

--- head-2010-04-29.orig/arch/x86/Kconfig	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/Kconfig	2010-03-24 15:10:37.000000000 +0100
@@ -40,7 +40,7 @@ config X86
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_KVM
+	select HAVE_KVM if !XEN
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
@@ -240,14 +240,12 @@ config X86_TRAMPOLINE
 	default y

 config X86_NO_TSS
-	bool
+	def_bool y
 	depends on XEN
-	default y

 config X86_NO_IDT
-	bool
+	def_bool y
 	depends on XEN
-	default y

 config X86_32_LAZY_GS
 	def_bool y
@@ -920,9 +918,8 @@ config X86_VISWS_APIC
 	depends on X86_32 && X86_VISWS

 config X86_XEN_GENAPIC
-	bool
+	def_bool y
 	depends on X86_64_XEN
-	default y

 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
@@ -1371,7 +1368,7 @@ config ARCH_PROC_KCORE_TEXT

 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
-	depends on X86_64
+	depends on X86_64 && !X86_64_XEN

 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
@@ -2073,10 +2070,10 @@ config PCI_MMCONFIG
 	depends on X86_64 && PCI && ACPI

 config XEN_PCIDEV_FRONTEND
-	bool "Xen PCI Frontend" if X86_64
+	def_bool y
+	prompt "Xen PCI Frontend" if X86_64
 	depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
  	select HOTPLUG
-	default y
 	help
 	  The PCI device frontend driver allows the kernel to import arbitrary
 	  PCI devices from a PCI backend to support PCI driver domains.
@@ -2084,7 +2081,6 @@ config XEN_PCIDEV_FRONTEND
 config XEN_PCIDEV_FE_DEBUG
 	bool "Xen PCI Frontend Debugging"
 	depends on XEN_PCIDEV_FRONTEND
-	default n
 	help
 	  Enables some debug statements within the PCI Frontend.

--- head-2010-04-29.orig/arch/x86/Kconfig.debug	2010-03-24 15:02:14.000000000 +0100
+++ head-2010-04-29/arch/x86/Kconfig.debug	2010-03-24 15:10:37.000000000 +0100
@@ -273,6 +273,7 @@ config DEBUG_BOOT_PARAMS
 	bool "Debug boot parameters"
 	depends on DEBUG_KERNEL
 	depends on DEBUG_FS
+	depends on !XEN
 	---help---
 	  This option will cause struct boot_params to be exported via debugfs.

--- head-2010-04-29.orig/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:10:37.000000000 +0100
@@ -12,7 +12,6 @@
 #include <asm/ia32_unistd.h>
 #include <asm/thread_info.h>
 #include <asm/segment.h>
-#include <asm/vsyscall32.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>

@@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target)
 	CFI_RESTORE	rcx
  	movl	%ebp,%ebp		/* zero extension */
 	movl	%eax,%eax
+	movl	48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
 	movl	$__USER32_DS,40(%rsp)
 	movq	%rbp,32(%rsp)
 	movl	$__USER32_CS,16(%rsp)
-	movl	$VSYSCALL32_SYSEXIT,8(%rsp)
+	movq	%r10,8(%rsp)
 	movq	%rax,(%rsp)
 	cld
 	SAVE_ARGS 0,0,1
@@ -582,8 +582,8 @@ ia32_sys_call_table:
 	.quad compat_sys_futex		/* 240 */
 	.quad compat_sys_sched_setaffinity
 	.quad compat_sys_sched_getaffinity
-	.quad sys32_set_thread_area
-	.quad sys32_get_thread_area
+	.quad sys_set_thread_area
+	.quad sys_get_thread_area
 	.quad compat_sys_io_setup	/* 245 */
 	.quad sys_io_destroy
 	.quad compat_sys_io_getevents
@@ -661,7 +661,9 @@ ia32_sys_call_table:
 	.quad sys_epoll_pwait
 	.quad compat_sys_utimensat	/* 320 */
 	.quad compat_sys_signalfd
-	.quad compat_sys_timerfd
+	.quad sys_timerfd_create
 	.quad sys_eventfd
 	.quad sys32_fallocate
+	.quad compat_sys_timerfd_settime	/* 325 */
+	.quad compat_sys_timerfd_gettime
 ia32_syscall_end:
--- head-2010-04-29.orig/arch/x86/kernel/Makefile	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/Makefile	2010-03-24 15:10:37.000000000 +0100
@@ -134,11 +134,10 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o

+	obj-$(CONFIG_XEN)		+= nmi_64.o
 	time_64-$(CONFIG_XEN)		+= time_32.o
 	pci-dma_64-$(CONFIG_XEN)	+= pci-dma_32.o
 endif

 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
 	smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
-disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o
-%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) :=
--- head-2010-04-29.orig/arch/x86/kernel/acpi/boot.c	2010-04-15 09:52:23.000000000 +0200
+++ head-2010-04-29/arch/x86/kernel/acpi/boot.c	2010-04-15 09:56:18.000000000 +0200
@@ -115,6 +115,11 @@ char *__init __acpi_map_table(unsigned l
 	if (!phys || !size)
 		return NULL;

+#ifdef CONFIG_XEN
+	if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT))
+		return isa_bus_to_virt(phys);
+#endif
+
 	return early_ioremap(phys, size);
 }
 void __init __acpi_unmap_table(char *map, unsigned long size)
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,95 @@
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ *  Copyright (C) 2001-2003 Patrick Mochel
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+
+#include <asm/smp.h>
+
+#ifndef CONFIG_ACPI_PV_SLEEP
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_realmode_flags;
+extern char wakeup_start, wakeup_end;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+#endif
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem(void)
+{
+#ifndef CONFIG_ACPI_PV_SLEEP
+	if (!acpi_wakeup_address) {
+		printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
+		return -ENOMEM;
+	}
+	memcpy((void *)acpi_wakeup_address, &wakeup_start,
+	       &wakeup_end - &wakeup_start);
+	acpi_copy_wakeup_routine(acpi_wakeup_address);
+#endif
+
+	return 0;
+}
+
+/*
+ * acpi_restore_state - undo effects of acpi_save_state_mem
+ */
+void acpi_restore_state_mem(void)
+{
+}
+
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page from the first 1MB of memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16MB pages, but not
+ * <1MB pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+#ifndef CONFIG_ACPI_PV_SLEEP
+	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
+		printk(KERN_ERR
+		       "ACPI: Wakeup code way too big, S3 disabled.\n");
+		return;
+	}
+
+	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
+	if (!acpi_wakeup_address)
+		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+#endif
+}
+
+
+#ifndef CONFIG_ACPI_PV_SLEEP
+static int __init acpi_sleep_setup(char *str)
+{
+	while ((str != NULL) && (*str != '\0')) {
+		if (strncmp(str, "s3_bios", 7) == 0)
+			acpi_realmode_flags |= 1;
+		if (strncmp(str, "s3_mode", 7) == 0)
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
+		str = strchr(str, ',');
+		if (str != NULL)
+			str += strspn(str, ", \t");
+	}
+	return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
+#endif /* CONFIG_ACPI_PV_SLEEP */
--- head-2010-04-29.orig/arch/x86/kernel/acpi/sleep_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,117 +0,0 @@
-/*
- * sleep.c - x86-specific ACPI sleep support.
- *
- *  Copyright (C) 2001-2003 Patrick Mochel
- *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
- */
-
-#include <linux/acpi.h>
-#include <linux/bootmem.h>
-#include <linux/dmi.h>
-#include <linux/cpumask.h>
-
-#include <asm/smp.h>
-
-#ifndef CONFIG_ACPI_PV_SLEEP
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
-#endif
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-#ifndef CONFIG_ACPI_PV_SLEEP
-	if (!acpi_wakeup_address)
-		return 1;
-	memcpy((void *)acpi_wakeup_address, &wakeup_start,
-	       &wakeup_end - &wakeup_start);
-	acpi_copy_wakeup_routine(acpi_wakeup_address);
-#endif
-	return 0;
-}
-
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-#ifndef CONFIG_ACPI_PV_SLEEP
-	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
-		printk(KERN_ERR
-		       "ACPI: Wakeup code way too big, S3 disabled.\n");
-		return;
-	}
-
-	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
-	if (!acpi_wakeup_address)
-		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
-#endif
-}
-
-#ifndef CONFIG_ACPI_PV_SLEEP
-static int __init acpi_sleep_setup(char *str)
-{
-	while ((str != NULL) && (*str != '\0')) {
-		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_realmode_flags |= 1;
-		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_realmode_flags |= 2;
-		if (strncmp(str, "s3_beep", 7) == 0)
-			acpi_realmode_flags |= 4;
-		str = strchr(str, ',');
-		if (str != NULL)
-			str += strspn(str, ", \t");
-	}
-	return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-
-/* Ouch, we want to delete this. We already have better version in userspace, in
-   s2ram from suspend.sf.net project */
-static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
-{
-	acpi_realmode_flags |= 2;
-	return 0;
-}
-
-static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
-	{			/* Reset video mode after returning from ACPI S3 sleep */
-	 .callback = reset_videomode_after_s3,
-	 .ident = "Toshiba Satellite 4030cdt",
-	 .matches = {
-		     DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
-		     },
-	 },
-	{}
-};
-
-static int __init acpisleep_dmi_init(void)
-{
-	dmi_check_system(acpisleep_dmi_table);
-	return 0;
-}
-
-core_initcall(acpisleep_dmi_init);
-#endif /* CONFIG_ACPI_PV_SLEEP */
--- head-2010-04-29.orig/arch/x86/kernel/acpi/sleep_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-/*
- *  acpi.c - Architecture-Specific Low-Level ACPI Support
- *
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
- *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
- *  Copyright (C) 2003 Pavel Machek, SuSE Labs
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/bootmem.h>
-#include <linux/acpi.h>
-#include <linux/cpumask.h>
-
-#include <asm/mpspec.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/tlbflush.h>
-
-/* --------------------------------------------------------------------------
-                              Low-Level Sleep Support
-   -------------------------------------------------------------------------- */
-
-#ifndef CONFIG_ACPI_PV_SLEEP
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
-#endif
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-#ifndef CONFIG_ACPI_PV_SLEEP
-	memcpy((void *)acpi_wakeup_address, &wakeup_start,
-	       &wakeup_end - &wakeup_start);
-	acpi_copy_wakeup_routine(acpi_wakeup_address);
-#endif
-	return 0;
-}
-
-/*
- * acpi_restore_state
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page in low memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16M pages, but not
- * <1M pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-#ifndef CONFIG_ACPI_PV_SLEEP
-	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
-	if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
-		printk(KERN_CRIT
-		       "ACPI: Wakeup code way too big, will crash on attempt"
-		       " to suspend\n");
-#endif
-}
-
-#ifndef CONFIG_ACPI_PV_SLEEP
-static int __init acpi_sleep_setup(char *str)
-{
-	while ((str != NULL) && (*str != '\0')) {
-		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_realmode_flags |= 1;
-		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_realmode_flags |= 2;
-		if (strncmp(str, "s3_beep", 7) == 0)
-			acpi_realmode_flags |= 4;
-		str = strchr(str, ',');
-		if (str != NULL)
-			str += strspn(str, ", \t");
-	}
-
-	return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-#endif				/* CONFIG_ACPI_PV_SLEEP */
-
--- head-2010-04-29.orig/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
-int __init APIC_init_uniprocessor (void)
+int __init APIC_init_uniprocessor(void)
 {
 #ifdef CONFIG_X86_IO_APIC
 	if (smp_found_config)
--- head-2010-04-29.orig/arch/x86/kernel/asm-offsets_32.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/asm-offsets_32.c	2010-03-24 15:10:37.000000000 +0100
@@ -24,8 +24,10 @@
 #include <xen/interface/xen.h>
 #endif

+#ifdef CONFIG_LGUEST_GUEST
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
+#endif

 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
--- head-2010-04-29.orig/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -27,45 +27,50 @@
 #include "cpu.h"

 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
-	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
 #ifndef CONFIG_XEN
 	/*
 	 * Segments used for calling PnP BIOS have byte granularity.
 	 * They code segments and data segments have fixed 64k limits,
 	 * the transfer segment sizes are set at run time.
 	 */
-	[GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
-	[GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
-	[GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+	/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
 	/*
 	 * The APM segments have byte granularity and their bases
 	 * are set at run time.  All have 64k limits.
 	 */
-	[GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	/* 32-bit code */
+	[GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
-	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+	[GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+	/* data */
+	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },

-	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
 #endif
-	[GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);

+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
 static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_fxsr __cpuinitdata;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-static int disable_x86_sep __cpuinitdata;

 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};

-extern int disable_pse;
-
 static void __cpuinit default_init(struct cpuinfo_x86 * c)
 {
 	/* Not much we can do here... */
@@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str

 static int __init x86_fxsr_setup(char * s)
 {
-	/* Tell all the other CPUs to not use it... */
-	disable_x86_fxsr = 1;
-
-	/*
-	 * ... and clear the bits early in the boot_cpu_data
-	 * so that the bootup process doesn't try to do this
-	 * either.
-	 */
-	clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
-	clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
 	return 1;
 }
 __setup("nofxsr", x86_fxsr_setup);
@@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup);

 static int __init x86_sep_setup(char * s)
 {
-	disable_x86_sep = 1;
+	setup_clear_cpu_cap(X86_FEATURE_SEP);
 	return 1;
 }
 __setup("nosep", x86_sep_setup);
@@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void)
 void __init cpu_detect(struct cpuinfo_x86 *c)
 {
 	/* Get vendor name */
-	cpuid(0x00000000, &c->cpuid_level,
-	      (int *)&c->x86_vendor_id[0],
-	      (int *)&c->x86_vendor_id[8],
-	      (int *)&c->x86_vendor_id[4]);
+	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+	      (unsigned int *)&c->x86_vendor_id[0],
+	      (unsigned int *)&c->x86_vendor_id[8],
+	      (unsigned int *)&c->x86_vendor_id[4]);

 	c->x86 = 4;
 	if (c->cpuid_level >= 0x00000001) {
@@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
 		c->x86_mask = tfms & 15;
-		if (cap0 & (1<<19))
+		if (cap0 & (1<<19)) {
 			c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
+			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+		}
+	}
+}
+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+{
+	u32 tfms, xlvl;
+	unsigned int ebx;
+
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	if (have_cpuid_p()) {
+		/* Intel-defined flags: level 0x00000001 */
+		if (c->cpuid_level >= 0x00000001) {
+			u32 capability, excap;
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+			c->x86_capability[0] = capability;
+			c->x86_capability[4] = excap;
+		}
+
+		/* AMD-defined flags: level 0x80000001 */
+		xlvl = cpuid_eax(0x80000000);
+		if ((xlvl & 0xffff0000) == 0x80000000) {
+			if (xlvl >= 0x80000001) {
+				c->x86_capability[1] = cpuid_edx(0x80000001);
+				c->x86_capability[6] = cpuid_ecx(0x80000001);
+			}
+		}
+
 	}
+
 }

 /* Do minimum CPU detection early.
@@ -300,6 +326,7 @@ static void __init early_cpu_detect(void
 	struct cpuinfo_x86 *c = &boot_cpu_data;

 	c->x86_cache_alignment = 32;
+	c->x86_clflush_size = 32;

 	if (!have_cpuid_p())
 		return;
@@ -307,19 +334,30 @@ static void __init early_cpu_detect(void
 	cpu_detect(c);

 	get_cpu_vendor(c, 1);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		early_init_amd(c);
+		break;
+	case X86_VENDOR_INTEL:
+		early_init_intel(c);
+		break;
+	}
+
+	early_get_cap(c);
 }

 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
-	int ebx;
+	unsigned int ebx;

 	if (have_cpuid_p()) {
 		/* Get vendor name */
-		cpuid(0x00000000, &c->cpuid_level,
-		      (int *)&c->x86_vendor_id[0],
-		      (int *)&c->x86_vendor_id[8],
-		      (int *)&c->x86_vendor_id[4]);
+		cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+		      (unsigned int *)&c->x86_vendor_id[0],
+		      (unsigned int *)&c->x86_vendor_id[8],
+		      (unsigned int *)&c->x86_vendor_id[4]);

 		get_cpu_vendor(c, 0);
 		/* Initialize the standard set of capabilities */
@@ -366,8 +404,6 @@ static void __cpuinit generic_identify(s
 		init_scattered_cpuid_features(c);
 	}

-	early_intel_workaround(c);
-
 #ifdef CONFIG_X86_HT
 	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
@@ -401,7 +437,7 @@ __setup("serialnumber", x86_serial_nr_se
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;

@@ -427,20 +463,9 @@ static void __cpuinit identify_cpu(struc

 	generic_identify(c);

-	printk(KERN_DEBUG "CPU: After generic identify, caps:");
-	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08lx", c->x86_capability[i]);
-	printk("\n");
-
-	if (this_cpu->c_identify) {
+	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);

-		printk(KERN_DEBUG "CPU: After vendor identify, caps:");
-		for (i = 0; i < NCAPINTS; i++)
-			printk(" %08lx", c->x86_capability[i]);
-		printk("\n");
-	}
-
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
@@ -462,23 +487,6 @@ static void __cpuinit identify_cpu(struc
 	 * we do "generic changes."
 	 */

-	/* TSC disabled? */
-	if ( tsc_disable )
-		clear_bit(X86_FEATURE_TSC, c->x86_capability);
-
-	/* FXSR disabled? */
-	if (disable_x86_fxsr) {
-		clear_bit(X86_FEATURE_FXSR, c->x86_capability);
-		clear_bit(X86_FEATURE_XMM, c->x86_capability);
-	}
-
-	/* SEP disabled? */
-	if (disable_x86_sep)
-		clear_bit(X86_FEATURE_SEP, c->x86_capability);
-
-	if (disable_pse)
-		clear_bit(X86_FEATURE_PSE, c->x86_capability);
-
 	/* If the model name is still unset, do table lookup. */
 	if ( !c->x86_model_id[0] ) {
 		char *p;
@@ -491,13 +499,6 @@ static void __cpuinit identify_cpu(struc
 				c->x86, c->x86_model);
 	}

-	/* Now the feature flags better reflect actual CPU features! */
-
-	printk(KERN_DEBUG "CPU: After all inits, caps:");
-	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08lx", c->x86_capability[i]);
-	printk("\n");
-
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -510,8 +511,14 @@ static void __cpuinit identify_cpu(struc
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}

+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
+
+	select_idle_routine(c);
 }

 void __init identify_boot_cpu(void)
@@ -519,7 +526,6 @@ void __init identify_boot_cpu(void)
 	identify_cpu(&boot_cpu_data);
 	sysenter_setup();
 	enable_sep_cpu();
-	mtrr_bp_init();
 }

 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -576,6 +582,13 @@ void __cpuinit detect_ht(struct cpuinfo_
 }
 #endif

+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	char *vendor = NULL;
@@ -599,6 +612,17 @@ void __cpuinit print_cpu_info(struct cpu
 		printk("\n");
 }

+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

 /* This is hacky. :)
@@ -608,16 +632,6 @@ cpumask_t cpu_initialized __cpuinitdata
  * They will insert themselves into the cpu_devs structure.
  * Then, when cpu_init() is called, we can just iterate over that array.
  */
-
-extern int intel_cpu_init(void);
-extern int cyrix_init_cpu(void);
-extern int nsc_init_cpu(void);
-extern int amd_init_cpu(void);
-extern int centaur_init_cpu(void);
-extern int transmeta_init_cpu(void);
-extern int nexgen_init_cpu(void);
-extern int umc_init_cpu(void);
-
 void __init early_cpu_init(void)
 {
 	intel_cpu_init();
@@ -629,21 +643,13 @@ void __init early_cpu_init(void)
 	nexgen_init_cpu();
 	umc_init_cpu();
 	early_cpu_detect();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/* pse is not compatible with on-the-fly unmapping,
-	 * disable it even if the cpus claim to support it.
-	 */
-	clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-	disable_pse = 1;
-#endif
 }

 /* Make sure %fs is initialized properly in idle threads */
-struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xfs = __KERNEL_PERCPU;
+	regs->fs = __KERNEL_PERCPU;
 	return regs;
 }

@@ -651,7 +657,7 @@ struct pt_regs * __devinit idle_regs(str
  * it's on the real one. */
 void switch_to_new_gdt(void)
 {
-	struct Xgt_desc_struct gdt_descr;
+	struct desc_ptr gdt_descr;
 	unsigned long va, frames[16];
 	int f;

@@ -694,12 +700,6 @@ void __cpuinit cpu_init(void)

 	if (cpu_has_vme || cpu_has_de)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-	if (tsc_disable && cpu_has_tsc) {
-		printk(KERN_NOTICE "Disabling TSC...\n");
-		/**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
-		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-		set_in_cr4(X86_CR4_TSD);
-	}

 	switch_to_new_gdt();

@@ -712,7 +712,7 @@ void __cpuinit cpu_init(void)
 		BUG();
 	enter_lazy_tlb(&init_mm, curr);

-	load_esp0(t, thread);
+	load_sp0(t, thread);

 	load_LDT(&init_mm.context);

--- head-2010-04-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {

 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
 unsigned int num_var_ranges;
-unsigned int *usage_table;
+unsigned int mtrr_usage_table[MAX_VAR_RANGES];

 static void __init set_num_var_ranges(void)
 {
@@ -52,17 +52,12 @@ static void __init init_table(void)
 	int i, max;

 	max = num_var_ranges;
-	if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
-	    == NULL) {
-		printk(KERN_ERR "mtrr: could not allocate\n");
-		return;
-	}
 	for (i = 0; i < max; i++)
-		usage_table[i] = 0;
+		mtrr_usage_table[i] = 0;
 }

 int mtrr_add_page(unsigned long base, unsigned long size,
-		  unsigned int type, char increment)
+		  unsigned int type, bool increment)
 {
 	int error;
 	struct xen_platform_op op;
@@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un
 	}

 	if (increment)
-		++usage_table[op.u.add_memtype.reg];
+		++mtrr_usage_table[op.u.add_memtype.reg];

 	mutex_unlock(&mtrr_mutex);

@@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base

 int
 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
-	 char increment)
+	 bool increment)
 {
 	if (mtrr_check(base, size))
 		return -EINVAL;
@@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long
 			goto out;
 		}
 	}
-	if (usage_table[reg] < 1) {
+	if (mtrr_usage_table[reg] < 1) {
 		printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
 		goto out;
 	}
-	if (--usage_table[reg] < 1) {
+	if (--mtrr_usage_table[reg] < 1) {
 		op.cmd = XENPF_del_memtype;
 		op.u.del_memtype.handle = 0;
 		op.u.del_memtype.reg    = reg;
--- head-2010-04-29.orig/arch/x86/kernel/e820_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/e820_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -7,7 +7,6 @@
 #include <linux/kexec.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/efi.h>
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
@@ -18,11 +17,6 @@
 #include <asm/setup.h>
 #include <xen/interface/memory.h>

-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
-
 struct e820map e820;
 struct change_member {
 	struct e820entry *pbios; /* pointer to original bios entry */
@@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000
 EXPORT_SYMBOL(pci_mem_start);
 #endif
 extern int user_defined_memmap;
-struct resource data_resource = {
-	.name	= "Kernel data",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource code_resource = {
-	.name	= "Kernel code",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};

 static struct resource system_rom_resource = {
 	.name	= "System ROM",
@@ -112,60 +86,6 @@ static struct resource video_rom_resourc
 	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 };

-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-	.name	= "dma1",
-	.start	= 0x0000,
-	.end	= 0x001f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic1",
-	.start	= 0x0020,
-	.end	= 0x0021,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer0",
-	.start	= 0x0040,
-	.end    = 0x0043,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer1",
-	.start  = 0x0050,
-	.end    = 0x0053,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0060,
-	.end	= 0x006f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma page reg",
-	.start	= 0x0080,
-	.end	= 0x008f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic2",
-	.start	= 0x00a0,
-	.end	= 0x00a1,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma2",
-	.start	= 0x00c0,
-	.end	= 0x00df,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "fpu",
-	.start	= 0x00f0,
-	.end	= 0x00ff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
 #define ROMSIGNATURE 0xaa55

 static int __init romsignature(const unsigned char *rom)
@@ -272,10 +192,9 @@ static struct e820map machine_e820;
  * Request address space for all standard RAM and ROM resources
  * and also for regions reported as reserved by the e820.
  */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource,
-			    struct resource *data_resource,
-			    struct resource *bss_resource)
+void __init init_iomem_resources(struct resource *code_resource,
+		struct resource *data_resource,
+		struct resource *bss_resource)
 {
 	int i;

@@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou

 #undef e820

-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-	int i;
-
-	/* Nothing to do if not running in dom0. */
-	if (!is_initial_xendomain())
-		return 0;
-
-	printk("Setting up standard PCI resources\n");
-	if (efi_enabled)
-		efi_initialize_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
-	else
-		legacy_init_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
-
-	/* EFI systems may still have VGA */
-	request_resource(&iomem_resource, &video_ram_resource);
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
-}
-
-subsys_initcall(request_standard_resources);
-
 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
 /**
  * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
@@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l
 {
 	int x;

-	if (!efi_enabled) {
-       		x = e820.nr_map;
+	x = e820.nr_map;

-		if (x == E820MAX) {
-		    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		    return;
-		}
-
-		e820.map[x].addr = start;
-		e820.map[x].size = size;
-		e820.map[x].type = type;
-		e820.nr_map++;
+	if (x == E820MAX) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
 	}
+
+	e820.map[x].addr = start;
+	e820.map[x].size = size;
+	e820.map[x].type = type;
+	e820.nr_map++;
 } /* add_memory_region */

 /*
@@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr
 }

 /*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
-	unsigned long *max_pfn = arg, pfn;
-
-	if (start < end) {
-		pfn = PFN_UP(end -1);
-		if (pfn > *max_pfn)
-			*max_pfn = pfn;
-	}
-	return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
-	memory_present(0, PFN_UP(start), PFN_DOWN(end));
-	return 0;
-}
-
-/*
  * Find the highest page frame number we have available
  */
 void __init find_max_pfn(void)
@@ -672,11 +533,6 @@ void __init find_max_pfn(void)
 	int i;

 	max_pfn = 0;
-	if (efi_enabled) {
-		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-		efi_memmap_walk(efi_memory_present_wrapper, NULL);
-		return;
-	}

 	for (i = 0; i < e820.nr_map; i++) {
 		unsigned long start, end;
@@ -694,34 +550,12 @@ void __init find_max_pfn(void)
 }

 /*
- * Free all available memory for boot time allocation.  Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
-	/* check max_low_pfn */
-	if (start >= (max_low_pfn << PAGE_SHIFT))
-		return 0;
-	if (end >= (max_low_pfn << PAGE_SHIFT))
-		end = max_low_pfn << PAGE_SHIFT;
-	if (start < end)
-		free_bootmem(start, end - start);
-
-	return 0;
-}
-/*
  * Register fully available low RAM pages with the bootmem allocator.
  */
 void __init register_bootmem_low_pages(unsigned long max_low_pfn)
 {
 	int i;

-	if (efi_enabled) {
-		efi_memmap_walk(free_available_memory, NULL);
-		return;
-	}
 	for (i = 0; i < e820.nr_map; i++) {
 		unsigned long curr_pfn, last_pfn, size;
 		/*
@@ -842,56 +676,12 @@ void __init print_memory_map(char *who)
 	}
 }

-static __init __always_inline void efi_limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr = 0;
-	efi_memory_desc_t *md, *next_md;
-	void *p, *p1;
-	int i, j;
-
-	j = 0;
-	p1 = memmap.map;
-	for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-		md = p;
-		next_md = p1;
-		current_addr = md->phys_addr +
-			PFN_PHYS(md->num_pages);
-		if (is_available_memory(md)) {
-			if (md->phys_addr >= size) continue;
-			memcpy(next_md, md, memmap.desc_size);
-			if (current_addr >= size) {
-				next_md->num_pages -=
-					PFN_UP(current_addr-size);
-			}
-			p1 += memmap.desc_size;
-			next_md = p1;
-			j++;
-		} else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
-			   EFI_MEMORY_RUNTIME) {
-			/* In order to make runtime services
-			 * available we have to include runtime
-			 * memory regions in memory map */
-			memcpy(next_md, md, memmap.desc_size);
-			p1 += memmap.desc_size;
-			next_md = p1;
-			j++;
-		}
-	}
-	memmap.nr_map = j;
-	memmap.map_end = memmap.map +
-		(memmap.nr_map * memmap.desc_size);
-}
-
 void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr = 0;
 	int i;

 	print_memory_map("limit_regions start");
-	if (efi_enabled) {
-		efi_limit_regions(size);
-		return;
-	}
 	for (i = 0; i < e820.nr_map; i++) {
 		current_addr = e820.map[i].addr + e820.map[i].size;
 		if (current_addr < size)
@@ -1043,3 +833,44 @@ static int __init parse_memmap(char *arg
 	return 0;
 }
 early_param("memmap", parse_memmap);
+
+#ifndef CONFIG_XEN
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+				unsigned new_type)
+{
+	int i;
+
+	BUG_ON(old_type == new_type);
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+		if (ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start && ei->size <= size) {
+			ei->type = new_type;
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+		add_memory_region(final_start, final_end - final_start,
+					 new_type);
+	}
+}
+
+void __init update_e820(void)
+{
+	u8 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	print_memory_map("modified");
+}
+#endif
--- head-2010-04-29.orig/arch/x86/kernel/e820_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/e820_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -1,4 +1,4 @@
-/*
+/*
  * Handle the memory map.
  * The functions here do the job until bootmem takes over.
  *
@@ -26,6 +26,7 @@
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
 #include <xen/interface/memory.h>

 struct e820map e820 __initdata;
@@ -33,98 +34,105 @@ struct e820map e820 __initdata;
 struct e820map machine_e820;
 #endif

-/*
+/*
  * PFN of last memory page.
  */
-unsigned long end_pfn;
-EXPORT_SYMBOL(end_pfn);
+unsigned long end_pfn;

 #ifndef CONFIG_XEN
-/*
+/*
  * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
  * The direct mapping extends to end_pfn_map, so that we can directly access
  * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long end_pfn_map;
+ */
+unsigned long end_pfn_map;
 #endif

-/*
+/*
  * Last pfn which the user wants to use.
  */
 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;

-extern struct resource code_resource, data_resource, bss_resource;
-
-/* Check for some hardcoded bad areas that early boot is not allowed to touch */
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
-{
-	unsigned long addr = *addrp, last = addr + size;
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20

+struct early_res {
+	unsigned long start, end;
+	char name[16];
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
 #ifndef CONFIG_XEN
-	/* various gunk below that needed for SMP startup */
-	if (addr < 0x8000) {
-		*addrp = PAGE_ALIGN(0x8000);
-		return 1;
-	}
-
-	/* direct mapping tables of the kernel */
-	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
-		*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
-		return 1;
-	}
-
-	/* initrd */
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-		unsigned long ramdisk_end   = ramdisk_image+ramdisk_size;
-
-		if (last >= ramdisk_image && addr < ramdisk_end) {
-			*addrp = PAGE_ALIGN(ramdisk_end);
-			return 1;
-		}
-	}
+	{ 0, PAGE_SIZE, "BIOS data page" },			/* BIOS data page */
+#ifdef CONFIG_SMP
+	{ SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
 #endif
-	/* kernel code */
-	if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
-		*addrp = PAGE_ALIGN(__pa_symbol(&_end));
-		return 1;
-	}
+#endif
+	{}
+};

-	if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-		*addrp = PAGE_ALIGN(ebda_addr + ebda_size);
-		return 1;
+void __init reserve_early(unsigned long start, unsigned long end, char *name)
+{
+	int i;
+	struct early_res *r;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
+			      start, end - 1, name?name:"", r->start, r->end - 1, r->name);
 	}
+	if (i >= MAX_EARLY_RES)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	r->start = start;
+	r->end = end;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+}

-#ifdef CONFIG_NUMA
-	/* NUMA memory to node map */
-	if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
-		*addrp = nodemap_addr + nodemap_size;
-		return 1;
+void __init early_res_to_bootmem(void)
+{
+	int i;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
+			r->start, r->end - 1, r->name);
+		reserve_bootmem_generic(r->start, r->end - r->start);
 	}
-#endif
-	/* XXX ramdisk image here? */
-#else
-	if (last < (table_end<<PAGE_SHIFT)) {
-		*addrp = table_end << PAGE_SHIFT;
-		return 1;
+}
+
+/* Check for already reserved areas */
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{
+	int i;
+	unsigned long addr = *addrp, last;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last >= r->start && addr < r->end) {
+			*addrp = addr = r->end;
+			changed = 1;
+			goto again;
+		}
 	}
-#endif
-	return 0;
-}
+	return changed;
+}

 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
  */
-int e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{
+int
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+{
 	int i;

 #ifndef CONFIG_XEN
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
 #else
 	if (!is_initial_xendomain())
 		return 0;
@@ -132,12 +140,12 @@ int e820_any_mapped(unsigned long start,
 		const struct e820entry *ei = &machine_e820.map[i];
 #endif

-		if (type && ei->type != type)
+		if (type && ei->type != type)
 			continue;
 		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
+			continue;
+		return 1;
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -148,7 +156,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
  * Note: this function only works correct if the e820 table is sorted and
  * not-overlapping, which is the case
  */
-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+int __init e820_all_mapped(unsigned long start, unsigned long end,
+			   unsigned type)
 {
 	int i;

@@ -173,65 +182,77 @@ int __init e820_all_mapped(unsigned long
 		 */
 		if (ei->addr <= start)
 			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full coverage */
+		/*
+		 * if start is now at or beyond end, we're done, full
+		 * coverage
+		 */
 		if (start >= end)
-			return 1; /* we're done */
+			return 1;
 	}
 	return 0;
 }

-/*
- * Find a free area in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr = ei->addr, last;
-		if (ei->type != E820_RAM)
-			continue;
-		if (addr < start)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
+				    unsigned size, unsigned long align)
+{
+	int i;
+	unsigned long mask = ~(align - 1);
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long addr = ei->addr, last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		if (addr < start)
 			addr = start;
-		if (addr > ei->addr + ei->size)
-			continue;
+		if (addr > ei->addr + ei->size)
+			continue;
 		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
 			;
-		last = PAGE_ALIGN(addr) + size;
+		addr = (addr + align - 1) & mask;
+		last = addr + size;
 		if (last > ei->addr + ei->size)
 			continue;
-		if (last > end)
+		if (last > end)
 			continue;
-		return addr;
-	}
-	return -1UL;
-}
+		return addr;
+	}
+	return -1UL;
+}

 /*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
 {
-	unsigned long end_pfn = 0;
+	unsigned long end_pfn;
+
 	end_pfn = find_max_pfn_with_active_regions();
-
-	if (end_pfn > end_pfn_map)
+
+	if (end_pfn > end_pfn_map)
 		end_pfn_map = end_pfn;
 	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
 		end_pfn_map = MAXMEM>>PAGE_SHIFT;
 	if (end_pfn > end_user_pfn)
 		end_pfn = end_user_pfn;
-	if (end_pfn > end_pfn_map)
-		end_pfn = end_pfn_map;
+	if (end_pfn > end_pfn_map)
+		end_pfn = end_pfn_map;

-	printk("end_pfn_map = %lu\n", end_pfn_map);
-	return end_pfn;
+	printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+	return end_pfn;
 }

 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
+void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
+				   struct resource *code_resource,
+				   struct resource *data_resource,
+				   struct resource *bss_resource)
 {
 	int i;
 	for (i = 0; i < nr_map; i++) {
@@ -249,14 +270,14 @@ void __init e820_reserve_resources(struc
 		request_resource(&iomem_resource, res);
 		if (e820[i].type == E820_RAM) {
 			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
+			 * We don't know which RAM region contains kernel data,
+			 * so we try it repeatedly and let the resource manager
+			 * test it.
 			 */
 #ifndef CONFIG_XEN
-			request_resource(res, &code_resource);
-			request_resource(res, &data_resource);
-			request_resource(res, &bss_resource);
+			request_resource(res, code_resource);
+			request_resource(res, data_resource);
+			request_resource(res, bss_resource);
 #endif
 #ifdef CONFIG_KEXEC
 			if (crashk_res.start != crashk_res.end)
@@ -368,9 +389,9 @@ e820_register_active_regions(int nid, un
 #endif
 }

-/*
+/*
  * Add a memory region to the kernel e820 map.
- */
+ */
 void __init add_memory_region(unsigned long start, unsigned long size, int type)
 {
 	int x = e820.nr_map;
@@ -395,9 +416,7 @@ unsigned long __init e820_hole_size(unsi
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	unsigned long ram = 0;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
 	int i;

 	for (i = 0; i < e820.nr_map; i++) {
@@ -409,28 +428,31 @@ unsigned long __init e820_hole_size(unsi
 	return end - start - (ram << PAGE_SHIFT);
 }

-void __init e820_print_map(char *who)
+static void __init e820_print_map(char *who)
 {
 	int i;

 	for (i = 0; i < e820.nr_map; i++) {
 		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-			(unsigned long long) e820.map[i].addr,
-			(unsigned long long) (e820.map[i].addr + e820.map[i].size));
+		       (unsigned long long) e820.map[i].addr,
+		       (unsigned long long)
+		       (e820.map[i].addr + e820.map[i].size));
 		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
+		case E820_RAM:
+			printk(KERN_CONT "(usable)\n");
+			break;
 		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
+			printk(KERN_CONT "(reserved)\n");
+			break;
 		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
+			printk(KERN_CONT "(ACPI data)\n");
+			break;
 		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
+			printk(KERN_CONT "(ACPI NVS)\n");
+			break;
+		default:
+			printk(KERN_CONT "type %u\n", e820.map[i].type);
+			break;
 		}
 	}
 }
@@ -438,11 +460,11 @@ void __init e820_print_map(char *who)
 /*
  * Sanitize the BIOS e820 map.
  *
- * Some e820 responses include overlapping entries.  The following
+ * Some e820 responses include overlapping entries. The following
  * replaces the original e820 map with a new one, removing overlaps.
  *
  */
-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -462,7 +484,8 @@ static int __init sanitize_e820_map(stru
 	int i;

 	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
+		Visually we're performing the following
+		(1,2,3,4 = memory types)...

 		Sample memory map (w/overlaps):
 		   ____22__________________
@@ -504,22 +527,23 @@ static int __init sanitize_e820_map(stru
 	old_nr = *pnr_map;

 	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
+	for (i = 0; i < old_nr; i++)
 		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
 			return -1;

 	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
+	for (i = 0; i < 2 * old_nr; i++)
 		change_point[i] = &change_point_list[i];

 	/* record all known change-points (starting and ending addresses),
 	   omitting those that are for empty memory regions */
 	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
+	for (i = 0; i < old_nr; i++)	{
 		if (biosmap[i].size != 0) {
 			change_point[chgidx]->addr = biosmap[i].addr;
 			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+			change_point[chgidx]->addr = biosmap[i].addr +
+				biosmap[i].size;
 			change_point[chgidx++]->pbios = &biosmap[i];
 		}
 	}
@@ -529,75 +553,106 @@ static int __init sanitize_e820_map(stru
 	still_changing = 1;
 	while (still_changing)	{
 		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
+		for (i = 1; i < chg_nr; i++)  {
+			unsigned long long curaddr, lastaddr;
+			unsigned long long curpbaddr, lastpbaddr;
+
+			curaddr = change_point[i]->addr;
+			lastaddr = change_point[i - 1]->addr;
+			curpbaddr = change_point[i]->pbios->addr;
+			lastpbaddr = change_point[i - 1]->pbios->addr;
+
+			/*
+			 * swap entries, when:
+			 *
+			 * curaddr > lastaddr or
+			 * curaddr == lastaddr and curaddr == curpbaddr and
+			 * lastaddr != lastpbaddr
+			 */
+			if (curaddr < lastaddr ||
+			    (curaddr == lastaddr && curaddr == curpbaddr &&
+			     lastaddr != lastpbaddr)) {
 				change_tmp = change_point[i];
 				change_point[i] = change_point[i-1];
 				change_point[i-1] = change_tmp;
-				still_changing=1;
+				still_changing = 1;
 			}
 		}
 	}

 	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
+	overlap_entries = 0;	 /* number of entries in the overlap table */
+	new_bios_entry = 0;	 /* index for creating new bios map entries */
 	last_type = 0;		 /* start with undefined memory type */
 	last_addr = 0;		 /* start with 0 as last starting address */
+
 	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
+	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
 		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
+		if (change_point[chgidx]->addr ==
+		    change_point[chgidx]->pbios->addr) {
+			/*
+			 * add map entry to overlap list (> 1 entry
+			 * implies an overlap)
+			 */
+			overlap_list[overlap_entries++] =
+				change_point[chgidx]->pbios;
+		} else {
+			/*
+			 * remove entry from list (order independent,
+			 * so swap with last)
+			 */
+			for (i = 0; i < overlap_entries; i++) {
+				if (overlap_list[i] ==
+				    change_point[chgidx]->pbios)
+					overlap_list[i] =
+						overlap_list[overlap_entries-1];
 			}
 			overlap_entries--;
 		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+		/*
+		 * if there are overlapping entries, decide which
+		 * "type" to use (larger value takes precedence --
+		 * 1=usable, 2,3,4,4+=unusable)
+		 */
 		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
+		for (i = 0; i < overlap_entries; i++)
 			if (overlap_list[i]->type > current_type)
 				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
+		/*
+		 * continue building up new bios map based on this
+		 * information
+		 */
 		if (current_type != last_type)	{
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
+				/*
+				 * move forward only if the new size
+				 * was non-zero
+				 */
 				if (new_bios[new_bios_entry].size != 0)
+					/*
+					 * no more space left for new
+					 * bios entries ?
+					 */
 					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
+						break;
 			}
 			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+				new_bios[new_bios_entry].addr =
+					change_point[chgidx]->addr;
 				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
+				last_addr = change_point[chgidx]->addr;
 			}
 			last_type = current_type;
 		}
 	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
+	/* retain count for new bios entries */
+	new_nr = new_bios_entry;

 	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
 	*pnr_map = new_nr;

 	return 0;
@@ -612,7 +667,7 @@ static int __init sanitize_e820_map(stru
  * will have given us a memory map that we can use to properly
  * set up memory.  If we aren't, we'll fake a memory map.
  */
-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 {
 #ifndef CONFIG_XEN
 	/* Only one memory region (or negative)? Ignore it */
@@ -633,7 +688,7 @@ static int __init copy_e820_map(struct e
 			return -1;

 		add_memory_region(start, size, type);
-	} while (biosmap++,--nr_map);
+	} while (biosmap++, --nr_map);

 #ifdef CONFIG_XEN
 	if (is_initial_xendomain()) {
@@ -652,15 +707,17 @@ static int __init copy_e820_map(struct e
 	return 0;
 }

-void early_panic(char *msg)
+static void early_panic(char *msg)
 {
 	early_printk(msg);
 	panic(msg);
 }

-#ifndef CONFIG_XEN
-void __init setup_memory_region(void)
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
 {
+#ifndef CONFIG_XEN
+	char *who = "BIOS-e820";
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
@@ -670,14 +727,8 @@ void __init setup_memory_region(void)
 	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
 		early_panic("Cannot find a valid memory map");
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map("BIOS-e820");
-}
-
 #else  /* CONFIG_XEN */
-
-void __init setup_memory_region(void)
-{
+	char *who = "Xen";
 	int rc;
 	struct xen_memory_map memmap;
 	/*
@@ -705,11 +756,13 @@ void __init setup_memory_region(void)

 	if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
 		early_panic("Cannot find a valid memory map");
-
+#endif
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map("Xen");
+	e820_print_map(who);
+
+	/* In case someone cares... */
+	return who;
 }
-#endif

 static int __init parse_memopt(char *p)
 {
@@ -720,7 +773,7 @@ static int __init parse_memopt(char *p)
 	if (!p)
 		return -EINVAL;
 	end_user_pfn = memparse(p, &p);
-	end_user_pfn >>= PAGE_SHIFT;
+	end_user_pfn >>= PAGE_SHIFT;

 	end = end_user_pfn<<PAGE_SHIFT;
 	i = e820.nr_map-1;
@@ -738,7 +791,7 @@ static int __init parse_memopt(char *p)
 	}

 	return 0;
-}
+}
 early_param("mem", parse_memopt);

 static int userdef __initdata;
@@ -750,9 +803,9 @@ static int __init parse_memmap_opt(char

 	if (!strcmp(p, "exactmap")) {
 #ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
 		 * reset.
 		 */
 		e820_register_active_regions(0, 0, -1UL);
@@ -769,6 +822,8 @@ static int __init parse_memmap_opt(char
 	mem_size = memparse(p, &p);
 	if (p == oldp)
 		return -EINVAL;
+
+	userdef = 1;
 	if (*p == '@') {
 		start_at = memparse(p+1, &p);
 		add_memory_region(start_at, mem_size, E820_RAM);
@@ -788,11 +843,58 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
+		char nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
 		printk(KERN_INFO "user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
 }

+#ifndef CONFIG_XEN
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+				unsigned new_type)
+{
+	int i;
+
+	BUG_ON(old_type == new_type);
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+		if (ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start && ei->size <= size) {
+			ei->type = new_type;
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+		add_memory_region(final_start, final_end - final_start,
+					 new_type);
+	}
+}
+
+void __init update_e820(void)
+{
+	u8 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	e820_print_map("modified");
+}
+#endif
+
 unsigned long pci_mem_start = 0xaeedbabe;
 EXPORT_SYMBOL(pci_mem_start);

@@ -836,8 +938,10 @@ __init void e820_setup_gap(struct e820en

 	if (!found) {
 		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
-		       KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
+		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+		       "address range\n"
+		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
+		       "registers may break!\n");
 	}

 	/*
@@ -850,8 +954,9 @@ __init void e820_setup_gap(struct e820en
 	/* Fun with two's complement */
 	pci_mem_start = (gapstart + round) & -round;

-	printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-		pci_mem_start, gapstart, gapsize);
+	printk(KERN_INFO
+	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+	       pci_mem_start, gapstart, gapsize);
 }

 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
--- head-2010-04-29.orig/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -222,7 +222,7 @@ static struct console simnow_console = {
 };

 /* Direct interface for emergencies */
-struct console *early_console = &early_vga_console;
+static struct console *early_console = &early_vga_console;
 static int early_console_initialized = 0;

 void early_printk(const char *fmt, ...)
--- head-2010-04-29.orig/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:10:37.000000000 +0100
@@ -59,7 +59,7 @@
  * for paravirtualization.  The following will never clobber any registers:
  *   INTERRUPT_RETURN (aka. "iret")
  *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
  *
  * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
  * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -282,16 +282,21 @@ END(resume_kernel)
 #endif
 	CFI_ENDPROC

+	.macro test_tif ti_reg		# system call tracing in operation / emulation
+	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
+	.endm
+
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */

 	# sysenter call handler stub
-ENTRY(sysenter_entry)
+ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC simple
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
-	movl SYSENTER_stack_esp0(%esp),%esp
+	movl SYSENTER_stack_sp0(%esp),%esp
 sysenter_past_esp:
 	/*
 	 * No need to follow this irqs on/off section: the syscall
@@ -334,9 +339,7 @@ sysenter_past_esp:
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	test_tif %ebp
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -354,7 +357,7 @@ sysenter_past_esp:
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
-	ENABLE_INTERRUPTS_SYSEXIT
+	ENABLE_INTERRUPTS_SYSCALL_RET
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
@@ -363,10 +366,10 @@ sysenter_past_esp:
 	.align 4
 	.long 1b,2b
 .popsection
-ENDPROC(sysenter_entry)
+ENDPROC(ia32_sysenter_target)

 	# pv sysenter call handler stub
-ENTRY(sysenter_entry_pv)
+ENTRY(ia32pv_sysenter_target)
 	RING0_INT_FRAME
 	movl $__USER_DS,16(%esp)
 	movl %ebp,12(%esp)
@@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv)
 .previous
 	/* fall through */
 	CFI_ENDPROC
-ENDPROC(sysenter_entry_pv)
+ENDPROC(ia32pv_sysenter_target)

 	# system call handler stub
 ENTRY(system_call)
@@ -398,9 +401,7 @@ ENTRY(system_call)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation / emulation
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	test_tif %ebp
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -452,7 +453,8 @@ restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp			# skip orig_eax/error_code
 	CFI_ADJUST_CFA_OFFSET -4
-1:	INTERRUPT_RETURN
+irq_return:
+	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 	pushl $0			# no error code
@@ -461,7 +463,7 @@ iret_exc:
 .previous
 .section __ex_table,"a"
 	.align 4
-	.long 1b,iret_exc
+	.long irq_return,iret_exc
 .previous

 	CFI_RESTORE_STATE
@@ -657,7 +659,7 @@ END(syscall_badsys)
  * Build the entry stubs and pointer table with
  * some assembler magic.
  */
-.data
+.section .rodata,"a"
 ENTRY(interrupt)
 .text

@@ -963,7 +965,7 @@ END(device_not_available)
  * that sets up the real kernel stack. Check here, since we can't
  * allow the wrong stack to be used.
  *
- * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
  * already pushed 3 words if it hits on the sysenter instruction:
  * eflags, cs and eip.
  *
@@ -975,7 +977,7 @@ END(device_not_available)
 	cmpw $__KERNEL_CS,4(%esp);		\
 	jne ok;					\
 label:						\
-	movl SYSENTER_stack_esp0+offset(%esp),%esp;	\
+	movl SYSENTER_stack_sp0+offset(%esp),%esp;	\
 	CFI_DEF_CFA esp, 0;			\
 	CFI_UNDEFINED eip;			\
 	pushfl;					\
@@ -990,7 +992,7 @@ label:						\
 KPROBE_ENTRY(debug)
 	RING0_INT_FRAME
 #ifndef CONFIG_XEN
-	cmpl $sysenter_entry,(%esp)
+	cmpl $ia32_sysenter_target,(%esp)
 	jne debug_stack_correct
 	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
 debug_stack_correct:
@@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi)
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
 	je nmi_espfix_stack
-	cmpl $sysenter_entry,(%esp)
+	cmpl $ia32_sysenter_target,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi)
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
 	jae nmi_stack_correct
-	cmpl $sysenter_entry,12(%esp)
+	cmpl $ia32_sysenter_target,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
 	/* We have a RING0_INT_FRAME here */
@@ -1089,12 +1091,8 @@ nmi_espfix_stack:
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to espfix stack
 	CFI_ADJUST_CFA_OFFSET -24
-1:	INTERRUPT_RETURN
+	jmp irq_return
 	CFI_ENDPROC
-.section __ex_table,"a"
-	.align 4
-	.long 1b,iret_exc
-.previous
 #else
 KPROBE_ENTRY(nmi)
 	RING0_INT_FRAME
@@ -1112,17 +1110,17 @@ KPROBE_END(nmi)

 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
-1:	iret
+	iret
 .section __ex_table,"a"
 	.align 4
-	.long 1b,iret_exc
+	.long native_iret, iret_exc
 .previous
 END(native_iret)

-ENTRY(native_irq_enable_sysexit)
+ENTRY(native_irq_enable_syscall_ret)
 	sti
 	sysexit
-END(native_irq_enable_sysexit)
+END(native_irq_enable_syscall_ret)
 #endif

 KPROBE_ENTRY(int3)
@@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)

+#include <asm/alternative-asm.h>
+
+	# pv syscall call handler stub
+ENTRY(ia32pv_cstar_target)
+	RING0_INT_FRAME
+	movl $__USER_DS,16(%esp)
+	movl %ebp,%ecx
+	movl $__USER_CS,4(%esp)
+	movl 12(%esp),%ebp
+	pushl %eax			# save orig_eax
+	CFI_ADJUST_CFA_OFFSET 4
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-4,%ebp
+	CFI_REMEMBER_STATE
+	ja cstar_fault
+1:	movl (%ebp),%ebp
+.section __ex_table,"a"
+	.align 4
+	.long 1b,cstar_fault
+.previous
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	test_tif %ebp
+	jnz cstar_trace_entry
+	cmpl $nr_syscalls,%eax
+	jae cstar_badsys
+.Lcstar_call:
+	btl %eax,cstar_special
+	jc .Lcstar_special
+	call *cstar_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)		# store the return value
+.Lcstar_exit:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp syscall_exit
+.Lcstar_special:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp syscall_call
+cstar_set_tif:
+	movl $cstar_clear_tif,(%esp)	# replace return address
+	LOCK_PREFIX
+	orl $_TIF_CSTAR,TI_flags(%ebp)
+	jmp *sys_call_table(,%eax,4)
+cstar_clear_tif:
+	movl %eax,PT_EAX(%esp)		# store the return value
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	jmp .Lcstar_exit
+cstar_trace_entry:
+	movl $-ENOSYS,PT_EAX(%esp)
+	cmpl $nr_syscalls,%eax
+	jae 1f
+	btl %eax,cstar_special
+	jc .Lcstar_trace_special
+1:	movl %esp,%eax
+	xorl %edx,%edx
+	LOCK_PREFIX
+	orl $_TIF_CSTAR,TI_flags(%ebp)
+	call do_syscall_trace
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	testl %eax,%eax
+	jne .Lcstar_resume		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
+	movl PT_ORIG_EAX(%esp),%eax
+	cmpl $nr_syscalls,%eax
+	jb .Lcstar_call
+	jmp .Lcstar_exit
+.Lcstar_trace_special:
+	movl PT_ECX(%esp),%ecx
+	movl %esp,%eax
+	xorl %edx,%edx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	call do_syscall_trace
+	testl %eax,%eax
+	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
+	movl PT_ORIG_EAX(%esp),%eax
+	cmpl $nr_syscalls,%eax
+	jb syscall_call
+	jmp syscall_exit
+cstar_badsys:
+	movl $-ENOSYS,PT_EAX(%esp)
+.Lcstar_resume:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp resume_userspace
+	CFI_RESTORE_STATE
+cstar_fault:
+	movl $-EFAULT,%eax
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	jmp .Lcstar_resume
+	CFI_ENDPROC
+ENDPROC(ia32pv_cstar_target)
+
+ENTRY(cstar_ret_from_fork)
+	CFI_STARTPROC
+	movl PT_ECX(%esp),%ecx
+	GET_THREAD_INFO(%ebp)
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	jmp ret_from_fork
+	CFI_ENDPROC
+END(ret_from_fork)
+
 .section .rodata,"a"
 #include "syscall_table_32.S"

 syscall_table_size=(.-sys_call_table)
+
+#include <asm/unistd.h>
+cstar_special:
+nr=0
+mask=0
+.rept nr_syscalls+31
+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
+  .if nr == \n
+   mask = mask | (1 << (\n & 31))
+  .endif
+ .endr
+ nr = nr + 1
+ .if (nr & 31) == 0
+  .long mask
+  mask = 0
+ .endif
+.endr
+#define	sys_call_table cstar_call_table
+#define	sys_fork cstar_set_tif
+#define	sys_clone cstar_set_tif
+#define	sys_vfork cstar_set_tif
+#include "syscall_table_32.S"
+#undef	sys_call_table
+#undef	sys_fork
+#undef	sys_clone
+#undef	sys_vfork
--- head-2010-04-29.orig/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:10:37.000000000 +0100
@@ -54,11 +54,9 @@
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
-#include <xen/interface/arch-x86_64.h>
+#include <xen/interface/xen.h>
 #include <xen/interface/features.h>

-#include "xen_entry_64.S"
-
 	.code64

 #ifndef CONFIG_PREEMPT
@@ -277,7 +275,7 @@ ret_from_sys_call:
 sysret_check:
 	LOCKDEP_SYS_EXIT
 	GET_THREAD_INFO(%rcx)
-        XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
@@ -287,7 +285,7 @@ sysret_check:
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
-        XEN_UNBLOCK_EVENTS(%rsi)
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	RESTORE_ARGS 0,8,0
         HYPERVISOR_IRET VGCF_IN_SYSCALL

@@ -298,7 +296,7 @@ sysret_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
-	XEN_UNBLOCK_EVENTS(%rsi)
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
@@ -309,9 +307,8 @@ sysret_careful:
 	/* Handle a signal */
 sysret_signal:
 	TRACE_IRQS_ON
-/*	sti */
-        XEN_UNBLOCK_EVENTS(%rsi)
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f

 	/* Really a signal */
@@ -323,7 +320,7 @@ sysret_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check

@@ -355,7 +352,7 @@ tracesys:
  */
 	.globl int_ret_from_sys_call
 int_ret_from_sys_call:
-        XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testb $3,CS-ARGOFFSET(%rsp)
         jnz 1f
@@ -381,22 +378,20 @@ int_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc  int_very_careful
 	TRACE_IRQS_ON
-/*	sti */
-        XEN_UNBLOCK_EVENTS(%rsi)
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check

 	/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
 	TRACE_IRQS_ON
-/*	sti */
-        XEN_UNBLOCK_EVENTS(%rsi)
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	/* Check for syscall exit trace */
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -411,7 +406,7 @@ int_very_careful:
 	jmp int_restore_rest

 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -419,7 +414,7 @@ int_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi
 int_restore_rest:
 	RESTORE_REST
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
@@ -474,6 +469,7 @@ ENTRY(stub_execve)
 	CFI_REGISTER rip, r11
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
 	call sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
@@ -526,11 +522,10 @@ retint_check:
 retint_restore_args:	/* return to kernel space */
 	movl EFLAGS-REST_SKIP(%rsp), %eax
 	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
-	XEN_GET_VCPU_INFO(%rsi)
+	GET_VCPU_INFO
 	andb evtchn_upcall_mask(%rsi),%al
 	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
 	jnz restore_all_enable_events	#        != 0 => enable event delivery
-	XEN_PUT_VCPU_INFO(%rsi)

 	RESTORE_ARGS 0,8,0
 	HYPERVISOR_IRET 0
@@ -541,31 +536,29 @@ retint_careful:
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
 	TRACE_IRQS_ON
-	XEN_UNBLOCK_EVENTS(%rsi)
-/*	sti */
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
 	call  schedule
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
-	XEN_BLOCK_EVENTS(%rsi)
-/*	cli */
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp retint_check

 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_restore_args
 	TRACE_IRQS_ON
-        XEN_UNBLOCK_EVENTS(%rsi)
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp)
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
 	RESTORE_REST
-        XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
@@ -702,7 +695,7 @@ END(spurious_interrupt)
 	rdmsr
 	testl %edx,%edx
 	js    1f
-	swapgs
+	SWAPGS
 	xorl  %ebx,%ebx
 1:
 #endif
@@ -719,8 +712,7 @@ END(spurious_interrupt)
 	.if \ist
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
-/*	cli */
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \irqtrace
 	TRACE_IRQS_OFF
 	.endif
@@ -749,10 +741,10 @@ paranoid_swapgs\trace:
 	.if \trace
 	TRACE_IRQS_IRETQ 0
 	.endif
-	swapgs
+	SWAPGS_UNSAFE_STACK
 paranoid_restore\trace:
 	RESTORE_ALL 8
-	iretq
+	jmp irq_return
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
 	movl threadinfo_flags(%rcx),%ebx
@@ -767,11 +759,11 @@ paranoid_userspace\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	xorl %esi,%esi 			/* arg2: oldset */
 	movq %rsp,%rdi 			/* arg1: &pt_regs */
 	call do_notify_resume
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -780,9 +772,9 @@ paranoid_schedule\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_ANY)
 	call schedule
-	cli
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -846,8 +838,7 @@ error_call_handler:
 	call *%rax
 error_exit:
 	RESTORE_REST
-/*	cli */
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
 	testb $3,CS-ARGOFFSET(%rsp)
@@ -875,7 +866,7 @@ error_kernelspace:
 	   iret run with kernel gs again, so don't set the user space flag.
 	   B stepping K8s sometimes report an truncated RIP for IRET
 	   exceptions returning to compat mode. Check for these here too. */
-	leaq iret_label(%rip),%rbp
+	leaq irq_return(%rip),%rbp
 	cmpq %rbp,RIP(%rsp)
 	je   error_swapgs
 	movl %ebp,%ebp	/* zero extend */
@@ -930,19 +921,17 @@ END(do_hypervisor_callback)
 restore_all_enable_events:
 	CFI_DEFAULT_STACK adj=1
 	TRACE_IRQS_ON
-	XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
+	__ENABLE_INTERRUPTS

 scrit:	/**** START OF CRITICAL REGION ****/
-	XEN_TEST_PENDING(%rsi)
+	__TEST_PENDING
 	CFI_REMEMBER_STATE
 	jnz  14f			# process more events if necessary...
-	XEN_PUT_VCPU_INFO(%rsi)
         RESTORE_ARGS 0,8,0
         HYPERVISOR_IRET 0

 	CFI_RESTORE_STATE
-14:	XEN_LOCKED_BLOCK_EVENTS(%rsi)
-	XEN_PUT_VCPU_INFO(%rsi)
+14:	__DISABLE_INTERRUPTS
 	SAVE_REST
         movq %rsp,%rdi                  # set the argument again
 	jmp  11b
@@ -1086,15 +1075,16 @@ ENDPROC(child_rip)
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
- *	rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
  */
 ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
 	SAVE_ALL
+	movq %rsp,%rcx
 	call sys_execve
 	movq %rax, RAX(%rsp)
 	RESTORE_REST
@@ -1144,7 +1134,7 @@ do_nmi_callback:
 	call do_nmi
 	orl  $NMI_MASK,EFLAGS(%rsp)
 	RESTORE_REST
-	XEN_BLOCK_EVENTS(%rsi)
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
 	jmp  retint_restore_args
--- head-2010-04-29.orig/arch/x86/kernel/fixup.c	2008-01-28 12:24:18.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/fixup.c	2010-03-24 15:10:37.000000000 +0100
@@ -36,7 +36,7 @@

 #define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )

-fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
 {
 	static unsigned long printed = 0;
 	char info[100];
--- head-2010-04-29.orig/arch/x86/kernel/head64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/head64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/percpu.h>
+#include <linux/start_kernel.h>
 #include <linux/module.h>

 #include <asm/processor.h>
@@ -26,6 +27,8 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820.h>

 unsigned long start_pfn;

@@ -34,7 +37,7 @@ static void __init zap_identity_mappings
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
 	pgd_clear(pgd);
-	__flush_tlb();
+	__flush_tlb_all();
 }

 /* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
 unsigned int machine_to_phys_order;
 EXPORT_SYMBOL(machine_to_phys_order);

+#define EBDA_ADDR_POINTER 0x40E
+
+static __init void reserve_ebda(void)
+{
+#ifndef CONFIG_XEN
+	unsigned ebda_addr, ebda_size;
+
+	/*
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E
+	 */
+	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+	ebda_addr <<= 4;
+
+	if (!ebda_addr)
+		return;
+
+	ebda_size = *(unsigned short *)__va(ebda_addr);
+
+	/* Round EBDA up to pages */
+	if (ebda_size == 0)
+		ebda_size = 1;
+	ebda_size <<= 10;
+	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+	if (ebda_size > 64*1024)
+		ebda_size = 64*1024;
+
+	reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
+#endif
+}
+
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	struct xen_machphys_mapping mapping;
@@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r
 	/* Make NULL pointers segfault */
 	zap_identity_mappings();

-	for (i = 0; i < IDT_ENTRIES; i++)
+	/* Cleanup the over mapped high alias */
+	cleanup_highmap();
+
+	for (i = 0; i < IDT_ENTRIES; i++) {
+#ifdef CONFIG_EARLY_PRINTK
+		set_intr_gate(i, &early_idt_handlers[i]);
+#else
 		set_intr_gate(i, early_idt_handler);
+#endif
+	}
 	load_idt((const struct desc_ptr *)&idt_descr);
 #endif

@@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r

 	pda_init(0);
 	copy_bootdata(__va(real_mode_data));
-#ifdef CONFIG_SMP
-	cpu_set(0, cpu_online_map);
-#endif
+
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+	reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
+		      start_pfn << PAGE_SHIFT, "Xen provided");
+
+	reserve_ebda();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
 	start_kernel();
 }
--- head-2010-04-29.orig/arch/x86/kernel/head_32-xen.S	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/head_32-xen.S	2010-03-24 15:10:37.000000000 +0100
@@ -3,6 +3,7 @@
 .text
 #include <linux/elfnote.h>
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page.h>
@@ -88,7 +89,7 @@ ENTRY(_stext)
  */
 .section ".bss.page_aligned","wa"
 	.align PAGE_SIZE_asm
-ENTRY(swapper_pg_pmd)
+ENTRY(swapper_pg_fixmap)
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
--- head-2010-04-29.orig/arch/x86/kernel/io_apic_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/io_apic_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -35,6 +35,7 @@
 #include <linux/htirq.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/jiffies.h>	/* time_after() */

 #include <asm/io.h>
 #include <asm/smp.h>
@@ -48,8 +49,6 @@
 #include <mach_apic.h>
 #include <mach_apicdef.h>

-#include "io_ports.h"
-
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
@@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi
 # include <asm/processor.h>	/* kernel_thread() */
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
-# include <linux/timer.h>	/* time_after() */
+# include <linux/timer.h>

 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
@@ -777,7 +776,7 @@ late_initcall(balanced_irq_init);
 #endif

 #ifndef CONFIG_SMP
-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
 {
 #ifndef CONFIG_XEN
 	unsigned int cfg;
@@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (time_after(jiffies, t1 + 4))
 		return 1;

 	return 0;
@@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read
 	.eoi		= ack_apic,
 };

-static void setup_nmi (void)
+static void __init setup_nmi(void)
 {
 	/*
  	 * Dirty trick to enable the NMI watchdog ...
@@ -2155,7 +2154,7 @@ static void setup_nmi (void)
 	 */
 	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");

-	on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+	enable_NMI_through_LVT0();

 	apic_printk(APIC_VERBOSE, " done.\n");
 }
@@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi
 }

 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
--- head-2010-04-29.orig/arch/x86/kernel/io_apic_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/io_apic_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -32,9 +32,11 @@
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/dmar.h>
+#include <linux/jiffies.h>
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
+#include <linux/bootmem.h>

 #include <asm/idle.h>
 #include <asm/io.h>
@@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();

 	v = apic_read(APIC_TASKPRI);
 	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void)
 }
 #endif /* !CONFIG_XEN */

-static void __init enable_IO_APIC(void)
+void __init enable_IO_APIC(void)
 {
 	union IO_APIC_reg_01 reg_01;
 #ifndef CONFIG_XEN
@@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void)
 	 */

 	/* jiffies wrap? */
-	if (jiffies - t1 > 4)
+	if (time_after(jiffies, t1 + 4))
 		return 1;
 	return 0;
 }
@@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i
 	if (likely(!cfg->move_in_progress))
 		return;

-	vector = ~get_irq_regs()->orig_rax;
+	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
@@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int
 	int do_unmask_irq = 0;

 	irq_complete_move(irq);
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
@@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir
 	.end = end_lapic_irq,
 };

-static void setup_nmi (void)
+static void __init setup_nmi(void)
 {
 	/*
  	 * Dirty trick to enable the NMI watchdog ...
@@ -1583,7 +1585,7 @@ static void setup_nmi (void)
 	 */
 	printk(KERN_INFO "activating NMI Watchdog ...");

-	enable_NMI_through_LVT0(NULL);
+	enable_NMI_through_LVT0();

 	printk(" done.\n");
 }
@@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v
  *
  * FIXME: really need to revamp this for modern platforms only.
  */
-static inline void check_timer(void)
+static inline void __init check_timer(void)
 {
 	struct irq_cfg *cfg = irq_cfg + 0;
 	int apic1, pin1, apic2, pin2;
@@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi
 }

 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
@@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void)
 	}
 }
 #endif
-#endif /* !CONFIG_XEN */

+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+	unsigned long n;
+	struct resource *res;
+	char *mem;
+	int i;
+
+	if (nr_ioapics <= 0)
+		return NULL;
+
+	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+	n *= nr_ioapics;
+
+	mem = alloc_bootmem(n);
+	res = (void *)mem;
+
+	if (mem != NULL) {
+		memset(mem, 0, n);
+		mem += sizeof(struct resource) * nr_ioapics;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			res[i].name = mem;
+			res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			sprintf(mem,  "IOAPIC %u", i);
+			mem += IOAPIC_RESOURCE_NAME_SIZE;
+		}
+	}
+
+	ioapic_resources = res;
+
+	return res;
+}
+
+void __init ioapic_init_mappings(void)
+{
+	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+	struct resource *ioapic_res;
+	int i;
+
+	ioapic_res = ioapic_setup_resources();
+	for (i = 0; i < nr_ioapics; i++) {
+		if (smp_found_config) {
+			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+		} else {
+			ioapic_phys = (unsigned long)
+				alloc_bootmem_pages(PAGE_SIZE);
+			ioapic_phys = __pa(ioapic_phys);
+		}
+		set_fixmap_nocache(idx, ioapic_phys);
+		apic_printk(APIC_VERBOSE,
+			    "mapped IOAPIC to %016lx (%016lx)\n",
+			    __fix_to_virt(idx), ioapic_phys);
+		idx++;
+
+		if (ioapic_res != NULL) {
+			ioapic_res->start = ioapic_phys;
+			ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+			ioapic_res++;
+		}
+	}
+}
+
+static int __init ioapic_insert_resources(void)
+{
+	int i;
+	struct resource *r = ioapic_resources;
+
+	if (!r) {
+		printk(KERN_ERR
+		       "IO APIC resources could be not be allocated.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		insert_resource(&iomem_resource, r);
+		r++;
+	}
+
+	return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif /* !CONFIG_XEN */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/kernel/ioport-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,112 @@
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <xen/interface/physdev.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base,
+		       unsigned int extent, int new_value)
+{
+	unsigned int i;
+
+	for (i = base; i < base + extent; i++) {
+		if (new_value)
+			__set_bit(i, bitmap);
+		else
+			__clear_bit(i, bitmap);
+	}
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	struct thread_struct * t = &current->thread;
+	struct physdev_set_iobitmap set_iobitmap;
+
+	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+		return -EINVAL;
+	if (turn_on && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	/*
+	 * If it's the first ioperm() call in this thread's lifetime, set the
+	 * IO bitmap up. ioperm() is much less timing critical than clone(),
+	 * this is why we delay this operation until now:
+	 */
+	if (!t->io_bitmap_ptr) {
+		unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+		if (!bitmap)
+			return -ENOMEM;
+
+		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
+
+		set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+		set_iobitmap.nr_ports = IO_BITMAP_BITS;
+		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+					      &set_iobitmap));
+	}
+
+	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+	return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ */
+static int do_iopl(unsigned int level, struct thread_struct *t)
+{
+	unsigned int old = t->iopl >> 12;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage long sys_iopl(unsigned long regsp)
+{
+	struct pt_regs *regs = (struct pt_regs *)&regsp;
+	unsigned int level = regs->bx;
+#else
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+#endif
+	struct thread_struct *t = &current->thread;
+	int rc;
+
+	rc = do_iopl(level, t);
+	if (rc < 0)
+		goto out;
+
+	t->iopl = level << 12;
+	set_iopl_mask(t->iopl);
+out:
+	return rc;
+}
--- head-2010-04-29.orig/arch/x86/kernel/ioport_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,121 +0,0 @@
-/*
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-#include <xen/interface/physdev.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-	unsigned long mask;
-	unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
-	unsigned int low_index = base & (BITS_PER_LONG-1);
-	int length = low_index + extent;
-
-	if (low_index != 0) {
-		mask = (~0UL << low_index);
-		if (length < BITS_PER_LONG)
-			mask &= ~(~0UL << length);
-		if (new_value)
-			*bitmap_base++ |= mask;
-		else
-			*bitmap_base++ &= ~mask;
-		length -= BITS_PER_LONG;
-	}
-
-	mask = (new_value ? ~0UL : 0UL);
-	while (length >= BITS_PER_LONG) {
-		*bitmap_base++ = mask;
-		length -= BITS_PER_LONG;
-	}
-
-	if (length > 0) {
-		mask = ~(~0UL << length);
-		if (new_value)
-			*bitmap_base++ |= mask;
-		else
-			*bitmap_base++ &= ~mask;
-	}
-}
-
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-	struct thread_struct * t = &current->thread;
-	unsigned long *bitmap;
-	struct physdev_set_iobitmap set_iobitmap;
-
-	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-		return -EINVAL;
-	if (turn_on && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/*
-	 * If it's the first ioperm() call in this thread's lifetime, set the
-	 * IO bitmap up. ioperm() is much less timing critical than clone(),
-	 * this is why we delay this operation until now:
-	 */
-	if (!t->io_bitmap_ptr) {
-		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-		if (!bitmap)
-			return -ENOMEM;
-
-		memset(bitmap, 0xff, IO_BITMAP_BYTES);
-		t->io_bitmap_ptr = bitmap;
-		set_thread_flag(TIF_IO_BITMAP);
-
-		set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
-		set_iobitmap.nr_ports = IO_BITMAP_BITS;
-		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
-					      &set_iobitmap));
-	}
-
-	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-	return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the eflags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned long unused)
-{
-	volatile struct pt_regs * regs = (struct pt_regs *) &unused;
-	unsigned int level = regs->ebx;
-	struct thread_struct *t = &current->thread;
-	unsigned int old = (t->iopl >> 12) & 3;
-
-	if (level > 3)
-		return -EINVAL;
-	/* Trying to gain more privileges? */
-	if (level > old) {
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
-	}
-	t->iopl = level << 12;
-	set_iopl_mask(t->iopl);
-	return 0;
-}
--- head-2010-04-29.orig/arch/x86/kernel/ioport_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,99 +0,0 @@
-/*
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-#include <xen/interface/physdev.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-	int i;
-
-	if (new_value)
-		for (i = base; i < base + extent; i++)
-			__set_bit(i, bitmap);
-	else
-		for (i = base; i < base + extent; i++)
-			clear_bit(i, bitmap);
-}
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-	struct thread_struct * t = &current->thread;
-	unsigned long *bitmap;
-	struct physdev_set_iobitmap set_iobitmap;
-
-	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-		return -EINVAL;
-	if (turn_on && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/*
-	 * If it's the first ioperm() call in this thread's lifetime, set the
-	 * IO bitmap up. ioperm() is much less timing critical than clone(),
-	 * this is why we delay this operation until now:
-	 */
-	if (!t->io_bitmap_ptr) {
-		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-		if (!bitmap)
-			return -ENOMEM;
-
-		memset(bitmap, 0xff, IO_BITMAP_BYTES);
-		t->io_bitmap_ptr = bitmap;
-		set_thread_flag(TIF_IO_BITMAP);
-
-		set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
-		set_iobitmap.nr_ports = IO_BITMAP_BITS;
-		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
-					      &set_iobitmap));
-	}
-
-	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-	return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- */
-
-asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
-{
-	unsigned int old_iopl = current->thread.iopl;
-	struct physdev_set_iopl set_iopl;
-
-	if (new_iopl > 3)
-		return -EINVAL;
-
-	/* Need "raw I/O" privileges for direct port access. */
-	if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/* Change our version of the privilege levels. */
-	current->thread.iopl = new_iopl;
-
-	/* Force the change at ring 0. */
-	set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
-	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-
-	return 0;
-}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/kernel/ldt-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP
+static void flush_ldt(void *null)
+{
+	if (current->active_mm)
+		load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+	void *oldldt, *newldt;
+	int oldsize;
+
+	if (mincount <= pc->size)
+		return 0;
+	oldsize = pc->size;
+	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
+	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+	else
+		newldt = (void *)__get_free_page(GFP_KERNEL);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	if (oldsize)
+		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
+	oldldt = pc->ldt;
+	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+	       (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+	/* CHECKME: Do we really need this ? */
+	wmb();
+#endif
+	pc->ldt = newldt;
+	wmb();
+	pc->size = mincount;
+	wmb();
+
+	if (reload) {
+#ifdef CONFIG_SMP
+		cpumask_t mask;
+
+		preempt_disable();
+#endif
+		make_pages_readonly(newldt,
+				    (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		load_LDT(pc);
+#ifdef CONFIG_SMP
+		mask = cpumask_of_cpu(smp_processor_id());
+		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+			smp_call_function(flush_ldt, NULL, 1, 1);
+		preempt_enable();
+#endif
+	}
+	if (oldsize) {
+		make_pages_writable(oldldt,
+				    (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			put_page(virt_to_page(oldldt));
+	}
+	return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+	int err = alloc_ldt(new, old->size, 0);
+
+	if (err < 0)
+		return err;
+	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+	make_pages_readonly(new->ldt,
+			    (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+			    XENFEAT_writable_descriptor_tables);
+	return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct mm_struct *old_mm;
+	int retval = 0;
+
+	memset(&mm->context, 0, sizeof(mm->context));
+	mutex_init(&mm->context.lock);
+	old_mm = current->mm;
+	if (old_mm)
+		mm->context.vdso = old_mm->context.vdso;
+	if (old_mm && old_mm->context.size > 0) {
+		mutex_lock(&old_mm->context.lock);
+		retval = copy_ldt(&mm->context, &old_mm->context);
+		mutex_unlock(&old_mm->context.lock);
+	}
+	return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	if (mm->context.size) {
+		/* CHECKME: Can this ever happen ? */
+		if (mm == current->active_mm)
+			clear_LDT();
+		make_pages_writable(mm->context.ldt,
+				    (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(mm->context.ldt);
+		else
+			put_page(virt_to_page(mm->context.ldt));
+		mm->context.size = 0;
+	}
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+	struct mm_struct *mm = current->mm;
+
+	if (!mm->context.size)
+		return 0;
+	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+	mutex_lock(&mm->context.lock);
+	size = mm->context.size * LDT_ENTRY_SIZE;
+	if (size > bytecount)
+		size = bytecount;
+
+	err = 0;
+	if (copy_to_user(ptr, mm->context.ldt, size))
+		err = -EFAULT;
+	mutex_unlock(&mm->context.lock);
+	if (err < 0)
+		goto error_return;
+	if (size != bytecount) {
+		/* zero-fill the rest */
+		if (clear_user(ptr + size, bytecount - size) != 0) {
+			err = -EFAULT;
+			goto error_return;
+		}
+	}
+	return bytecount;
+error_return:
+	return err;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+	/* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+	unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+	unsigned long size = 128;
+#endif
+	if (bytecount > size)
+		bytecount = size;
+	if (clear_user(ptr, bytecount))
+		return -EFAULT;
+	return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+	struct mm_struct *mm = current->mm;
+	struct desc_struct ldt;
+	int error;
+	struct user_desc ldt_info;
+
+	error = -EINVAL;
+	if (bytecount != sizeof(ldt_info))
+		goto out;
+	error = -EFAULT;
+	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+		goto out;
+
+	error = -EINVAL;
+	if (ldt_info.entry_number >= LDT_ENTRIES)
+		goto out;
+	if (ldt_info.contents == 3) {
+		if (oldmode)
+			goto out;
+		if (ldt_info.seg_not_present == 0)
+			goto out;
+	}
+
+	mutex_lock(&mm->context.lock);
+	if (ldt_info.entry_number >= mm->context.size) {
+		error = alloc_ldt(&current->mm->context,
+				  ldt_info.entry_number + 1, 1);
+		if (error < 0)
+			goto out_unlock;
+	}
+
+	/* Allow LDTs to be cleared by the user. */
+	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+		if (oldmode || LDT_empty(&ldt_info)) {
+			memset(&ldt, 0, sizeof(ldt));
+			goto install;
+		}
+	}
+
+	fill_ldt(&ldt, &ldt_info);
+	if (oldmode)
+		ldt.avl = 0;
+
+	/* Install the new entry ...  */
+install:
+	error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+
+out_unlock:
+	mutex_unlock(&mm->context.lock);
+out:
+	return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount)
+{
+	int ret = -ENOSYS;
+
+	switch (func) {
+	case 0:
+		ret = read_ldt(ptr, bytecount);
+		break;
+	case 1:
+		ret = write_ldt(ptr, bytecount, 1);
+		break;
+	case 2:
+		ret = read_default_ldt(ptr, bytecount);
+		break;
+	case 0x11:
+		ret = write_ldt(ptr, bytecount, 0);
+		break;
+	}
+	return ret;
+}
--- head-2010-04-29.orig/arch/x86/kernel/ldt_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,265 +0,0 @@
-/*
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
-static void flush_ldt(void *null)
-{
-	if (current->active_mm)
-		load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
-{
-	void *oldldt;
-	void *newldt;
-	int oldsize;
-
-	if (mincount <= pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount+511)&(~511);
-	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
-	else
-		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
-
-	if (!newldt)
-		return -ENOMEM;
-
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-
-	if (reload) {
-#ifdef CONFIG_SMP
-		cpumask_t mask;
-		preempt_disable();
-#endif
-		make_pages_readonly(
-			pc->ldt,
-			(pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		load_LDT(pc);
-#ifdef CONFIG_SMP
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
-		preempt_enable();
-#endif
-	}
-	if (oldsize) {
-		make_pages_writable(
-			oldldt,
-			(oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			kfree(oldldt);
-	}
-	return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-	int err = alloc_ldt(new, old->size, 0);
-	if (err < 0)
-		return err;
-	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
-	make_pages_readonly(
-		new->ldt,
-		(new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-		XENFEAT_writable_descriptor_tables);
-	return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	struct mm_struct * old_mm;
-	int retval = 0;
-
-	mutex_init(&mm->context.lock);
-	mm->context.size = 0;
-	mm->context.has_foreign_mappings = 0;
-	old_mm = current->mm;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
-	}
-	return retval;
-}
-
-/*
- * No need to lock the MM as we are the last user
- */
-void destroy_context(struct mm_struct *mm)
-{
-	if (mm->context.size) {
-		if (mm == current->active_mm)
-			clear_LDT();
-		make_pages_writable(
-			mm->context.ldt,
-			(mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			kfree(mm->context.ldt);
-		mm->context.size = 0;
-	}
-}
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-	struct mm_struct * mm = current->mm;
-
-	if (!mm->context.size)
-		return 0;
-	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
-
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size*LDT_ENTRY_SIZE;
-	if (size > bytecount)
-		size = bytecount;
-
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
-	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr+size, bytecount-size) != 0) {
-			err = -EFAULT;
-			goto error_return;
-		}
-	}
-	return bytecount;
-error_return:
-	return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-
-	err = 0;
-	size = 5*sizeof(struct desc_struct);
-	if (size > bytecount)
-		size = bytecount;
-
-	err = size;
-	if (clear_user(ptr, size))
-		err = -EFAULT;
-
-	return err;
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
-{
-	struct mm_struct * mm = current->mm;
-	__u32 entry_1, entry_2;
-	int error;
-	struct user_desc ldt_info;
-
-	error = -EINVAL;
-	if (bytecount != sizeof(ldt_info))
-		goto out;
-	error = -EFAULT;
-	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
-		goto out;
-
-	error = -EINVAL;
-	if (ldt_info.entry_number >= LDT_ENTRIES)
-		goto out;
-	if (ldt_info.contents == 3) {
-		if (oldmode)
-			goto out;
-		if (ldt_info.seg_not_present == 0)
-			goto out;
-	}
-
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-   	/* Allow LDTs to be cleared by the user. */
-   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			entry_1 = 0;
-			entry_2 = 0;
-			goto install;
-		}
-	}
-
-	entry_1 = LDT_entry_a(&ldt_info);
-	entry_2 = LDT_entry_b(&ldt_info);
-	if (oldmode)
-		entry_2 &= ~(1 << 20);
-
-	/* Install the new entry ...  */
-install:
-	error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
-				entry_1, entry_2);
-
-out_unlock:
-	mutex_unlock(&mm->context.lock);
-out:
-	return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
-{
-	int ret = -ENOSYS;
-
-	switch (func) {
-	case 0:
-		ret = read_ldt(ptr, bytecount);
-		break;
-	case 1:
-		ret = write_ldt(ptr, bytecount, 1);
-		break;
-	case 2:
-		ret = read_default_ldt(ptr, bytecount);
-		break;
-	case 0x11:
-		ret = write_ldt(ptr, bytecount, 0);
-		break;
-	}
-	return ret;
-}
--- head-2010-04-29.orig/arch/x86/kernel/ldt_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,271 +0,0 @@
-/*
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2002 Andi Kleen
- *
- * This handles calls from both 32bit and 64bit mode.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/pgalloc.h>
-
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
-static void flush_ldt(void *null)
-{
-	if (current->active_mm)
-               load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
-{
-	void *oldldt;
-	void *newldt;
-	unsigned oldsize;
-
-	if (mincount <= (unsigned)pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount+511)&(~511);
-	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
-	else
-		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
-
-	if (!newldt)
-		return -ENOMEM;
-
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
-	wmb();
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-	if (reload) {
-#ifdef CONFIG_SMP
-		cpumask_t mask;
-
-		preempt_disable();
-#endif
-		make_pages_readonly(
-			pc->ldt,
-			(pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		load_LDT(pc);
-#ifdef CONFIG_SMP
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
-		preempt_enable();
-#endif
-	}
-	if (oldsize) {
-		make_pages_writable(
-			oldldt,
-			(oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			kfree(oldldt);
-	}
-	return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-	int err = alloc_ldt(new, old->size, 0);
-	if (err < 0)
-		return err;
-	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
-	make_pages_readonly(
-		new->ldt,
-		(new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-		XENFEAT_writable_descriptor_tables);
-	return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	struct mm_struct * old_mm;
-	int retval = 0;
-
-	memset(&mm->context, 0, sizeof(mm->context));
-	mutex_init(&mm->context.lock);
-	old_mm = current->mm;
-	if (old_mm)
-		mm->context.vdso = old_mm->context.vdso;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
-	}
-	return retval;
-}
-
-/*
- *
- * Don't touch the LDT register - we're already in the next thread.
- */
-void destroy_context(struct mm_struct *mm)
-{
-	if (mm->context.size) {
-		if (mm == current->active_mm)
-			clear_LDT();
-		make_pages_writable(
-			mm->context.ldt,
-			(mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
-			XENFEAT_writable_descriptor_tables);
-		if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			kfree(mm->context.ldt);
-		mm->context.size = 0;
-	}
-}
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-	struct mm_struct * mm = current->mm;
-
-	if (!mm->context.size)
-		return 0;
-	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
-
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size*LDT_ENTRY_SIZE;
-	if (size > bytecount)
-		size = bytecount;
-
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
-	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr+size, bytecount-size) != 0) {
-			err = -EFAULT;
-			goto error_return;
-		}
-	}
-	return bytecount;
-error_return:
-	return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
-	/* Arbitrary number */
-	/* x86-64 default LDT is all zeros */
-	if (bytecount > 128)
-		bytecount = 128;
-	if (clear_user(ptr, bytecount))
-		return -EFAULT;
-	return bytecount;
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
-{
-	struct task_struct *me = current;
-	struct mm_struct * mm = me->mm;
-	__u32 entry_1, entry_2, *lp;
-	unsigned long mach_lp;
-	int error;
-	struct user_desc ldt_info;
-
-	error = -EINVAL;
-
-	if (bytecount != sizeof(ldt_info))
-		goto out;
-	error = -EFAULT;
-	if (copy_from_user(&ldt_info, ptr, bytecount))
-		goto out;
-
-	error = -EINVAL;
-	if (ldt_info.entry_number >= LDT_ENTRIES)
-		goto out;
-	if (ldt_info.contents == 3) {
-		if (oldmode)
-			goto out;
-		if (ldt_info.seg_not_present == 0)
-			goto out;
-	}
-
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= (unsigned)mm->context.size) {
-		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-	lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
- 	mach_lp = arbitrary_virt_to_machine(lp);
-
-   	/* Allow LDTs to be cleared by the user. */
-   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			entry_1 = 0;
-			entry_2 = 0;
-			goto install;
-		}
-	}
-
-	entry_1 = LDT_entry_a(&ldt_info);
-	entry_2 = LDT_entry_b(&ldt_info);
-	if (oldmode)
-		entry_2 &= ~(1 << 20);
-
-	/* Install the new entry ...  */
-install:
-	error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
-
-out_unlock:
-	mutex_unlock(&mm->context.lock);
-out:
-	return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
-{
-	int ret = -ENOSYS;
-
-	switch (func) {
-	case 0:
-		ret = read_ldt(ptr, bytecount);
-		break;
-	case 1:
-		ret = write_ldt(ptr, bytecount, 1);
-		break;
-	case 2:
-		ret = read_default_ldt(ptr, bytecount);
-		break;
-	case 0x11:
-		ret = write_ldt(ptr, bytecount, 0);
-		break;
-	}
-	return ret;
-}
--- head-2010-04-29.orig/arch/x86/kernel/machine_kexec_64.c	2010-04-15 09:44:51.000000000 +0200
+++ head-2010-04-29/arch/x86/kernel/machine_kexec_64.c	2010-04-15 09:56:14.000000000 +0200
@@ -407,7 +407,9 @@ void machine_kexec(struct kimage *image)

 void arch_crash_save_vmcoreinfo(void)
 {
+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
 	VMCOREINFO_SYMBOL(phys_base);
+#endif
 	VMCOREINFO_SYMBOL(init_level4_pgt);

 #ifdef CONFIG_NUMA
--- head-2010-04-29.orig/arch/x86/kernel/microcode-xen.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/microcode-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -167,7 +167,7 @@ static int request_microcode(void)
 	}

 	op.cmd = XENPF_microcode_update;
-	set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
+	set_xen_guest_handle(op.u.microcode.data, firmware->data);
 	op.u.microcode.length = firmware->size;
 	error = HYPERVISOR_platform_op(&op);

--- head-2010-04-29.orig/arch/x86/kernel/mpparse_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/mpparse_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-unsigned int __cpuinitdata num_processors;
+unsigned int num_processors;

 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
@@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc
 	if (!(m->mpc_flags & MPC_APIC_USABLE))
 		return;

-	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
 	if (nr_ioapics >= MAX_IO_APICS) {
 		printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp

 	mps_oem_check(mpc, oem, str);

-	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+	printk("APIC at: 0x%X\n", mpc->mpc_lapic);

-	/*
+	/*
 	 * Save the local APIC address (it might be non-default) -- but only
 	 * if we're not using ACPI.
 	 */
@@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig
 	unsigned long *bp = isa_bus_to_virt(base);
 	struct intel_mp_floating *mpf;

-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+	printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
 	if (sizeof(*mpf) != 16)
 		printk("Error: MPF size\n");

@@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig

 			smp_found_config = 1;
 #ifndef CONFIG_XEN
-			printk(KERN_INFO "found SMP MP-table at %08lx\n",
-						virt_to_phys(mpf));
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+				mpf, virt_to_phys(mpf));
+			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
 				/*
 				 * We cannot access to MPC table to compute
@@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size);
+				reserve_bootmem(mpf->mpf_physptr, size,
+						BOOTMEM_DEFAULT);
 			}
 #else
-			printk(KERN_INFO "found SMP MP-table at %08lx\n",
-				((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+				mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
 #endif

 			mpf_found = mpf;
@@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3
 	 */
 	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
 	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
 		io_apic_get_redir_entries(idx);

-	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
-		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-		mp_ioapic_routing[idx].gsi_base,
-		mp_ioapic_routing[idx].gsi_end);
+	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base,
+	       mp_ioapic_routing[idx].gsi_end);
 }

 void __init
@@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs (
 }

 #define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64

 int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
 	int ioapic = -1;
 	int ioapic_pin = 0;
 	int idx, bit = 0;
-	static int pci_irq = 16;
+	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
-	 * Mapping between Global System Interrups, which
+	 * Mapping between Global System Interrupts, which
 	 * represent all possible interrupts, and IRQs
 	 * assigned to actual devices.
 	 */
@@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger
 	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
 			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi_to_irq[gsi];
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
 	}

 	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);

-	if (triggering == ACPI_LEVEL_SENSITIVE) {
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+		&& (triggering == ACPI_LEVEL_SENSITIVE)) {
 		/*
 		 * For PCI devices assign IRQs in order, avoiding gaps
 		 * due to unused I/O APIC pins.
--- head-2010-04-29.orig/arch/x86/kernel/mpparse_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/mpparse_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U;
 EXPORT_SYMBOL(boot_cpu_id);

 /* Internal processor count */
-unsigned int num_processors __cpuinitdata = 0;
+unsigned int num_processors;

 unsigned disabled_cpus __cpuinitdata;

 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;

-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+#ifndef CONFIG_XEN
+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+				= { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_bios_cpu_apicid_early_ptr;
+#endif
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);


 /*
@@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info(
 	physid_set(m->mpc_apicid, phys_cpu_present_map);
  	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
  		/*
- 		 * bios_cpu_apicid is required to have processors listed
+		 * x86_bios_cpu_apicid is required to have processors listed
  		 * in same order as logical cpu numbers. Hence the first
  		 * entry is BSP, and so on.
  		 */
 		cpu = 0;
  	}
-	bios_cpu_apicid[cpu] = m->mpc_apicid;
-	/*
-	 * We get called early in the the start_kernel initialization
-	 * process when the per_cpu data area is not yet setup, so we
-	 * use a static array that is removed after the per_cpu data
-	 * area is created.
-	 */
-	if (x86_cpu_to_apicid_ptr) {
-		u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
-		x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+	/* are we being called early in kernel startup? */
+	if (x86_cpu_to_apicid_early_ptr) {
+		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+
+		cpu_to_apicid[cpu] = m->mpc_apicid;
+		bios_cpu_apicid[cpu] = m->mpc_apicid;
 	} else {
 		per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
+		per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
 	}

 	cpu_set(cpu, cpu_possible_map);
--- head-2010-04-29.orig/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -431,3 +431,23 @@ dma_sync_single_for_device(struct device
 		swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
 }
 EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
+{
+	if (swiotlb)
+		swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
+	flush_write_buffers();
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
+{
+	if (swiotlb)
+		swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
+	flush_write_buffers();
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
--- head-2010-04-29.orig/arch/x86/kernel/process_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/process_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -23,7 +23,6 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/user.h>
-#include <linux/a.out.h>
 #include <linux/interrupt.h>
 #include <linux/utsname.h>
 #include <linux/delay.h>
@@ -59,8 +58,10 @@

 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
+#include <asm/kdebug.h>

 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");

 static int hlt_counter;

@@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	return ((unsigned long *)tsk->thread.esp)[3];
+	return ((unsigned long *)tsk->thread.sp)[3];
 }

 /*
@@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas
  */
 void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);

 void disable_hlt(void)
 {
@@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt);
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+static void poll_idle(void)
 {
 	cpu_relax();
 }
@@ -122,10 +122,19 @@ static void xen_idle(void)
 	smp_mb();

 	local_irq_disable();
-	if (!need_resched())
+	if (!need_resched()) {
+		ktime_t t0, t1;
+		u64 t0n, t1n;
+
+		t0 = ktime_get();
+		t0n = ktime_to_ns(t0);
 		safe_halt();	/* enables interrupts racelessly */
-	else
-		local_irq_enable();
+		local_irq_disable();
+		t1 = ktime_get();
+		t1n = ktime_to_ns(t1);
+		sched_clock_idle_wakeup_event(t1n - t0n);
+	}
+	local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }
 #ifdef CONFIG_APM_MODULE
@@ -168,13 +177,13 @@ void cpu_idle(void)
 		while (!need_resched()) {
 			void (*idle)(void);

-			if (__get_cpu_var(cpu_idle_state))
-				__get_cpu_var(cpu_idle_state) = 0;
-
 			check_pgt_cache();
 			rmb();
 			idle = xen_idle; /* no alternatives */

+			if (rcu_pending(cpu))
+				rcu_check_callbacks(cpu, 0);
+
 			if (cpu_is_offline(cpu))
 				play_dead();

@@ -192,40 +201,19 @@ static void do_nothing(void *unused)
 {
 }

+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
 void cpu_idle_wait(void)
 {
-	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map, tmp = current->cpus_allowed;
-
-	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-	put_cpu();
-
-	cpus_clear(map);
-	for_each_online_cpu(cpu) {
-		per_cpu(cpu_idle_state, cpu) = 1;
-		cpu_set(cpu, map);
-	}
-
-	__get_cpu_var(cpu_idle_state) = 0;
-
-	wmb();
-	do {
-		ssleep(1);
-		for_each_online_cpu(cpu) {
-			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
-				cpu_clear(cpu, map);
-		}
-		cpus_and(map, map, cpu_online_map);
-		/*
-		 * We waited 1 sec, if a CPU still did not call idle
-		 * it may be because it is in idle and not waking up
-		 * because it has nothing to do.
-		 * Give all the remaining CPUS a kick.
-		 */
-		smp_call_function_mask(map, do_nothing, 0, 0);
-	} while (!cpus_empty(map));
-
-	set_cpus_allowed(current, tmp);
+	smp_mb();
+	/* kick all the CPUs so that they exit out of pm_idle */
+	smp_call_function(do_nothing, NULL, 0, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);

@@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 	unsigned long d0, d1, d2, d3, d6, d7;
-	unsigned long esp;
+	unsigned long sp;
 	unsigned short ss, gs;

 	if (user_mode_vm(regs)) {
-		esp = regs->esp;
-		ss = regs->xss & 0xffff;
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
 		savesegment(gs, gs);
 	} else {
-		esp = (unsigned long) (&regs->esp);
+		sp = (unsigned long) (&regs->sp);
 		savesegment(ss, ss);
 		savesegment(gs, gs);
 	}
@@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re
 			init_utsname()->version);

 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-			0xffff & regs->xcs, regs->eip, regs->eflags,
+			0xffff & regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
-	print_symbol("EIP is at %s\n", regs->eip);
+	print_symbol("EIP is at %s\n", regs->ip);

 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-		regs->eax, regs->ebx, regs->ecx, regs->edx);
+		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
-		regs->esi, regs->edi, regs->ebp, esp);
+		regs->si, regs->di, regs->bp, sp);
 	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       regs->xds & 0xffff, regs->xes & 0xffff,
-	       regs->xfs & 0xffff, gs, ss);
+	       regs->ds & 0xffff, regs->es & 0xffff,
+	       regs->fs & 0xffff, gs, ss);

 	if (!all)
 		return;
@@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re
 void show_regs(struct pt_regs *regs)
 {
 	__show_registers(regs, 1);
-	show_trace(NULL, regs, &regs->esp);
+	show_trace(NULL, regs, &regs->sp, regs->bp);
 }

 /*
- * This gets run with %ebx containing the
- * function to call, and %edx containing
+ * This gets run with %bx containing the
+ * function to call, and %dx containing
  * the "args".
  */
 extern void kernel_thread_helper(void);
@@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi

 	memset(&regs, 0, sizeof(regs));

-	regs.ebx = (unsigned long) fn;
-	regs.edx = (unsigned long) arg;
+	regs.bx = (unsigned long) fn;
+	regs.dx = (unsigned long) arg;

-	regs.xds = __USER_DS;
-	regs.xes = __USER_DS;
-	regs.xfs = __KERNEL_PERCPU;
-	regs.orig_eax = -1;
-	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS | get_kernel_rpl();
-	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+	regs.ds = __USER_DS;
+	regs.es = __USER_DS;
+	regs.fs = __KERNEL_PERCPU;
+	regs.orig_ax = -1;
+	regs.ip = (unsigned long) kernel_thread_helper;
+	regs.cs = __KERNEL_CS | get_kernel_rpl();
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;

 	/* Ok, create the new process.. */
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -368,7 +356,12 @@ void flush_thread(void)
 {
 	struct task_struct *tsk = current;

-	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+	tsk->thread.debugreg0 = 0;
+	tsk->thread.debugreg1 = 0;
+	tsk->thread.debugreg2 = 0;
+	tsk->thread.debugreg3 = 0;
+	tsk->thread.debugreg6 = 0;
+	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 	/*
@@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct
 	unlazy_fpu(tsk);
 }

-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl

 	childregs = task_pt_regs(p);
 	*childregs = *regs;
-	childregs->eax = 0;
-	childregs->esp = esp;
+	childregs->ax = 0;
+	childregs->sp = sp;

-	p->thread.esp = (unsigned long) childregs;
-	p->thread.esp0 = (unsigned long) (childregs+1);
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);

-	p->thread.eip = (unsigned long) ret_from_fork;
+	p->thread.ip = (unsigned long) ret_from_fork;

-	savesegment(gs,p->thread.gs);
+	savesegment(gs, p->thread.gs);

 	tsk = current;
+	if (test_tsk_thread_flag(tsk, TIF_CSTAR))
+		p->thread.ip = (unsigned long) cstar_ret_from_fork;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	}

+	err = 0;
+
 	/*
 	 * Set a new TLS for the child thread?
 	 */
-	if (clone_flags & CLONE_SETTLS) {
-		struct desc_struct *desc;
-		struct user_desc info;
-		int idx;
-
-		err = -EFAULT;
-		if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
-			goto out;
-		err = -EINVAL;
-		if (LDT_empty(&info))
-			goto out;
-
-		idx = info.entry_number;
-		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-			goto out;
-
-		desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
+	if (clone_flags & CLONE_SETTLS)
+		err = do_set_thread_area(p, -1,
+			(struct user_desc __user *)childregs->si, 0);

 	p->thread.iopl = current->thread.iopl;

-	err = 0;
- out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
@@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl
 	return err;
 }

-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	int i;
-
-/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-	for (i = 0; i < 8; i++)
-		dump->u_debugreg[i] = current->thread.debugreg[i];
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->regs.ebx = regs->ebx;
-	dump->regs.ecx = regs->ecx;
-	dump->regs.edx = regs->edx;
-	dump->regs.esi = regs->esi;
-	dump->regs.edi = regs->edi;
-	dump->regs.ebp = regs->ebp;
-	dump->regs.eax = regs->eax;
-	dump->regs.ds = regs->xds;
-	dump->regs.es = regs->xes;
-	dump->regs.fs = regs->xfs;
-	savesegment(gs,dump->regs.gs);
-	dump->regs.orig_eax = regs->orig_eax;
-	dump->regs.eip = regs->eip;
-	dump->regs.cs = regs->xcs;
-	dump->regs.eflags = regs->eflags;
-	dump->regs.esp = regs->esp;
-	dump->regs.ss = regs->xss;
-
-	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
-}
-EXPORT_SYMBOL(dump_thread);
-
-/*
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-	struct pt_regs ptregs = *task_pt_regs(tsk);
-	ptregs.xcs &= 0xffff;
-	ptregs.xds &= 0xffff;
-	ptregs.xes &= 0xffff;
-	ptregs.xss &= 0xffff;
-
-	elf_core_copy_regs(regs, &ptregs);
-
-	return 1;
-}
-
 #ifdef CONFIG_SECCOMP
-void hard_disable_TSC(void)
+static void hard_disable_TSC(void)
 {
 	write_cr4(read_cr4() | X86_CR4_TSD);
 }
@@ -534,7 +453,7 @@ void disable_TSC(void)
 		hard_disable_TSC();
 	preempt_enable();
 }
-void hard_enable_TSC(void)
+static void hard_enable_TSC(void)
 {
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
@@ -543,18 +462,32 @@ void hard_enable_TSC(void)
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 {
-	struct thread_struct *next;
+	struct thread_struct *prev, *next;
+	unsigned long debugctl;

+	prev = &prev_p->thread;
 	next = &next_p->thread;

+	debugctl = prev->debugctlmsr;
+	if (next->ds_area_msr != prev->ds_area_msr) {
+		/* we clear debugctl to make sure DS
+		 * is not in use when we change it */
+		debugctl = 0;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+	}
+
+	if (next->debugctlmsr != debugctl)
+		wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		set_debugreg(next->debugreg0, 0);
+		set_debugreg(next->debugreg1, 1);
+		set_debugreg(next->debugreg2, 2);
+		set_debugreg(next->debugreg3, 3);
 		/* no 4 and 5 */
-		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_debugreg(next->debugreg6, 6);
+		set_debugreg(next->debugreg7, 7);
 	}

 #ifdef CONFIG_SECCOMP
@@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre
 			hard_enable_TSC();
 	}
 #endif
+
+#ifdef X86_BTS
+	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 }

 /*
@@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre
  * More important, however, is the fact that this allows us much
  * more flexibility.
  *
- * The return value (in %eax) will be the "prev" task after
+ * The return value (in %ax) will be the "prev" task after
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
@@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t
 #endif

 	/*
-	 * Reload esp0.
-	 * This is load_esp0(tss, next) with a multicall.
+	 * Reload sp0.
+	 * This is load_sp0(tss, next) with a multicall.
 	 */
 	mcl->op      = __HYPERVISOR_stack_switch;
 	mcl->args[0] = __KERNEL_DS;
-	mcl->args[1] = next->esp0;
+	mcl->args[1] = next->sp0;
 	mcl++;

 	/*
@@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t

 asmlinkage int sys_fork(struct pt_regs regs)
 {
-	return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }

 asmlinkage int sys_clone(struct pt_regs regs)
@@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs
 	unsigned long newsp;
 	int __user *parent_tidptr, *child_tidptr;

-	clone_flags = regs.ebx;
-	newsp = regs.ecx;
-	parent_tidptr = (int __user *)regs.edx;
-	child_tidptr = (int __user *)regs.edi;
+	clone_flags = regs.bx;
+	newsp = regs.cx;
+	parent_tidptr = (int __user *)regs.dx;
+	child_tidptr = (int __user *)regs.di;
 	if (!newsp)
-		newsp = regs.esp;
+		newsp = regs.sp;
 	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
 }

@@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs
  */
 asmlinkage int sys_vfork(struct pt_regs regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }

 /*
@@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs
 	int error;
 	char * filename;

-	filename = getname((char __user *) regs.ebx);
+	filename = getname((char __user *) regs.bx);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			(char __user * __user *) regs.ecx,
-			(char __user * __user *) regs.edx,
+			(char __user * __user *) regs.cx,
+			(char __user * __user *) regs.dx,
 			&regs);
 	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
 		/* Make sure we don't return using sysenter.. */
 		set_thread_flag(TIF_IRET);
 	}
@@ -800,145 +738,37 @@ out:

 unsigned long get_wchan(struct task_struct *p)
 {
-	unsigned long ebp, esp, eip;
+	unsigned long bp, sp, ip;
 	unsigned long stack_page;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 	stack_page = (unsigned long)task_stack_page(p);
-	esp = p->thread.esp;
-	if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+	sp = p->thread.sp;
+	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
 		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
-	ebp = *(unsigned long *) esp;
+	/* include/asm-i386/system.h:switch_to() pushes bp last. */
+	bp = *(unsigned long *) sp;
 	do {
-		if (ebp < stack_page || ebp > top_ebp+stack_page)
+		if (bp < stack_page || bp > top_ebp+stack_page)
 			return 0;
-		eip = *(unsigned long *) (ebp+4);
-		if (!in_sched_functions(eip))
-			return eip;
-		ebp = *(unsigned long *) ebp;
+		ip = *(unsigned long *) (bp+4);
+		if (!in_sched_functions(ip))
+			return ip;
+		bp = *(unsigned long *) bp;
 	} while (count++ < 16);
 	return 0;
 }

-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-	struct thread_struct *t = &current->thread;
-	int idx;
-
-	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-		if (desc_empty(t->tls_array + idx))
-			return idx + GDT_ENTRY_TLS_MIN;
-	return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- */
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
-{
-	struct thread_struct *t = &current->thread;
-	struct user_desc info;
-	struct desc_struct *desc;
-	int cpu, idx;
-
-	if (copy_from_user(&info, u_info, sizeof(info)))
-		return -EFAULT;
-	idx = info.entry_number;
-
-	/*
-	 * index -1 means the kernel should try to find and
-	 * allocate an empty descriptor:
-	 */
-	if (idx == -1) {
-		idx = get_free_idx();
-		if (idx < 0)
-			return idx;
-		if (put_user(idx, &u_info->entry_number))
-			return -EFAULT;
-	}
-
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-	/*
-	 * We must not get preempted while modifying the TLS.
-	 */
-	cpu = get_cpu();
-
-	if (LDT_empty(&info)) {
-		desc->a = 0;
-		desc->b = 0;
-	} else {
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
-	load_TLS(t, cpu);
-
-	put_cpu();
-
-	return 0;
-}
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-	(((desc)->a >> 16) & 0x0000ffff) | \
-	(((desc)->b << 16) & 0x00ff0000) | \
-	( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-	((desc)->a & 0x0ffff) | \
-	 ((desc)->b & 0xf0000) )
-
-#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
-
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
-{
-	struct user_desc info;
-	struct desc_struct *desc;
-	int idx;
-
-	if (get_user(idx, &u_info->entry_number))
-		return -EFAULT;
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	memset(&info, 0, sizeof(info));
-
-	desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-	info.entry_number = idx;
-	info.base_addr = GET_BASE(desc);
-	info.limit = GET_LIMIT(desc);
-	info.seg_32bit = GET_32BIT(desc);
-	info.contents = GET_CONTENTS(desc);
-	info.read_exec_only = !GET_WRITABLE(desc);
-	info.limit_in_pages = GET_LIMIT_PAGES(desc);
-	info.seg_not_present = !GET_PRESENT(desc);
-	info.useable = GET_USEABLE(desc);
-
-	if (copy_to_user(u_info, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
-}
-
 unsigned long arch_align_stack(unsigned long sp)
 {
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
--- head-2010-04-29.orig/arch/x86/kernel/process_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/process_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -3,7 +3,7 @@
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
+ *
  *  X86-64 port
  *	Andi Kleen.
  *
@@ -22,19 +22,18 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/fs.h>
 #include <linux/elfcore.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/module.h>
-#include <linux/a.out.h>
 #include <linux/interrupt.h>
+#include <linux/utsname.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/utsname.h>
 #include <linux/random.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
@@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override)
  */
 void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);

 static ATOMIC_NOTIFIER_HEAD(idle_notifier);

@@ -81,13 +79,6 @@ void idle_notifier_register(struct notif
 {
 	atomic_notifier_chain_register(&idle_notifier, n);
 }
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-	atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL(idle_notifier_unregister);

 void enter_idle(void)
 {
@@ -116,7 +107,7 @@ void exit_idle(void)
  * to poll the ->need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+static void poll_idle(void)
 {
 	local_irq_enable();
 	cpu_relax();
@@ -131,10 +122,19 @@ static void xen_idle(void)
 	 */
 	smp_mb();
 	local_irq_disable();
-	if (!need_resched())
-		safe_halt();
-	else
-		local_irq_enable();
+	if (!need_resched()) {
+		ktime_t t0, t1;
+		u64 t0n, t1n;
+
+		t0 = ktime_get();
+		t0n = ktime_to_ns(t0);
+		safe_halt();	/* enables interrupts racelessly */
+		local_irq_disable();
+		t1 = ktime_get();
+		t1n = ktime_to_ns(t1);
+		sched_clock_idle_wakeup_event(t1n - t0n);
+	}
+	local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }

@@ -161,19 +161,15 @@ static inline void play_dead(void)
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
-void cpu_idle (void)
+void cpu_idle(void)
 {
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
+		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
 			void (*idle)(void);

-			if (__get_cpu_var(cpu_idle_state))
-				__get_cpu_var(cpu_idle_state) = 0;
-
-			tick_nohz_stop_sched_tick();
-
 			rmb();
 			idle = xen_idle; /* no alternatives */
 			if (cpu_is_offline(smp_processor_id()))
@@ -203,49 +199,27 @@ static void do_nothing(void *unused)
 {
 }

+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
 void cpu_idle_wait(void)
 {
-	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map, tmp = current->cpus_allowed;
-
-	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-	put_cpu();
-
- 	cpus_clear(map);
-	for_each_online_cpu(cpu) {
-		per_cpu(cpu_idle_state, cpu) = 1;
-		cpu_set(cpu, map);
-	}
-
-	__get_cpu_var(cpu_idle_state) = 0;
-
-	wmb();
-	do {
-		ssleep(1);
-		for_each_online_cpu(cpu) {
-			if (cpu_isset(cpu, map) &&
-					!per_cpu(cpu_idle_state, cpu))
-				cpu_clear(cpu, map);
-		}
-		cpus_and(map, map, cpu_online_map);
-		/*
-		 * We waited 1 sec, if a CPU still did not call idle
-		 * it may be because it is in idle and not waking up
-		 * because it has nothing to do.
-		 * Give all the remaining CPUS a kick.
-		 */
-		smp_call_function_mask(map, do_nothing, 0, 0);
-	} while (!cpus_empty(map));
-
-	set_cpus_allowed(current, tmp);
+	smp_mb();
+	/* kick all the CPUs so that they exit out of pm_idle */
+	smp_call_function(do_nothing, NULL, 0, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);

-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 }

-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
 {
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
@@ -260,13 +234,13 @@ static int __init idle_setup (char *str)
 }
 early_param("idle", idle_setup);

-/* Prints also some state that isn't saved in the pt_regs */
+/* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs * regs)
 {
 	unsigned long fs, gs, shadowgs;
 	unsigned long d0, d1, d2, d3, d6, d7;
-	unsigned int fsindex,gsindex;
-	unsigned int ds,cs,es;
+	unsigned int fsindex, gsindex;
+	unsigned int ds, cs, es;

 	printk("\n");
 	print_modules();
@@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-	printk_address(regs->rip);
-	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
-		regs->eflags);
+	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+	printk_address(regs->ip, 1);
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
+		regs->flags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-	       regs->rax, regs->rbx, regs->rcx);
+	       regs->ax, regs->bx, regs->cx);
 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-	       regs->rdx, regs->rsi, regs->rdi);
+	       regs->dx, regs->si, regs->di);
 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-	       regs->rbp, regs->r8, regs->r9);
+	       regs->bp, regs->r8, regs->r9);
 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
 	       regs->r10, regs->r11, regs->r12);
 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(NULL, regs, (void *)(regs + 1));
+	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 }

 /*
@@ -329,7 +303,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;

-	if (me->thread.io_bitmap_ptr) {
+	if (me->thread.io_bitmap_ptr) {
 #ifndef CONFIG_X86_NO_TSS
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 #endif
@@ -382,7 +356,7 @@ void flush_thread(void)
 	tsk->thread.debugreg3 = 0;
 	tsk->thread.debugreg6 = 0;
 	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -405,26 +379,21 @@ void release_thread(struct task_struct *

 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 {
-	struct user_desc ud = {
+	struct user_desc ud = {
 		.base_addr = addr,
 		.limit = 0xfffff,
 		.seg_32bit = 1,
 		.limit_in_pages = 1,
 		.useable = 1,
 	};
-	struct n_desc_struct *desc = (void *)t->thread.tls_array;
+	struct desc_struct *desc = t->thread.tls_array;
 	desc += tls;
-	desc->a = LDT_entry_a(&ud);
-	desc->b = LDT_entry_b(&ud);
+	fill_ldt(desc, &ud);
 }

 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 {
-	struct desc_struct *desc = (void *)t->thread.tls_array;
-	desc += tls;
-	return desc->base0 |
-		(((u32)desc->base1) << 16) |
-		(((u32)desc->base2) << 24);
+	return get_desc_base(&t->thread.tls_array[tls]);
 }

 /*
@@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct
 	unlazy_fpu(tsk);
 }

-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -448,14 +417,13 @@ int copy_thread(int nr, unsigned long cl
 			(THREAD_SIZE + task_stack_page(p))) - 1;
 	*childregs = *regs;

-	childregs->rax = 0;
-	childregs->rsp = rsp;
-	if (rsp == ~0UL)
-		childregs->rsp = (unsigned long)childregs;
-
-	p->thread.rsp = (unsigned long) childregs;
-	p->thread.rsp0 = (unsigned long) (childregs+1);
-	p->thread.userrsp = me->thread.userrsp;
+	childregs->ax = 0;
+	childregs->sp = sp;
+	if (sp == ~0UL)
+		childregs->sp = (unsigned long)childregs;
+
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);

 	set_tsk_thread_flag(p, TIF_FORK);

@@ -476,7 +444,7 @@ int copy_thread(int nr, unsigned long cl
 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 				IO_BITMAP_BYTES);
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
-	}
+	}

 	/*
 	 * Set a new TLS for the child thread?
@@ -484,7 +452,8 @@ int copy_thread(int nr, unsigned long cl
 	if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
 		if (test_thread_flag(TIF_IA32))
-			err = ia32_child_tls(p, childregs);
+			err = do_set_thread_area(p, -1,
+				(struct user_desc __user *)childregs->si, 0);
 		else
 #endif
 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
@@ -502,26 +471,32 @@ out:
 	return err;
 }

-static inline void __save_init_fpu( struct task_struct *tsk )
-{
-	asm volatile( "rex64 ; fxsave %0 ; fnclex"
-		      : "=m" (tsk->thread.i387.fxsave));
-	tsk->thread_info->status &= ~TS_USEDFPU;
-}
-
 /*
  * This special macro can be used to load a debugging register
  */
-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)

 static inline void __switch_to_xtra(struct task_struct *prev_p,
-			     	    struct task_struct *next_p)
+				    struct task_struct *next_p)
 {
 	struct thread_struct *prev, *next;
+	unsigned long debugctl;

 	prev = &prev_p->thread,
 	next = &next_p->thread;

+	debugctl = prev->debugctlmsr;
+	if (next->ds_area_msr != prev->ds_area_msr) {
+		/* we clear debugctl to make sure DS
+		 * is not in use when we change it */
+		debugctl = 0;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+	}
+
+	if (next->debugctlmsr != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		loaddebug(next, 0);
 		loaddebug(next, 1);
@@ -531,12 +506,20 @@ static inline void __switch_to_xtra(stru
 		loaddebug(next, 6);
 		loaddebug(next, 7);
 	}
+
+#ifdef X86_BTS
+	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
 }

 /*
  *	switch_to(x,y) should switch tasks from x to y.
  *
- * This could still be optimized:
+ * This could still be optimized:
  * - fold all the options into a flag word and test it with a single test.
  * - could test fs/gs bitsliced
  *
@@ -547,7 +530,7 @@ __switch_to(struct task_struct *prev_p,
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
-	int cpu = smp_processor_id();
+	int cpu = smp_processor_id();
 #ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 #endif
@@ -581,11 +564,12 @@ __switch_to(struct task_struct *prev_p,
 		prev_p->fpu_counter = 0;

 	/*
-	 * Reload esp0, LDT and the page table pointer:
+	 * Reload sp0.
+	 * This is load_sp0(tss, next) with a multicall.
 	 */
 	mcl->op      = __HYPERVISOR_stack_switch;
 	mcl->args[0] = __KERNEL_DS;
-	mcl->args[1] = next->rsp0;
+	mcl->args[1] = next->sp0;
 	mcl++;

 	/*
@@ -593,11 +577,12 @@ __switch_to(struct task_struct *prev_p,
 	 * This is load_TLS(next, cpu) with multicalls.
 	 */
 #define C(i) do {							\
-	if (unlikely(next->tls_array[i] != prev->tls_array[i])) {	\
+	if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||	\
+		     next->tls_array[i].b != prev->tls_array[i].b)) {	\
 		mcl->op      = __HYPERVISOR_update_descriptor;		\
 		mcl->args[0] = virt_to_machine(				\
-			&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);		\
-		mcl->args[1] = next->tls_array[i];			\
+			&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+		mcl->args[1] = *(u64 *)&next->tls_array[i];		\
 		mcl++;							\
 	}								\
 } while (0)
@@ -605,7 +590,7 @@ __switch_to(struct task_struct *prev_p,
 #undef C

 	if (unlikely(prev->iopl != next->iopl)) {
-		iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
+		iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
 #if CONFIG_XEN_COMPAT > 0x030002
 		mcl->op      = __HYPERVISOR_physdev_op;
 		mcl->args[0] = PHYSDEVOP_set_iopl;
@@ -669,8 +654,6 @@ __switch_to(struct task_struct *prev_p,
 	/*
 	 * Switch the PDA context.
 	 */
-	prev->userrsp = read_pda(oldrsp);
-	write_pda(oldrsp, next->userrsp);
 	write_pda(pcurrent, next_p);
 	write_pda(kernelstack,
 	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
@@ -687,7 +670,8 @@ __switch_to(struct task_struct *prev_p,
 	/*
 	 * Now maybe reload the debug registers
 	 */
-	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p);

 	/* If the task has used fpu the last 5 timeslices, just do a full
@@ -702,23 +686,18 @@ __switch_to(struct task_struct *prev_p,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage
+asmlinkage
 long sys_execve(char __user *name, char __user * __user *argv,
-		char __user * __user *envp, struct pt_regs regs)
+		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char * filename;

 	filename = getname(name);
 	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
+	if (IS_ERR(filename))
 		return error;
-	error = do_execve(filename, argv, envp, &regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
+	error = do_execve(filename, argv, envp, regs);
 	putname(filename);
 	return error;
 }
@@ -728,18 +707,18 @@ void set_personality_64bit(void)
 	/* inherit personality from parent */

 	/* Make sure to be in 64bit mode */
-	clear_thread_flag(TIF_IA32);
+	clear_thread_flag(TIF_IA32);

 	/* TBD: overwrites user setup. Should have two bits.
 	   But 64bit processes have always behaved this way,
 	   so it's not too bad. The main problem is just that
-   	   32bit childs are affected again. */
+	   32bit childs are affected again. */
 	current->personality &= ~READ_IMPLIES_EXEC;
 }

 asmlinkage long sys_fork(struct pt_regs *regs)
 {
-	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
 }

 asmlinkage long
@@ -747,7 +726,7 @@ sys_clone(unsigned long clone_flags, uns
 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 {
 	if (!newsp)
-		newsp = regs->rsp;
+		newsp = regs->sp;
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }

@@ -763,29 +742,29 @@ sys_clone(unsigned long clone_flags, uns
  */
 asmlinkage long sys_vfork(struct pt_regs *regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
 		    NULL, NULL);
 }

 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
-	u64 fp,rip;
+	u64 fp,ip;
 	int count = 0;

 	if (!p || p == current || p->state==TASK_RUNNING)
 		return 0;
 	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
 		return 0;
-	fp = *(u64 *)(p->thread.rsp);
+	fp = *(u64 *)(p->thread.sp);
 	do {
 		if (fp < (unsigned long)stack ||
 		    fp > (unsigned long)stack+THREAD_SIZE)
 			return 0;
-		rip = *(u64 *)(fp+8);
-		if (!in_sched_functions(rip))
-			return rip;
+		ip = *(u64 *)(fp+8);
+		if (!in_sched_functions(ip))
+			return ip;
 		fp = *(u64 *)fp;
 	} while (count++ < 16);
 	return 0;
@@ -827,19 +806,19 @@ long do_arch_prctl(struct task_struct *t
 		/* Not strictly needed for fs, but do it for symmetry
 		   with gs */
 		if (addr >= TASK_SIZE_OF(task))
-			return -EPERM;
+			return -EPERM;
 		cpu = get_cpu();
-		/* handle small bases via the GDT because that's faster to
+		/* handle small bases via the GDT because that's faster to
 		   switch. */
-		if (addr <= 0xffffffff) {
+		if (addr <= 0xffffffff) {
 			set_32bit_tls(task, FS_TLS, addr);
-			if (doit) {
-				load_TLS(&task->thread, cpu);
+			if (doit) {
+				load_TLS(&task->thread, cpu);
 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
 			}
 			task->thread.fsindex = FS_TLS_SEL;
 			task->thread.fs = 0;
-		} else {
+		} else {
 			task->thread.fsindex = 0;
 			task->thread.fs = addr;
 			if (doit) {
@@ -852,24 +831,24 @@ long do_arch_prctl(struct task_struct *t
 		}
 		put_cpu();
 		break;
-	case ARCH_GET_FS: {
-		unsigned long base;
+	case ARCH_GET_FS: {
+		unsigned long base;
 		if (task->thread.fsindex == FS_TLS_SEL)
 			base = read_32bit_tls(task, FS_TLS);
 		else if (doit)
 			rdmsrl(MSR_FS_BASE, base);
 		else
 			base = task->thread.fs;
-		ret = put_user(base, (unsigned long __user *)addr);
-		break;
+		ret = put_user(base, (unsigned long __user *)addr);
+		break;
 	}
-	case ARCH_GET_GS: {
+	case ARCH_GET_GS: {
 		unsigned long base;
 		unsigned gsindex;
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
 		else if (doit) {
- 			asm("movl %%gs,%0" : "=r" (gsindex));
+			asm("movl %%gs,%0" : "=r" (gsindex));
 			if (gsindex)
 				rdmsrl(MSR_KERNEL_GS_BASE, base);
 			else
@@ -877,40 +856,21 @@ long do_arch_prctl(struct task_struct *t
 		}
 		else
 			base = task->thread.gs;
-		ret = put_user(base, (unsigned long __user *)addr);
+		ret = put_user(base, (unsigned long __user *)addr);
 		break;
 	}

 	default:
 		ret = -EINVAL;
 		break;
-	}
+	}

-	return ret;
-}
+	return ret;
+}

 long sys_arch_prctl(int code, unsigned long addr)
 {
 	return do_arch_prctl(current, code, addr);
-}
-
-/*
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-	struct pt_regs *pp, ptregs;
-
-	pp = task_pt_regs(tsk);
-
-	ptregs = *pp;
-	ptregs.cs &= 0xffff;
-	ptregs.ss &= 0xffff;
-
-	elf_core_copy_regs(regs, &ptregs);
-
-        boot_option_idle_override = 1;
-	return 1;
 }

 unsigned long arch_align_stack(unsigned long sp)
@@ -919,3 +879,9 @@ unsigned long arch_align_stack(unsigned
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
--- head-2010-04-29.orig/arch/x86/kernel/quirks-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/quirks-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -9,7 +9,7 @@
 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
 	u8 config, rev;
-	u32 word;
+	u16 word;

 	/* BIOS may enable hardware IRQ balancing for
 	 * E7520/E7320/E7525(revision ID 0x9 and below)
@@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal
 	pci_read_config_byte(dev, 0xf4, &config);
 	pci_write_config_byte(dev, 0xf4, config|0x2);

-	/* read xTPR register */
-	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+	/*
+	 * read xTPR register.  We may not have a pci_dev for device 8
+	 * because it might be hidden until the above write.
+	 */
+	pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);

 	if (!(word & (1 << 13))) {
 		struct xen_platform_op op;

-		printk(KERN_INFO "Intel E7520/7320/7525 detected. "
-			"Disabling irq balancing and affinity\n");
+		dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
+			"disabling irq balancing and affinity\n");
 		op.cmd = XENPF_platform_quirk;
 		op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
 		WARN_ON(HYPERVISOR_platform_op(&op));
@@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct
 	pci_read_config_dword(dev, 0xF0, &rcba);
 	rcba &= 0xFFFFC000;
 	if (rcba == 0) {
-		printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n");
+		dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
+			"cannot force enable HPET\n");
 		return;
 	}

 	/* use bits 31:14, 16 kB aligned */
 	rcba_base = ioremap_nocache(rcba, 0x4000);
 	if (rcba_base == NULL) {
-		printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n");
+		dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
+			"cannot force enable HPET\n");
 		return;
 	}

@@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct
 		/* HPET is enabled in HPTC. Just not reported by BIOS */
 		val = val & 0x3;
 		force_hpet_address = 0xFED00000 | (val << 12);
-		printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+			"0x%lx\n", force_hpet_address);
 		iounmap(rcba_base);
 		return;
 	}
@@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct
 	if (err) {
 		force_hpet_address = 0;
 		iounmap(rcba_base);
-		printk(KERN_DEBUG "Failed to force enable HPET\n");
+		dev_printk(KERN_DEBUG, &dev->dev,
+			"Failed to force enable HPET\n");
 	} else {
 		force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
-		printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+			"0x%lx\n", force_hpet_address);
 	}
 }

@@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+			 ich_force_enable_hpet);


 static struct pci_dev *cached_dev;
@@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st
 	if (val & 0x4) {
 		val &= 0x3;
 		force_hpet_address = 0xFED00000 | (val << 12);
-		printk(KERN_DEBUG "HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+			force_hpet_address);
 		return;
 	}

@@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st
 		/* HPET is enabled in HPTC. Just not reported by BIOS */
 		val &= 0x3;
 		force_hpet_address = 0xFED00000 | (val << 12);
-		printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+			"0x%lx\n", force_hpet_address);
 		cached_dev = dev;
 		force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
 		return;
 	}

-	printk(KERN_DEBUG "Failed to force enable HPET\n");
+	dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
 }

 /*
@@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str
 	 */
 	if (val & 0x80) {
 		force_hpet_address = (val & ~0x3ff);
-		printk(KERN_DEBUG "HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+			force_hpet_address);
 		return;
 	}

@@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str
 	pci_read_config_dword(dev, 0x68, &val);
 	if (val & 0x80) {
 		force_hpet_address = (val & ~0x3ff);
-		printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
-			       force_hpet_address);
+		dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+			"0x%lx\n", force_hpet_address);
 		cached_dev = dev;
 		force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
 		return;
 	}

-	printk(KERN_DEBUG "Failed to force enable HPET\n");
+	dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
 }

 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
@@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str
 	pci_read_config_dword(dev, 0x44, &val);
 	force_hpet_address = val & 0xfffffffe;
 	force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
-	printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n",
+	dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
 		force_hpet_address);
 	cached_dev = dev;
 	return;
@@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
 			nvidia_force_enable_hpet);

 /* LPC bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
+			nvidia_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
 			nvidia_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
@@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N
 void force_hpet_resume(void)
 {
 	switch (force_hpet_resume_type) {
-	    case ICH_FORCE_HPET_RESUME:
-		return ich_force_hpet_resume();
-
-	    case OLD_ICH_FORCE_HPET_RESUME:
-		return old_ich_force_hpet_resume();
-
-	    case VT8237_FORCE_HPET_RESUME:
-		return vt8237_force_hpet_resume();
-
-	    case NVIDIA_FORCE_HPET_RESUME:
-		return nvidia_force_hpet_resume();
-
-	    default:
+	case ICH_FORCE_HPET_RESUME:
+		ich_force_hpet_resume();
+		return;
+	case OLD_ICH_FORCE_HPET_RESUME:
+		old_ich_force_hpet_resume();
+		return;
+	case VT8237_FORCE_HPET_RESUME:
+		vt8237_force_hpet_resume();
+		return;
+	case NVIDIA_FORCE_HPET_RESUME:
+		nvidia_force_hpet_resume();
+		return;
+	default:
 		break;
 	}
 }
--- head-2010-04-29.orig/arch/x86/kernel/rtc.c	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/kernel/rtc.c	2010-03-24 15:10:37.000000000 +0100
@@ -171,6 +171,11 @@ int update_persistent_clock(struct times
 	unsigned long flags;
 	int retval;

+#ifdef CONFIG_XEN
+	if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
+		return 0;
+#endif
+
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = x86_platform.set_wallclock(now.tv_sec);
 	spin_unlock_irqrestore(&rtc_lock, flags);
@@ -183,6 +188,10 @@ void read_persistent_clock(struct timesp
 {
 	unsigned long retval, flags;

+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return xen_read_persistent_clock();
+#endif
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = x86_platform.get_wallclock();
 	spin_unlock_irqrestore(&rtc_lock, flags);
--- head-2010-04-29.orig/arch/x86/kernel/setup64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/setup64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -31,7 +31,11 @@
 #include <asm/hypervisor.h>
 #endif

+#ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif

 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

@@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr

 unsigned long __supported_pte_mask __read_mostly = ~0UL;
 EXPORT_SYMBOL(__supported_pte_mask);
+
 static int do_not_nx __cpuinitdata = 0;

 /* noexec=on|off
@@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str
 __setup("noexec32=", nonx32_setup);

 /*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+#ifndef CONFIG_XEN
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+		if (per_cpu_offset(cpu)) {
+#endif
+			per_cpu(x86_cpu_to_apicid, cpu) =
+						x86_cpu_to_apicid_init[cpu];
+			per_cpu(x86_bios_cpu_apicid, cpu) =
+						x86_bios_cpu_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+			per_cpu(x86_cpu_to_node_map, cpu) =
+						x86_cpu_to_node_map_init[cpu];
+#endif
+#ifdef CONFIG_SMP
+		}
+		else
+			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+									cpu);
+#endif
+	}
+
+	/* indicate the early static arrays will soon be gone */
+	x86_cpu_to_apicid_early_ptr = NULL;
+	x86_bios_cpu_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+	x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+#endif
+}
+
+/*
  * Great future plan:
  * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
  * Always point %gs to its beginning
@@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void)
 	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
 	for_each_cpu_mask (i, cpu_possible_map) {
 		char *ptr;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+		ptr = alloc_bootmem_pages(size);
+#else
+		int node = early_cpu_to_node(i);

-		if (!NODE_DATA(cpu_to_node(i))) {
-			printk("cpu with no node %d, num_online_nodes %d\n",
-			       i, num_online_nodes());
+		if (!node_online(node) || !NODE_DATA(node))
 			ptr = alloc_bootmem_pages(size);
-		} else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
-		}
+		else
+			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
 		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
+
+	/* setup percpu data maps early */
+	setup_per_cpu_maps();
 }

 #ifdef CONFIG_XEN
@@ -224,7 +273,8 @@ void syscall_init(void)
 	wrmsrl(MSR_CSTAR, ignore_sysret);

 	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
 #endif
 #ifdef CONFIG_IA32_EMULATION
 	syscall32_cpu_init ();
@@ -303,7 +353,7 @@ void __cpuinit cpu_init (void)
 	 */
 #ifndef CONFIG_XEN
 	if (cpu)
- 		memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
+		memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
 #endif

 	cpu_gdt_descr[cpu].size = GDT_SIZE;
@@ -334,10 +384,10 @@ void __cpuinit cpu_init (void)
 				      v, cpu);
 		}
 		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
+		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
 	}

-	t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
 	/*
 	 * <= is required because the CPU will access up to
 	 * 8 bits beyond the end of the IO permission bitmap.
--- head-2010-04-29.orig/arch/x86/kernel/setup_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/setup_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -47,9 +47,12 @@
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
 #include <linux/pfn.h>
+#include <linux/pci.h>
+#include <linux/init_ohci1394_dma.h>

 #include <video/edid.h>

+#include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
@@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b
 	xen_panic_event, NULL, 0 /* try to go last */
 };

-int disable_pse __cpuinitdata = 0;
-
 /*
  * Machine setup..
  */
-extern struct resource code_resource;
-extern struct resource data_resource;
-extern struct resource bss_resource;
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+	.name	= "dma1",
+	.start	= 0x0000,
+	.end	= 0x001f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic1",
+	.start	= 0x0020,
+	.end	= 0x0021,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer0",
+	.start	= 0x0040,
+	.end    = 0x0043,
+	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer1",
+	.start  = 0x0050,
+	.end    = 0x0053,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "keyboard",
+	.start	= 0x0060,
+	.end	= 0x006f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma page reg",
+	.start	= 0x0080,
+	.end	= 0x008f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic2",
+	.start	= 0x00a0,
+	.end	= 0x00a1,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma2",
+	.start	= 0x00c0,
+	.end	= 0x00df,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "fpu",
+	.start	= 0x00f0,
+	.end	= 0x00ff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+} };

 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini
 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);

+#ifndef CONFIG_X86_PAE
 unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif

 /* for MCA, but anyone else can use it if they want */
 unsigned int machine_id;
 unsigned int machine_submodel_id;
 unsigned int BIOS_revision;
-unsigned int mca_pentium_flag;

 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
@@ -131,13 +206,17 @@ extern int root_mountflags;

 unsigned long saved_videomode;

-#define RAMDISK_IMAGE_START_MASK  	0x07FF
+#define RAMDISK_IMAGE_START_MASK	0x07FF
 #define RAMDISK_PROMPT_FLAG		0x8000
-#define RAMDISK_LOAD_FLAG		0x4000
+#define RAMDISK_LOAD_FLAG		0x4000

 static char __initdata command_line[COMMAND_LINE_SIZE];

+#ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif

 /*
  * Point at the empty zero page to start with. We map the real shared_info
@@ -198,8 +277,7 @@ static int __init parse_mem(char *arg)
 		return -EINVAL;

 	if (strcmp(arg, "nopentium") == 0) {
-		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-		disable_pse = 1;
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
 	} else {
 		/* If the user specifies memory size, we
 		 * limit the BIOS-provided memory map to
@@ -208,7 +286,7 @@ static int __init parse_mem(char *arg)
 		 * trim the existing memory map.
 		 */
 		unsigned long long mem_size;
-
+
 		mem_size = memparse(arg, &arg);
 		limit_regions(mem_size);
 		user_defined_memmap = 1;
@@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v
 	unsigned int addr;
 	addr = get_bios_ebda();
 	if (addr)
-		reserve_bootmem(addr, PAGE_SIZE);
+		reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
 }
 #endif

@@ -365,8 +443,6 @@ static unsigned long __init setup_memory
  	min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
 		xen_start_info->nr_pt_frames;

-	find_max_pfn();
-
 	max_low_pfn = find_max_low_pfn();

 #ifdef CONFIG_HIGHMEM
@@ -449,7 +525,8 @@ static void __init reserve_crashkernel(v
 					(unsigned long)(total_mem >> 20));
 			crashk_res.start = crash_base;
 			crashk_res.end   = crash_base + crash_size - 1;
-			reserve_bootmem(crash_base, crash_size);
+			reserve_bootmem(crash_base, crash_size,
+					BOOTMEM_DEFAULT);
 		} else
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
@@ -463,6 +540,99 @@ static inline void __init reserve_crashk
 {}
 #endif

+#ifdef CONFIG_BLK_DEV_INITRD
+
+static bool do_relocate_initrd = false;
+
+static void __init reserve_initrd(void)
+{
+	unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+	unsigned long ramdisk_size  = xen_start_info->mod_len;
+	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	unsigned long ramdisk_here;
+
+	initrd_start = 0;
+
+	if (!xen_start_info->mod_start || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+
+	if (ramdisk_end < ramdisk_image) {
+		printk(KERN_ERR "initrd wraps around end of memory, "
+		       "disabling initrd\n");
+		return;
+	}
+	if (ramdisk_size >= end_of_lowmem/2) {
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
+	}
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start+ramdisk_size;
+		return;
+	}
+
+	/* We need to move the initrd down into lowmem */
+	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+
+	/* Note: this includes all the lowmem currently occupied by
+	   the initrd, we rely on that fact to keep the data intact. */
+	reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+	initrd_start = ramdisk_here + PAGE_OFFSET;
+	initrd_end   = initrd_start + ramdisk_size;
+
+	do_relocate_initrd = true;
+}
+
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+
+static void __init relocate_initrd(void)
+{
+	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	unsigned long ramdisk_here;
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
+
+	if (!do_relocate_initrd)
+		return;
+
+	ramdisk_here = initrd_start - PAGE_OFFSET;
+
+	q = (char *)initrd_start;
+
+	/* Copy any lowmem portion of the initrd */
+	if (ramdisk_image < end_of_lowmem) {
+		clen = end_of_lowmem - ramdisk_image;
+		p = (char *)__va(ramdisk_image);
+		memcpy(q, p, clen);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+
+	/* Copy the highmem portion of the initrd */
+	while (ramdisk_size) {
+		slop = ramdisk_image & ~PAGE_MASK;
+		clen = ramdisk_size;
+		if (clen > MAX_MAP_CHUNK-slop)
+			clen = MAX_MAP_CHUNK-slop;
+		mapaddr = ramdisk_image & PAGE_MASK;
+		p = early_ioremap(mapaddr, clen+slop);
+		memcpy(q, p+slop, clen);
+		early_iounmap(p, clen+slop);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+}
+
+#endif /* CONFIG_BLK_DEV_INITRD */
+
 void __init setup_bootmem_allocator(void)
 {
 	unsigned long bootmap_size;
@@ -480,14 +650,15 @@ void __init setup_bootmem_allocator(void
 	 * bootmem allocator with an invalid RAM area.
 	 */
 	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
+			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
+			 BOOTMEM_DEFAULT);

 #ifndef CONFIG_XEN
 	/*
 	 * reserve physical page 0 - it's a special BIOS page on many boxes,
 	 * enabling clean reboots, SMP operation, laptop functions.
 	 */
-	reserve_bootmem(0, PAGE_SIZE);
+	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);

 	/* reserve EBDA region, it's a 4K region */
 	reserve_ebda_region();
@@ -497,7 +668,7 @@ void __init setup_bootmem_allocator(void
        unless you have no PS/2 mouse plugged in. */
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 	    boot_cpu_data.x86 == 6)
-	     reserve_bootmem(0xa0000 - 4096, 4096);
+	     reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);

 #ifdef CONFIG_SMP
 	/*
@@ -505,7 +676,7 @@ void __init setup_bootmem_allocator(void
 	 * FIXME: Don't need the extra page at 4K, but need to fix
 	 * trampoline before removing it. (see the GDT stuff)
 	 */
-	reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+	reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
 #endif
 #ifdef CONFIG_ACPI_SLEEP
 	/*
@@ -513,29 +684,12 @@ void __init setup_bootmem_allocator(void
 	 */
 	acpi_reserve_bootmem();
 #endif
-	numa_kva_reserve();
 #endif /* !CONFIG_XEN */

 #ifdef CONFIG_BLK_DEV_INITRD
-	if (xen_start_info->mod_start) {
-		unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
-		unsigned long ramdisk_size  = xen_start_info->mod_len;
-		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-		unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-
-		if (ramdisk_end <= end_of_lowmem) {
-			/*reserve_bootmem(ramdisk_image, ramdisk_size);*/
-			initrd_start = ramdisk_image + PAGE_OFFSET;
-			initrd_end = initrd_start+ramdisk_size;
-			initrd_below_start_ok = 1;
-		} else {
-			printk(KERN_ERR "initrd extends beyond end of memory "
-			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			       ramdisk_end, end_of_lowmem);
-			initrd_start = 0;
-		}
-	}
+	reserve_initrd();
 #endif
+	numa_kva_reserve();
 	reserve_crashkernel();
 }

@@ -602,20 +756,14 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
+	early_ioremap_init();
 #ifdef CONFIG_SMP
 	prefill_possible_map();
 #endif

-	/*
-	 * FIXME: This isn't an official loader_type right
-	 * now but does currently work with elilo.
-	 * If we were configured as an EFI kernel, check to make
-	 * sure that we were loaded correctly from elilo and that
-	 * the system table is valid.  If not, then initialize normally.
-	 */
 #ifdef CONFIG_EFI
-	if ((boot_params.hdr.type_of_loader == 0x50) &&
-	    boot_params.efi_info.efi_systab)
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL32", 4))
 		efi_enabled = 1;
 #endif

@@ -655,12 +803,9 @@ void __init setup_arch(char **cmdline_p)
 #endif

 	ARCH_SETUP
-	if (efi_enabled)
-		efi_init();
-	else {
-		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-		print_memory_map(memory_setup());
-	}
+
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	print_memory_map(memory_setup());

 	copy_edd();

@@ -693,6 +838,17 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;

+	if (efi_enabled)
+		efi_init();
+
+	/* update e820 for memory not covered by WB MTRRs */
+	find_max_pfn();
+	mtrr_bp_init();
+#ifndef CONFIG_XEN
+	if (mtrr_trim_uncached_memory(max_pfn))
+		find_max_pfn();
+#endif
+
 	max_low_pfn = setup_memory();

 #ifdef CONFIG_VMI
@@ -717,6 +873,16 @@ void __init setup_arch(char **cmdline_p)
 	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
 #endif
 	paging_init();
+
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
 	remapped_pgdat_init();
 	sparse_init();
 	zone_sizes_init();
@@ -802,16 +968,20 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */

+#ifdef CONFIG_BLK_DEV_INITRD
+	relocate_initrd();
+#endif
+
 	paravirt_post_allocator_init();

 	if (is_initial_xendomain())
 		dmi_scan_machine();

+	io_delay_init();
+
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
-#endif
-	if (efi_enabled)
-		efi_map_memmap();
+#endif

 	set_iopl.iopl = 1;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
@@ -829,7 +999,7 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 #endif

-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
 	early_quirks();
 #endif

@@ -875,3 +1045,30 @@ xen_panic_event(struct notifier_block *t
 	/* we're never actually going to get here... */
 	return NOTIFY_DONE;
 }
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+	int i;
+
+	/* Nothing to do if not running in dom0. */
+	if (!is_initial_xendomain())
+		return 0;
+
+	printk(KERN_INFO "Setting up standard PCI resources\n");
+	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+
+	request_resource(&iomem_resource, &video_ram_resource);
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+	return 0;
+}
+
+subsys_initcall(request_standard_resources);
--- head-2010-04-29.orig/arch/x86/kernel/setup_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/setup_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -15,7 +15,6 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/a.out.h>
 #include <linux/screen_info.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
@@ -30,6 +29,7 @@
 #include <linux/crash_dump.h>
 #include <linux/root_dev.h>
 #include <linux/pci.h>
+#include <linux/efi.h>
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
@@ -39,10 +39,13 @@
 #include <linux/dmi.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/init_ohci1394_dma.h>

 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <asm/vsyscall.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/msr.h>
@@ -50,6 +53,7 @@
 #include <video/edid.h>
 #include <asm/e820.h>
 #include <asm/dma.h>
+#include <asm/gart.h>
 #include <asm/mpspec.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
@@ -59,6 +63,9 @@
 #include <asm/sections.h>
 #include <asm/dmi.h>
 #include <asm/cacheflush.h>
+#include <asm/mce.h>
+#include <asm/ds.h>
+#include <asm/topology.h>
 #ifdef CONFIG_XEN
 #include <linux/percpu.h>
 #include <xen/interface/physdev.h>
@@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info);
 struct cpuinfo_x86 boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);

+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
 unsigned long mmu_cr4_features;

 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -117,7 +126,7 @@ unsigned long saved_video_mode;

 int force_mwait __cpuinitdata;

-/*
+/*
  * Early DMI memory
  */
 int dmi_alloc_index;
@@ -163,25 +172,27 @@ struct resource standard_io_resources[]

 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)

-struct resource data_resource = {
+static struct resource data_resource = {
 	.name = "Kernel data",
 	.start = 0,
 	.end = 0,
 	.flags = IORESOURCE_RAM,
 };
-struct resource code_resource = {
+static struct resource code_resource = {
 	.name = "Kernel code",
 	.start = 0,
 	.end = 0,
 	.flags = IORESOURCE_RAM,
 };
-struct resource bss_resource = {
+static struct resource bss_resource = {
 	.name = "Kernel bss",
 	.start = 0,
 	.end = 0,
 	.flags = IORESOURCE_RAM,
 };

+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
@@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_
 	unsigned long bootmap_size, bootmap;

 	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
 	if (bootmap == -1L)
-		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
 	e820_register_active_regions(0, start_pfn, end_pfn);
 #ifdef CONFIG_XEN
@@ -216,8 +228,8 @@ contig_initmem_init(unsigned long start_
 	else
 #endif
 	free_bootmem_with_active_regions(0, end_pfn);
-	reserve_bootmem(bootmap, bootmap_size);
-}
+	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
 #endif

 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -250,27 +262,35 @@ static inline void copy_edd(void)
 #ifndef CONFIG_XEN
 static void __init reserve_crashkernel(void)
 {
-	unsigned long long free_mem;
+	unsigned long long total_mem;
 	unsigned long long crash_size, crash_base;
 	int ret;

-	free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
+	total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;

-	ret = parse_crashkernel(boot_command_line, free_mem,
+	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size) {
-		if (crash_base > 0) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(crash_size >> 20),
-					(unsigned long)(crash_base >> 20),
-					(unsigned long)(free_mem >> 20));
-			crashk_res.start = crash_base;
-			crashk_res.end   = crash_base + crash_size - 1;
-			reserve_bootmem(crash_base, crash_size);
-		} else
+		if (crash_base <= 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
 	}
 }
 #else
@@ -281,37 +301,21 @@ static inline void __init reserve_crashk
 {}
 #endif

-#ifndef CONFIG_XEN
-#define EBDA_ADDR_POINTER 0x40E
-
-unsigned __initdata ebda_addr;
-unsigned __initdata ebda_size;
-
-static void discover_ebda(void)
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) __init memory_setup(void)
 {
-	/*
-	 * there is a real-mode segmented pointer pointing to the
-	 * 4K EBDA area at 0x40E
-	 */
-	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
-	ebda_addr <<= 4;
-
-	ebda_size = *(unsigned short *)__va(ebda_addr);
-
-	/* Round EBDA up to pages */
-	if (ebda_size == 0)
-		ebda_size = 1;
-	ebda_size <<= 10;
-	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
-	if (ebda_size > 64*1024)
-		ebda_size = 64*1024;
+       machine_specific_memory_setup();
 }
-#else
-#define discover_ebda() ((void)0)
-#endif

+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
 void __init setup_arch(char **cmdline_p)
 {
+	unsigned i;
+
 #ifdef CONFIG_XEN
 	extern struct e820map machine_e820;

@@ -320,6 +324,11 @@ void __init setup_arch(char **cmdline_p)
 	/* Register a call for panic conditions. */
 	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);

+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_writable_pagetables));
+
+	early_ioremap_init();
+
  	ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
 	screen_info = boot_params.screen_info;

@@ -336,11 +345,6 @@ void __init setup_arch(char **cmdline_p)
 		screen_info.orig_video_isVGA = 0;

 	copy_edid();
-
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_writable_pagetables));
-
-	ARCH_SETUP
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);

@@ -356,7 +360,15 @@ void __init setup_arch(char **cmdline_p)
 	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
-	setup_memory_region();
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL64", 4))
+		efi_enabled = 1;
+#endif
+
+	ARCH_SETUP
+
+	memory_setup();
 	copy_edd();

 	if (!boot_params.hdr.root_flags)
@@ -380,28 +392,51 @@ void __init setup_arch(char **cmdline_p)

 	parse_early_param();

+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
 	finish_e820_parsing();

+	early_gart_iommu_check();
+
 	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+#ifndef CONFIG_XEN
+	if (mtrr_trim_uncached_memory(end_pfn)) {
+		e820_register_active_regions(0, 0, -1UL);
+		end_pfn = e820_end_of_ram();
+	}
+#endif
+
 	num_physpages = end_pfn;
+	max_mapnr = end_pfn;

 	check_efer();

-	discover_ebda();
-
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+	if (efi_enabled)
+		efi_init();

 	if (is_initial_xendomain())
 		dmi_scan_machine();

+	io_delay_init();
+
 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	/* setup to use the static apicid table during kernel startup */
-	x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
+	/* setup to use the early static init tables during kernel startup */
+	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
 #endif

 	/* How many end-of-memory variables you have, grandma! */
@@ -420,54 +455,25 @@ void __init setup_arch(char **cmdline_p)
 #endif

 #ifdef CONFIG_NUMA
-	numa_initmem_init(0, end_pfn);
+	numa_initmem_init(0, end_pfn);
 #else
 	contig_initmem_init(0, end_pfn);
 #endif

-#ifdef CONFIG_XEN
-	/*
-	 * Reserve kernel, physmap, start info, initial page tables, and
-	 * direct mapping.
-	 */
-	reserve_bootmem_generic(__pa_symbol(&_text),
-	                        (table_end << PAGE_SHIFT) - __pa_symbol(&_text));
-#else
-	/* Reserve direct mapping */
-	reserve_bootmem_generic(table_start << PAGE_SHIFT,
-				(table_end - table_start) << PAGE_SHIFT);
-
-	/* reserve kernel */
-	reserve_bootmem_generic(__pa_symbol(&_text),
-				__pa_symbol(&_end) - __pa_symbol(&_text));
+	early_res_to_bootmem();

+#ifndef CONFIG_XEN
+#ifdef CONFIG_ACPI_SLEEP
 	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
+	 * Reserve low memory region for sleep support.
 	 */
-	reserve_bootmem_generic(0, PAGE_SIZE);
-
-	/* reserve ebda region */
-	if (ebda_addr)
-		reserve_bootmem_generic(ebda_addr, ebda_size);
-#ifdef CONFIG_NUMA
-	/* reserve nodemap region */
-	if (nodemap_addr)
-		reserve_bootmem_generic(nodemap_addr, nodemap_size);
+       acpi_reserve_bootmem();
 #endif

-#ifdef CONFIG_SMP
-	/* Reserve SMP trampoline */
-	reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
-#endif
+	if (efi_enabled)
+		efi_reserve_bootmem();
 #endif

-#ifdef CONFIG_ACPI_SLEEP
-       /*
-        * Reserve low memory region for sleep support.
-        */
-       acpi_reserve_bootmem();
-#endif
 #ifdef CONFIG_BLK_DEV_INITRD
 #ifdef CONFIG_XEN
 	if (xen_start_info->mod_start) {
@@ -491,6 +497,8 @@ void __init setup_arch(char **cmdline_p)
 			initrd_below_start_ok = 1;
 #endif
 		} else {
+			/* Assumes everything on node 0 */
+			free_bootmem(ramdisk_image, ramdisk_size);
 			printk(KERN_ERR "initrd extends beyond end of memory "
 			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
 			       ramdisk_end, end_of_mem);
@@ -500,10 +508,11 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	reserve_crashkernel();
 	paging_init();
+	map_vsyscall();
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
+	* Find and reserve possible boot-time SMP configuration:
+	*/
 	find_smp_config();
 #endif
 #ifdef CONFIG_XEN
@@ -591,16 +600,10 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif

-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
 	early_quirks();
 #endif

-	/*
-	 * set this early, so we dont allocate cpu0
-	 * if MADT list doesnt list BSP first
-	 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
-	 */
-	cpu_set(0, cpu_present_map);
 #ifdef CONFIG_ACPI
 	/*
 	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
@@ -624,6 +627,7 @@ void __init setup_arch(char **cmdline_p)
 		get_smp_config();
 #ifndef CONFIG_XEN
 	init_apic_mappings();
+	ioapic_init_mappings();
 #endif
 #endif
 #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
@@ -635,18 +639,17 @@ void __init setup_arch(char **cmdline_p)
 	 */
 #ifdef CONFIG_XEN
 	if (is_initial_xendomain())
-		e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
+		e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
+				       &code_resource, &data_resource, &bss_resource);
 #else
-	e820_reserve_resources(e820.map, e820.nr_map);
+	e820_reserve_resources(e820.map, e820.nr_map,
+			       &code_resource, &data_resource, &bss_resource);
 	e820_mark_nosave_regions();
 #endif

-	{
-	unsigned i;
 	/* request I/O space for devices used on all i[345]86 PCs */
 	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
 		request_resource(&ioport_resource, &standard_io_resources[i]);
-	}

 #ifdef CONFIG_XEN
 	if (is_initial_xendomain())
@@ -680,7 +683,8 @@ void __init setup_arch(char **cmdline_p)

 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
+	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+		conswitchp = &vga_con;
 #elif defined(CONFIG_DUMMY_CONSOLE)
 	conswitchp = &dummy_con;
 #endif
@@ -724,9 +728,10 @@ static void __cpuinit display_cacheinfo(

 	if (n >= 0x80000005) {
 		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size=(ecx>>24)+(edx>>24);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+		       "D cache %dK (%d bytes/line)\n",
+		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
 		/* On K8 L1 TLB is inclusive, so don't count it */
 		c->x86_tlbsize = 0;
 	}
@@ -740,27 +745,25 @@ static void __cpuinit display_cacheinfo(
 		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
 		c->x86_cache_size, ecx & 0xFF);
 	}
-
-	if (n >= 0x80000007)
-		cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
 	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
+		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
 }

 #ifdef CONFIG_NUMA
-static int nearby_node(int apicid)
+static int __cpuinit nearby_node(int apicid)
 {
-	int i;
+	int i, node;
+
 	for (i = apicid - 1; i >= 0; i--) {
-		int node = apicid_to_node[i];
+		node = apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
 	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		int node = apicid_to_node[i];
+		node = apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
@@ -772,7 +775,7 @@ static int nearby_node(int apicid)
  * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
  * Assumes number of cores is a power of two.
  */
-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	unsigned bits;
@@ -781,7 +784,54 @@ static void __init amd_detect_cmp(struct
 	int node = 0;
 	unsigned apicid = hard_smp_processor_id();
 #endif
-	unsigned ecx = cpuid_ecx(0x80000008);
+	bits = c->x86_coreid_bits;
+
+	/* Low order bits define the core id (index of core in socket) */
+	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+	/* Convert the APIC ID into the socket ID */
+	c->phys_proc_id = phys_pkg_id(bits);
+
+#ifdef CONFIG_NUMA
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
+		int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);

 	c->x86_max_cores = (ecx & 0xff) + 1;

@@ -794,37 +844,8 @@ static void __init amd_detect_cmp(struct
 			bits++;
 	}

-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
-	/* Convert the APIC ID into the socket ID */
-	c->phys_proc_id = phys_pkg_id(bits);
-
-#ifdef CONFIG_NUMA
-  	node = c->phys_proc_id;
- 	if (apicid_to_node[apicid] != NUMA_NO_NODE)
- 		node = apicid_to_node[apicid];
- 	if (!node_online(node)) {
- 		/* Two possibilities here:
- 		   - The CPU is missing memory and no node was created.
- 		   In that case try picking one from a nearby CPU
- 		   - The APIC IDs differ from the HyperTransport node IDs
- 		   which the K8 northbridge parsing fills in.
- 		   Assume they are all increased by a constant offset,
- 		   but in the same order as the HT nodeids.
- 		   If that doesn't result in a usable node fall back to the
- 		   path for the previous case.  */
-		int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
- 		if (ht_nodeid >= 0 &&
- 		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- 			node = apicid_to_node[ht_nodeid];
- 		/* Pick a nearby node */
- 		if (!node_online(node))
- 			node = nearby_node(apicid);
- 	}
-	numa_set_node(cpu, node);
+	c->x86_coreid_bits = bits;

-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
 #endif
 }

@@ -841,8 +862,8 @@ static void __init amd_detect_cmp(struct
 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
 static __cpuinit int amd_apic_timer_broken(void)
 {
-	u32 lo, hi;
-	u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
 	switch (eax & CPUID_XFAM) {
 	case CPUID_XFAM_K8:
 		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -861,6 +882,15 @@ static __cpuinit int amd_apic_timer_brok
 }
 #endif

+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+ 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	unsigned level;
@@ -871,7 +901,7 @@ static void __cpuinit init_amd(struct cp
 	/*
 	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
 	 * bit 6 of msr C001_0015
- 	 *
+	 *
 	 * Errata 63 for SH-B3 steppings
 	 * Errata 122 for all steppings (F+ have it disabled by default)
 	 */
@@ -884,35 +914,32 @@ static void __cpuinit init_amd(struct cp

 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_bit(0*32+31, &c->x86_capability);
-
+	clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
+
 	/* On C+ stepping K8 rep microcode works well for copy/memset */
 	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+			     level >= 0x0f58))
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);

 	/* Enable workaround for FXSAVE leak */
 	if (c->x86 >= 6)
-		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);

 	level = get_model_name(c);
 	if (!level) {
-		switch (c->x86) {
+		switch (c->x86) {
 		case 15:
 			/* Should distinguish Models here, but this is only
 			   a fallback anyways. */
 			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
+			break;
+		}
+	}
 	display_cacheinfo(c);

-	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
-
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
@@ -924,14 +951,10 @@ static void __cpuinit init_amd(struct cp
 		num_cache_leaves = 3;

 	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_K8, &c->x86_capability);
-
-	/* RDTSC can be speculated around */
-	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_K8);

-	/* Family 10 doesn't support C states in MWAIT so don't use it */
-	if (c->x86 == 0x10 && !force_mwait)
-		clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+	/* MFENCE stops RDTSC speculation */
+	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);

 #ifndef CONFIG_XEN
 	if (amd_apic_timer_broken())
@@ -939,28 +962,29 @@ static void __cpuinit init_amd(struct cp
 #endif
 }

-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	u32 	eax, ebx, ecx, edx;
-	int 	index_msb, core_bits;
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;

 	cpuid(1, &eax, &ebx, &ecx, &edx);


 	if (!cpu_has(c, X86_FEATURE_HT))
 		return;
- 	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		goto out;

 	smp_num_siblings = (ebx & 0xff0000) >> 16;

 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1 ) {
+	} else if (smp_num_siblings > 1) {

 		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+			printk(KERN_WARNING "CPU: Unsupported number of "
+			       "siblings %d", smp_num_siblings);
 			smp_num_siblings = 1;
 			return;
 		}
@@ -970,7 +994,7 @@ static void __cpuinit detect_ht(struct c

 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;

-		index_msb = get_count_order(smp_num_siblings) ;
+		index_msb = get_count_order(smp_num_siblings);

 		core_bits = get_count_order(c->x86_max_cores);

@@ -979,8 +1003,10 @@ static void __cpuinit detect_ht(struct c
 	}
 out:
 	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
 	}

 #endif
@@ -1004,7 +1030,7 @@ static int __cpuinit intel_num_cpu_cores
 		return 1;
 }

-static void srat_detect_node(void)
+static void __cpuinit srat_detect_node(void)
 {
 #ifdef CONFIG_NUMA
 	unsigned node;
@@ -1014,7 +1040,7 @@ static void srat_detect_node(void)
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
 	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE)
+	if (node == NUMA_NO_NODE || !node_online(node))
 		node = first_node(node_online_map);
 	numa_set_node(cpu, node);

@@ -1022,28 +1048,39 @@ static void srat_detect_node(void)
 #endif
 }

+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
+		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+}
+
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;

 	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9 ) {
+	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
 		/* Check for version and the number of counters */
 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}

 	if (cpu_has_ds) {
 		unsigned int l1, l2;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
-			set_bit(X86_FEATURE_BTS, c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
-			set_bit(X86_FEATURE_PEBS, c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_PEBS);
 	}

+
+	if (cpu_has_bts)
+		ds_init_intel(c);
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
@@ -1060,14 +1097,11 @@ static void __cpuinit init_intel(struct
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	if (c->x86 == 6)
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-	if (c->x86 == 15)
-		set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-	else
-		clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
- 	c->x86_max_cores = intel_num_cpu_cores(c);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+	c->x86_max_cores = intel_num_cpu_cores(c);

 	srat_detect_node();
 }
@@ -1084,18 +1118,12 @@ static void __cpuinit get_cpu_vendor(str
 		c->x86_vendor = X86_VENDOR_UNKNOWN;
 }

-struct cpu_model_info {
-	int vendor;
-	int family;
-	char *model_names[16];
-};
-
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
-	u32 tfms;
+	u32 tfms, xlvl;

 	c->loops_per_jiffy = loops_per_jiffy;
 	c->x86_cache_size = -1;
@@ -1106,6 +1134,7 @@ void __cpuinit early_identify_cpu(struct
 	c->x86_clflush_size = 64;
 	c->x86_cache_alignment = c->x86_clflush_size;
 	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
 	c->extended_cpuid_level = 0;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);

@@ -1114,7 +1143,7 @@ void __cpuinit early_identify_cpu(struct
 	      (unsigned int *)&c->x86_vendor_id[0],
 	      (unsigned int *)&c->x86_vendor_id[8],
 	      (unsigned int *)&c->x86_vendor_id[4]);
-
+
 	get_cpu_vendor(c);

 	/* Initialize the standard set of capabilities */
@@ -1132,7 +1161,7 @@ void __cpuinit early_identify_cpu(struct
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (c->x86_capability[0] & (1<<19))
+		if (c->x86_capability[0] & (1<<19))
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 	} else {
 		/* Have CPUID level 0 only - unheard of */
@@ -1142,18 +1171,6 @@ void __cpuinit early_identify_cpu(struct
 #ifdef CONFIG_SMP
 	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-	u32 xlvl;
-
-	early_identify_cpu(c);
-
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
@@ -1174,6 +1191,30 @@ void __cpuinit identify_cpu(struct cpuin
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}

+	c->extended_cpuid_level = cpuid_eax(0x80000000);
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		early_init_amd(c);
+		break;
+	case X86_VENDOR_INTEL:
+		early_init_intel(c);
+		break;
+	}
+
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	early_identify_cpu(c);
+
 	init_scattered_cpuid_features(c);

 #ifndef CONFIG_XEN
@@ -1205,8 +1246,7 @@ void __cpuinit identify_cpu(struct cpuin
 		break;
 	}

-	select_idle_routine(c);
-	detect_ht(c);
+	detect_ht(c);

 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
@@ -1216,31 +1256,55 @@ void __cpuinit identify_cpu(struct cpuin
 	 */
 	if (c != &boot_cpu_data) {
 		/* AND the already accumulated flags with these */
-		for (i = 0 ; i < NCAPINTS ; i++)
+		for (i = 0; i < NCAPINTS; i++)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}

+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
 #endif
+	select_idle_routine(c);
+
 	if (c != &boot_cpu_data)
 		mtrr_ap_init();
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
+
 }
-
+
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);

 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
-		printk("%s", c->x86_model_id);
+		printk(KERN_CONT "%s", c->x86_model_id);
+
+	if (c->x86_mask || c->cpuid_level >= 0)
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+	else
+		printk(KERN_CONT "\n");
+}

-	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(" stepping %02x\n", c->x86_mask);
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
 	else
-		printk("\n");
+		return 0;
+	return 1;
 }
+__setup("clearcpuid=", setup_disablecpuid);

 /*
  *	Get CPU information for use by the procfs.
@@ -1249,116 +1313,41 @@ void __cpuinit print_cpu_info(struct cpu
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
-	int cpu = 0;
-
-	/*
-	 * These flag bits must match the definitions in <asm/cpufeature.h>.
-	 * NULL means this bit is undefined or reserved; either way it doesn't
-	 * have meaning as far as Linux is concerned.  Note that it's important
-	 * to realize there is a difference between this table and CPUID -- if
-	 * applications want to get the raw CPUID data, they should access
-	 * /dev/cpu/<cpu_nr>/cpuid instead.
-	 */
-	static const char *const x86_cap_flags[] = {
-		/* Intel-defined */
-	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-	        "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	        "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-		/* AMD-defined */
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-		NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-		"3dnowext", "3dnow",
-
-		/* Transmeta-defined */
-		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Other (Linux-defined) */
-		"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-		NULL, NULL, NULL, NULL,
-		"constant_tsc", "up", NULL, "arch_perfmon",
-		"pebs", "bts", NULL, "sync_rdtsc",
-		"rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Intel-defined (#2) */
-		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* VIA/Cyrix/Centaur-defined */
-		NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-		"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* AMD-defined (#2) */
-		"lahf_lm", "cmp_legacy", "svm", "extapic",
-		"cr8_legacy", "abm", "sse4a", "misalignsse",
-		"3dnowprefetch", "osvw", "ibs", "sse5",
-		"skinit", "wdt", NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-		/* Auxiliary (Linux-defined) */
-		"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	};
-	static const char *const x86_power_flags[] = {
-		"ts",	/* temperature sensor */
-		"fid",  /* frequency id control */
-		"vid",  /* voltage id control */
-		"ttp",  /* thermal trip */
-		"tm",
-		"stc",
-		"100mhzsteps",
-		"hwpstate",
-		"",	/* tsc invariant mapped to constant_tsc */
-		/* nothing */
-	};
-
+	int cpu = 0, i;

 #ifdef CONFIG_SMP
 	cpu = c->cpu_index;
 #endif

-	seq_printf(m,"processor\t: %u\n"
-		     "vendor_id\t: %s\n"
-		     "cpu family\t: %d\n"
-		     "model\t\t: %d\n"
-		     "model name\t: %s\n",
-		     (unsigned)cpu,
-		     c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-		     c->x86,
-		     (int)c->x86_model,
-		     c->x86_model_id[0] ? c->x86_model_id : "unknown");
-
+	seq_printf(m, "processor\t: %u\n"
+		   "vendor_id\t: %s\n"
+		   "cpu family\t: %d\n"
+		   "model\t\t: %d\n"
+		   "model name\t: %s\n",
+		   (unsigned)cpu,
+		   c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+		   c->x86,
+		   (int)c->x86_model,
+		   c->x86_model_id[0] ? c->x86_model_id : "unknown");
+
 	if (c->x86_mask || c->cpuid_level >= 0)
 		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
 	else
 		seq_printf(m, "stepping\t: unknown\n");
-
-	if (cpu_has(c,X86_FEATURE_TSC)) {
+
+	if (cpu_has(c, X86_FEATURE_TSC)) {
 		unsigned int freq = cpufreq_quick_get((unsigned)cpu);
+
 		if (!freq)
 			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-			     freq / 1000, (freq % 1000));
+			   freq / 1000, (freq % 1000));
 	}

 	/* Cache size */
-	if (c->x86_cache_size >= 0)
+	if (c->x86_cache_size >= 0)
 		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-
+
 #ifdef CONFIG_SMP
 	if (smp_num_siblings * c->x86_max_cores > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1367,48 +1356,43 @@ static int show_cpuinfo(struct seq_file
 		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
-#endif
+#endif

 	seq_printf(m,
-	        "fpu\t\t: yes\n"
-	        "fpu_exception\t: yes\n"
-	        "cpuid level\t: %d\n"
-	        "wp\t\t: yes\n"
-	        "flags\t\t:",
+		   "fpu\t\t: yes\n"
+		   "fpu_exception\t: yes\n"
+		   "cpuid level\t: %d\n"
+		   "wp\t\t: yes\n"
+		   "flags\t\t:",
 		   c->cpuid_level);

-	{
-		int i;
-		for ( i = 0 ; i < 32*NCAPINTS ; i++ )
-			if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-				seq_printf(m, " %s", x86_cap_flags[i]);
-	}
-
+	for (i = 0; i < 32*NCAPINTS; i++)
+		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+			seq_printf(m, " %s", x86_cap_flags[i]);
+
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
 		   c->loops_per_jiffy/(500000/HZ),
 		   (c->loops_per_jiffy/(5000/HZ)) % 100);

-	if (c->x86_tlbsize > 0)
+	if (c->x86_tlbsize > 0)
 		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
 	seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
 	seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);

-	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
 		   c->x86_phys_bits, c->x86_virt_bits);

 	seq_printf(m, "power management:");
-	{
-		unsigned i;
-		for (i = 0; i < 32; i++)
-			if (c->x86_power & (1 << i)) {
-				if (i < ARRAY_SIZE(x86_power_flags) &&
-					x86_power_flags[i])
-					seq_printf(m, "%s%s",
-						x86_power_flags[i][0]?" ":"",
-						x86_power_flags[i]);
-				else
-					seq_printf(m, " [%d]", i);
-			}
+	for (i = 0; i < 32; i++) {
+		if (c->x86_power & (1 << i)) {
+			if (i < ARRAY_SIZE(x86_power_flags) &&
+			    x86_power_flags[i])
+				seq_printf(m, "%s%s",
+					   x86_power_flags[i][0]?" ":"",
+					   x86_power_flags[i]);
+			else
+				seq_printf(m, " [%d]", i);
+		}
 	}

 	seq_printf(m, "\n\n");
@@ -1435,8 +1419,8 @@ static void c_stop(struct seq_file *m, v
 {
 }

-struct seq_operations cpuinfo_op = {
-	.start =c_start,
+const struct seq_operations cpuinfo_op = {
+	.start = c_start,
 	.next =	c_next,
 	.stop =	c_stop,
 	.show =	show_cpuinfo,
--- head-2010-04-29.orig/arch/x86/kernel/smp_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/smp_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh
 	}
 }

-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
 {
 	__send_IPI_shortcut(APIC_DEST_SELF, vector);
 }
@@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
  * We need to reload %cr3 since the page tables may be going
  * away from under us..
  */
-void leave_mm(unsigned long cpu)
+void leave_mm(int cpu)
 {
 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
+EXPORT_SYMBOL_GPL(leave_mm);

 /*
  *
--- head-2010-04-29.orig/arch/x86/kernel/smp_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/smp_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -33,7 +33,7 @@

 #ifndef CONFIG_XEN
 /*
- *	Smarter SMP flushing macros.
+ *	Smarter SMP flushing macros.
  *		c/o Linus Torvalds.
  *
  *	These mean you can really definitely utterly forget about
@@ -41,15 +41,15 @@
  *
  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
  *
- * 	More scalable flush, from Andi Kleen
+ *	More scalable flush, from Andi Kleen
  *
- * 	To avoid global state use 8 different call vectors.
- * 	Each CPU uses a specific vector to trigger flushes on other
- * 	CPUs. Depending on the received vector the target CPUs look into
+ *	To avoid global state use 8 different call vectors.
+ *	Each CPU uses a specific vector to trigger flushes on other
+ *	CPUs. Depending on the received vector the target CPUs look into
  *	the right per cpu variable for the flush data.
  *
- * 	With more than 8 CPUs they are hashed to the 8 available
- * 	vectors. The limited global vector space forces us to this right now.
+ *	With more than 8 CPUs they are hashed to the 8 available
+ *	vectors. The limited global vector space forces us to this right now.
  *	In future when interrupts are split into per CPU domains this could be
  *	fixed, at the cost of triggering multiple IPIs in some cases.
  */
@@ -59,7 +59,6 @@ union smp_flush_state {
 		cpumask_t flush_cpumask;
 		struct mm_struct *flush_mm;
 		unsigned long flush_va;
-#define FLUSH_ALL	-1ULL
 		spinlock_t tlbstate_lock;
 	};
 	char pad[SMP_CACHE_BYTES];
@@ -71,16 +70,17 @@ union smp_flush_state {
 static DEFINE_PER_CPU(union smp_flush_state, flush_state);

 /*
- * We cannot call mmdrop() because we are in interrupt context,
+ * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  */
-static inline void leave_mm(unsigned long cpu)
+void leave_mm(int cpu)
 {
 	if (read_pda(mmu_state) == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
+EXPORT_SYMBOL_GPL(leave_mm);

 /*
  *
@@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon
  * 1) switch_mm() either 1a) or 1b)
  * 1a) thread switch to a different mm
  * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * 	Stop ipi delivery for the old mm. This is not synchronized with
- * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * 	for the wrong mm, and in the worst case we perform a superfluous
- * 	tlb flush.
+ *	Stop ipi delivery for the old mm. This is not synchronized with
+ *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *	for the wrong mm, and in the worst case we perform a superfluous
+ *	tlb flush.
  * 1a2) set cpu mmu_state to TLBSTATE_OK
- * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
  *	was in lazy tlb mode.
  * 1a3) update cpu active_mm
- * 	Now cpu0 accepts tlb flushes for the new mm.
+ *	Now cpu0 accepts tlb flushes for the new mm.
  * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * 	Now the other cpus will send tlb flush ipis.
+ *	Now the other cpus will send tlb flush ipis.
  * 1a4) change cr3.
  * 1b) thread switch without mm change
  *	cpu active_mm is correct, cpu0 already handles
  *	flush ipis.
  * 1b1) set cpu mmu_state to TLBSTATE_OK
  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * 	Atomically set the bit [other cpus will start sending flush ipis],
- * 	and test the bit.
+ *	Atomically set the bit [other cpus will start sending flush ipis],
+ *	and test the bit.
  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
  * 2) switch %%esp, ie current
  *
@@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt
 	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
 	 */
-	sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
 	f = &per_cpu(flush_state, sender);

 	if (!cpu_isset(cpu, f->flush_cpumask))
 		goto out;
-		/*
+		/*
 		 * This was a BUG() but until someone can quote me the
 		 * line from the intel manual that guarantees an IPI to
 		 * multiple CPUs is retried _only_ on the erroring CPUs
@@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt
 		 *
 		 * BUG();
 		 */
-
+
 	if (f->flush_mm == read_pda(active_mm)) {
 		if (read_pda(mmu_state) == TLBSTATE_OK) {
-			if (f->flush_va == FLUSH_ALL)
+			if (f->flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
 				__flush_tlb_one(f->flush_va);
@@ -170,19 +170,22 @@ out:
 	add_pda(irq_tlb_count, 1);
 }

-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-						unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+			     unsigned long va)
 {
 	int sender;
 	union smp_flush_state *f;
+	cpumask_t cpumask = *cpumaskp;

 	/* Caller has disabled preemption */
 	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
 	f = &per_cpu(flush_state, sender);

-	/* Could avoid this lock when
-	   num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-	   probably not worth checking this for a cache-hot lock. */
+	/*
+	 * Could avoid this lock when
+	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+	 * probably not worth checking this for a cache-hot lock.
+	 */
 	spin_lock(&f->tlbstate_lock);

 	f->flush_mm = mm;
@@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c
 int __cpuinit init_smp_flush(void)
 {
 	int i;
+
 	for_each_cpu_mask(i, cpu_possible_map) {
 		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
 	}
 	return 0;
 }
-
 core_initcall(init_smp_flush);
-
+
 void flush_tlb_current_task(void)
 {
 	struct mm_struct *mm = current->mm;
@@ -225,10 +228,9 @@ void flush_tlb_current_task(void)

 	local_flush_tlb();
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_current_task);

 void flush_tlb_mm (struct mm_struct * mm)
 {
@@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm
 			leave_mm(smp_processor_id());
 	}
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);

 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_mm);

 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 {
@@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc
 	if (current->active_mm == mm) {
 		if(current->mm)
 			__flush_tlb_one(va);
-		 else
-		 	leave_mm(smp_processor_id());
+		else
+			leave_mm(smp_processor_id());
 	}

 	if (!cpus_empty(cpu_mask))
@@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc

 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_page);

 static void do_flush_tlb_all(void* info)
 {
@@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void)
  * this function sends a 'generic call function' IPI to all other CPU
  * of the system defined in the mask.
  */
-
-static int
-__smp_call_function_mask(cpumask_t mask,
-			 void (*func)(void *), void *info,
-			 int wait)
+static int __smp_call_function_mask(cpumask_t mask,
+				    void (*func)(void *), void *info,
+				    int wait)
 {
 	struct call_data_struct data;
 	cpumask_t allbutself;
@@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
  */

 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-	int nonatomic, int wait)
+			      int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
-	int ret;
-	int me = get_cpu();
+	int ret, me = get_cpu();

 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
@@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy)
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
 	disable_all_local_evtchn();
-	for (;;)
+	for (;;)
 		halt();
-}
+}

 void smp_send_stop(void)
 {
--- head-2010-04-29.orig/arch/x86/kernel/time-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/time-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -28,47 +28,19 @@
  *	serialize accesses to xtime/lost_ticks).
  */

-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
+#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/efi.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
 #include <linux/kernel_stat.h>
 #include <linux/posix-timers.h>
 #include <linux/cpufreq.h>
 #include <linux/clocksource.h>
+#include <linux/sysdev.h>

-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/msr.h>
 #include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
 #include <asm/time.h>
-#include <asm/sections.h>
-
-#include "mach_time.h"
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-
-#include <asm/arch_hooks.h>

 #include <xen/evtchn.h>
 #include <xen/sysctl.h>
@@ -88,9 +60,6 @@ volatile unsigned long __jiffies __secti
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);

-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
 /* These are peridically updated in shared_info, and then copied here. */
 struct shadow_time_info {
 	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
@@ -153,6 +122,11 @@ static int __init __independent_wallcloc
 }
 __setup("independent_wallclock", __independent_wallclock);

+int xen_independent_wallclock(void)
+{
+	return independent_wallclock;
+}
+
 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
 static int __init __permitted_clock_jitter(char *str)
@@ -209,7 +183,6 @@ static inline u64 get64(volatile u64 *pt
 	return res;
 #else
 	return *ptr;
-#define cmpxchg64 cmpxchg
 #endif
 }

@@ -224,7 +197,6 @@ static inline u64 get64_local(volatile u
 	return res;
 #else
 	return *ptr;
-#define cmpxchg64_local cmpxchg_local
 #endif
 }

@@ -330,35 +302,6 @@ static inline int time_values_up_to_date
 	return (dst->version == src->version);
 }

-/*
- * This is a special lock that is owned by the CPU and holds the index
- * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
- */
-volatile unsigned long cmos_lock = 0;
-EXPORT_SYMBOL(cmos_lock);
-
-/* Routines for accessing the CMOS RAM/RTC. */
-unsigned char rtc_cmos_read(unsigned char addr)
-{
-	unsigned char val;
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	val = inb_p(RTC_PORT(1));
-	lock_cmos_suffix(addr);
-	return val;
-}
-EXPORT_SYMBOL(rtc_cmos_read);
-
-void rtc_cmos_write(unsigned char val, unsigned char addr)
-{
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	outb_p(val, RTC_PORT(1));
-	lock_cmos_suffix(addr);
-}
-EXPORT_SYMBOL(rtc_cmos_write);
-
 static void sync_xen_wallclock(unsigned long dummy);
 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
 static void sync_xen_wallclock(unsigned long dummy)
@@ -367,7 +310,8 @@ static void sync_xen_wallclock(unsigned
 	s64 nsec;
 	struct xen_platform_op op;

-	if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
+	BUG_ON(!is_initial_xendomain());
+	if (!ntp_synced() || independent_wallclock)
 		return;

 	write_seqlock_irq(&xtime_lock);
@@ -390,23 +334,6 @@ static void sync_xen_wallclock(unsigned
 	mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
 }

-static int set_rtc_mmss(unsigned long nowtime)
-{
-	int retval;
-	unsigned long flags;
-
-	if (independent_wallclock || !is_initial_xendomain())
-		return 0;
-
-	/* gets recalled with irq locally disabled */
-	/* XXX - does irqsave resolve this? -johnstul */
-	spin_lock_irqsave(&rtc_lock, flags);
-	retval = set_wallclock(nowtime);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return retval;
-}
-
 static unsigned long long local_clock(void)
 {
 	unsigned int cpu = get_cpu();
@@ -416,7 +343,7 @@ static unsigned long long local_clock(vo

 	do {
 		local_time_version = shadow->version;
-		barrier();
+		rdtsc_barrier();
 		time = shadow->system_timestamp + get_nsec_offset(shadow);
 		if (!time_values_up_to_date(cpu))
 			get_time_values_from_xen(cpu);
@@ -489,28 +416,24 @@ unsigned long profile_pc(struct pt_regs

 #if defined(CONFIG_SMP) || defined(__x86_64__)
 # ifdef __i386__
-	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs)
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs)
 # else
 	if (!user_mode(regs)
 # endif
 	    && in_lock_functions(pc)) {
 # ifdef CONFIG_FRAME_POINTER
-#  ifdef __i386__
-		return ((unsigned long *)regs->ebp)[1];
-#  else
-		return ((unsigned long *)regs->rbp)[1];
-#  endif
+		return ((unsigned long *)regs->bp)[1];
 # else
 #  ifdef __i386__
-		unsigned long *sp = (unsigned long *)&regs->esp;
+		unsigned long *sp = (unsigned long *)&regs->sp;
 #  else
-		unsigned long *sp = (unsigned long *)regs->rsp;
+		unsigned long *sp = (unsigned long *)regs->sp;
 #  endif

 		/* Return address is either directly at stack pointer
-		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   or above a saved flags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
- 		if (sp[0] >> 22)
+		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
 			return sp[1];
@@ -749,25 +672,32 @@ static void init_missing_ticks_accountin
 		runstate->time[RUNSTATE_offline];
 }

-/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
+unsigned long xen_read_persistent_clock(void)
 {
-	unsigned long retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&rtc_lock, flags);
+	const shared_info_t *s = HYPERVISOR_shared_info;
+	u32 version, sec, nsec;
+	u64 delta;

-	retval = get_wallclock();
+	do {
+		version = s->wc_version;
+		rmb();
+		sec     = s->wc_sec;
+		nsec    = s->wc_nsec;
+		rmb();
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));

-	spin_unlock_irqrestore(&rtc_lock, flags);
+	delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+	do_div(delta, NSEC_PER_SEC);

-	return retval;
+	return delta;
 }

-int update_persistent_clock(struct timespec now)
+int xen_update_persistent_clock(void)
 {
+	if (!is_initial_xendomain())
+		return -1;
 	mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
-	return set_rtc_mmss(now.tv_sec);
+	return 0;
 }

 extern void (*late_time_init)(void);
--- head-2010-04-29.orig/arch/x86/kernel/traps_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/traps_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -79,7 +79,8 @@ char ignore_fpu_irq = 0;
  * F0 0F bug workaround.. We have a special link segment
  * for this.
  */
-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+gate_desc idt_table[256]
+	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 #endif

 asmlinkage void divide_error(void);
@@ -109,6 +110,34 @@ asmlinkage void machine_check(void);
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;

+void printk_address(unsigned long address, int reliable)
+{
+#ifdef CONFIG_KALLSYMS
+	unsigned long offset = 0, symsize;
+	const char *symname;
+	char *modname;
+	char *delim = ":";
+	char namebuf[128];
+	char reliab[4] = "";
+
+	symname = kallsyms_lookup(address, &symsize, &offset,
+					&modname, namebuf);
+	if (!symname) {
+		printk(" [<%08lx>]\n", address);
+		return;
+	}
+	if (!reliable)
+		strcpy(reliab, "? ");
+
+	if (!modname)
+		modname = delim = "";
+	printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+		address, reliab, delim, modname, delim, symname, offset, symsize);
+#else
+	printk(" [<%08lx>]\n", address);
+#endif
+}
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
 {
 	return	p > (void *)tinfo &&
@@ -122,48 +151,35 @@ struct stack_frame {
 };

 static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long ebp,
+				unsigned long *stack, unsigned long bp,
 				const struct stacktrace_ops *ops, void *data)
 {
-#ifdef	CONFIG_FRAME_POINTER
-	struct stack_frame *frame = (struct stack_frame *)ebp;
-	while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
-		struct stack_frame *next;
-		unsigned long addr;
+	struct stack_frame *frame = (struct stack_frame *)bp;

-		addr = frame->return_address;
-		ops->address(data, addr);
-		/*
-		 * break out of recursive entries (such as
-		 * end_of_stack_stop_unwind_function). Also,
-		 * we can never allow a frame pointer to
-		 * move downwards!
-		 */
-		next = frame->next_frame;
-		if (next <= frame)
-			break;
-		frame = next;
-	}
-#else
 	while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
 		unsigned long addr;

-		addr = *stack++;
-		if (__kernel_text_address(addr))
-			ops->address(data, addr);
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + 4) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
+		}
+		stack++;
 	}
-#endif
-	return ebp;
+	return bp;
 }

 #define MSG(msg) ops->warning(data, msg)

 void dump_trace(struct task_struct *task, struct pt_regs *regs,
-	        unsigned long *stack,
+		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
-	unsigned long ebp = 0;
-
 	if (!task)
 		task = current;

@@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task
 		unsigned long dummy;
 		stack = &dummy;
 		if (task != current)
-			stack = (unsigned long *)task->thread.esp;
+			stack = (unsigned long *)task->thread.sp;
 	}

 #ifdef CONFIG_FRAME_POINTER
-	if (!ebp) {
+	if (!bp) {
 		if (task == current) {
-			/* Grab ebp right from our regs */
-			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+			/* Grab bp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (bp) : );
 		} else {
-			/* ebp is the last reg pushed by switch_to */
-			ebp = *(unsigned long *) task->thread.esp;
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) task->thread.sp;
 		}
 	}
 #endif
@@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task
 		struct thread_info *context;
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		ebp = print_context_stack(context, stack, ebp, ops, data);
+		bp = print_context_stack(context, stack, bp, ops, data);
 		/* Should be after the line below, but somewhere
 		   in early boot context comes out corrupted and we
 		   can't reference it -AK */
@@ -225,9 +241,11 @@ static int print_trace_stack(void *data,
 /*
  * Print one address/symbol entries per line.
  */
-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	printk("%s [<%08lx>] ", (char *)data, addr);
+	if (!reliable)
+		printk("? ");
 	print_symbol("%s\n", addr);
 	touch_nmi_watchdog();
 }
@@ -241,32 +259,32 @@ static const struct stacktrace_ops print

 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long * stack, char *log_lvl)
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
-	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
 }

 void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long * stack)
+		unsigned long *stack, unsigned long bp)
 {
-	show_trace_log_lvl(task, regs, stack, "");
+	show_trace_log_lvl(task, regs, stack, bp, "");
 }

 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *esp, char *log_lvl)
+		       unsigned long *sp, unsigned long bp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;

-	if (esp == NULL) {
+	if (sp == NULL) {
 		if (task)
-			esp = (unsigned long*)task->thread.esp;
+			sp = (unsigned long*)task->thread.sp;
 		else
-			esp = (unsigned long *)&esp;
+			sp = (unsigned long *)&sp;
 	}

-	stack = esp;
+	stack = sp;
 	for(i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
@@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
-	show_trace_log_lvl(task, regs, esp, log_lvl);
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }

-void show_stack(struct task_struct *task, unsigned long *esp)
+void show_stack(struct task_struct *task, unsigned long *sp)
 {
 	printk("       ");
-	show_stack_log_lvl(task, NULL, esp, "");
+	show_stack_log_lvl(task, NULL, sp, 0, "");
 }

 /*
@@ -290,13 +308,19 @@ void show_stack(struct task_struct *task
 void dump_stack(void)
 {
 	unsigned long stack;
+	unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		asm("movl %%ebp, %0" : "=r" (bp):);
+#endif

 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(current, NULL, &stack);
+	show_trace(current, NULL, &stack, bp);
 }

 EXPORT_SYMBOL(dump_stack);
@@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs
 	 * time of the fault..
 	 */
 	if (!user_mode_vm(regs)) {
-		u8 *eip;
+		u8 *ip;
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;

 		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
+		show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);

 		printk(KERN_EMERG "Code: ");

-		eip = (u8 *)regs->eip - code_prologue;
-		if (eip < (u8 *)PAGE_OFFSET ||
-			probe_kernel_address(eip, c)) {
+		ip = (u8 *)regs->ip - code_prologue;
+		if (ip < (u8 *)PAGE_OFFSET ||
+			probe_kernel_address(ip, c)) {
 			/* try starting at EIP */
-			eip = (u8 *)regs->eip;
+			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
-		for (i = 0; i < code_len; i++, eip++) {
-			if (eip < (u8 *)PAGE_OFFSET ||
-				probe_kernel_address(eip, c)) {
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+				probe_kernel_address(ip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
-			if (eip == (u8 *)regs->eip)
+			if (ip == (u8 *)regs->ip)
 				printk("<%02x> ", c);
 			else
 				printk("%02x ", c);
@@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs
 	printk("\n");
 }

-int is_valid_bugaddr(unsigned long eip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;

-	if (eip < PAGE_OFFSET)
+	if (ip < PAGE_OFFSET)
 		return 0;
-	if (probe_kernel_address((unsigned short *)eip, ud2))
+	if (probe_kernel_address((unsigned short *)ip, ud2))
 		return 0;

 	return ud2 == 0x0b0f;
 }

+static int die_counter;
+
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+	unsigned long sp;
+	unsigned short ss;
+
+	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+
+	if (notify_die(DIE_OOPS, str, regs, err,
+				current->thread.trap_no, SIGSEGV) !=
+			NOTIFY_STOP) {
+		show_registers(regs);
+		/* Executive summary in case the oops scrolled away */
+		sp = (unsigned long) (&regs->sp);
+		savesegment(ss, ss);
+		if (user_mode(regs)) {
+			sp = regs->sp;
+			ss = regs->ss & 0xffff;
+		}
+		printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+		print_symbol("%s", regs->ip);
+		printk(" SS:ESP %04x:%08lx\n", ss, sp);
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
 /*
  * This is gone through when something in the kernel has done something bad and
  * is about to be terminated.
@@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
-	static int die_counter;
 	unsigned long flags;

 	oops_enter();
@@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg
 		raw_local_irq_save(flags);

 	if (++die.lock_owner_depth < 3) {
-		unsigned long esp;
-		unsigned short ss;
-
-		report_bug(regs->eip, regs);
-
-		printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
-		       ++die_counter);
-#ifdef CONFIG_PREEMPT
-		printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-		printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-		printk("DEBUG_PAGEALLOC");
-#endif
-		printk("\n");
+		report_bug(regs->ip, regs);

-		if (notify_die(DIE_OOPS, str, regs, err,
-					current->thread.trap_no, SIGSEGV) !=
-				NOTIFY_STOP) {
-			show_registers(regs);
-			/* Executive summary in case the oops scrolled away */
-			esp = (unsigned long) (&regs->esp);
-			savesegment(ss, ss);
-			if (user_mode(regs)) {
-				esp = regs->esp;
-				ss = regs->xss & 0xffff;
-			}
-			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
-			print_symbol("%s", regs->eip);
-			printk(" SS:ESP %04x:%08lx\n", ss, esp);
-		}
-		else
+		if (__die(str, regs, err))
 			regs = NULL;
-  	} else
+	} else {
 		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+	}

 	bust_spinlocks(0);
 	die.lock_owner = -1;
@@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr
 {
 	struct task_struct *tsk = current;

-	if (regs->eflags & VM_MASK) {
+	if (regs->flags & VM_MASK) {
 		if (vm86)
 			goto vm86_trap;
 		goto trap_signal;
@@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr
 }

 #define DO_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 						== NOTIFY_STOP) \
@@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs *
 }

 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
 	if (irq) \
@@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs *
 }

 #define DO_VM86_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 						== NOTIFY_STOP) \
@@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs *
 }

 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
 	info.si_signo = signr; \
@@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs *
 	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
 }

-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
@@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS,  "stack segment", s
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)

-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+void __kprobes do_general_protection(struct pt_regs * regs,
 					      long error_code)
 {
-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto gp_in_vm86;

 	if (!user_mode(regs))
@@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
 	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-	    printk_ratelimit())
+	    printk_ratelimit()) {
 		printk(KERN_INFO
-		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    "%s[%d] general protection ip:%lx sp:%lx error:%lx",
 		    current->comm, task_pid_nr(current),
-		    regs->eip, regs->esp, error_code);
+		    regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}

 	force_sig(SIGSEGV, current);
 	return;
@@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r
 	*/
 	bust_spinlocks(1);
 	printk(KERN_EMERG "%s", msg);
-	printk(" on CPU%d, eip %08lx, registers:\n",
-		smp_processor_id(), regs->eip);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
 	show_registers(regs);
 	console_silent();
 	spin_unlock(&nmi_print_lock);
@@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str

 static int ignore_nmis;

-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+__kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;

@@ -762,7 +797,7 @@ void restart_nmi(void)
 }

 #ifdef CONFIG_KPROBES
-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	trace_hardirqs_fixup();

@@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
  */
-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+void __kprobes do_debug(struct pt_regs * regs, long error_code)
 {
 	unsigned int condition;
 	struct task_struct *tsk = current;
@@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct

 	get_debugreg(condition, 6);

+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+	tsk->thread.debugctlmsr = 0;
+
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();

 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg[7])
+		if (!tsk->thread.debugreg7)
 			goto clear_dr7;
 	}

-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto debug_vm86;

 	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg[6] = condition;
+	tsk->thread.debugreg6 = condition;

 	/*
 	 * Single-stepping through TF: make sure we ignore any events in
@@ -856,7 +897,7 @@ debug_vm86:

 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	return;
 }

@@ -865,7 +906,7 @@ clear_TF_reenable:
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *eip)
+void math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -881,7 +922,7 @@ void math_error(void __user *eip)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -924,13 +965,13 @@ void math_error(void __user *eip)
 	force_sig_info(SIGFPE, &info, task);
 }

-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+void do_coprocessor_error(struct pt_regs * regs, long error_code)
 {
 	ignore_fpu_irq = 1;
-	math_error((void __user *)regs->eip);
+	math_error((void __user *)regs->ip);
 }

-static void simd_math_error(void __user *eip)
+static void simd_math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -946,7 +987,7 @@ static void simd_math_error(void __user
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
@@ -978,19 +1019,19 @@ static void simd_math_error(void __user
 	force_sig_info(SIGFPE, &info, task);
 }

-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+void do_simd_coprocessor_error(struct pt_regs * regs,
 					  long error_code)
 {
 	if (cpu_has_xmm) {
 		/* Handle SIMD FPU exceptions on PIII+ processors. */
 		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->eip);
+		simd_math_error((void __user *)regs->ip);
 	} else {
 		/*
 		 * Handle strange cache flush from user space exception
 		 * in all other cases.  This is undocumented behaviour.
 		 */
-		if (regs->eflags & VM_MASK) {
+		if (regs->flags & VM_MASK) {
 			handle_vm86_fault((struct kernel_vm86_regs *)regs,
 					  error_code);
 			return;
@@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error(
 }

 #ifndef CONFIG_XEN
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+void do_spurious_interrupt_bug(struct pt_regs * regs,
 					  long error_code)
 {
 #if 0
@@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug(
 #endif
 }

-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+unsigned long patch_espfix_desc(unsigned long uesp,
 					  unsigned long kesp)
 {
 	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
@@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg)
  * NB. All these are "trap gates" (i.e. events_mask isn't set) except
  * for those that specify <dpl>|4 in the second field.
  */
-static trap_info_t __cpuinitdata trap_table[] = {
+static const trap_info_t __cpuinitconst trap_table[] = {
 	{  0, 0, __KERNEL_CS, (unsigned long)divide_error		},
 	{  1, 0|4, __KERNEL_CS, (unsigned long)debug			},
 	{  3, 3|4, __KERNEL_CS, (unsigned long)int3			},
@@ -1105,17 +1146,12 @@ void __init trap_init(void)
 	if (ret)
 		printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);

+	/*
+	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+	 * Generate a build-time error if the alignment is wrong.
+	 */
+	BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
 	if (cpu_has_fxsr) {
-		/*
-		 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-		 * Generates a compile-time "error: zero width for bit-field" if
-		 * the alignment is wrong.
-		 */
-		struct fxsrAlignAssert {
-			int _:!(offsetof(struct task_struct,
-					thread.i387.fxsave) & 15);
-		};
-
 		printk(KERN_INFO "Enabling fast FPU save and restore... ");
 		set_in_cr4(X86_CR4_OSFXSR);
 		printk("done.\n");
--- head-2010-04-29.orig/arch/x86/kernel/traps_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/traps_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -74,38 +74,41 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);

+static unsigned int code_bytes = 64;
+
 static inline void conditional_sti(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }

 static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
-	preempt_disable();
-	if (regs->eflags & X86_EFLAGS_IF)
+	inc_preempt_count();
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }

 static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
 	/* Make sure to not schedule here because we could be running
 	   on an exception stack. */
-	preempt_enable_no_resched();
+	dec_preempt_count();
 }

 int kstack_depth_to_print = 12;

-#ifdef CONFIG_KALLSYMS
-void printk_address(unsigned long address)
+void printk_address(unsigned long address, int reliable)
 {
+#ifdef CONFIG_KALLSYMS
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
 	char *delim = ":";
-	char namebuf[128];
+	char namebuf[KSYM_NAME_LEN];
+	char reliab[4] = "";

 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -113,17 +116,17 @@ void printk_address(unsigned long addres
 		printk(" [<%016lx>]\n", address);
 		return;
 	}
+	if (!reliable)
+		strcpy(reliab, "? ");
+
 	if (!modname)
-		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
-		address, delim, modname, delim, symname, offset, symsize);
-}
+		modname = delim = "";
+	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+		address, reliab, delim, modname, delim, symname, offset, symsize);
 #else
-void printk_address(unsigned long address)
-{
 	printk(" [<%016lx>]\n", address);
-}
 #endif
+}

 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, char **idp)
@@ -210,14 +213,53 @@ static unsigned long *in_exception_stack
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */

-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size, void *end)
 {
-	void *t = (void *)tinfo;
-        return p > t && p < t + THREAD_SIZE - 3;
+	void *t = tinfo;
+	if (end) {
+		if (p < end && p >= (end-THREAD_SIZE))
+			return 1;
+		else
+			return 0;
+	}
+	return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+				unsigned long *stack, unsigned long bp,
+				const struct stacktrace_ops *ops, void *data,
+				unsigned long *end)
+{
+	struct stack_frame *frame = (struct stack_frame *)bp;
+
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+		unsigned long addr;
+
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + 8) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
+		}
+		stack++;
+	}
+	return bp;
 }

 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-		unsigned long *stack,
+		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = get_cpu();
@@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk,

 	if (!tsk)
 		tsk = current;
+	tinfo = task_thread_info(tsk);

 	if (!stack) {
 		unsigned long dummy;
 		stack = &dummy;
 		if (tsk && tsk != current)
-			stack = (unsigned long *)tsk->thread.rsp;
+			stack = (unsigned long *)tsk->thread.sp;
 	}

-	/*
-	 * Print function call entries within a stack. 'cond' is the
-	 * "end of stackframe" condition, that the 'stack++'
-	 * iteration will eventually trigger.
-	 */
-#define HANDLE_STACK(cond) \
-	do while (cond) { \
-		unsigned long addr = *stack++; \
-		/* Use unlocked access here because except for NMIs	\
-		   we should be already protected against module unloads */ \
-		if (__kernel_text_address(addr)) { \
-			/* \
-			 * If the address is either in the text segment of the \
-			 * kernel, or in the region which contains vmalloc'ed \
-			 * memory, it *may* be the address of a calling \
-			 * routine; if so, print it so that someone tracing \
-			 * down the cause of the crash will be able to figure \
-			 * out the call path that was taken. \
-			 */ \
-			ops->address(data, addr);   \
-		} \
-	} while (0)
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp) {
+		if (tsk == current) {
+			/* Grab bp right from our regs */
+			asm("movq %%rbp, %0" : "=r" (bp):);
+		} else {
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) tsk->thread.sp;
+		}
+	}
+#endif
+
+

 	/*
 	 * Print function call entries in all stacks, starting at the
@@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk,
 		if (estack_end) {
 			if (ops->stack(data, id) < 0)
 				break;
-			HANDLE_STACK (stack < estack_end);
+
+			bp = print_context_stack(tinfo, stack, bp, ops,
+							data, estack_end);
 			ops->stack(data, "<EOE>");
 			/*
 			 * We link to the next stack via the
@@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk,
 			if (stack >= irqstack && stack < irqstack_end) {
 				if (ops->stack(data, "IRQ") < 0)
 					break;
-				HANDLE_STACK (stack < irqstack_end);
+				bp = print_context_stack(tinfo, stack, bp,
+						ops, data, irqstack_end);
 				/*
 				 * We link to the next stack (which would be
 				 * the process stack normally) the last
@@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk,
 	/*
 	 * This handles the process stack:
 	 */
-	tinfo = task_thread_info(tsk);
-	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
-#undef HANDLE_STACK
+	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
 	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
@@ -333,10 +368,10 @@ static int print_trace_stack(void *data,
 	return 0;
 }

-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	touch_nmi_watchdog();
-	printk_address(addr);
+	printk_address(addr, reliable);
 }

 static const struct stacktrace_ops print_trace_ops = {
@@ -347,15 +382,17 @@ static const struct stacktrace_ops print
 };

 void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+		unsigned long bp)
 {
 	printk("\nCall Trace:\n");
-	dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
 	printk("\n");
 }

 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
+							unsigned long bp)
 {
 	unsigned long *stack;
 	int i;
@@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str
 	// debugging aid: "show_stack(NULL, NULL);" prints the
 	// back trace for this cpu.

-	if (rsp == NULL) {
+	if (sp == NULL) {
 		if (tsk)
-			rsp = (unsigned long *)tsk->thread.rsp;
+			sp = (unsigned long *)tsk->thread.sp;
 		else
-			rsp = (unsigned long *)&rsp;
+			sp = (unsigned long *)&sp;
 	}

-	stack = rsp;
+	stack = sp;
 	for(i=0; i < kstack_depth_to_print; i++) {
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
@@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, rsp);
+	show_trace(tsk, regs, sp, bp);
 }

-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+void show_stack(struct task_struct *tsk, unsigned long * sp)
 {
-	_show_stack(tsk, NULL, rsp);
+	_show_stack(tsk, NULL, sp, 0);
 }

 /*
@@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk,
 void dump_stack(void)
 {
 	unsigned long dummy;
+	unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		asm("movq %%rbp, %0" : "=r" (bp):);
+#endif

 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(NULL, NULL, &dummy);
+	show_trace(NULL, NULL, &dummy, bp);
 }

 EXPORT_SYMBOL(dump_stack);
@@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack);
 void show_registers(struct pt_regs *regs)
 {
 	int i;
-	int in_kernel = !user_mode(regs);
-	unsigned long rsp;
+	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+	u8 *ip;
+	unsigned int code_prologue = code_bytes * 43 / 64;
+	unsigned int code_len = code_bytes;

-	rsp = regs->rsp;
+	sp = regs->sp;
+	ip = (u8 *) regs->ip - code_prologue;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
 	 */
-	if (in_kernel) {
+	if (!user_mode(regs)) {
+		unsigned char c;
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long*)rsp);
+		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+		printk("\n");

-		printk("\nCode: ");
-		if (regs->rip < PAGE_OFFSET)
-			goto bad;
-
-		for (i=0; i<20; i++) {
-			unsigned char c;
-			if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
-bad:
+		printk(KERN_EMERG "Code: ");
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+			/* try starting at RIP */
+			ip = (u8 *) regs->ip;
+			code_len = code_len - code_prologue + 1;
+		}
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+					probe_kernel_address(ip, c)) {
 				printk(" Bad RIP value.");
 				break;
 			}
-			printk("%02x ", c);
+			if (ip == (u8 *)regs->ip)
+				printk("<%02x> ", c);
+			else
+				printk("%02x ", c);
 		}
 	}
 	printk("\n");
 }

-int is_valid_bugaddr(unsigned long rip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;

-	if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+	if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
 		return 0;

 	return ud2 == 0x0b0f;
 }

-#ifdef CONFIG_BUG
-void out_of_line_bug(void)
-{
-	BUG();
-}
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
 static int die_owner = -1;
 static unsigned int die_nest_count;
@@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void)
 	return flags;
 }

-void __kprobes oops_end(unsigned long flags)
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
 	die_owner = -1;
 	bust_spinlocks(0);
@@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl
 		/* Nest count reaches zero, release the lock. */
 		__raw_spin_unlock(&die_lock);
 	raw_local_irq_restore(flags);
+	if (!regs) {
+		oops_exit();
+		return;
+	}
 	if (panic_on_oops)
 		panic("Fatal exception");
 	oops_exit();
+	do_exit(signr);
 }

-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 {
 	static int die_counter;
 	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
+	if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
 	show_registers(regs);
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
-	printk_address(regs->rip);
-	printk(" RSP <%016lx>\n", regs->rsp);
+	printk_address(regs->ip, 1);
+	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
+	return 0;
 }

 void die(const char * str, struct pt_regs * regs, long err)
@@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg
 	unsigned long flags = oops_begin();

 	if (!user_mode(regs))
-		report_bug(regs->rip, regs);
+		report_bug(regs->ip, regs);

-	__die(str, regs, err);
-	oops_end(flags);
-	do_exit(SIGSEGV);
+	if (__die(str, regs, err))
+		regs = NULL;
+	oops_end(flags, regs, SIGSEGV);
 }

 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
@@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct
 		crash_kexec(regs);
 	if (do_panic || panic_on_oops)
 		panic("Non maskable interrupt");
-	oops_end(flags);
+	oops_end(flags, NULL, SIGBUS);
 	nmi_exit();
 	local_irq_enable();
-	do_exit(SIGSEGV);
+	do_exit(SIGBUS);
 }
 #endif

@@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr
 		tsk->thread.trap_no = trapnr;

 		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-		    printk_ratelimit())
+		    printk_ratelimit()) {
 			printk(KERN_INFO
-			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
 			       tsk->comm, tsk->pid, str,
-			       regs->rip, regs->rsp, error_code);
+			       regs->ip, regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}

 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr
 	}


-	/* kernel trap */
-	{
-		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
-		if (fixup)
-			regs->rip = fixup->fixup;
-		else {
-			tsk->thread.error_code = error_code;
-			tsk->thread.trap_no = trapnr;
-			die(str, regs, error_code);
-		}
-		return;
+	if (!fixup_exception(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+		die(str, regs, error_code);
 	}
+	return;
 }

 #define DO_ERROR(trapnr, signr, str, name) \
@@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs
 	do_trap(trapnr, signr, str, regs, error_code, &info); \
 }

-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro
 		tsk->thread.trap_no = 13;

 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit())
+		    printk_ratelimit()) {
 			printk(KERN_INFO
-		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+		       "%s[%d] general protection ip:%lx sp:%lx error:%lx",
 			       tsk->comm, tsk->pid,
-			       regs->rip, regs->rsp, error_code);
+			       regs->ip, regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}

 		force_sig(SIGSEGV, tsk);
 		return;
 	}

-	/* kernel gp */
-	{
-		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
-		if (fixup) {
-			regs->rip = fixup->fixup;
-			return;
-		}
+	if (fixup_exception(regs))
+		return;

-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = 13;
-		if (notify_die(DIE_GPF, "general protection fault", regs,
-					error_code, 13, SIGSEGV) == NOTIFY_STOP)
-			return;
-		die("general protection fault", regs, error_code);
-	}
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+	if (notify_die(DIE_GPF, "general protection fault", regs,
+				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+		return;
+	die("general protection fault", regs, error_code);
 }

 static __kprobes void
@@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn
 {
 	struct pt_regs *regs = eregs;
 	/* Did already sync */
-	if (eregs == (struct pt_regs *)eregs->rsp)
+	if (eregs == (struct pt_regs *)eregs->sp)
 		;
 	/* Exception from user space */
 	else if (user_mode(eregs))
 		regs = task_pt_regs(current);
 	/* Exception from kernel and interrupts are enabled. Move to
  	   kernel process stack. */
-	else if (eregs->eflags & X86_EFLAGS_IF)
-		regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+	else if (eregs->flags & X86_EFLAGS_IF)
+		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
 	if (eregs != regs)
 		*regs = *eregs;
 	return regs;
@@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc

 	get_debugreg(condition, 6);

+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+	tsk->thread.debugctlmsr = 0;
+
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
@@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc

 	tsk->thread.debugreg6 = condition;

-	/* Mask out spurious TF errors due to lazy TF clearing */
+
+	/*
+	 * Single-stepping through TF: make sure we ignore any events in
+	 * kernel space (but re-enable TF when returning to user mode).
+	 */
 	if (condition & DR_STEP) {
-		/*
-		 * The TF error should be masked out only if the current
-		 * process is not traced and if the TRAP flag has been set
-		 * previously by a tracing process (condition detected by
-		 * the PT_DTRACE flag); remember that the i386 TRAP flag
-		 * can be modified by the process itself in user mode,
-		 * allowing programs to debug themselves without the ptrace()
-		 * interface.
-		 */
                 if (!user_mode(regs))
                        goto clear_TF_reenable;
-		/*
-		 * Was the TF flag set by a debugger? If so, clear it now,
-		 * so that register information is correct.
-		 */
-		if (tsk->ptrace & PT_DTRACE) {
-			regs->eflags &= ~TF_MASK;
-			tsk->ptrace &= ~PT_DTRACE;
-		}
 	}

 	/* Ok, finally something we can handle */
@@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_BRKPT;
-	info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+	info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 	force_sig_info(SIGTRAP, &info, tsk);

 clear_dr7:
@@ -913,18 +949,15 @@ clear_dr7:

 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	preempt_conditional_cli(regs);
 }

 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	fixup = search_exception_tables(regs->rip);
-	if (fixup) {
-		regs->rip = fixup->fixup;
+	if (fixup_exception(regs))
 		return 1;
-	}
+
 	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
 	/* Illegal floating point operation in the kernel */
 	current->thread.trap_no = trapnr;
@@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r
  */
 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short cwd, swd;
@@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void)

 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short mxcsr;
@@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
@@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void)
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);


 /*
  * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
  * specify <dpl>|4 in the second field.
  */
-static trap_info_t __cpuinitdata trap_table[] = {
+static const trap_info_t __cpuinitconst trap_table[] = {
         {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
         {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
         {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
@@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s)
 	return 0;
 }
 early_param("kstack", kstack_setup);
+
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
--- head-2010-04-29.orig/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -43,12 +43,7 @@
 #include <asm/vgtod.h>

 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
-#define __pa_vsymbol(x)			\
-	({unsigned long v;  		\
-	extern char __vsyscall_0; 	\
-	  asm("" : "=r" (v) : "0" (x)); \
-	  ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
+#define __syscall_clobber "r11","cx","memory"

 /*
  * vsyscall_gtod_data contains data that is :
@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st
 static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	int ret;
-	asm volatile("vsysc2: syscall"
+	asm volatile("syscall"
 		: "=a" (ret)
 		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
 		: __syscall_clobber );
@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(
 static __always_inline long time_syscall(long *t)
 {
 	long secs;
-	asm volatile("vsysc1: syscall"
+	asm volatile("syscall"
 		: "=a" (secs)
 		: "0" (__NR_time),"D" (t) : __syscall_clobber);
 	return secs;
@@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t)
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	unsigned int dummy, p;
+	unsigned int p;
 	unsigned long j = 0;

 	/* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
 		p = tcache->blob[1];
 	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
 		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
+		native_read_tscp(&p);
 	} else {
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)

 #ifdef CONFIG_SYSCTL

-#define SYSCALL 0x050f
-#define NOP2    0x9090
-
-/*
- * NOP out syscall in vsyscall page when not needed.
- */
-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	extern u16 vsysc1, vsysc2;
-	u16 __iomem *map1;
-	u16 __iomem *map2;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-	if (!write)
-		return ret;
-	/* gcc has some trouble with __va(__pa()), so just do it this
-	   way. */
-	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
-	if (!map1)
-		return -ENOMEM;
-	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
-	if (!map2) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	if (!vsyscall_gtod_data.sysctl_enabled) {
-		writew(SYSCALL, map1);
-		writew(SYSCALL, map2);
-	} else {
-		writew(NOP2, map1);
-		writew(NOP2, map2);
-	}
-	iounmap(map2);
-out:
-	iounmap(map1);
-	return ret;
+	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 }

 static ctl_table kernel_table2[] = {
@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] =
 	  .child = kernel_table2 },
 	{}
 };
-
 #endif

 /* Assume __initcall executes before all user space. Hopefully kmod
@@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i
 	d |= cpu;
 	d |= (node & 0xf) << 12;
 	d |= (node >> 4) << 48;
-	if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
+	if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
 							 + GDT_ENTRY_PER_CPU),
 					 d))
 		BUG();
@@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl
 	return NOTIFY_DONE;
 }

-static void __init map_vsyscall(void)
+void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -338,7 +301,6 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-	map_vsyscall();
 #ifdef CONFIG_XEN
 	vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
  	if (boot_cpu_has(X86_FEATURE_RDTSCP))
--- head-2010-04-29.orig/arch/x86/kernel/xen_entry_64.S	2008-04-02 12:34:02.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,36 +0,0 @@
-/*
- * Copied from arch/xen/i386/kernel/entry.S
- */
-/* Offsets into shared_info_t. */
-#define evtchn_upcall_pending		/* 0 */
-#define evtchn_upcall_mask		1
-
-#define sizeof_vcpu_shift		6
-
-#ifdef CONFIG_SMP
-//#define preempt_disable(reg)	incl threadinfo_preempt_count(reg)
-//#define preempt_enable(reg)	decl threadinfo_preempt_count(reg)
-#define preempt_disable(reg)
-#define preempt_enable(reg)
-#define XEN_GET_VCPU_INFO(reg)	preempt_disable(%rbp)			; \
-				movq %gs:pda_cpunumber,reg		; \
-				shl  $32, reg				; \
-				shr  $32-sizeof_vcpu_shift,reg		; \
-				addq HYPERVISOR_shared_info,reg
-#define XEN_PUT_VCPU_INFO(reg)	preempt_enable(%rbp)			; \
-#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
-#else
-#define XEN_GET_VCPU_INFO(reg)	movq HYPERVISOR_shared_info,reg
-#define XEN_PUT_VCPU_INFO(reg)
-#define XEN_PUT_VCPU_INFO_fixup
-#endif
-
-#define XEN_LOCKED_BLOCK_EVENTS(reg)	movb $1,evtchn_upcall_mask(reg)
-#define XEN_LOCKED_UNBLOCK_EVENTS(reg)	movb $0,evtchn_upcall_mask(reg)
-#define XEN_BLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-				XEN_LOCKED_BLOCK_EVENTS(reg)		; \
-    				XEN_PUT_VCPU_INFO(reg)
-#define XEN_UNBLOCK_EVENTS(reg)	XEN_GET_VCPU_INFO(reg)			; \
-				XEN_LOCKED_UNBLOCK_EVENTS(reg)		; \
-    				XEN_PUT_VCPU_INFO(reg)
-#define XEN_TEST_PENDING(reg)	testb $0xFF,evtchn_upcall_pending(reg)
--- head-2010-04-29.orig/arch/x86/mach-xen/setup.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/mach-xen/setup.c	2010-03-24 15:10:37.000000000 +0100
@@ -161,15 +161,12 @@ void __init machine_specific_arch_setup(

 	/* Do an early initialization of the fixmap area */
 	{
-		extern pte_t swapper_pg_pmd[PTRS_PER_PTE];
+		extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
 		unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
-		pgd_t *pgd = (pgd_t *)xen_start_info->pt_base;
-		pud_t *pud = pud_offset(pgd + pgd_index(addr), addr);
+		pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
 		pmd_t *pmd = pmd_offset(pud, addr);

-		swapper_pg_dir = pgd;
-		init_mm.pgd    = pgd;
-		make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables);
-		set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE));
+		make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+		set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
 	}
 }
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/mm/fault-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,1025 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>		/* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/*
+ * Page fault error code bits
+ *	bit 0 == 0 means no page found, 1 means protection fault
+ *	bit 1 == 0 means read, 1 means write
+ *	bit 2 == 0 means kernel, 1 means user-mode
+ *	bit 3 == 1 means use of reserved bit detected
+ *	bit 4 == 1 means fault was an instruction fetch
+ */
+#define PF_PROT		(1<<0)
+#define PF_WRITE	(1<<1)
+#define PF_USER		(1<<2)
+#define PF_RSVD		(1<<3)
+#define PF_INSTR	(1<<4)
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+#ifdef CONFIG_KPROBES
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+#ifdef CONFIG_X86_32
+	if (!user_mode_vm(regs)) {
+#else
+	if (!user_mode(regs)) {
+#endif
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, 14))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+/*
+ * X86_32
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * X86_64
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner
+ */
+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+		       unsigned long error_code)
+{
+	unsigned char *instr;
+	int scan_more = 1;
+	int prefetch = 0;
+	unsigned char *max_instr;
+
+	/*
+	 * If it was a exec (instruction fetch) fault on NX page, then
+	 * do not ignore the fault:
+	 */
+	if (error_code & PF_INSTR)
+		return 0;
+
+	instr = (unsigned char *)convert_ip_to_linear(current, regs);
+	max_instr = instr + 15;
+
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+		return 0;
+
+	while (scan_more && instr < max_instr) {
+		unsigned char opcode;
+		unsigned char instr_hi;
+		unsigned char instr_lo;
+
+		if (probe_kernel_address(instr, opcode))
+			break;
+
+		instr_hi = opcode & 0xf0;
+		instr_lo = opcode & 0x0f;
+		instr++;
+
+		switch (instr_hi) {
+		case 0x20:
+		case 0x30:
+			/*
+			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+			 * In X86_64 long mode, the CPU will signal invalid
+			 * opcode if some of these prefixes are present so
+			 * X86_64 will never get here anyway
+			 */
+			scan_more = ((instr_lo & 7) == 0x6);
+			break;
+#ifdef CONFIG_X86_64
+		case 0x40:
+			/*
+			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+			 * Need to figure out under what instruction mode the
+			 * instruction was issued. Could check the LDT for lm,
+			 * but for now it's good enough to assume that long
+			 * mode only uses well known segments or kernel.
+			 */
+			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+			break;
+#endif
+		case 0x60:
+			/* 0x64 thru 0x67 are valid prefixes in all modes. */
+			scan_more = (instr_lo & 0xC) == 0x4;
+			break;
+		case 0xF0:
+			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+			scan_more = !instr_lo || (instr_lo>>1) == 1;
+			break;
+		case 0x00:
+			/* Prefetch instruction is 0x0F0D or 0x0F18 */
+			scan_more = 0;
+
+			if (probe_kernel_address(instr, opcode))
+				break;
+			prefetch = (instr_lo == 0xF) &&
+				(opcode == 0x0D || opcode == 0x18);
+			break;
+		default:
+			scan_more = 0;
+			break;
+		}
+	}
+	return prefetch;
+}
+
+static void force_sig_info_fault(int si_signo, int si_code,
+	unsigned long address, struct task_struct *tsk)
+{
+	siginfo_t info;
+
+	info.si_signo = si_signo;
+	info.si_errno = 0;
+	info.si_code = si_code;
+	info.si_addr = (void __user *)address;
+	force_sig_info(si_signo, &info, tsk);
+}
+
+#ifdef CONFIG_X86_64
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+#endif
+
+static void dump_pagetable(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	__typeof__(pte_val(__pte(0))) page;
+
+	page = read_cr3();
+	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+	printk("*pdpt = %016Lx ", page);
+	if ((page & _PAGE_PRESENT)
+	    && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) {
+		page = mfn_to_pfn(page >> PAGE_SHIFT);
+		page <<= PAGE_SHIFT;
+		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+		                                         & (PTRS_PER_PMD - 1)];
+		printk(KERN_CONT "*pde = %016Lx ", page);
+		page &= ~_PAGE_NX;
+	}
+#else
+	printk("*pde = %08lx ", page);
+#endif
+
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case if the page table is located in highmem.
+	 * And let's rather not kmap-atomic the pte, just in case
+	 * it's allocated already.
+	 */
+	if ((page & _PAGE_PRESENT)
+	    && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn
+	    && !(page & _PAGE_PSE)) {
+		page = mfn_to_pfn(page >> PAGE_SHIFT);
+		page <<= PAGE_SHIFT;
+		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+		                                         & (PTRS_PER_PTE - 1)];
+		printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+	}
+
+	printk(KERN_CONT "\n");
+#else /* CONFIG_X86_64 */
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = (pgd_t *)read_cr3();
+
+	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+	pgd += pgd_index(address);
+	if (bad_address(pgd)) goto bad;
+	printk("PGD %lx ", pgd_val(*pgd));
+	if (!pgd_present(*pgd)) goto ret;
+
+	pud = pud_offset(pgd, address);
+	if (bad_address(pud)) goto bad;
+	printk(KERN_CONT "PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud) || pud_large(*pud))
+		goto ret;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd)) goto bad;
+	printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte)) goto bad;
+	printk(KERN_CONT "PTE %lx", pte_val(*pte));
+ret:
+	printk(KERN_CONT "\n");
+	return;
+bad:
+	printk("BAD\n");
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+	if (!pmd_present(*pmd)) {
+		bool lazy = x86_read_percpu(xen_lazy_mmu);
+
+		x86_write_percpu(xen_lazy_mmu, false);
+#if CONFIG_XEN_COMPAT > 0x030002
+		set_pmd(pmd, *pmd_k);
+#else
+		/*
+		 * When running on older Xen we must launder *pmd_k through
+		 * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+		 */
+		set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
+#endif
+		x86_write_percpu(xen_lazy_mmu, lazy);
+	} else
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+	return pmd_k;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8.
+   A lot of BIOS that didn't get tested properly miss this.
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here.
+   Does nothing for X86_32
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	static int warned;
+	if (address != regs->ip)
+		return 0;
+	if ((address >> 32) != 0)
+		return 0;
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) ||
+	    (address >= MODULES_VADDR && address <= MODULES_END)) {
+		if (!warned) {
+			printk(errata93_warning);
+			warned = 1;
+		}
+		regs->ip = address;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * addresses >4GB.  We catch this in the page fault handler because these
+ * addresses are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+	    (address >> 32))
+		return 1;
+#endif
+	return 0;
+}
+
+void do_invalid_op(struct pt_regs *, unsigned long);
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+	unsigned long nr;
+	/*
+	 * Pentium F0 0F C7 C8 bug workaround.
+	 */
+	if (boot_cpu_data.f00f_bug) {
+		nr = (address - idt_descr.address) >> 3;
+
+		if (nr == 6) {
+			do_invalid_op(regs, 0);
+			return 1;
+		}
+	}
+#endif
+	return 0;
+}
+
+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+			    unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	if (!oops_may_print())
+		return;
+#endif
+
+#ifdef CONFIG_X86_PAE
+	if (error_code & PF_INSTR) {
+		unsigned int level;
+		pte_t *pte = lookup_address(address, &level);
+
+		if (pte && pte_present(*pte) && !pte_exec(*pte))
+			printk(KERN_CRIT "kernel tried to execute "
+				"NX-protected page - exploit attempt? "
+				"(uid: %d)\n", current->uid);
+	}
+#endif
+
+	printk(KERN_ALERT "BUG: unable to handle kernel ");
+	if (address < PAGE_SIZE)
+		printk(KERN_CONT "NULL pointer dereference");
+	else
+		printk(KERN_CONT "paging request");
+#ifdef CONFIG_X86_32
+	printk(KERN_CONT " at %08lx\n", address);
+#else
+	printk(KERN_CONT " at %016lx\n", address);
+#endif
+	printk(KERN_ALERT "IP:");
+	printk_address(regs->ip, 1);
+	dump_pagetable(address);
+}
+
+#ifdef CONFIG_X86_64
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+				 unsigned long error_code)
+{
+	unsigned long flags = oops_begin();
+	struct task_struct *tsk;
+
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       current->comm, address);
+	dump_pagetable(address);
+	tsk = current;
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
+	if (__die("Bad pagetable", regs, error_code))
+		regs = NULL;
+	oops_end(flags, regs, SIGKILL);
+}
+#endif
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+	if ((error_code & PF_WRITE) && !pte_write(*pte))
+		return 0;
+	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+			  unsigned long error_code)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	/* Reserved-bit violation or user access to kernel space? */
+	if (error_code & (PF_USER | PF_RSVD))
+		return 0;
+
+	pgd = init_mm.pgd + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return spurious_fault_check(error_code, (pte_t *) pud);
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return spurious_fault_check(error_code, (pte_t *) pmd);
+
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte))
+		return 0;
+
+	return spurious_fault_check(error_code, pte);
+}
+
+/*
+ * X86_32
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * X86_64
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+	return 0;
+#else
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Make sure we are in vmalloc area */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	/* Copy kernel mappings over when needed. This can also
+	   happen within a race in page table update. In the later
+	   case just flush. */
+
+	/* On Xen the line below does not always work. Needs investigating! */
+	/*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
+	pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd += pgd_index(address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+	/* Below here mismatches are bugs because these lower tables
+	   are shared */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+	pte = pte_offset_kernel(pmd, address);
+	/* Don't use pte_page here, because the mappings can point
+	   outside mem_map, and the NUMA hash lookup cannot handle
+	   that. */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+	return 0;
+#endif
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned long address;
+	int write, si_code;
+	int fault;
+#ifdef CONFIG_X86_64
+	unsigned long flags;
+#endif
+
+	/*
+	 * We can fault from pretty much anywhere, with unknown IRQ state.
+	 */
+	trace_hardirqs_fixup();
+
+	/* Set the "privileged fault" bit to something sane. */
+	if (user_mode_vm(regs))
+		error_code |= PF_USER;
+	else
+		error_code &= ~PF_USER;
+
+	tsk = current;
+	mm = tsk->mm;
+	prefetchw(&mm->mmap_sem);
+
+	/* get the address */
+	address = read_cr2();
+
+	si_code = SEGV_MAPERR;
+
+	if (notify_page_fault(regs))
+		return;
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 9) == 0.
+	 */
+#ifdef CONFIG_X86_32
+	if (unlikely(address >= TASK_SIZE)) {
+#else
+	if (unlikely(address >= TASK_SIZE64)) {
+#endif
+		/* Faults in hypervisor area can never be patched up. */
+#if defined(CONFIG_X86_XEN)
+		if (address >= hypervisor_virt_start)
+			goto bad_area_nosemaphore;
+#elif defined(CONFIG_X86_64_XEN)
+		if (address >= HYPERVISOR_VIRT_START
+		    && address < HYPERVISOR_VIRT_END)
+			goto bad_area_nosemaphore;
+#endif
+		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+		    vmalloc_fault(address) >= 0)
+			return;
+
+		/* Can handle a stale RO->RW TLB */
+		if (spurious_fault(address, error_code))
+			return;
+
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock.
+		 */
+		goto bad_area_nosemaphore;
+	}
+
+
+#ifdef CONFIG_X86_32
+	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
+	   fault has been handled. */
+	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+		local_irq_enable();
+
+	/*
+	 * If we're in an interrupt, have no user context or are running in an
+	 * atomic region then we must not take the fault.
+	 */
+	if (in_atomic() || !mm)
+		goto bad_area_nosemaphore;
+#else /* CONFIG_X86_64 */
+	if (likely(regs->flags & X86_EFLAGS_IF))
+		local_irq_enable();
+
+	if (unlikely(error_code & PF_RSVD))
+		pgtable_bad(address, regs, error_code);
+
+	/*
+	 * If we're in an interrupt, have no user context or are running in an
+	 * atomic region then we must not take the fault.
+	 */
+	if (unlikely(in_atomic() || !mm))
+		goto bad_area_nosemaphore;
+
+	/*
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet.
+	 */
+	if (user_mode_vm(regs))
+		error_code |= PF_USER;
+again:
+#endif
+	/* When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in the
+	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
+	 * erroneous fault occurring in a code path which already holds mmap_sem
+	 * we will deadlock attempting to validate the fault against the
+	 * address space.  Luckily the kernel only validly references user
+	 * space from well defined areas of code, which are listed in the
+	 * exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibility of a deadlock.
+	 * Attempt to lock the address space, if we cannot we then validate the
+	 * source.  If this is invalid we can skip the address space check,
+	 * thus avoiding the deadlock.
+	 */
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		if ((error_code & PF_USER) == 0 &&
+		    !search_exception_tables(regs->ip))
+			goto bad_area_nosemaphore;
+		down_read(&mm->mmap_sem);
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= address)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (error_code & PF_USER) {
+		/*
+		 * Accessing the stack below %sp is always a bug.
+		 * The large cushion allows instructions like enter
+		 * and pusha to work.  ("enter $65535,$31" pushes
+		 * 32 pointers and then decrements %sp by 65535.)
+		 */
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+			goto bad_area;
+	}
+	if (expand_stack(vma, address))
+		goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+	si_code = SEGV_ACCERR;
+	write = 0;
+	switch (error_code & (PF_PROT|PF_WRITE)) {
+	default:	/* 3: write, present */
+		/* fall through */
+	case PF_WRITE:		/* write, not present */
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+		write++;
+		break;
+	case PF_PROT:		/* read, present */
+		goto bad_area;
+	case 0:			/* read, not present */
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+			goto bad_area;
+	}
+
+#ifdef CONFIG_X86_32
+survive:
+#endif
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Did it hit the DOS screen memory VA from vm86 mode?
+	 */
+	if (v8086_mode(regs)) {
+		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+		if (bit < 32)
+			tsk->thread.screen_bitmap |= 1 << bit;
+	}
+#endif
+	up_read(&mm->mmap_sem);
+	return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+	up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here.
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space.
+		 */
+		if (is_prefetch(regs, address, error_code))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk(
+#ifdef CONFIG_X86_32
+			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+#else
+			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
+#endif
+			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+			tsk->comm, task_pid_nr(tsk), address, regs->ip,
+			regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}
+
+		tsk->thread.cr2 = address;
+		/* Kernel addresses are always protection faults */
+		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no = 14;
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+no_context:
+	/* Are we prepared to handle this kernel fault?  */
+	if (fixup_exception(regs))
+		return;
+
+	/*
+	 * X86_32
+	 * Valid to do another page fault here, because if this fault
+	 * had been triggered by is_prefetch fixup_exception would have
+	 * handled it.
+	 *
+	 * X86_64
+	 * Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, address, error_code))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+	bust_spinlocks(1);
+#else
+	flags = oops_begin();
+#endif
+
+	show_fault_oops(regs, error_code, address);
+
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+	die("Oops", regs, error_code);
+	bust_spinlocks(0);
+	do_exit(SIGKILL);
+#else
+	if (__die("Oops", regs, error_code))
+		regs = NULL;
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+	oops_end(flags, regs, SIGKILL);
+#endif
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+	up_read(&mm->mmap_sem);
+	if (is_global_init(tsk)) {
+		yield();
+#ifdef CONFIG_X86_32
+		down_read(&mm->mmap_sem);
+		goto survive;
+#else
+		goto again;
+#endif
+	}
+
+	printk("VM: killing process %s\n", tsk->comm);
+	if (error_code & PF_USER)
+		do_group_exit(SIGKILL);
+	goto no_context;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & PF_USER))
+		goto no_context;
+#ifdef CONFIG_X86_32
+	/* User space => ok to do another page fault */
+	if (is_prefetch(regs, address, error_code))
+		return;
+#endif
+	tsk->thread.cr2 = address;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 14;
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+#ifdef CONFIG_X86_32
+	/*
+	 * Note that races in the updates of insync and start aren't
+	 * problematic: insync can only get set bits added, and updates to
+	 * start are only improving performance (without affecting correctness
+	 * if undone).
+	 * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
+	 *      This change works just fine with 2-level paging too.
+	 */
+#define sync_index(a) ((a) >> PMD_SHIFT)
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
+	static unsigned long start = TASK_SIZE;
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
+	for (address = start;
+	     address < hypervisor_virt_start;
+	     address += PMD_SIZE) {
+		if (!test_bit(sync_index(address), insync)) {
+			unsigned long flags;
+			struct page *page;
+
+			spin_lock_irqsave(&pgd_lock, flags);
+			/* XEN: failure path assumes non-empty pgd_list. */
+			if (unlikely(list_empty(&pgd_list))) {
+				spin_unlock_irqrestore(&pgd_lock, flags);
+				return;
+			}
+			list_for_each_entry(page, &pgd_list, lru) {
+				if (!vmalloc_sync_one(page_address(page),
+						      address))
+					break;
+			}
+			spin_unlock_irqrestore(&pgd_lock, flags);
+			if (!page)
+				set_bit(sync_index(address), insync);
+		}
+		if (address == start && test_bit(sync_index(address), insync))
+			start = address + PMD_SIZE;
+	}
+#else /* CONFIG_X86_64 */
+	/*
+	 * Note that races in the updates of insync and start aren't
+	 * problematic: insync can only get set bits added, and updates to
+	 * start are only improving performance (without affecting correctness
+	 * if undone).
+	 */
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+	static unsigned long start = VMALLOC_START & PGDIR_MASK;
+	unsigned long address;
+
+	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+		if (!test_bit(pgd_index(address), insync)) {
+			const pgd_t *pgd_ref = pgd_offset_k(address);
+			unsigned long flags;
+			struct page *page;
+
+			if (pgd_none(*pgd_ref))
+				continue;
+			spin_lock_irqsave(&pgd_lock, flags);
+			list_for_each_entry(page, &pgd_list, lru) {
+				pgd_t *pgd;
+				pgd = (pgd_t *)page_address(page) + pgd_index(address);
+				if (pgd_none(*pgd))
+					set_pgd(pgd, *pgd_ref);
+				else
+					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+			}
+			spin_unlock_irqrestore(&pgd_lock, flags);
+			set_bit(pgd_index(address), insync);
+		}
+		if (address == start)
+			start = address + PGDIR_SIZE;
+	}
+	/* Check that there is no need to do the same for the modules area. */
+	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+				(__START_KERNEL & PGDIR_MASK)));
+#endif
+}
--- head-2010-04-29.orig/arch/x86/mm/fault_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,757 +0,0 @@
-/*
- *  linux/arch/i386/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/highmem.h>
-#include <linux/bootmem.h>		/* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-
-extern void die(const char *,struct pt_regs *,long);
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode_vm(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
-
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- *
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-					    unsigned long *eip_limit)
-{
-	unsigned long eip = regs->eip;
-	unsigned seg = regs->xcs & 0xffff;
-	u32 seg_ar, seg_limit, base, *desc;
-
-	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->eflags & VM_MASK)) {
-		base = seg << 4;
-		*eip_limit = base + 0xffff;
-		return base + (eip & 0xffff);
-	}
-
-	/* The standard kernel/user address space limit. */
-	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-
-	/* By far the most common cases. */
-	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return eip;
-
-	/* Check the segment exists, is within the current LDT/GDT size,
-	   that kernel/user (ring 0..3) has the appropriate privilege,
-	   that it's a code segment, and get the limit. */
-	__asm__ ("larl %3,%0; lsll %3,%1"
-		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || eip > seg_limit) {
-		*eip_limit = 0;
-		return 1;	 /* So that returned eip > *eip_limit. */
-	}
-
-	/* Get the GDT/LDT descriptor base.
-	   When you look for races in this code remember that
-	   LDT and other horrors are only used in user space. */
-	if (seg & (1<<2)) {
-		/* Must lock the LDT while reading it. */
-		mutex_lock(&current->mm->context.lock);
-		desc = current->mm->context.ldt;
-		desc = (void *)desc + (seg & ~7);
-	} else {
-		/* Must disable preemption while reading the GDT. */
- 		desc = (u32 *)get_cpu_gdt_table(get_cpu());
-		desc = (void *)desc + (seg & ~7);
-	}
-
-	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((unsigned long *)desc);
-
-	if (seg & (1<<2)) {
-		mutex_unlock(&current->mm->context.lock);
-	} else
-		put_cpu();
-
-	/* Adjust EIP and segment limit, and clamp at the kernel limit.
-	   It's legitimate for segments to wrap at 0xffffffff. */
-	seg_limit += base;
-	if (seg_limit < *eip_limit && seg_limit >= base)
-		*eip_limit = seg_limit;
-	return eip + base;
-}
-
-/*
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- */
-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
-{
-	unsigned long limit;
-	unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
-	int scan_more = 1;
-	int prefetch = 0;
-	int i;
-
-	for (i = 0; scan_more && i < 15; i++) {
-		unsigned char opcode;
-		unsigned char instr_hi;
-		unsigned char instr_lo;
-
-		if (instr > (unsigned char *)limit)
-			break;
-		if (probe_kernel_address(instr, opcode))
-			break;
-
-		instr_hi = opcode & 0xf0;
-		instr_lo = opcode & 0x0f;
-		instr++;
-
-		switch (instr_hi) {
-		case 0x20:
-		case 0x30:
-			/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
-			scan_more = ((instr_lo & 7) == 0x6);
-			break;
-
-		case 0x60:
-			/* 0x64 thru 0x67 are valid prefixes in all modes. */
-			scan_more = (instr_lo & 0xC) == 0x4;
-			break;
-		case 0xF0:
-			/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
-			scan_more = !instr_lo || (instr_lo>>1) == 1;
-			break;
-		case 0x00:
-			/* Prefetch instruction is 0x0F0D or 0x0F18 */
-			scan_more = 0;
-			if (instr > (unsigned char *)limit)
-				break;
-			if (probe_kernel_address(instr, opcode))
-				break;
-			prefetch = (instr_lo == 0xF) &&
-				(opcode == 0x0D || opcode == 0x18);
-			break;
-		default:
-			scan_more = 0;
-			break;
-		}
-	}
-	return prefetch;
-}
-
-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-			      unsigned long error_code)
-{
-	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		     boot_cpu_data.x86 >= 6)) {
-		/* Catch an obscure case of prefetch inside an NX page. */
-		if (nx_enabled && (error_code & 16))
-			return 0;
-		return __is_prefetch(regs, addr);
-	}
-	return 0;
-}
-
-static noinline void force_sig_info_fault(int si_signo, int si_code,
-	unsigned long address, struct task_struct *tsk)
-{
-	siginfo_t info;
-
-	info.si_signo = si_signo;
-	info.si_errno = 0;
-	info.si_code = si_code;
-	info.si_addr = (void __user *)address;
-	force_sig_info(si_signo, &info, tsk);
-}
-
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
-
-#ifdef CONFIG_X86_PAE
-static void dump_fault_path(unsigned long address)
-{
-	unsigned long *p, page;
-	unsigned long mfn;
-
-	page = read_cr3();
-	p  = (unsigned long *)__va(page);
-	p += (address >> 30) * 2;
-	printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
-	if (p[0] & _PAGE_PRESENT) {
-		mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
-		page = mfn_to_pfn(mfn) << PAGE_SHIFT;
-		p  = (unsigned long *)__va(page);
-		address &= 0x3fffffff;
-		p += (address >> 21) * 2;
-		printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n",
-		       page, p[1], p[0]);
-		mfn  = (p[0] >> PAGE_SHIFT) | (p[1] << 20);
-#ifdef CONFIG_HIGHPTE
-		if (mfn_to_pfn(mfn) >= highstart_pfn)
-			return;
-#endif
-		if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) {
-			page = mfn_to_pfn(mfn) << PAGE_SHIFT;
-			p  = (unsigned long *) __va(page);
-			address &= 0x001fffff;
-			p += (address >> 12) * 2;
-			printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
-			       page, p[1], p[0]);
-		}
-	}
-}
-#else
-static void dump_fault_path(unsigned long address)
-{
-	unsigned long page;
-
-	page = read_cr3();
-	page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
-	printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
-	       machine_to_phys(page));
-	/*
-	 * We must not directly access the pte in the highpte
-	 * case if the page table is located in highmem.
-	 * And lets rather not kmap-atomic the pte, just in case
-	 * it's allocated already.
-	 */
-	if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
-	    && (page & _PAGE_PRESENT)
-	    && !(page & _PAGE_PSE)) {
-		page = machine_to_phys(page & PAGE_MASK);
-		page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
-		                                      & (PTRS_PER_PTE - 1)];
-		printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
-		       machine_to_phys(page));
-	}
-}
-#endif
-
-static int spurious_fault(struct pt_regs *regs,
-			  unsigned long address,
-			  unsigned long error_code)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	/* Reserved-bit violation or user access to kernel space? */
-	if (error_code & 0x0c)
-		return 0;
-
-	pgd = init_mm.pgd + pgd_index(address);
-	if (!pgd_present(*pgd))
-		return 0;
-
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return 0;
-
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return 0;
-
-	pte = pte_offset_kernel(pmd, address);
-	if (!pte_present(*pte))
-		return 0;
-	if ((error_code & 0x02) && !pte_write(*pte))
-		return 0;
-#ifdef CONFIG_X86_PAE
-	if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX))
-		return 0;
-#endif
-
-	return 1;
-}
-
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-	unsigned index = pgd_index(address);
-	pgd_t *pgd_k;
-	pud_t *pud, *pud_k;
-	pmd_t *pmd, *pmd_k;
-
-	pgd += index;
-	pgd_k = init_mm.pgd + index;
-
-	if (!pgd_present(*pgd_k))
-		return NULL;
-
-	/*
-	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
-	 * and redundant with the set_pmd() on non-PAE. As would
-	 * set_pud.
-	 */
-
-	pud = pud_offset(pgd, address);
-	pud_k = pud_offset(pgd_k, address);
-	if (!pud_present(*pud_k))
-		return NULL;
-
-	pmd = pmd_offset(pud, address);
-	pmd_k = pmd_offset(pud_k, address);
-	if (!pmd_present(*pmd_k))
-		return NULL;
-	if (!pmd_present(*pmd)) {
-		bool lazy = x86_read_percpu(xen_lazy_mmu);
-
-		x86_write_percpu(xen_lazy_mmu, false);
-#if CONFIG_XEN_COMPAT > 0x030002
-		set_pmd(pmd, *pmd_k);
-#else
-		/*
-		 * When running on older Xen we must launder *pmd_k through
-		 * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
-		 */
-		set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
-#endif
-		x86_write_percpu(xen_lazy_mmu, lazy);
-	} else
-		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	return pmd_k;
-}
-
-/*
- * Handle a fault on the vmalloc or module mapping area
- *
- * This assumes no large pages in there.
- */
-static inline int vmalloc_fault(unsigned long address)
-{
-	unsigned long pgd_paddr;
-	pmd_t *pmd_k;
-	pte_t *pte_k;
-	/*
-	 * Synchronize this task's top level page-table
-	 * with the 'reference' page table.
-	 *
-	 * Do _not_ use "current" here. We might be inside
-	 * an interrupt in the middle of a task switch..
-	 */
-	pgd_paddr = read_cr3();
-	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-	if (!pmd_k)
-		return -1;
-	pte_k = pte_offset_kernel(pmd_k, address);
-	if (!pte_present(*pte_k))
-		return -1;
-	return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- *
- * error_code:
- *	bit 0 == 0 means no page found, 1 means protection fault
- *	bit 1 == 0 means read, 1 means write
- *	bit 2 == 0 means kernel, 1 means user-mode
- *	bit 3 == 1 means use of reserved bit detected
- *	bit 4 == 1 means fault was an instruction fetch
- */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	unsigned long address;
-	int write, si_code;
-	int fault;
-
-	/*
-	 * We can fault from pretty much anywhere, with unknown IRQ state.
-	 */
-	trace_hardirqs_fixup();
-
-	/* get the address */
-        address = read_cr2();
-
-	/* Set the "privileged fault" bit to something sane. */
-	error_code &= ~4;
-	error_code |= (regs->xcs & 2) << 1;
-	if (regs->eflags & X86_EFLAGS_VM)
-		error_code |= 4;
-
-	tsk = current;
-
-	si_code = SEGV_MAPERR;
-
-	/*
-	 * We fault-in kernel-space virtual memory on-demand. The
-	 * 'reference' page table is init_mm.pgd.
-	 *
-	 * NOTE! We MUST NOT take any locks for this case. We may
-	 * be in an interrupt or a critical region, and should
-	 * only copy the information from the master page table,
-	 * nothing more.
-	 *
-	 * This verifies that the fault happens in kernel space
-	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 9) == 0.
-	 */
-	if (unlikely(address >= TASK_SIZE)) {
-#ifdef CONFIG_XEN
-		/* Faults in hypervisor area can never be patched up. */
-		if (address >= hypervisor_virt_start)
-			goto bad_area_nosemaphore;
-#endif
-		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
-			return;
-		/* Can take a spurious fault if mapping changes R/O -> R/W. */
-		if (spurious_fault(regs, address, error_code))
-			return;
-		if (notify_page_fault(regs))
-			return;
-		/*
-		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
-		 */
-		goto bad_area_nosemaphore;
-	}
-
-	if (notify_page_fault(regs))
-		return;
-
-	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
-	   fault has been handled. */
-	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-		local_irq_enable();
-
-	mm = tsk->mm;
-
-	/*
-	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault..
-	 */
-	if (in_atomic() || !mm)
-		goto bad_area_nosemaphore;
-
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & 4) == 0 &&
-		    !search_exception_tables(regs->eip))
-			goto bad_area_nosemaphore;
-		down_read(&mm->mmap_sem);
-	}
-
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (error_code & 4) {
-		/*
-		 * Accessing the stack below %esp is always a bug.
-		 * The large cushion allows instructions like enter
-		 * and pusha to work.  ("enter $65535,$31" pushes
-		 * 32 pointers and then decrements %esp by 65535.)
-		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-	si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & 3) {
-		default:	/* 3: write, present */
-				/* fall through */
-		case 2:		/* write, not present */
-			if (!(vma->vm_flags & VM_WRITE))
-				goto bad_area;
-			write++;
-			break;
-		case 1:		/* read, present */
-			goto bad_area;
-		case 0:		/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-				goto bad_area;
-	}
-
- survive:
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, write);
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
-	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
-
-	/*
-	 * Did it hit the DOS screen memory VA from vm86 mode?
-	 */
-	if (regs->eflags & VM_MASK) {
-		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-		if (bit < 32)
-			tsk->thread.screen_bitmap |= 1 << bit;
-	}
-	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & 4) {
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		/*
-		 * Valid to do another page fault here because this one came
-		 * from user space.
-		 */
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk("%s%s[%d]: segfault at %08lx eip %08lx "
-			    "esp %08lx error %lx\n",
-			    task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			    tsk->comm, task_pid_nr(tsk), address, regs->eip,
-			    regs->esp, error_code);
-		}
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-		return;
-	}
-
-#ifdef CONFIG_X86_F00F_BUG
-	/*
-	 * Pentium F0 0F C7 C8 bug workaround.
-	 */
-	if (boot_cpu_data.f00f_bug) {
-		unsigned long nr;
-
-		nr = (address - idt_descr.address) >> 3;
-
-		if (nr == 6) {
-			do_invalid_op(regs, 0);
-			return;
-		}
-	}
-#endif
-
-no_context:
-	/* Are we prepared to handle this kernel fault?  */
-	if (fixup_exception(regs))
-		return;
-
-	/*
-	 * Valid to do another page fault here, because if this fault
-	 * had been triggered by is_prefetch fixup_exception would have
-	 * handled it.
-	 */
- 	if (is_prefetch(regs, address, error_code))
- 		return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-	bust_spinlocks(1);
-
-	if (oops_may_print()) {
-#ifdef CONFIG_X86_PAE
-		if (error_code & 16) {
-			pte_t *pte = lookup_address(address);
-
-			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-				printk(KERN_CRIT "kernel tried to execute "
-					"NX-protected page - exploit attempt? "
-					"(uid: %d)\n", current->uid);
-		}
-#endif
-		if (address < PAGE_SIZE)
-			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
-					"pointer dereference");
-		else
-			printk(KERN_ALERT "BUG: unable to handle kernel paging"
-					" request");
-		printk(" at virtual address %08lx\n",address);
-		printk(KERN_ALERT "printing eip: %08lx\n", regs->eip);
-		dump_fault_path(address);
-	}
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	die("Oops", regs, error_code);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (error_code & 4)
-		do_group_exit(SIGKILL);
-	goto no_context;
-
-do_sigbus:
-	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & 4))
-		goto no_context;
-
-	/* User space => ok to do another page fault */
-	if (is_prefetch(regs, address, error_code))
-		return;
-
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-void vmalloc_sync_all(void)
-{
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
-	 *      This change works just fine with 2-level paging too.
-	 */
-#define sync_index(a) ((a) >> PMD_SHIFT)
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
-	static unsigned long start = TASK_SIZE;
-	unsigned long address;
-
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
-	for (address = start;
-	     address >= TASK_SIZE && address < hypervisor_virt_start;
-	     address += 1UL << PMD_SHIFT) {
-		if (!test_bit(sync_index(address), insync)) {
-			unsigned long flags;
-			struct page *page;
-
-			spin_lock_irqsave(&pgd_lock, flags);
-			/* XEN: failure path assumes non-empty pgd_list. */
-			if (unlikely(!pgd_list)) {
-				spin_unlock_irqrestore(&pgd_lock, flags);
-				return;
-			}
-			for (page = pgd_list; page; page =
-					(struct page *)page->index)
-				if (!vmalloc_sync_one(page_address(page),
-								address)) {
-					BUG_ON(page != pgd_list);
-					break;
-				}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			if (!page)
-				set_bit(sync_index(address), insync);
-		}
-		if (address == start && test_bit(sync_index(address), insync))
-			start = address + (1UL << PMD_SHIFT);
-	}
-}
--- head-2010-04-29.orig/arch/x86/mm/fault_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,686 +0,0 @@
-/*
- *  linux/arch/x86-64/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/compiler.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm-generic/sections.h>
-
-/* Page fault error code bits */
-#define PF_PROT	(1<<0)		/* or no page found */
-#define PF_WRITE	(1<<1)
-#define PF_USER	(1<<2)
-#define PF_RSVD	(1<<3)
-#define PF_INSTR	(1<<4)
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
-
-/* Sometimes the CPU reports invalid exceptions on prefetch.
-   Check that here and ignore.
-   Opcode checker based on code by Richard Brunner */
-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-				unsigned long error_code)
-{
-	unsigned char *instr;
-	int scan_more = 1;
-	int prefetch = 0;
-	unsigned char *max_instr;
-
-	/* If it was a exec fault ignore */
-	if (error_code & PF_INSTR)
-		return 0;
-
-	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
-	max_instr = instr + 15;
-
-	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
-		return 0;
-
-	while (scan_more && instr < max_instr) {
-		unsigned char opcode;
-		unsigned char instr_hi;
-		unsigned char instr_lo;
-
-		if (probe_kernel_address(instr, opcode))
-			break;
-
-		instr_hi = opcode & 0xf0;
-		instr_lo = opcode & 0x0f;
-		instr++;
-
-		switch (instr_hi) {
-		case 0x20:
-		case 0x30:
-			/* Values 0x26,0x2E,0x36,0x3E are valid x86
-			   prefixes.  In long mode, the CPU will signal
-			   invalid opcode if some of these prefixes are
-			   present so we will never get here anyway */
-			scan_more = ((instr_lo & 7) == 0x6);
-			break;
-
-		case 0x40:
-			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
-			   Need to figure out under what instruction mode the
-			   instruction was issued ... */
-			/* Could check the LDT for lm, but for now it's good
-			   enough to assume that long mode only uses well known
-			   segments or kernel. */
-			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
-			break;
-
-		case 0x60:
-			/* 0x64 thru 0x67 are valid prefixes in all modes. */
-			scan_more = (instr_lo & 0xC) == 0x4;
-			break;
-		case 0xF0:
-			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
-			scan_more = !instr_lo || (instr_lo>>1) == 1;
-			break;
-		case 0x00:
-			/* Prefetch instruction is 0x0F0D or 0x0F18 */
-			scan_more = 0;
-			if (probe_kernel_address(instr, opcode))
-				break;
-			prefetch = (instr_lo == 0xF) &&
-				(opcode == 0x0D || opcode == 0x18);
-			break;
-		default:
-			scan_more = 0;
-			break;
-		}
-	}
-	return prefetch;
-}
-
-static int bad_address(void *p)
-{
-	unsigned long dummy;
-	return probe_kernel_address((unsigned long *)p, dummy);
-}
-
-void dump_pagetable(unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = (pgd_t *)read_cr3();
-
-	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
-	pgd += pgd_index(address);
-	if (bad_address(pgd)) goto bad;
-	printk("PGD %lx ", pgd_val(*pgd));
-	if (!pgd_present(*pgd)) goto ret;
-
-	pud = pud_offset(pgd, address);
-	if (bad_address(pud)) goto bad;
-	printk("PUD %lx ", pud_val(*pud));
-	if (!pud_present(*pud))	goto ret;
-
-	pmd = pmd_offset(pud, address);
-	if (bad_address(pmd)) goto bad;
-	printk("PMD %lx ", pmd_val(*pmd));
-	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
-
-	pte = pte_offset_kernel(pmd, address);
-	if (bad_address(pte)) goto bad;
-	printk("PTE %lx", pte_val(*pte));
-ret:
-	printk("\n");
-	return;
-bad:
-	printk("BAD\n");
-}
-
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/* Workaround for K8 erratum #93 & buggy BIOS.
-   BIOS SMM functions are required to use a specific workaround
-   to avoid corruption of the 64bit RIP register on C stepping K8.
-   A lot of BIOS that didn't get tested properly miss this.
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
-   Try to work around it here.
-   Note we only handle faults in kernel here. */
-
-static int is_errata93(struct pt_regs *regs, unsigned long address)
-{
-	static int warned;
-	if (address != regs->rip)
-		return 0;
-	if ((address >> 32) != 0)
-		return 0;
-	address |= 0xffffffffUL << 32;
-	if ((address >= (u64)_stext && address <= (u64)_etext) ||
-	    (address >= MODULES_VADDR && address <= MODULES_END)) {
-		if (!warned) {
-			printk(errata93_warning);
-			warned = 1;
-		}
-		regs->rip = address;
-		return 1;
-	}
-	return 0;
-}
-
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-				 unsigned long error_code)
-{
-	unsigned long flags = oops_begin();
-	struct task_struct *tsk;
-
-	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-	       current->comm, address);
-	dump_pagetable(address);
-	tsk = current;
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	__die("Bad pagetable", regs, error_code);
-	oops_end(flags);
-	do_exit(SIGKILL);
-}
-
-/*
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-	pgd_t *pgd, *pgd_ref;
-	pud_t *pud, *pud_ref;
-	pmd_t *pmd, *pmd_ref;
-	pte_t *pte, *pte_ref;
-
-	/* Copy kernel mappings over when needed. This can also
-	   happen within a race in page table update. In the later
-	   case just flush. */
-
-	/* On Xen the line below does not always work. Needs investigating! */
-	/*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
-	pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
-	pgd += pgd_index(address);
-	pgd_ref = pgd_offset_k(address);
-	if (pgd_none(*pgd_ref))
-		return -1;
-	if (pgd_none(*pgd))
-		set_pgd(pgd, *pgd_ref);
-	else
-		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
-	/* Below here mismatches are bugs because these lower tables
-	   are shared */
-
-	pud = pud_offset(pgd, address);
-	pud_ref = pud_offset(pgd_ref, address);
-	if (pud_none(*pud_ref))
-		return -1;
-	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-		BUG();
-	pmd = pmd_offset(pud, address);
-	pmd_ref = pmd_offset(pud_ref, address);
-	if (pmd_none(*pmd_ref))
-		return -1;
-	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-		BUG();
-	pte_ref = pte_offset_kernel(pmd_ref, address);
-	if (!pte_present(*pte_ref))
-		return -1;
-	pte = pte_offset_kernel(pmd, address);
-	/* Don't use pte_page here, because the mappings can point
-	   outside mem_map, and the NUMA hash lookup cannot handle
-	   that. */
-	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-		BUG();
-	return 0;
-}
-
-int show_unhandled_signals = 1;
-
-
-#define MEM_VERBOSE 1
-
-#ifdef MEM_VERBOSE
-#define MEM_LOG(_f, _a...)			\
-	printk("fault.c:[%d]-> " _f "\n",	\
-	__LINE__ , ## _a )
-#else
-#define MEM_LOG(_f, _a...) ((void)0)
-#endif
-
-static int spurious_fault(struct pt_regs *regs,
-			  unsigned long address,
-			  unsigned long error_code)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-#ifdef CONFIG_XEN
-	/* Faults in hypervisor area are never spurious. */
-	if ((address >= HYPERVISOR_VIRT_START) &&
-	    (address < HYPERVISOR_VIRT_END))
-		return 0;
-#endif
-
-	/* Reserved-bit violation or user access to kernel space? */
-	if (error_code & (PF_RSVD|PF_USER))
-		return 0;
-
-	pgd = init_mm.pgd + pgd_index(address);
-	if (!pgd_present(*pgd))
-		return 0;
-
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return 0;
-
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return 0;
-
-	pte = pte_offset_kernel(pmd, address);
-	if (!pte_present(*pte))
-		return 0;
-	if ((error_code & PF_WRITE) && !pte_write(*pte))
-		return 0;
-	if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX))
-		return 0;
-
-	return 1;
-}
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-					unsigned long error_code)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	unsigned long address;
-	const struct exception_table_entry *fixup;
-	int write, fault;
-	unsigned long flags;
-	siginfo_t info;
-
-	if (!user_mode(regs))
-		error_code &= ~PF_USER; /* means kernel */
-
-	/*
-	 * We can fault from pretty much anywhere, with unknown IRQ state.
-	 */
-	trace_hardirqs_fixup();
-
-	tsk = current;
-	mm = tsk->mm;
-	prefetchw(&mm->mmap_sem);
-
-	/* get the address */
-	address = read_cr2();
-
-	info.si_code = SEGV_MAPERR;
-
-
-	/*
-	 * We fault-in kernel-space virtual memory on-demand. The
-	 * 'reference' page table is init_mm.pgd.
-	 *
-	 * NOTE! We MUST NOT take any locks for this case. We may
-	 * be in an interrupt or a critical region, and should
-	 * only copy the information from the master page table,
-	 * nothing more.
-	 *
-	 * This verifies that the fault happens in kernel space
-	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 9) == 0.
-	 */
-	if (unlikely(address >= TASK_SIZE64)) {
-		/*
-		 * Don't check for the module range here: its PML4
-		 * is always initialized because it's shared with the main
-		 * kernel text. Only vmalloc may need PML4 syncups.
-		 */
-		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
-			if (vmalloc_fault(address) >= 0)
-				return;
-		}
-		/* Can take a spurious fault if mapping changes R/O -> R/W. */
-		if (spurious_fault(regs, address, error_code))
-			return;
-		if (notify_page_fault(regs))
-			return;
-		/*
-		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
-		 */
-		goto bad_area_nosemaphore;
-	}
-
-	if (notify_page_fault(regs))
-		return;
-
-	if (likely(regs->eflags & X86_EFLAGS_IF))
-		local_irq_enable();
-
-	if (unlikely(error_code & PF_RSVD))
-		pgtable_bad(address, regs, error_code);
-
-	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
-	 */
-	if (unlikely(in_atomic() || !mm))
-		goto bad_area_nosemaphore;
-
-	/*
-	 * User-mode registers count as a user access even for any
-	 * potential system fault or CPU buglet.
-	 */
-	if (user_mode_vm(regs))
-		error_code |= PF_USER;
-
- again:
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->rip))
-			goto bad_area_nosemaphore;
-		down_read(&mm->mmap_sem);
-	}
-
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (likely(vma->vm_start <= address))
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (error_code & 4) {
-		/* Allow userspace just enough access below the stack pointer
-		 * to let the 'enter' instruction work.
-		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-	info.si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & (PF_PROT|PF_WRITE)) {
-		default:	/* 3: write, present */
-			/* fall through */
-		case PF_WRITE:		/* write, not present */
-			if (!(vma->vm_flags & VM_WRITE))
-				goto bad_area;
-			write++;
-			break;
-		case PF_PROT:		/* read, present */
-			goto bad_area;
-		case 0:			/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-				goto bad_area;
-	}
-
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, write);
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
-	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
-	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
-
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		/* Work around K8 erratum #100 K8 in compat mode
-		   occasionally jumps to illegal addresses >4GB.  We
-		   catch this here in the page fault handler because
-		   these addresses are not reachable. Just detect this
-		   case and return.  Any code segment in LDT is
-		   compatibility mode. */
-		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
-		    (address >> 32))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(
-		       "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
-					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-					tsk->comm, tsk->pid, address, regs->rip,
-					regs->rsp, error_code);
-		}
-
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGSEGV, &info, tsk);
-		return;
-	}
-
-no_context:
-
-	/* Are we prepared to handle this kernel fault?  */
-	fixup = search_exception_tables(regs->rip);
-	if (fixup) {
-		regs->rip = fixup->fixup;
-		return;
-	}
-
-	/*
-	 * Hall of shame of CPU/BIOS bugs.
-	 */
-
- 	if (is_prefetch(regs, address, error_code))
- 		return;
-
-	if (is_errata93(regs, address))
-		return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-	flags = oops_begin();
-
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
-	printk(" at %016lx RIP: \n" KERN_ALERT,address);
-	printk_address(regs->rip);
-	dump_pagetable(address);
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	__die("Oops", regs, error_code);
-	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags);
-	do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(current)) {
-		yield();
-		goto again;
-	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (error_code & 4)
-		do_group_exit(SIGKILL);
-	goto no_context;
-
-do_sigbus:
-	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & PF_USER))
-		goto no_context;
-
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGBUS, &info, tsk);
-	return;
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-	/* Note that races in the updates of insync and start aren't
-	   problematic:
-	   insync can only get set bits added, and updates to start are only
-	   improving performance (without affecting correctness if undone). */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = VMALLOC_START & PGDIR_MASK;
-	unsigned long address;
-
-	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			const pgd_t *pgd_ref = pgd_offset_k(address);
-			struct page *page;
-
-			if (pgd_none(*pgd_ref))
-				continue;
-			spin_lock(&pgd_lock);
-			list_for_each_entry(page, &pgd_list, lru) {
-				pgd_t *pgd;
-				pgd = (pgd_t *)page_address(page) + pgd_index(address);
-				if (pgd_none(*pgd))
-					set_pgd(pgd, *pgd_ref);
-				else
-					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-			}
-			spin_unlock(&pgd_lock);
-			set_bit(pgd_index(address), insync);
-		}
-		if (address == start)
-			start = address + PGDIR_SIZE;
-	}
-	/* Check that there is no need to do the same for the modules area. */
-	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
-				(__START_KERNEL & PGDIR_MASK)));
-}
--- head-2010-04-29.orig/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }

+static void debug_kmap_atomic_prot(enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+	static unsigned warn_count = 10;
+
+	if (unlikely(warn_count == 0))
+		return;
+
+	if (unlikely(in_interrupt())) {
+		if (in_irq()) {
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		} else if (!irqs_disabled()) {	/* softirq */
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+			    type != KM_SKB_SUNRPC_DATA &&
+			    type != KM_SKB_DATA_SOFTIRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		}
+	}
+
+	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+		if (!irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+		if (irq_count() == 0 && !irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	}
+#endif
+}
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page
 	if (!PageHighMem(page))
 		return page_address(page);

+	debug_kmap_atomic_prot(type);
+
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
--- head-2010-04-29.orig/arch/x86/mm/hypervisor.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/hypervisor.c	2010-03-24 15:10:37.000000000 +0100
@@ -873,15 +873,11 @@ int xen_limit_pages_to_max_mfn(
 }
 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);

-#ifdef __i386__
-int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
 {
-	__u32 *lp = (__u32 *)((char *)ldt + entry * 8);
-	maddr_t mach_lp = arbitrary_virt_to_machine(lp);
-	return HYPERVISOR_update_descriptor(
-		mach_lp, (u64)entry_a | ((u64)entry_b<<32));
+	maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
+	return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
 }
-#endif

 #define MAX_BATCHED_FULL_PTES 32

--- head-2010-04-29.orig/arch/x86/mm/init_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/init_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -27,13 +27,13 @@
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
-#include <linux/efi.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>

+#include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -42,18 +42,22 @@
 #include <asm/fixmap.h>
 #include <asm/e820.h>
 #include <asm/apic.h>
+#include <asm/bugs.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/hypervisor.h>
 #include <asm/swiotlb.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>

 unsigned int __VMALLOC_RESERVE = 128 << 20;

 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;

-static int noinline do_test_wp_bit(void);
+static noinline int do_test_wp_bit(void);

 /*
  * Creates a middle page table and puts a pointer to it in the
@@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init(
 {
 	pud_t *pud;
 	pmd_t *pmd_table;
-
+
 #ifdef CONFIG_X86_PAE
 	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
 		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);

-		paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+		paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
-		if (pmd_table != pmd_offset(pud, 0))
-			BUG();
+		BUG_ON(pmd_table != pmd_offset(pud, 0));
 	}
 #endif
 	pud = pud_offset(pgd, 0);
@@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init(

 /*
  * Create a page table and place a pointer to it in a middle page
- * directory entry.
+ * directory entry:
  */
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
@@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
-		if (!page_table)
+		if (!page_table) {
 			page_table =
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		}

 		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(page_table,
@@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini
 }

 /*
- * This function initializes a certain range of kernel virtual memory
+ * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
  * the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space
- * so we can cache the place of the first one and move around without
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
  * checking the pgd every time.
  */
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 {
-	pgd_t *pgd;
-	pmd_t *pmd;
 	int pgd_idx, pmd_idx;
 	unsigned long vaddr;
+	pgd_t *pgd;
+	pmd_t *pmd;

 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
@@ -139,7 +142,8 @@ static void __init page_table_range_init
 	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
 		pmd = one_md_table_init(pgd);
 		pmd = pmd + pmd_index(vaddr);
-		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+							pmd++, pmd_idx++) {
 			if (vaddr < hypervisor_virt_start)
 				one_page_table_init(pmd);

@@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne
 }

 /*
- * This maps the physical memory to kernel virtual address space, a total
- * of max_low_pfn pages, by creating page tables starting from address
- * PAGE_OFFSET.
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
  */
 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 {
+	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	int pgd_idx, pmd_idx, pte_ofs;

 	unsigned long max_ram_pfn = xen_start_info->nr_pages;
 	if (max_ram_pfn > max_low_pfn)
@@ -195,36 +199,49 @@ static void __init kernel_physical_mappi
 		if (pfn >= max_low_pfn)
 			continue;
 		pmd += pmd_idx;
-		for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-			unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
-			if (address >= hypervisor_virt_start)
+		for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+		     pmd++, pmd_idx++) {
+			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+			if (addr >= hypervisor_virt_start)
 				continue;

-			/* Map with big pages if possible, otherwise create normal page tables. */
+			/*
+			 * Map with big pages if possible, otherwise
+			 * create normal page tables:
+			 */
 			if (cpu_has_pse) {
-				unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-				if (is_kernel_text(address) || is_kernel_text(address2))
-					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
-				else
-					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+				unsigned int addr2;
+				pgprot_t prot = PAGE_KERNEL_LARGE;
+
+				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+					PAGE_OFFSET + PAGE_SIZE-1;
+
+				if (is_kernel_text(addr) ||
+				    is_kernel_text(addr2))
+					prot = PAGE_KERNEL_LARGE_EXEC;
+
+				set_pmd(pmd, pfn_pmd(pfn, prot));

 				pfn += PTRS_PER_PTE;
-			} else {
-				pte = one_page_table_init(pmd);
+				continue;
+			}
+			pte = one_page_table_init(pmd);
+
+			for (pte += pte_ofs;
+			     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+				pgprot_t prot = PAGE_KERNEL;
+
+				/* XEN: Only map initial RAM allocation. */
+				if ((pfn >= max_ram_pfn) || pte_present(*pte))
+					continue;
+				if (is_kernel_text(addr))
+					prot = PAGE_KERNEL_EXEC;

-				for (pte += pte_ofs;
-				     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
-				     pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
-					/* XEN: Only map initial RAM allocation. */
-					if ((pfn >= max_ram_pfn) || pte_present(*pte))
-						continue;
-					if (is_kernel_text(address))
-						set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-					else
-						set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-				}
-				pte_ofs = 0;
+				set_pte(pte, pfn_pte(pfn, prot));
 			}
+			pte_ofs = 0;
 		}
 		pmd_idx = 0;
 	}
@@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign

 #endif

-int page_is_ram(unsigned long pagenr)
-{
-	int i;
-	unsigned long addr, end;
-
-	if (efi_enabled) {
-		efi_memory_desc_t *md;
-		void *p;
-
-		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-			md = p;
-			if (!is_available_memory(md))
-				continue;
-			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-			if ((pagenr >= addr) && (pagenr < end))
-				return 1;
-		}
-		return 0;
-	}
-
-	for (i = 0; i < e820.nr_map; i++) {
-
-		if (e820.map[i].type != E820_RAM)	/* not usable memory */
-			continue;
-		/*
-		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
-		 *	are not. Notably the 640->1Mb area. We need a sanity
-		 *	check here.
-		 */
-		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
-		if  ((pagenr >= addr) && (pagenr < end))
-			return 1;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
 pgprot_t kmap_prot;

-#define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}

 static void __init kmap_init(void)
 {
 	unsigned long kmap_vstart;

-	/* cache the first kmap pte */
+	/*
+	 * Cache the first kmap pte:
+	 */
 	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
 	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);

@@ -304,11 +287,11 @@ static void __init kmap_init(void)

 static void __init permanent_kmaps_init(pgd_t *pgd_base)
 {
+	unsigned long vaddr;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned long vaddr;

 	vaddr = PKMAP_BASE;
 	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -317,7 +300,7 @@ static void __init permanent_kmaps_init(
 	pud = pud_offset(pgd, vaddr);
 	pmd = pmd_offset(pud, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;
+	pkmap_page_table = pte;
 }

 static void __meminit free_new_highpage(struct page *page, int pfn)
@@ -336,7 +319,8 @@ void __init add_one_highpage_init(struct
 		SetPageReserved(page);
 }

-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+static int __meminit
+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
 {
 	free_new_highpage(page, pfn);
 	totalram_pages++;
@@ -344,6 +328,7 @@ static int __meminit add_one_highpage_ho
 	max_mapnr = max(pfn, max_mapnr);
 #endif
 	num_physpages++;
+
 	return 0;
 }

@@ -351,7 +336,7 @@ static int __meminit add_one_highpage_ho
  * Not currently handling the NUMA case.
  * Assuming single node and all memory that
  * has been added dynamically that would be
- * onlined here is in HIGHMEM
+ * onlined here is in HIGHMEM.
  */
 void __meminit online_page(struct page *page)
 {
@@ -359,13 +344,11 @@ void __meminit online_page(struct page *
 	add_one_highpage_hotplug(page, page_to_pfn(page));
 }

-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
+#ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
+
 	for (pfn = highstart_pfn; pfn < highend_pfn
 				  && pfn < xen_start_info->nr_pages; pfn++) {
 		/*
@@ -383,23 +366,18 @@ static void __init set_highmem_pages_ini

 	totalram_pages += totalhigh_pages;
 }
-#endif /* CONFIG_FLATMEM */
+#endif /* !CONFIG_NUMA */

 #else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define kmap_init()				do { } while (0)
+# define permanent_kmaps_init(pgd_base)		do { } while (0)
+# define set_highmem_pages_init(bad_ppro)	do { } while (0)
 #endif /* CONFIG_HIGHMEM */

-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
 EXPORT_SYMBOL(__PAGE_KERNEL);
-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;

-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;

 pgd_t *swapper_pg_dir;

@@ -417,9 +395,8 @@ static void __init xen_pagetable_setup_d
  * the boot process.
  *
  * If we're booting on native hardware, this will be a pagetable
- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
- * (even if we'll end up running in PAE).  The root of the pagetable
- * will be swapper_pg_dir.
+ * constructed in arch/x86/kernel/head_32.S.  The root of the
+ * pagetable will be swapper_pg_dir.
  *
  * If we're booting paravirtualized under a hypervisor, then there are
  * more options: we may already be running PAE, and the pagetable may
@@ -431,10 +408,10 @@ static void __init xen_pagetable_setup_d
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init pagetable_init (void)
+static void __init pagetable_init(void)
 {
-	unsigned long vaddr, end;
 	pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
+	unsigned long vaddr, end;

 	xen_pagetable_setup_start(pgd_base);

@@ -456,34 +433,36 @@ static void __init pagetable_init (void)
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
+	early_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
+	early_ioremap_reset();

 	permanent_kmaps_init(pgd_base);

 	xen_pagetable_setup_done(pgd_base);
 }

-#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
+#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
 /*
- * Swap suspend & friends need this for resume because things like the intel-agp
+ * ACPI suspend needs this for resume, because things like the intel-agp
  * driver might have split up a kernel 4MB mapping.
  */
-char __nosavedata swsusp_pg_dir[PAGE_SIZE]
-	__attribute__ ((aligned (PAGE_SIZE)));
+char swsusp_pg_dir[PAGE_SIZE]
+	__attribute__ ((aligned(PAGE_SIZE)));

 static inline void save_pg_dir(void)
 {
 	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
 }
-#else
+#else /* !CONFIG_ACPI_SLEEP */
 static inline void save_pg_dir(void)
 {
 }
-#endif
+#endif /* !CONFIG_ACPI_SLEEP */

-void zap_low_mappings (void)
+void zap_low_mappings(void)
 {
 	int i;

@@ -495,22 +474,24 @@ void zap_low_mappings (void)
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
 		set_pgd(swapper_pg_dir+i, __pgd(0));
 #endif
+	}
 	flush_tlb_all();
 }

-int nx_enabled = 0;
+int nx_enabled;
+
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);

 #ifdef CONFIG_X86_PAE

-static int disable_nx __initdata = 0;
-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
+static int disable_nx __initdata;

 /*
  * noexec = on|off
@@ -527,11 +508,14 @@ static int __init noexec_setup(char *str
 			__supported_pte_mask |= _PAGE_NX;
 			disable_nx = 0;
 		}
-	} else if (!strcmp(str,"off")) {
-		disable_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	} else
-		return -EINVAL;
+	} else {
+		if (!strcmp(str, "off")) {
+			disable_nx = 1;
+			__supported_pte_mask &= ~_PAGE_NX;
+		} else {
+			return -EINVAL;
+		}
+	}

 	return 0;
 }
@@ -543,6 +527,7 @@ static void __init set_nx(void)

 	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
 		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
 		if ((v[3] & (1 << 20)) && !disable_nx) {
 			rdmsr(MSR_EFER, l, h);
 			l |= EFER_NX;
@@ -552,35 +537,6 @@ static void __init set_nx(void)
 		}
 	}
 }
-
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
-	pte_t *pte;
-	int ret = 1;
-
-	if (!nx_enabled)
-		goto out;
-
-	pte = lookup_address(vaddr);
-	BUG_ON(!pte);
-
-	if (!pte_exec_kernel(*pte))
-		ret = 0;
-
-	if (enable)
-		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-	else
-		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
-	pte_update_defer(&init_mm, vaddr, pte);
-	__flush_tlb_all();
-out:
-	return ret;
-}
-
 #endif

 /*
@@ -597,21 +553,10 @@ void __init paging_init(void)
 #ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
-		printk("NX (Execute Disable) protection: active\n");
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
 #endif
-
 	pagetable_init();

-#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
-	/*
-	 * We will bail out later - printk doesn't work right now so
-	 * the user would just see a hanging kernel.
-	 * when running as xen domain we are already in PAE mode at
-	 * this point.
-	 */
-	if (cpu_has_pae)
-		set_in_cr4(X86_CR4_PAE);
-#endif
 	__flush_tlb_all();

 	kmap_init();
@@ -638,10 +583,10 @@ void __init paging_init(void)
  * used to involve black magic jumps to work around some nasty CPU bugs,
  * but fortunately the switch to using exceptions got rid of all that.
  */
-
 static void __init test_wp_bit(void)
 {
-	printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+	printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");

 	/* Any page-aligned address will do, the test is non-destructive */
 	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -649,23 +594,22 @@ static void __init test_wp_bit(void)
 	clear_fixmap(FIX_WP_TEST);

 	if (!boot_cpu_data.wp_works_ok) {
-		printk("No.\n");
+		printk(KERN_CONT "No.\n");
 #ifdef CONFIG_X86_WP_WORKS_OK
-		panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+		panic(
+  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
 #endif
 	} else {
-		printk("Ok.\n");
+		printk(KERN_CONT "Ok.\n");
 	}
 }

-static struct kcore_list kcore_mem, kcore_vmalloc;
+static struct kcore_list kcore_mem, kcore_vmalloc;

 void __init mem_init(void)
 {
-	extern int ppro_with_ram_bug(void);
 	int codesize, reservedpages, datasize, initsize;
-	int tmp;
-	int bad_ppro;
+	int tmp, bad_ppro;
 	unsigned long pfn;

 #if defined(CONFIG_SWIOTLB)
@@ -675,19 +619,19 @@ void __init mem_init(void)
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
-
 	bad_ppro = ppro_with_ram_bug();

 #ifdef CONFIG_HIGHMEM
 	/* check that fixmap and pkmap do not overlap */
-	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-		printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+	if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+		printk(KERN_ERR
+			"fixmap and kmap areas overlap - this will crash\n");
 		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-				PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+				PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
+				FIXADDR_START);
 		BUG();
 	}
 #endif
-
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 	/* XEN: init low-mem pages outside initial allocation. */
@@ -699,7 +643,7 @@ void __init mem_init(void)
 	reservedpages = 0;
 	for (tmp = 0; tmp < max_low_pfn; tmp++)
 		/*
-		 * Only count reserved RAM pages
+		 * Only count reserved RAM pages:
 		 */
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
@@ -710,11 +654,12 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		   VMALLOC_END-VMALLOC_START);

-	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
@@ -725,54 +670,53 @@ void __init mem_init(void)
 	       );

 #if 1 /* double-sanity-check paranoia */
-	printk("virtual kernel memory layout:\n"
-	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	printk(KERN_INFO "virtual kernel memory layout:\n"
+		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
-	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
-	       "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-	       "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-	       "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-	       "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
-	       FIXADDR_START, FIXADDR_TOP,
-	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+		FIXADDR_START, FIXADDR_TOP,
+		(FIXADDR_TOP - FIXADDR_START) >> 10,

 #ifdef CONFIG_HIGHMEM
-	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-	       (LAST_PKMAP*PAGE_SIZE) >> 10,
+		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+		(LAST_PKMAP*PAGE_SIZE) >> 10,
 #endif

-	       VMALLOC_START, VMALLOC_END,
-	       (VMALLOC_END - VMALLOC_START) >> 20,
+		VMALLOC_START, VMALLOC_END,
+		(VMALLOC_END - VMALLOC_START) >> 20,

-	       (unsigned long)__va(0), (unsigned long)high_memory,
-	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+		(unsigned long)__va(0), (unsigned long)high_memory,
+		((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,

-	       (unsigned long)&__init_begin, (unsigned long)&__init_end,
-	       ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+		(unsigned long)&__init_begin, (unsigned long)&__init_end,
+		((unsigned long)&__init_end -
+		 (unsigned long)&__init_begin) >> 10,

-	       (unsigned long)&_etext, (unsigned long)&_edata,
-	       ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+		(unsigned long)&_etext, (unsigned long)&_edata,
+		((unsigned long)&_edata - (unsigned long)&_etext) >> 10,

-	       (unsigned long)&_text, (unsigned long)&_etext,
-	       ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+		(unsigned long)&_text, (unsigned long)&_etext,
+		((unsigned long)&_etext - (unsigned long)&_text) >> 10);

 #ifdef CONFIG_HIGHMEM
-	BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
-	BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUG_ON(VMALLOC_END				> PKMAP_BASE);
 #endif
-	BUG_ON(VMALLOC_START                   > VMALLOC_END);
-	BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+	BUG_ON(VMALLOC_START				> VMALLOC_END);
+	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
 #endif /* double-sanity-check paranoia */

-#ifdef CONFIG_X86_PAE
-	if (!cpu_has_pae)
-		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
-#endif
 	if (boot_cpu_data.wp_works_ok < 0)
 		test_wp_bit();

+	cpa_init();
+
 	/*
 	 * Subtle. SMP is doing it's boot stuff late (because it has to
 	 * fork idle threads) - but it also needs low mappings for the
@@ -796,49 +740,35 @@ int arch_add_memory(int nid, u64 start,

 	return __add_pages(zone, start_pfn, nr_pages);
 }
-
 #endif

-struct kmem_cache *pmd_cache;
-
-void __init pgtable_cache_init(void)
-{
-	if (PTRS_PER_PMD > 1)
-		pmd_cache = kmem_cache_create("pmd",
-					      PTRS_PER_PMD*sizeof(pmd_t),
-					      PTRS_PER_PMD*sizeof(pmd_t),
-					      SLAB_PANIC,
-					      pmd_ctor);
-}
-
 /*
  * This function cannot be __init, since exceptions don't work in that
  * section.  Put this after the callers, so that it cannot be inlined.
  */
-static int noinline do_test_wp_bit(void)
+static noinline int do_test_wp_bit(void)
 {
 	char tmp_reg;
 	int flag;

 	__asm__ __volatile__(
-		"	movb %0,%1	\n"
-		"1:	movb %1,%0	\n"
-		"	xorl %2,%2	\n"
+		"	movb %0, %1	\n"
+		"1:	movb %1, %0	\n"
+		"	xorl %2, %2	\n"
 		"2:			\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 4	\n"
-		"	.long 1b,2b	\n"
-		".previous		\n"
+		_ASM_EXTABLE(1b,2b)
 		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
 		 "=q" (tmp_reg),
 		 "=r" (flag)
 		:"2" (1)
 		:"memory");
-
+
 	return flag;
 }

 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);

 void mark_rodata_ro(void)
 {
@@ -851,32 +781,58 @@ void mark_rodata_ro(void)
 	if (num_possible_cpus() <= 1)
 #endif
 	{
-		change_page_attr(virt_to_page(start),
-		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-		printk("Write protecting the kernel text: %luk\n", size >> 10);
+		set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+		printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+			size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+		printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+			start, start+size);
+		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+		printk(KERN_INFO "Testing CPA: write protecting again\n");
+		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
 	}
 #endif
 	start += size;
 	size = (unsigned long)__end_rodata - start;
-	change_page_attr(virt_to_page(start),
-	                 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
-	printk("Write protecting the kernel read-only data: %luk\n",
-	       size >> 10);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+		size >> 10);
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);

-	/*
-	 * change_page_attr() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
-	 */
-	global_flush_tlb();
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
 }
 #endif

 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
 	unsigned long addr;

+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable first.
+	 */
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
@@ -885,6 +841,7 @@ void free_init_pages(char *what, unsigne
 		totalram_pages++;
 	}
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+#endif
 }

 void free_initmem(void)
@@ -900,4 +857,3 @@ void free_initrd_mem(unsigned long start
 	free_init_pages("initrd memory", start, end);
 }
 #endif
-
--- head-2010-04-29.orig/arch/x86/mm/init_64-xen.c	2010-04-29 09:48:00.000000000 +0200
+++ head-2010-04-29/arch/x86/mm/init_64-xen.c	2010-04-29 09:50:58.000000000 +0200
@@ -46,14 +46,13 @@
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>

 #include <xen/features.h>

-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-const struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);

 #if CONFIG_XEN_COMPAT <= 0x030002
@@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_
 	(((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +	\
 	__START_KERNEL_map)))

-static void __meminit early_make_page_readonly(void *va, unsigned int feature)
+pmd_t *__init early_get_pmd(unsigned long va)
+{
+	unsigned long addr;
+	unsigned long *page = (unsigned long *)init_level4_pgt;
+
+	addr = page[pgd_index(va)];
+	addr_to_page(addr, page);
+
+	addr = page[pud_index(va)];
+	addr_to_page(addr, page);
+
+	return (pmd_t *)&page[pmd_index(va)];
+}
+
+void __meminit early_make_page_readonly(void *va, unsigned int feature)
 {
 	unsigned long addr, _va = (unsigned long)va;
 	pte_t pte, *ptep;
@@ -107,76 +120,6 @@ static void __meminit early_make_page_re
 		BUG();
 }

-static void __make_page_readonly(void *va)
-{
-	pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
-	unsigned long addr = (unsigned long) va;
-
-	pgd = pgd_offset_k(addr);
-	pud = pud_offset(pgd, addr);
-	pmd = pmd_offset(pud, addr);
-	ptep = pte_offset_kernel(pmd, addr);
-
-	pte.pte = ptep->pte & ~_PAGE_RW;
-	if (HYPERVISOR_update_va_mapping(addr, pte, 0))
-		xen_l1_entry_update(ptep, pte); /* fallback */
-
-	if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
-		__make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
-}
-
-static void __make_page_writable(void *va)
-{
-	pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
-	unsigned long addr = (unsigned long) va;
-
-	pgd = pgd_offset_k(addr);
-	pud = pud_offset(pgd, addr);
-	pmd = pmd_offset(pud, addr);
-	ptep = pte_offset_kernel(pmd, addr);
-
-	pte.pte = ptep->pte | _PAGE_RW;
-	if (HYPERVISOR_update_va_mapping(addr, pte, 0))
-		xen_l1_entry_update(ptep, pte); /* fallback */
-
-	if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
-		__make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
-}
-
-void make_page_readonly(void *va, unsigned int feature)
-{
-	if (!xen_feature(feature))
-		__make_page_readonly(va);
-}
-
-void make_page_writable(void *va, unsigned int feature)
-{
-	if (!xen_feature(feature))
-		__make_page_writable(va);
-}
-
-void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
-{
-	if (xen_feature(feature))
-		return;
-
-	while (nr-- != 0) {
-		__make_page_readonly(va);
-		va = (void*)((unsigned long)va + PAGE_SIZE);
-	}
-}
-
-void make_pages_writable(void *va, unsigned nr, unsigned int feature)
-{
-	if (xen_feature(feature))
-		return;
-
-	while (nr-- != 0) {
-		__make_page_writable(va);
-		va = (void*)((unsigned long)va + PAGE_SIZE);
-	}
-}
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -187,22 +130,26 @@ void show_mem(void)
 {
 	long i, total = 0, reserved = 0;
 	long shared = 0, cached = 0;
-	pg_data_t *pgdat;
 	struct page *page;
+	pg_data_t *pgdat;

 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	printk(KERN_INFO "Free swap:       %6ldkB\n",
+		nr_swap_pages << (PAGE_SHIFT-10));

 	for_each_online_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			/* this loop can take a while with 256 GB and 4k pages
-			   so update the NMI watchdog */
-			if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			/*
+			 * This loop can take a while with 256 GB and
+			 * 4k pages so defer the NMI watchdog:
+			 */
+			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
 				touch_nmi_watchdog();
-			}
+
 			if (!pfn_valid(pgdat->node_start_pfn + i))
 				continue;
+
 			page = pfn_to_page(pgdat->node_start_pfn + i);
 			total++;
 			if (PageReserved(page))
@@ -211,58 +158,67 @@ void show_mem(void)
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page) - 1;
-               }
+		}
 	}
-	printk(KERN_INFO "%lu pages of RAM\n", total);
-	printk(KERN_INFO "%lu reserved pages\n",reserved);
-	printk(KERN_INFO "%lu pages shared\n",shared);
-	printk(KERN_INFO "%lu pages swap cached\n",cached);
+	printk(KERN_INFO "%lu pages of RAM\n",		total);
+	printk(KERN_INFO "%lu reserved pages\n",	reserved);
+	printk(KERN_INFO "%lu pages shared\n",		shared);
+	printk(KERN_INFO "%lu pages swap cached\n",	cached);
 }

+static unsigned long __meminitdata table_start;
+static unsigned long __meminitdata table_end;

 static __init void *spp_getpage(void)
-{
+{
 	void *ptr;
+
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else if (start_pfn < table_end) {
 		ptr = __va(start_pfn << PAGE_SHIFT);
 		start_pfn++;
 		memset(ptr, 0, PAGE_SIZE);
 	} else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
-	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
-		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");

-	Dprintk("spp_getpage %p\n", ptr);
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+		panic("set_pte_phys: cannot allocate page data %s\n",
+			after_bootmem ? "after bootmem" : "");
+	}
+
+	pr_debug("spp_getpage %p\n", ptr);
+
 	return ptr;
-}
+}

 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))

-static __init void set_pte_phys(unsigned long vaddr,
-			 unsigned long phys, pgprot_t prot, int user_mode)
+static __init void
+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte, new_pte;

-	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);

 	pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
 	if (pgd_none(*pgd)) {
-		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		printk(KERN_ERR
+			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
 		return;
 	}
 	pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
+		pmd = (pmd_t *) spp_getpage();
 		make_page_readonly(pmd, XENFEAT_writable_page_tables);
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
 		if (pmd != pmd_offset(pud, 0)) {
-			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+				pmd, pmd_offset(pud, 0));
 			return;
 		}
 	}
@@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned
 		make_page_readonly(pte, XENFEAT_writable_page_tables);
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
 		if (pte != pte_offset_kernel(pmd, 0)) {
-			printk("PAGETABLE BUG #02!\n");
+			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
@@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned
 	__flush_tlb_one(vaddr);
 }

-static __init void set_pte_phys_ma(unsigned long vaddr,
-				   unsigned long phys, pgprot_t prot)
+static __init void
+set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte, new_pte;

-	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+	pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);

 	pgd = pgd_offset_k(vaddr);
 	if (pgd_none(*pgd)) {
-		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		printk(KERN_ERR
+			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
 		return;
 	}
 	pud = pud_offset(pgd, vaddr);
 	if (pud_none(*pud)) {
-
-		pmd = (pmd_t *) spp_getpage();
+		pmd = (pmd_t *) spp_getpage();
 		make_page_readonly(pmd, XENFEAT_writable_page_tables);
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
 		if (pmd != pmd_offset(pud, 0)) {
-			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
-			return;
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+				pmd, pmd_offset(pud, 0));
 		}
 	}
 	pmd = pmd_offset(pud, vaddr);
@@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig
 		make_page_readonly(pte, XENFEAT_writable_page_tables);
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
 		if (pte != pte_offset_kernel(pmd, 0)) {
-			printk("PAGETABLE BUG #02!\n");
+			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
@@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig
 	__flush_tlb_one(vaddr);
 }

+#ifndef CONFIG_XEN
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_addr holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _end.  _end is
+ * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+	unsigned long vaddr = __START_KERNEL_map;
+	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+	pmd_t *pmd = level2_kernel_pgt;
+	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
+
+	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+		if (!pmd_present(*pmd))
+			continue;
+		if (vaddr < (unsigned long) _text || vaddr > end)
+			set_pmd(pmd, __pmd(0));
+	}
+}
+#endif
+
 /* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init
+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 	unsigned long address = __fix_to_virt(idx);

 	if (idx >= __end_of_fixed_addresses) {
-		printk("Invalid __set_fixmap\n");
+		printk(KERN_ERR "Invalid __set_fixmap\n");
 		return;
 	}
 	switch (idx) {
@@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx,
 	}
 }

-unsigned long __meminitdata table_start, table_end;
-
 static __meminit void *alloc_static_page(unsigned long *phys)
 {
 	unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;

 	if (after_bootmem) {
 		void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
-
 		*phys = __pa(adr);
+
 		return adr;
 	}

@@ -396,7 +380,7 @@ static __meminit void *alloc_static_page

 #define PTE_SIZE PAGE_SIZE

-static inline int make_readonly(unsigned long paddr)
+static inline int __meminit make_readonly(unsigned long paddr)
 {
 	extern char __vsyscall_0;
 	int readonly = 0;
@@ -430,33 +414,38 @@ static inline int make_readonly(unsigned
 /* Must run before zap_low_mappings */
 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
 {
-	unsigned long vaddr;
 	pmd_t *pmd, *last_pmd;
+	unsigned long vaddr;
 	int i, pmds;

 	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
 	vaddr = __START_KERNEL_map;
 	pmd = level2_kernel_pgt;
 	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+
 	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
 		for (i = 0; i < pmds; i++) {
 			if (pmd_present(pmd[i]))
-				goto next;
+				goto continue_outer_loop;
 		}
 		vaddr += addr & ~PMD_MASK;
 		addr &= PMD_MASK;
+
 		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
-			set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
-		__flush_tlb();
+			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		__flush_tlb_all();
+
 		return (void *)vaddr;
-	next:
+continue_outer_loop:
 		;
 	}
 	printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
 	return NULL;
 }

-/* To avoid virtual aliases later */
+/*
+ * To avoid virtual aliases later:
+ */
 __meminit void early_iounmap(void *addr, unsigned long size)
 {
 	unsigned long vaddr;
@@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr,
 	vaddr = (unsigned long)addr;
 	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
 	pmd = level2_kernel_pgt + pmd_index(vaddr);
+
 	for (i = 0; i < pmds; i++)
 		pmd_clear(pmd + i);
-	__flush_tlb();
+
+	__flush_tlb_all();
 }
 #endif

@@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
 static void __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
-	pmd_t *pmd = pmd_offset(pud,0);
+	pmd_t *pmd = pmd_offset(pud, 0);
 	spin_lock(&init_mm.page_table_lock);
 	phys_pmd_init(pmd, address, end);
 	spin_unlock(&init_mm.page_table_lock);
 	__flush_tlb_all();
 }

-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
-{
+static void __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
 	int i = pud_index(addr);

-	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
@@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_

 		early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
 	}
-	__flush_tlb();
-}
+	__flush_tlb_all();
+}

 void __init xen_init_pt(void)
 {
@@ -637,6 +629,7 @@ void __init xen_init_pt(void)
 static void __init extend_init_mapping(unsigned long tables_space)
 {
 	unsigned long va = __START_KERNEL_map;
+	unsigned long start = start_pfn;
 	unsigned long phys, addr, *pte_page;
 	pmd_t *pmd;
 	pte_t *pte, new_pte;
@@ -713,6 +706,10 @@ static void __init extend_init_mapping(u
 			BUG();
 		va += PAGE_SIZE;
 	}
+
+	if (start_pfn > start)
+		reserve_early(start << PAGE_SHIFT,
+			      start_pfn << PAGE_SHIFT, "INITMAP");
 }

 static void __init find_early_table_space(unsigned long end)
@@ -737,7 +734,7 @@ static void __init find_early_table_spac
 		(table_start << PAGE_SHIFT) + tables);
 }

-static void xen_finish_init_mapping(void)
+static void __init xen_finish_init_mapping(void)
 {
 	unsigned long i, start, end;

@@ -769,13 +766,6 @@ static void xen_finish_init_mapping(void
 	/* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
 	table_end = ~0UL;

-	/*
-	 * Prefetch pte's for the bt_ioremap() area. It gets used before the
-	 * boot-time allocator is online, so allocate-on-demand would fail.
-	 */
-	for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
-		__set_fixmap(i, 0, __pgprot(0));
-
 	/* Switch to the real shared_info page, and clear the dummy page. */
 	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
 	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
@@ -795,20 +785,23 @@ static void xen_finish_init_mapping(void
 	table_end = start_pfn;
 }

-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
-   This runs before bootmem is initialized and gets pages directly from the
-   physical memory. To access them they are temporarily mapped. */
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
-{
+{
 	unsigned long next;

-	Dprintk("init_memory_mapping\n");
+	pr_debug("init_memory_mapping\n");

-	/*
+	/*
 	 * Find space for the kernel direct mapping tables.
-	 * Later we should allocate these tables in the local node of the memory
-	 * mapped.  Unfortunately this is done currently before the nodes are
-	 * discovered.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
 	 */
 	if (!after_bootmem)
 		find_early_table_space(end);
@@ -817,8 +810,8 @@ void __init_refok init_memory_mapping(un
 	end = (unsigned long)__va(end);

 	for (; start < end; start = next) {
-		unsigned long pud_phys;
 		pgd_t *pgd = pgd_offset_k(start);
+		unsigned long pud_phys;
 		pud_t *pud;

 		if (after_bootmem)
@@ -826,8 +819,8 @@ void __init_refok init_memory_mapping(un
 		else
 			pud = alloc_static_page(&pud_phys);
 		next = start + PGDIR_SIZE;
-		if (next > end)
-			next = end;
+		if (next > end)
+			next = end;
 		phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem) {
 			early_make_page_readonly(pud, XENFEAT_writable_page_tables);
@@ -841,12 +834,17 @@ void __init_refok init_memory_mapping(un
 	}

 	__flush_tlb_all();
+
+	if (!after_bootmem)
+		reserve_early(table_start << PAGE_SHIFT,
+			      table_end << PAGE_SHIFT, "PGTABLE");
 }

 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -860,40 +858,6 @@ void __init paging_init(void)
 }
 #endif

-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
-   from the CPU leading to inconsistent cache lines. address and size
-   must be aligned to 2MB boundaries.
-   Does nothing when the mapping doesn't exist. */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
-{
-	unsigned long end = address + size;
-
-	BUG_ON(address & ~LARGE_PAGE_MASK);
-	BUG_ON(size & ~LARGE_PAGE_MASK);
-
-	for (; address < end; address += LARGE_PAGE_SIZE) {
-		pgd_t *pgd = pgd_offset_k(address);
-		pud_t *pud;
-		pmd_t *pmd;
-		if (pgd_none(*pgd))
-			continue;
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud))
-			continue;
-		pmd = pmd_offset(pud, address);
-		if (!pmd || pmd_none(*pmd))
-			continue;
-		if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
-			/* Could handle this, but it should not happen currently. */
-			printk(KERN_ERR
-	       "clear_kernel_mapping: mapping has been split. will leak memory\n");
-			pmd_ERROR(*pmd);
-		}
-		set_pmd(pmd, __pmd(0));
-	}
-	__flush_tlb_all();
-}
-
 /*
  * Memory hotplug specific functions
  */
@@ -919,16 +883,12 @@ int arch_add_memory(int nid, u64 start,
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;

-	init_memory_mapping(start, (start + size -1));
+	init_memory_mapping(start, start + size-1);

 	ret = __add_pages(zone, start_pfn, nr_pages);
-	if (ret)
-		goto error;
+	WARN_ON(1);

 	return ret;
-error:
-	printk("%s: Problem encountered in __add_pages!\n", __func__);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);

@@ -942,36 +902,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to

 #endif /* CONFIG_MEMORY_HOTPLUG */

-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
-	int err = -EIO;
-	unsigned long pfn;
-	unsigned long total = 0, mem = 0;
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (pfn_valid(pfn)) {
-			online_page(pfn_to_page(pfn));
-			err = 0;
-			mem++;
-		}
-		total++;
-	}
-	if (!err) {
-		z->spanned_pages += total;
-		z->present_pages += mem;
-		z->zone_pgdat->node_spanned_pages += total;
-		z->zone_pgdat->node_present_pages += mem;
-	}
-	return err;
-}
-#endif
-
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
-			 kcore_vsyscall;
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+			 kcore_modules, kcore_vsyscall;

 void __init mem_init(void)
 {
@@ -980,8 +912,7 @@ void __init mem_init(void)

 	pci_iommu_alloc();

-	/* clear the zero-page */
-	memset(empty_zero_page, 0, PAGE_SIZE);
+	/* clear_bss() already clear the empty_zero_page */

 	reservedpages = 0;

@@ -998,7 +929,6 @@ void __init mem_init(void)
 	}
 	reservedpages = end_pfn - totalram_pages -
 					absent_pages_in_range(0, end_pfn);
-
 	after_bootmem = 1;

 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -1006,46 +936,64 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		   VMALLOC_END-VMALLOC_START);
 	kclist_add(&kcore_kernel, &_stext, _end - _stext);
 	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
-	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 				 VSYSCALL_END - VSYSCALL_START);

-	printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+				"%ldk reserved, %ldk data, %ldk init)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 		end_pfn << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		initsize >> 10);
+
+	cpa_init();
 }

 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
-	unsigned long addr;
+	unsigned long addr = begin;

-	if (begin >= end)
+	if (addr >= end)
 		return;

+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+
+	for (; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)),
 		       POISON_FREE_INITMEM, PAGE_SIZE);
 		if (addr >= __START_KERNEL_map) {
 			/* make_readonly() reports all kernel addresses. */
-			__make_page_writable(__va(__pa(addr)));
-			change_page_attr_addr(addr, 1, __pgprot(0));
+			if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+							 pfn_pte(__pa(addr) >> PAGE_SHIFT,
+								 PAGE_KERNEL),
+							 0))
+				BUG();
+			if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+				BUG();
 		}
 		free_page(addr);
 		totalram_pages++;
 	}
-	if (addr > __START_KERNEL_map)
-		global_flush_tlb();
+#endif
 }

 void free_initmem(void)
@@ -1056,6 +1004,8 @@ void free_initmem(void)
 }

 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);

 void mark_rodata_ro(void)
 {
@@ -1077,18 +1027,27 @@ void mark_rodata_ro(void)
 	if (end <= start)
 		return;

-	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);

 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

 	/*
-	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
+	 * The rodata section (but not the kernel text!) should also be
+	 * not-executable.
 	 */
-	global_flush_tlb();
+	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: again\n");
+	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
 }
 #endif

@@ -1099,17 +1058,21 @@ void free_initrd_mem(unsigned long start
 }
 #endif

-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
-{
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
 #ifdef CONFIG_NUMA
 	int nid = phys_to_nid(phys);
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
+
 	if (pfn >= end_pfn) {
-		/* This can happen with kdump kernels when accessing firmware
-		   tables. */
+		/*
+		 * This can happen with kdump kernels when accessing
+		 * firmware tables:
+		 */
 		if (pfn < end_pfn_map)
 			return;
+
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
 				phys, len);
 		return;
@@ -1117,9 +1080,9 @@ void __init reserve_bootmem_generic(unsi

 	/* Should check here against the e820 map to avoid double free */
 #ifdef CONFIG_NUMA
-  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
-#else
-	reserve_bootmem(phys, len);
+	reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+#else
+	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
 #ifndef CONFIG_XEN
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
@@ -1129,17 +1092,17 @@ void __init reserve_bootmem_generic(unsi
 #endif
 }

-int kern_addr_valid(unsigned long addr)
-{
+int kern_addr_valid(unsigned long addr)
+{
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;

 	if (above != 0 && above != -1UL)
-		return 0;
-
+		return 0;
+
 #ifdef CONFIG_XEN
 	/*
 	 * Don't walk page tables for hypervisor addresses, but allow
@@ -1159,29 +1122,32 @@ int kern_addr_valid(unsigned long addr)

 	pud = pud_offset(pgd, addr);
 	if (pud_none(*pud))
-		return 0;
+		return 0;

 	pmd = pmd_offset(pud, addr);
 	if (pmd_none(*pmd))
 		return 0;
+
 	if (pmd_large(*pmd))
 		return pfn_valid(pmd_pfn(*pmd));

 	pte = pte_offset_kernel(pmd, addr);
 	if (pte_none(*pte))
 		return 0;
+
 	return pfn_valid(pte_pfn(*pte));
 }

-/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
-   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
-   not need special handling anymore. */
-
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
 static struct vm_area_struct gate_vma = {
-	.vm_start = VSYSCALL_START,
-	.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
-	.vm_page_prot = PAGE_READONLY_EXEC,
-	.vm_flags = VM_READ | VM_EXEC
+	.vm_start	= VSYSCALL_START,
+	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+	.vm_page_prot	= PAGE_READONLY_EXEC,
+	.vm_flags	= VM_READ | VM_EXEC
 };

 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -1196,14 +1162,17 @@ struct vm_area_struct *get_gate_vma(stru
 int in_gate_area(struct task_struct *task, unsigned long addr)
 {
 	struct vm_area_struct *vma = get_gate_vma(task);
+
 	if (!vma)
 		return 0;
+
 	return (addr >= vma->vm_start) && (addr < vma->vm_end);
 }

-/* Use this when you have no reliable task/vma, typically from interrupt
- * context.  It is less reliable than using the task's vma and may give
- * false positives.
+/*
+ * Use this when you have no reliable task/vma, typically from interrupt
+ * context. It is less reliable than using the task's vma and may give
+ * false positives:
  */
 int in_gate_area_no_task(unsigned long addr)
 {
@@ -1223,8 +1192,8 @@ const char *arch_vma_name(struct vm_area
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  */
-int __meminit vmemmap_populate(struct page *start_page,
-						unsigned long size, int node)
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
 {
 	unsigned long addr = (unsigned long)start_page;
 	unsigned long end = (unsigned long)(start_page + size);
@@ -1239,6 +1208,7 @@ int __meminit vmemmap_populate(struct pa
 		pgd = vmemmap_pgd_populate(addr, node);
 		if (!pgd)
 			return -ENOMEM;
+
 		pud = vmemmap_pud_populate(pgd, addr, node);
 		if (!pud)
 			return -ENOMEM;
@@ -1246,20 +1216,22 @@ int __meminit vmemmap_populate(struct pa
 		pmd = pmd_offset(pud, addr);
 		if (pmd_none(*pmd)) {
 			pte_t entry;
-			void *p = vmemmap_alloc_block(PMD_SIZE, node);
+			void *p;
+
+			p = vmemmap_alloc_block(PMD_SIZE, node);
 			if (!p)
 				return -ENOMEM;

-			entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
-			mk_pte_huge(entry);
-			set_pmd(pmd, __pmd(pte_val(entry)));
+			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+							PAGE_KERNEL_LARGE);
+			set_pmd(pmd, __pmd_ma(__pte_val(entry)));

 			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
 				addr, addr + PMD_SIZE - 1, p, node);
-		} else
+		} else {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
+		}
 	}
-
 	return 0;
 }
 #endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/mm/ioremap-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,687 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+enum ioremap_mode {
+	IOR_MODE_UNCACHED,
+	IOR_MODE_CACHED,
+};
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+
+unsigned long __phys_addr(unsigned long x)
+{
+	if (x >= __START_KERNEL_map)
+		return x - __START_KERNEL_map + phys_base;
+	return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+#endif
+
+static int direct_remap_area_pte_fn(pte_t *pte,
+				    struct page *pmd_page,
+				    unsigned long address,
+				    void *data)
+{
+	mmu_update_t **v = (mmu_update_t **)data;
+
+	BUG_ON(!pte_none(*pte));
+
+	(*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+		     PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+	(*v)++;
+
+	return 0;
+}
+
+static int __direct_remap_pfn_range(struct mm_struct *mm,
+				    unsigned long address,
+				    unsigned long mfn,
+				    unsigned long size,
+				    pgprot_t prot,
+				    domid_t  domid)
+{
+	int rc;
+	unsigned long i, start_address;
+	mmu_update_t *u, *v, *w;
+
+	u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (u == NULL)
+		return -ENOMEM;
+
+	start_address = address;
+
+	flush_cache_all();
+
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
+			/* Flush a full batch after filling in the PTE ptrs. */
+			rc = apply_to_page_range(mm, start_address,
+						 address - start_address,
+						 direct_remap_area_pte_fn, &w);
+			if (rc)
+				goto out;
+			rc = -EFAULT;
+			if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+				goto out;
+			v = w = u;
+			start_address = address;
+		}
+
+		/*
+		 * Fill in the machine address: PTE ptr is done later by
+		 * apply_to_page_range().
+		 */
+		v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
+
+		mfn++;
+		address += PAGE_SIZE;
+		v++;
+	}
+
+	if (v != u) {
+		/* Final batch. */
+		rc = apply_to_page_range(mm, start_address,
+					 address - start_address,
+					 direct_remap_area_pte_fn, &w);
+		if (rc)
+			goto out;
+		rc = -EFAULT;
+		if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+			goto out;
+	}
+
+	rc = 0;
+
+ out:
+	flush_tlb_all();
+
+	free_page((unsigned long)u);
+
+	return rc;
+}
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+			   unsigned long address,
+			   unsigned long mfn,
+			   unsigned long size,
+			   pgprot_t prot,
+			   domid_t  domid)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return remap_pfn_range(vma, address, mfn, size, prot);
+
+	if (domid == DOMID_SELF)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	vma->vm_mm->context.has_foreign_mappings = 1;
+
+	return __direct_remap_pfn_range(
+		vma->vm_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_remap_pfn_range);
+
+int direct_kernel_remap_pfn_range(unsigned long address,
+				  unsigned long mfn,
+				  unsigned long size,
+				  pgprot_t prot,
+				  domid_t  domid)
+{
+	return __direct_remap_pfn_range(
+		&init_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
+
+static int lookup_pte_fn(
+	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	uint64_t *ptep = (uint64_t *)data;
+	if (ptep)
+		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+	return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+			   unsigned long address,
+			   uint64_t *ptep)
+{
+	return apply_to_page_range(mm, address, PAGE_SIZE,
+				   lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+static int noop_fn(
+	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	return 0;
+}
+
+int touch_pte_range(struct mm_struct *mm,
+		    unsigned long address,
+		    unsigned long size)
+{
+	return apply_to_page_range(mm, address, size, noop_fn, NULL);
+}
+
+EXPORT_SYMBOL(touch_pte_range);
+
+#ifdef CONFIG_X86_32
+int page_is_ram(unsigned long pagenr)
+{
+	unsigned long addr, end;
+	int i;
+
+#ifndef CONFIG_XEN
+	/*
+	 * A special case is the first 4Kb of memory;
+	 * This is a BIOS owned area, not kernel ram, but generally
+	 * not listed as such in the E820 table.
+	 */
+	if (pagenr == 0)
+		return 0;
+
+	/*
+	 * Second special case: Some BIOSen report the PC BIOS
+	 * area (640->1Mb) as ram even though it is not.
+	 */
+	if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
+		    pagenr < (BIOS_END >> PAGE_SHIFT))
+		return 0;
+#endif
+
+	for (i = 0; i < e820.nr_map; i++) {
+		/*
+		 * Not usable memory:
+		 */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
+
+
+		if ((pagenr >= addr) && (pagenr < end))
+			return 1;
+	}
+	return 0;
+}
+#endif
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+			       enum ioremap_mode mode)
+{
+	unsigned long nrpages = size >> PAGE_SHIFT;
+	int err;
+
+	switch (mode) {
+	case IOR_MODE_UNCACHED:
+	default:
+		err = set_memory_uc(vaddr, nrpages);
+		break;
+	case IOR_MODE_CACHED:
+		err = set_memory_wb(vaddr, nrpages);
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
+			       enum ioremap_mode mode)
+{
+	unsigned long mfn, offset, last_addr, vaddr;
+	struct vm_struct *area;
+	pgprot_t prot;
+	domid_t domid = DOMID_IO;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
+		return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
+		unsigned long pfn = mfn_to_local_pfn(mfn);
+
+		if (pfn >= max_pfn)
+			continue;
+
+		domid = DOMID_SELF;
+
+		if (pfn >= max_pfn_mapped) /* bogus */
+			continue;
+
+		if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+			return NULL;
+	}
+
+	switch (mode) {
+	case IOR_MODE_UNCACHED:
+	default:
+		/*
+		 * FIXME: we will use UC MINUS for now, as video fb drivers
+		 * depend on it. Upcoming ioremap_wc() will fix this behavior.
+		 */
+		prot = PAGE_KERNEL_UC_MINUS;
+		break;
+	case IOR_MODE_CACHED:
+		prot = PAGE_KERNEL;
+		break;
+	}
+
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area(size, VM_IOREMAP | (mode << 20));
+	if (!area)
+		return NULL;
+	area->phys_addr = phys_addr;
+	vaddr = (unsigned long) area->addr;
+	if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+				     size, prot, domid)) {
+		free_vm_area(area);
+		return NULL;
+	}
+
+	if (ioremap_change_attr(vaddr, size, mode) < 0) {
+		iounmap((void __iomem *) vaddr);
+		return NULL;
+	}
+
+	return (void __iomem *) (vaddr + offset);
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, *o;
+
+	if ((void __force *)addr <= high_memory)
+		return;
+
+	/*
+	 * __ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+		return;
+
+	addr = (volatile void __iomem *)
+		(PAGE_MASK & (unsigned long __force)addr);
+
+	/* Use the vm area unlocked, assuming the caller
+	   ensures there isn't another iounmap for the same address
+	   in parallel. Reuse of the virtual address is prevented by
+	   leaving it in the global lists until we're done with it.
+	   cpa takes care of the direct mappings. */
+	read_lock(&vmlist_lock);
+	for (p = vmlist; p; p = p->next) {
+		if (p->addr == addr)
+			break;
+	}
+	read_unlock(&vmlist_lock);
+
+	if (!p) {
+		printk(KERN_ERR "iounmap: bad address %p\n", addr);
+		dump_stack();
+		return;
+	}
+
+	if ((p->flags >> 20) != IOR_MODE_CACHED) {
+		unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
+		unsigned long mfn = p->phys_addr;
+		unsigned long va = (unsigned long)addr;
+
+		for (; n > 0; n--, mfn++, va += PAGE_SIZE)
+			if (mfn_to_local_pfn(mfn) < max_pfn)
+				set_memory_wb(va, 1);
+	}
+
+	/* Finally remove it */
+	o = remove_vm_area((void *)addr);
+	BUG_ON(p != o || o == NULL);
+	kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+	early_ioremap_debug = 1;
+
+	return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+				__attribute__((aligned(PAGE_SIZE)));
+
+#ifdef CONFIG_X86_32
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+	/* Don't assume we're using swapper_pg_dir at this point */
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(addr)];
+	pud_t *pud = pud_offset(pgd, addr);
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	return pmd;
+}
+#else
+#define early_ioremap_pmd early_get_pmd
+#define make_lowmem_page_readonly early_make_page_readonly
+#define make_lowmem_page_writable make_page_writable
+#endif
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+	return &bm_pte[pte_index(addr)];
+}
+
+void __init early_ioremap_init(void)
+{
+	pmd_t *pmd;
+
+	if (early_ioremap_debug)
+		printk(KERN_INFO "early_ioremap_init()\n");
+
+	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+	memset(bm_pte, 0, sizeof(bm_pte));
+	make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
+	pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+	/*
+	 * The boot-ioremap range spans multiple pmds, for which
+	 * we are not prepared:
+	 */
+	if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+		WARN_ON(1);
+		printk(KERN_WARNING "pmd %p != %p\n",
+		       pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+			fix_to_virt(FIX_BTMAP_BEGIN));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+			fix_to_virt(FIX_BTMAP_END));
+
+		printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+		printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+		       FIX_BTMAP_BEGIN);
+	}
+}
+
+#ifdef CONFIG_X86_32
+void __init early_ioremap_clear(void)
+{
+	pmd_t *pmd;
+
+	if (early_ioremap_debug)
+		printk(KERN_INFO "early_ioremap_clear()\n");
+
+	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+	pmd_clear(pmd);
+	make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
+	/* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
+	__flush_tlb_all();
+}
+
+void __init early_ioremap_reset(void)
+{
+	enum fixed_addresses idx;
+	unsigned long addr, phys;
+	pte_t *pte;
+
+	after_paging_init = 1;
+	for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
+		addr = fix_to_virt(idx);
+		pte = early_ioremap_pte(addr);
+		if (pte_present(*pte)) {
+			phys = __pte_val(*pte) & PAGE_MASK;
+			set_fixmap(idx, phys);
+		}
+	}
+}
+#endif /* CONFIG_X86_32 */
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+				   unsigned long phys, pgprot_t flags)
+{
+	unsigned long addr = __fix_to_virt(idx);
+	pte_t *pte;
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	pte = early_ioremap_pte(addr);
+	if (pgprot_val(flags))
+		set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+	else
+		pte_clear(NULL, addr, pte);
+	__flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+					unsigned long phys)
+{
+	if (after_paging_init)
+		set_fixmap(idx, phys);
+	else
+		__early_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+	if (after_paging_init)
+		clear_fixmap(idx);
+	else
+		__early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+
+int __initdata early_ioremap_nested;
+
+static int __init check_early_ioremap_leak(void)
+{
+	if (!early_ioremap_nested)
+		return 0;
+
+	printk(KERN_WARNING
+	       "Debug warning: early ioremap leak of %d areas detected.\n",
+	       early_ioremap_nested);
+	printk(KERN_WARNING
+	       "please boot with early_ioremap_debug and report the dmesg.\n");
+	WARN_ON(1);
+
+	return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+	unsigned long offset, last_addr;
+	unsigned int nrpages, nesting;
+	enum fixed_addresses idx0, idx;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	nesting = early_ioremap_nested;
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
+		       phys_addr, size, nesting);
+		dump_stack();
+	}
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (nesting >= FIX_BTMAPS_NESTING) {
+		WARN_ON(1);
+		return NULL;
+	}
+	early_ioremap_nested++;
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr) - phys_addr;
+
+	/*
+	 * Mappings have to fit in the FIX_BTMAP area.
+	 */
+	nrpages = size >> PAGE_SHIFT;
+	if (nrpages > NR_FIX_BTMAPS) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	/*
+	 * Ok, go for it..
+	 */
+	idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+	idx = idx0;
+	while (nrpages > 0) {
+		early_set_fixmap(idx, phys_addr);
+		phys_addr += PAGE_SIZE;
+		--idx;
+		--nrpages;
+	}
+	if (early_ioremap_debug)
+		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+
+	return (void *) (offset + fix_to_virt(idx0));
+}
+
+void __init early_iounmap(void *addr, unsigned long size)
+{
+	unsigned long virt_addr;
+	unsigned long offset;
+	unsigned int nrpages;
+	enum fixed_addresses idx;
+	unsigned int nesting;
+
+	nesting = --early_ioremap_nested;
+	WARN_ON(nesting < 0);
+
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+		       size, nesting);
+		dump_stack();
+	}
+
+	virt_addr = (unsigned long)addr;
+	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+		WARN_ON(1);
+		return;
+	}
+	offset = virt_addr & ~PAGE_MASK;
+	nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+	while (nrpages > 0) {
+		early_clear_fixmap(idx);
+		--idx;
+		--nrpages;
+	}
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+	WARN_ON(1);
+}
--- head-2010-04-29.orig/arch/x86/mm/ioremap_32-xen.c	2010-03-24 15:09:22.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,445 +0,0 @@
-/*
- * arch/i386/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/sched.h>
-#include <asm/fixmap.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-
-#define ISA_START_ADDRESS	0x0
-#define ISA_END_ADDRESS		0x100000
-
-static int direct_remap_area_pte_fn(pte_t *pte,
-				    struct page *pmd_page,
-				    unsigned long address,
-				    void *data)
-{
-	mmu_update_t **v = (mmu_update_t **)data;
-
-	BUG_ON(!pte_none(*pte));
-
-	(*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
-		     PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
-	(*v)++;
-
-	return 0;
-}
-
-static int __direct_remap_pfn_range(struct mm_struct *mm,
-				    unsigned long address,
-				    unsigned long mfn,
-				    unsigned long size,
-				    pgprot_t prot,
-				    domid_t  domid)
-{
-	int rc;
-	unsigned long i, start_address;
-	mmu_update_t *u, *v, *w;
-
-	u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-	if (u == NULL)
-		return -ENOMEM;
-
-	start_address = address;
-
-	flush_cache_all();
-
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
-			/* Flush a full batch after filling in the PTE ptrs. */
-			rc = apply_to_page_range(mm, start_address,
-						 address - start_address,
-						 direct_remap_area_pte_fn, &w);
-			if (rc)
-				goto out;
-			rc = -EFAULT;
-			if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
-				goto out;
-			v = w = u;
-			start_address = address;
-		}
-
-		/*
-		 * Fill in the machine address: PTE ptr is done later by
-		 * apply_to_page_range().
-		 */
-		v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
-
-		mfn++;
-		address += PAGE_SIZE;
-		v++;
-	}
-
-	if (v != u) {
-		/* Final batch. */
-		rc = apply_to_page_range(mm, start_address,
-					 address - start_address,
-					 direct_remap_area_pte_fn, &w);
-		if (rc)
-			goto out;
-		rc = -EFAULT;
-		if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
-			goto out;
-	}
-
-	rc = 0;
-
- out:
-	flush_tlb_all();
-
-	free_page((unsigned long)u);
-
-	return rc;
-}
-
-int direct_remap_pfn_range(struct vm_area_struct *vma,
-			   unsigned long address,
-			   unsigned long mfn,
-			   unsigned long size,
-			   pgprot_t prot,
-			   domid_t  domid)
-{
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return remap_pfn_range(vma, address, mfn, size, prot);
-
-	if (domid == DOMID_SELF)
-		return -EINVAL;
-
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-
-	vma->vm_mm->context.has_foreign_mappings = 1;
-
-	return __direct_remap_pfn_range(
-		vma->vm_mm, address, mfn, size, prot, domid);
-}
-EXPORT_SYMBOL(direct_remap_pfn_range);
-
-int direct_kernel_remap_pfn_range(unsigned long address,
-				  unsigned long mfn,
-				  unsigned long size,
-				  pgprot_t prot,
-				  domid_t  domid)
-{
-	return __direct_remap_pfn_range(
-		&init_mm, address, mfn, size, prot, domid);
-}
-EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
-
-static int lookup_pte_fn(
-	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
-{
-	uint64_t *ptep = (uint64_t *)data;
-	if (ptep)
-		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
-			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
-	return 0;
-}
-
-int create_lookup_pte_addr(struct mm_struct *mm,
-			   unsigned long address,
-			   uint64_t *ptep)
-{
-	return apply_to_page_range(mm, address, PAGE_SIZE,
-				   lookup_pte_fn, ptep);
-}
-
-EXPORT_SYMBOL(create_lookup_pte_addr);
-
-static int noop_fn(
-	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
-{
-	return 0;
-}
-
-int touch_pte_range(struct mm_struct *mm,
-		    unsigned long address,
-		    unsigned long size)
-{
-	return apply_to_page_range(mm, address, size, noop_fn, NULL);
-}
-
-EXPORT_SYMBOL(touch_pte_range);
-
-/*
- * Does @address reside within a non-highmem page that is local to this virtual
- * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
- * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
- * why this works.
- */
-static inline int is_local_lowmem(unsigned long address)
-{
-	extern unsigned long max_low_pfn;
-	return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
-}
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-	void __iomem * addr;
-	struct vm_struct * area;
-	unsigned long offset, last_addr;
-	pgprot_t prot;
-	domid_t domid = DOMID_IO;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (is_initial_xendomain() &&
-	    phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-		return (void __iomem *) isa_bus_to_virt(phys_addr);
-
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using..
-	 */
-	if (is_local_lowmem(phys_addr)) {
-		char *t_addr, *t_end;
-		struct page *page;
-
-		t_addr = bus_to_virt(phys_addr);
-		t_end = t_addr + (size - 1);
-
-		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-			if(!PageReserved(page))
-				return NULL;
-
-		domid = DOMID_SELF;
-	}
-
-	prot = __pgprot(_KERNPG_TABLE | flags);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-	/*
-	 * Ok, go for it..
-	 */
-	area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-	if (!area)
-		return NULL;
-	area->phys_addr = phys_addr;
-	addr = (void __iomem *) area->addr;
-	if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
-				     phys_addr>>PAGE_SHIFT,
-				     size, prot, domid)) {
-		vunmap((void __force *) addr);
-		return NULL;
-	}
-	return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address.
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- *
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long last_addr;
-	void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
-	if (!p)
-		return p;
-
-	/* Guaranteed to be > phys_addr, as per __ioremap() */
-	last_addr = phys_addr + size - 1;
-
-	if (is_local_lowmem(last_addr)) {
-		struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
-		unsigned long npages;
-
-		phys_addr &= PAGE_MASK;
-
-		/* This might overflow and become zero.. */
-		last_addr = PAGE_ALIGN(last_addr);
-
-		/* .. but that's ok, because modulo-2**n arithmetic will make
-	 	* the page-aligned "last - first" come out right.
-	 	*/
-		npages = (last_addr - phys_addr) >> PAGE_SHIFT;
-
-		if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
-			iounmap(p);
-			p = NULL;
-		}
-		global_flush_tlb();
-	}
-
-	return p;
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-	struct vm_struct *p, *o;
-
-	if ((void __force *)addr <= high_memory)
-		return;
-
-	/*
-	 * __ioremap special-cases the PCI/ISA range by not instantiating a
-	 * vm_area and by simply returning an address into the kernel mapping
-	 * of ISA space.   So handle that here.
-	 */
-	if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
-		return;
-
-	addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-
-	/* Use the vm area unlocked, assuming the caller
-	   ensures there isn't another iounmap for the same address
-	   in parallel. Reuse of the virtual address is prevented by
-	   leaving it in the global lists until we're done with it.
-	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
-	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
-			break;
-	}
-	read_unlock(&vmlist_lock);
-
-	if (!p) {
-		printk("iounmap: bad address %p\n", addr);
-		dump_stack();
-		return;
-	}
-
-	/* Reset the direct mapping. Can block */
-	if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
-		change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
-				 get_vm_area_size(p) >> PAGE_SHIFT,
-				 PAGE_KERNEL);
-		global_flush_tlb();
-	}
-
-	/* Finally remove it */
-	o = remove_vm_area((void *)addr);
-	BUG_ON(p != o || o == NULL);
-	kfree(p);
-}
-EXPORT_SYMBOL(iounmap);
-
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
-{
-	unsigned long offset, last_addr;
-	unsigned int nrpages;
-	enum fixed_addresses idx;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (is_initial_xendomain() &&
-	    phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-		return isa_bus_to_virt(phys_addr);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr) - phys_addr;
-
-	/*
-	 * Mappings have to fit in the FIX_BTMAP area.
-	 */
-	nrpages = size >> PAGE_SHIFT;
-	if (nrpages > NR_FIX_BTMAPS)
-		return NULL;
-
-	/*
-	 * Ok, go for it..
-	 */
-	idx = FIX_BTMAP_BEGIN;
-	while (nrpages > 0) {
-		set_fixmap(idx, phys_addr);
-		phys_addr += PAGE_SIZE;
-		--idx;
-		--nrpages;
-	}
-	return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
-}
-
-void __init bt_iounmap(void *addr, unsigned long size)
-{
-	unsigned long virt_addr;
-	unsigned long offset;
-	unsigned int nrpages;
-	enum fixed_addresses idx;
-
-	virt_addr = (unsigned long)addr;
-	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
-		return;
-	if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
-		return;
-	offset = virt_addr & ~PAGE_MASK;
-	nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
-
-	idx = FIX_BTMAP_BEGIN;
-	while (nrpages > 0) {
-		clear_fixmap(idx);
-		--idx;
-		--nrpages;
-	}
-}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/mm/pageattr-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,1414 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/mmu_context.h>
+
+#ifndef CONFIG_X86_64
+#define TASK_SIZE64 TASK_SIZE
+#endif
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+	if (lock)
+		spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	/* While mm->page_table_lock protects us against insertions and
+	 * removals of higher level page table pages, it doesn't protect
+	 * against updates of pte-s. Such updates, however, require the
+	 * pte pages to be in consistent state (unpinned+writable or
+	 * pinned+readonly). The pinning and attribute changes, however
+	 * cannot be done atomically, which is why such updates must be
+	 * prevented from happening concurrently.
+	 * Note that no pte lock can ever elsewhere be acquired nesting
+	 * with an already acquired one in the same mm, or with the mm's
+	 * page_table_lock already acquired, as that would break in the
+	 * non-split case (where all these are actually resolving to the
+	 * one page_table_lock). Thus acquiring all of them here is not
+	 * going to result in dead locks, and the order of acquires
+	 * doesn't matter.
+	 */
+	{
+		pgd_t *pgd = mm->pgd;
+		unsigned g;
+
+		for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+			pud_t *pud;
+			unsigned u;
+
+			if (pgd_none(*pgd))
+				continue;
+			pud = pud_offset(pgd, 0);
+			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+				pmd_t *pmd;
+				unsigned m;
+
+				if (pud_none(*pud))
+					continue;
+				pmd = pmd_offset(pud, 0);
+				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+					spinlock_t *ptl;
+
+					if (pmd_none(*pmd))
+						continue;
+					ptl = pte_lockptr(0, pmd);
+					if (lock)
+						spin_lock(ptl);
+					else
+						spin_unlock(ptl);
+				}
+			}
+		}
+	}
+#endif
+	if (!lock)
+		spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
+#define PIN_BATCH sizeof(void *)
+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+
+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+					     unsigned int cpu, unsigned int seq)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	if (PageHighMem(page)) {
+		if (pgprot_val(flags) & _PAGE_RW)
+			ClearPagePinned(page);
+		else
+			SetPagePinned(page);
+	} else {
+		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					pfn_pte(pfn, flags), 0);
+		if (unlikely(++seq == PIN_BATCH)) {
+			if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+								PIN_BATCH, NULL)))
+				BUG();
+			seq = 0;
+		}
+	}
+
+	return seq;
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+	pgd_t       *pgd = pgd_base;
+	pud_t       *pud;
+	pmd_t       *pmd;
+	int          g,u,m;
+	unsigned int cpu, seq;
+	multicall_entry_t *mcl;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	cpu = get_cpu();
+
+	/*
+	 * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+	 * may not be the 'current' task's pagetables (e.g., current may be
+	 * 32-bit, but the pagetables may be for a 64-bit task).
+	 * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
+	 * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+	 */
+	for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		if (PTRS_PER_PUD > 1) /* not folded */
+			seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (pud_none(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			if (PTRS_PER_PMD > 1) /* not folded */
+				seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+				if (pmd_none(*pmd))
+					continue;
+				seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+			}
+		}
+	}
+
+	mcl = per_cpu(pb_mcl, cpu);
+#ifdef CONFIG_X86_64
+	if (unlikely(seq > PIN_BATCH - 2)) {
+		if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+			BUG();
+		seq = 0;
+	}
+	MULTI_update_va_mapping(mcl + seq,
+	       (unsigned long)__user_pgd(pgd_base),
+	       pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+	       0);
+	MULTI_update_va_mapping(mcl + seq + 1,
+	       (unsigned long)pgd_base,
+	       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+	       UVMF_TLB_FLUSH);
+	if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+		BUG();
+#else
+	if (likely(seq != 0)) {
+		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+			(unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH);
+		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+		                                        seq + 1, NULL)))
+			BUG();
+	} else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH))
+		BUG();
+#endif
+
+	put_cpu();
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+	pgd_walk(pgd, PAGE_KERNEL_RO);
+	kmap_flush_unused();
+	xen_pgd_pin(__pa(pgd)); /* kernel */
+#ifdef CONFIG_X86_64
+	xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
+#endif
+	SetPagePinned(virt_to_page(pgd));
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+	xen_pgd_unpin(__pa(pgd));
+#ifdef CONFIG_X86_64
+	xen_pgd_unpin(__pa(__user_pgd(pgd)));
+#endif
+	pgd_walk(pgd, PAGE_KERNEL);
+	ClearPagePinned(virt_to_page(pgd));
+}
+
+void pgd_test_and_unpin(pgd_t *pgd)
+{
+	if (PagePinned(virt_to_page(pgd)))
+		__pgd_unpin(pgd);
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_pin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_unpin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_pin_all(void)
+{
+	struct page *page;
+	unsigned long flags;
+
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	/*
+	 * Allow uninterrupted access to the pgd_list. Also protects
+	 * __pgd_pin() by disabling preemption.
+	 * All other CPUs must be at a safe point (e.g., in stop_machine
+	 * or offlined entirely).
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (!PagePinned(page))
+			__pgd_pin((pgd_t *)page_address(page));
+	}
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	if (!PagePinned(virt_to_page(mm->pgd)))
+		mm_pin(mm);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+
+	/*
+	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+	 */
+	if (tsk->active_mm == mm) {
+		tsk->active_mm = &init_mm;
+		atomic_inc(&init_mm.mm_count);
+
+		switch_mm(mm, &init_mm, tsk);
+
+		atomic_dec(&mm->mm_count);
+		BUG_ON(atomic_read(&mm->mm_count) == 0);
+	}
+
+	task_unlock(tsk);
+
+	if (PagePinned(virt_to_page(mm->pgd))
+	    && atomic_read(&mm->mm_count) == 1
+	    && !mm->context.has_foreign_mappings)
+		mm_unpin(mm);
+}
+
+static void _pte_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pte_free(page);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+	if (pte) {
+		pgtable_page_ctor(pte);
+		SetPageForeign(pte, _pte_free);
+		init_page_count(pte);
+	}
+	return pte;
+}
+
+void __pte_free(pgtable_t pte)
+{
+	if (!PageHighMem(pte)) {
+		unsigned long va = (unsigned long)page_address(pte);
+		unsigned int level;
+		pte_t *ptep = lookup_address(va, &level);
+
+		BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+		if (!pte_write(*ptep)
+		    && HYPERVISOR_update_va_mapping(va,
+						    mk_pte(pte, PAGE_KERNEL),
+						    0))
+			BUG();
+	} else
+#ifdef CONFIG_HIGHPTE
+		ClearPagePinned(pte);
+#else
+		BUG();
+#endif
+
+	ClearPageForeign(pte);
+	init_page_count(pte);
+	pgtable_page_dtor(pte);
+	__free_page(pte);
+}
+
+#if PAGETABLE_LEVELS >= 3
+static void _pmd_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pmd_free(page);
+}
+
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pmd;
+
+	pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+	if (!pmd)
+		return NULL;
+	SetPageForeign(pmd, _pmd_free);
+	init_page_count(pmd);
+	return page_address(pmd);
+}
+
+void __pmd_free(pgtable_t pmd)
+{
+	unsigned long va = (unsigned long)page_address(pmd);
+	unsigned int level;
+	pte_t *ptep = lookup_address(va, &level);
+
+	BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+	if (!pte_write(*ptep)
+	    && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
+		BUG();
+
+	ClearPageForeign(pmd);
+	init_page_count(pmd);
+	__free_page(pmd);
+}
+#endif
+
+/* blktap and gntdev need this, as otherwise they would implicitly (and
+ * needlessly, as they never use it) reference init_mm. */
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep, int full)
+{
+	return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
+				       addr, ptep, full);
+}
+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+	unsigned long	vaddr;
+	pgprot_t	mask_set;
+	pgprot_t	mask_clr;
+	int		numpages;
+	int		flushtlb;
+	unsigned long	pfn;
+};
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+	return __pa(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+	return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr:	virtual start address
+ * @size:	number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+	void *vend = vaddr + size - 1;
+
+	mb();
+
+	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+		clflush(vaddr);
+	/*
+	 * Flush any possible final partial cacheline:
+	 */
+	clflush(vend);
+
+	mb();
+}
+
+static void __cpa_flush_all(void *arg)
+{
+	unsigned long cache = (unsigned long)arg;
+
+	/*
+	 * Flush all to work around Errata in early athlons regarding
+	 * large page flushing.
+	 */
+	__flush_tlb_all();
+
+	if (cache && boot_cpu_data.x86_model >= 4)
+		wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+	/*
+	 * We could optimize that further and do individual per page
+	 * tlb invalidates for a low number of pages. Caveat: we must
+	 * flush the high aliases on 64bit as well.
+	 */
+	__flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+	unsigned int i, level;
+	unsigned long addr;
+
+	BUG_ON(irqs_disabled());
+	WARN_ON(PAGE_ALIGN(start) != start);
+
+	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+	if (!cache)
+		return;
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+		pte_t *pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+			clflush_cache_range((void *) addr, PAGE_SIZE);
+	}
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+				   unsigned long pfn)
+{
+	pgprot_t forbidden = __pgprot(0);
+
+#ifndef CONFIG_XEN
+	/*
+	 * The BIOS area between 640k and 1Mb needs to be executable for
+	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+	 */
+	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+	/*
+	 * The kernel text needs to be executable for obvious reasons
+	 * Does not cover __inittext since that is gone later on. On
+	 * 64bit we do not enforce !NX on the low mapping
+	 */
+	if (within(address, (unsigned long)_text, (unsigned long)_etext))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
+	/*
+	 * The .rodata section needs to be read-only. Using the pfn
+	 * catches all aliases.
+	 */
+	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
+		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_RW;
+
+	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+	return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*level = PG_LEVEL_NONE;
+
+	if (pgd_none(*pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud))
+		return NULL;
+
+	*level = PG_LEVEL_1G;
+	if (pud_large(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd))
+		return NULL;
+
+	*level = PG_LEVEL_2M;
+	if (pmd_large(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	*level = PG_LEVEL_4K;
+
+	return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
+			  unsigned int level, pte_t pte)
+{
+	/* change init_mm */
+	switch(level) {
+	case PG_LEVEL_2M:
+		xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
+		break;
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+		xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
+		break;
+#endif
+	default:
+		BUG();
+	}
+#ifdef CONFIG_X86_32
+	if (!SHARED_KERNEL_PMD) {
+		struct page *page;
+
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pud = pud_offset(pgd, address);
+			pmd = pmd_offset(pud, address);
+			xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
+		}
+	}
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+			struct cpa_data *cpa)
+{
+	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	pte_t new_pte, old_pte, *tmp;
+	pgprot_t old_prot, new_prot;
+	int i, do_split = 1;
+	unsigned int level;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		psize = PMD_PAGE_SIZE;
+		pmask = PMD_PAGE_MASK;
+		break;
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+		psize = PUD_PAGE_SIZE;
+		pmask = PUD_PAGE_MASK;
+		break;
+#endif
+	default:
+		do_split = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Calculate the number of pages, which fit into this large
+	 * page starting at address:
+	 */
+	nextpage_addr = (address + psize) & pmask;
+	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+	if (numpages < cpa->numpages)
+		cpa->numpages = numpages;
+
+	/*
+	 * We are safe now. Check whether the new pgprot is the same:
+	 */
+	old_pte = *kpte;
+	old_prot = new_prot = pte_pgprot(old_pte);
+
+	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+	/*
+	 * old_pte points to the large page base address. So we need
+	 * to add the offset of the virtual address:
+	 */
+	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+	cpa->pfn = pfn;
+
+	new_prot = static_protections(new_prot, address, pfn);
+
+	/*
+	 * We need to check the full range, whether
+	 * static_protection() requires a different pgprot for one of
+	 * the pages in the range we try to preserve:
+	 */
+	if (pfn < max_mapnr) {
+		addr = address + PAGE_SIZE;
+		for (i = 1; i < cpa->numpages && ++pfn < max_mapnr;
+		     i++, addr += PAGE_SIZE) {
+			pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+
+			if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+				goto out_unlock;
+		}
+	}
+
+	/*
+	 * If there are no changes, return. maxpages has been updated
+	 * above:
+	 */
+	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+		do_split = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * We need to change the attributes. Check, whether we can
+	 * change the large page in one go. We request a split, when
+	 * the address is not aligned and the number of pages is
+	 * smaller than the number of pages in the large page. Note
+	 * that we limited the number of possible pages already to
+	 * the number of pages in the large page.
+	 */
+	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+		/*
+		 * The address is aligned and the number of pages
+		 * covers the full page.
+		 */
+		new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
+		__set_pmd_pte(kpte, address, level, new_pte);
+		cpa->flushtlb = 1;
+		do_split = 0;
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
+	return do_split;
+}
+
+static LIST_HEAD(page_pool);
+static unsigned long pool_size, pool_pages, pool_low;
+static unsigned long pool_used, pool_failed;
+
+static void cpa_fill_pool(struct page **ret)
+{
+	gfp_t gfp = GFP_KERNEL;
+	unsigned long flags;
+	struct page *p;
+
+	/*
+	 * Avoid recursion (on debug-pagealloc) and also signal
+	 * our priority to get to these pagetables:
+	 */
+	if (current->flags & PF_MEMALLOC)
+		return;
+	current->flags |= PF_MEMALLOC;
+
+	/*
+	 * Allocate atomically from atomic contexts:
+	 */
+	if (in_atomic() || irqs_disabled() || debug_pagealloc)
+		gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
+
+	while (pool_pages < pool_size || (ret && !*ret)) {
+		p = alloc_pages(gfp, 0);
+		if (!p) {
+			pool_failed++;
+			break;
+		}
+		/*
+		 * If the call site needs a page right now, provide it:
+		 */
+		if (ret && !*ret) {
+			*ret = p;
+			continue;
+		}
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_add(&p->lru, &page_pool);
+		pool_pages++;
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+
+	current->flags &= ~PF_MEMALLOC;
+}
+
+#define SHIFT_MB		(20 - PAGE_SHIFT)
+#define ROUND_MB_GB		((1 << 10) - 1)
+#define SHIFT_MB_GB		10
+#define POOL_PAGES_PER_GB	16
+
+void __init cpa_init(void)
+{
+	struct sysinfo si;
+	unsigned long gb;
+
+	si_meminfo(&si);
+	/*
+	 * Calculate the number of pool pages:
+	 *
+	 * Convert totalram (nr of pages) to MiB and round to the next
+	 * GiB. Shift MiB to Gib and multiply the result by
+	 * POOL_PAGES_PER_GB:
+	 */
+	if (debug_pagealloc) {
+		gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+		pool_size = POOL_PAGES_PER_GB * gb;
+	} else {
+		pool_size = 1;
+	}
+	pool_low = pool_size;
+
+	cpa_fill_pool(NULL);
+	printk(KERN_DEBUG
+	       "CPA: page pool initialized %lu of %lu pages preallocated\n",
+	       pool_pages, pool_size);
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+	unsigned long flags, mfn, mfninc = 1;
+	unsigned int i, level;
+	pte_t *pbase, *tmp;
+	pgprot_t ref_prot;
+	struct page *base;
+
+	/*
+	 * Get a page from the pool. The pool list is protected by the
+	 * pgd_lock, which we have to take anyway for the split
+	 * operation:
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+	if (list_empty(&page_pool)) {
+		spin_unlock_irqrestore(&pgd_lock, flags);
+		base = NULL;
+		cpa_fill_pool(&base);
+		if (!base)
+			return -ENOMEM;
+		spin_lock_irqsave(&pgd_lock, flags);
+	} else {
+		base = list_first_entry(&page_pool, struct page, lru);
+		list_del(&base->lru);
+		pool_pages--;
+
+		if (pool_pages < pool_low)
+			pool_low = pool_pages;
+	}
+
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up for us already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	pbase = (pte_t *)page_address(base);
+#ifdef CONFIG_X86_32
+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+#endif
+	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+
+#ifdef CONFIG_X86_64
+	if (level == PG_LEVEL_1G) {
+		mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+		pgprot_val(ref_prot) |= _PAGE_PSE;
+	}
+#endif
+
+	/*
+	 * Get the target mfn from the original entry:
+	 */
+	mfn = __pte_mfn(*kpte);
+	for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
+		set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
+
+	/*
+	 * Install the new, split up pagetable. Important details here:
+	 *
+	 * On Intel the NX bit of all levels must be cleared to make a
+	 * page executable. See section 4.13.2 of Intel 64 and IA-32
+	 * Architectures Software Developer's Manual).
+	 *
+	 * Mark the entry present. The current mapping might be
+	 * set to not present, which we preserved above.
+	 */
+	if (!xen_feature(XENFEAT_writable_page_tables) &&
+	    HYPERVISOR_update_va_mapping((unsigned long)pbase,
+					 mk_pte(base, PAGE_KERNEL_RO), 0))
+		BUG();
+	ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+	pgprot_val(ref_prot) |= _PAGE_PRESENT;
+	__set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
+	base = NULL;
+
+out_unlock:
+	/*
+	 * If we dropped out via the lookup_address check under
+	 * pgd_lock then stick the page back into the pool:
+	 */
+	if (base) {
+		list_add(&base->lru, &page_pool);
+		pool_pages++;
+	} else
+		pool_used++;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
+	return 0;
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+	unsigned long address = cpa->vaddr;
+	int do_split, err;
+	unsigned int level;
+	pte_t *kpte, old_pte;
+
+repeat:
+	kpte = lookup_address(address, &level);
+	if (!kpte)
+		return primary ? -EINVAL : 0;
+
+	old_pte = *kpte;
+	if (!__pte_val(old_pte)) {
+		if (!primary)
+			return 0;
+		printk(KERN_WARNING "CPA: called for zero pte. "
+		       "vaddr = %lx cpa->vaddr = %lx\n", address,
+		       cpa->vaddr);
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (level == PG_LEVEL_4K) {
+		pte_t new_pte;
+		pgprot_t new_prot = pte_pgprot(old_pte);
+		unsigned long mfn = __pte_mfn(old_pte);
+
+		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+		new_prot = static_protections(new_prot, address,
+					      mfn_to_local_pfn(mfn));
+
+		/*
+		 * We need to keep the mfn from the existing PTE,
+		 * after all we're only going to change it's attributes
+		 * not the memory it points to
+		 */
+		new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
+		cpa->pfn = mfn_to_local_pfn(mfn);
+		/*
+		 * Do we really change anything ?
+		 */
+		if (__pte_val(old_pte) != __pte_val(new_pte)) {
+			set_pte_atomic(kpte, new_pte);
+			cpa->flushtlb = 1;
+		}
+		cpa->numpages = 1;
+		return 0;
+	}
+
+	/*
+	 * Check, whether we can keep the large page intact
+	 * and just change the pte:
+	 */
+	do_split = try_preserve_large_page(kpte, address, cpa);
+	/*
+	 * When the range fits into the existing large page,
+	 * return. cp->numpages and cpa->tlbflush have been updated in
+	 * try_large_page:
+	 */
+	if (do_split <= 0)
+		return do_split;
+
+	/*
+	 * We have to split the large page:
+	 */
+	err = split_large_page(kpte, address);
+	if (!err) {
+		cpa->flushtlb = 1;
+		goto repeat;
+	}
+
+	return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+	struct cpa_data alias_cpa;
+	int ret = 0;
+
+	if (cpa->pfn > max_pfn_mapped)
+		return 0;
+
+	/*
+	 * No need to redo, when the primary call touched the direct
+	 * mapping already:
+	 */
+	if (!within(cpa->vaddr, PAGE_OFFSET,
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+
+		ret = __change_page_attr_set_clr(&alias_cpa, 0);
+	}
+
+#ifdef CONFIG_X86_64
+	if (ret)
+		return ret;
+	/*
+	 * No need to redo, when the primary call touched the high
+	 * mapping already:
+	 */
+	if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+		return 0;
+
+	/*
+	 * If the physical address is inside the kernel map, we need
+	 * to touch the high mapped kernel as well:
+	 */
+	if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
+		return 0;
+
+	alias_cpa = *cpa;
+	alias_cpa.vaddr =
+		(cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map;
+
+	/*
+	 * The high mapping range is imprecise, so ignore the return value.
+	 */
+	__change_page_attr_set_clr(&alias_cpa, 0);
+#endif
+	return ret;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+	int ret, numpages = cpa->numpages;
+
+	while (numpages) {
+		/*
+		 * Store the remaining nr of pages for the large page
+		 * preservation check.
+		 */
+		cpa->numpages = numpages;
+
+		ret = __change_page_attr(cpa, checkalias);
+		if (ret)
+			return ret;
+
+		if (checkalias) {
+			ret = cpa_process_alias(cpa);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * Adjust the number of pages with the result of the
+		 * CPA operation. Either a large page has been
+		 * preserved or a single page update happened.
+		 */
+		BUG_ON(cpa->numpages > numpages);
+		numpages -= cpa->numpages;
+		cpa->vaddr += cpa->numpages * PAGE_SIZE;
+	}
+	return 0;
+}
+
+static inline int cache_attr(pgprot_t attr)
+{
+	return pgprot_val(attr) &
+		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
+
+static int change_page_attr_set_clr(unsigned long addr, int numpages,
+				    pgprot_t mask_set, pgprot_t mask_clr)
+{
+	struct cpa_data cpa;
+	int ret, cache, checkalias;
+
+	/*
+	 * Check, if we are requested to change a not supported
+	 * feature:
+	 */
+	mask_set = canon_pgprot(mask_set);
+	mask_clr = canon_pgprot(mask_clr);
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+		return 0;
+
+	/* Ensure we are PAGE_SIZE aligned */
+	if (addr & ~PAGE_MASK) {
+		addr &= PAGE_MASK;
+		/*
+		 * People should not be passing in unaligned addresses:
+		 */
+		WARN_ON_ONCE(1);
+	}
+
+	cpa.vaddr = addr;
+	cpa.numpages = numpages;
+	cpa.mask_set = mask_set;
+	cpa.mask_clr = mask_clr;
+	cpa.flushtlb = 0;
+
+	/* No alias checking for _NX bit modifications */
+	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+	ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+	/*
+	 * Check whether we really changed something:
+	 */
+	if (!cpa.flushtlb)
+		goto out;
+
+	/*
+	 * No need to flush, when we did not set any of the caching
+	 * attributes:
+	 */
+	cache = cache_attr(mask_set);
+
+	/*
+	 * On success we use clflush, when the CPU supports it to
+	 * avoid the wbindv. If the CPU does not support it and in the
+	 * error case we fall back to cpa_flush_all (which uses
+	 * wbindv):
+	 */
+	if (!ret && cpu_has_clflush)
+		cpa_flush_range(addr, numpages, cache);
+	else
+		cpa_flush_all(cache);
+
+out:
+	cpa_fill_pool(NULL);
+
+	return ret;
+}
+
+static inline int change_page_attr_set(unsigned long addr, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int change_page_attr_clear(unsigned long addr, int numpages,
+					 pgprot_t mask)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages,
+				    __pgprot(_PAGE_PCD));
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages,
+				      __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+				.numpages = numpages,
+				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.mask_clr = __pgprot(0)};
+
+	return __change_page_attr_set_clr(&cpa, 1);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+	struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+				.numpages = numpages,
+				.mask_set = __pgprot(0),
+				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+
+	return __change_page_attr_set_clr(&cpa, 1);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
+		return;
+	if (!enable) {
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
+	}
+
+	/*
+	 * If page allocator is not up yet then do not call c_p_a():
+	 */
+	if (!debug_pagealloc_enabled)
+		return;
+
+	/*
+	 * The return value is ignored as the calls cannot fail.
+	 * Large pages are kept enabled at boot time, and are
+	 * split up quickly with DEBUG_PAGEALLOC. If a splitup
+	 * fails here (due to temporary memory shortage) no damage
+	 * is done because we just keep the largepage intact up
+	 * to the next attempt when it will likely be split up:
+	 */
+	if (enable)
+		__set_pages_p(page, numpages);
+	else
+		__set_pages_np(page, numpages);
+
+	/*
+	 * We should perform an IPI and flush all tlbs,
+	 * but that can deadlock->flush only current cpu:
+	 */
+	__flush_tlb_all();
+
+	/*
+	 * Try to refill the page pool here. We can do this only after
+	 * the tlb flush.
+	 */
+	cpa_fill_pool(NULL);
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned int level;
+	pte_t *pte;
+
+	if (PageHighMem(page))
+		return false;
+
+	pte = lookup_address((unsigned long)page_address(page), &level);
+	return (__pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static inline int in_secondary_range(unsigned long va)
+{
+#ifdef CONFIG_X86_64
+	return va >= VMALLOC_START && va < VMALLOC_END;
+#else
+	return va >= (unsigned long)high_memory;
+#endif
+}
+
+static void __make_page_readonly(unsigned long va)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K);
+	if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
+		BUG();
+	if (in_secondary_range(va)) {
+		unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+		if (pfn >= highstart_pfn)
+			kmap_flush_unused(); /* flush stale writable kmaps */
+		else
+#endif
+			__make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
+	}
+}
+
+static void __make_page_writable(unsigned long va)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K);
+	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+		BUG();
+	if (in_secondary_range(va)) {
+		unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+		if (pfn < highstart_pfn)
+#endif
+			__make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
+	}
+}
+
+void make_page_readonly(void *va, unsigned int feature)
+{
+	if (!xen_feature(feature))
+		__make_page_readonly((unsigned long)va);
+}
+
+void make_page_writable(void *va, unsigned int feature)
+{
+	if (!xen_feature(feature))
+		__make_page_writable((unsigned long)va);
+}
+
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
+{
+	unsigned long addr;
+
+	if (xen_feature(feature))
+		return;
+
+	for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+		__make_page_readonly(addr);
+}
+
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
+{
+	unsigned long addr;
+
+	if (xen_feature(feature))
+		return;
+
+	for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+		__make_page_writable(addr);
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
--- head-2010-04-29.orig/arch/x86/mm/pageattr_64-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,542 +0,0 @@
-/*
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Thanks to Ben LaHaise for precious feedback.
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-#ifdef CONFIG_XEN
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-
-static void _pin_lock(struct mm_struct *mm, int lock) {
-	if (lock)
-		spin_lock(&mm->page_table_lock);
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-	/* While mm->page_table_lock protects us against insertions and
-	 * removals of higher level page table pages, it doesn't protect
-	 * against updates of pte-s. Such updates, however, require the
-	 * pte pages to be in consistent state (unpinned+writable or
-	 * pinned+readonly). The pinning and attribute changes, however
-	 * cannot be done atomically, which is why such updates must be
-	 * prevented from happening concurrently.
-	 * Note that no pte lock can ever elsewhere be acquired nesting
-	 * with an already acquired one in the same mm, or with the mm's
-	 * page_table_lock already acquired, as that would break in the
-	 * non-split case (where all these are actually resolving to the
-	 * one page_table_lock). Thus acquiring all of them here is not
-	 * going to result in dead locks, and the order of acquires
-	 * doesn't matter.
-	 */
-	{
-		pgd_t *pgd = mm->pgd;
-		unsigned g;
-
-		for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-			pud_t *pud;
-			unsigned u;
-
-			if (pgd_none(*pgd))
-				continue;
-			pud = pud_offset(pgd, 0);
-			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-				pmd_t *pmd;
-				unsigned m;
-
-				if (pud_none(*pud))
-					continue;
-				pmd = pmd_offset(pud, 0);
-				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-					spinlock_t *ptl;
-
-					if (pmd_none(*pmd))
-						continue;
-					ptl = pte_lockptr(0, pmd);
-					if (lock)
-						spin_lock(ptl);
-					else
-						spin_unlock(ptl);
-				}
-			}
-		}
-	}
-#endif
-	if (!lock)
-		spin_unlock(&mm->page_table_lock);
-}
-#define pin_lock(mm) _pin_lock(mm, 1)
-#define pin_unlock(mm) _pin_lock(mm, 0)
-
-#define PIN_BATCH 8
-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
-
-static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags,
-					     unsigned int cpu, unsigned int seq)
-{
-	struct page *page = virt_to_page(pt);
-	unsigned long pfn = page_to_pfn(page);
-
-	MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-		(unsigned long)__va(pfn << PAGE_SHIFT),
-		pfn_pte(pfn, flags), 0);
-	if (unlikely(++seq == PIN_BATCH)) {
-		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-	                                                PIN_BATCH, NULL)))
-			BUG();
-		seq = 0;
-	}
-
-	return seq;
-}
-
-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
-{
-	pgd_t       *pgd = pgd_base;
-	pud_t       *pud;
-	pmd_t       *pmd;
-	pte_t       *pte;
-	int          g,u,m;
-	unsigned int cpu, seq;
-	multicall_entry_t *mcl;
-
-	cpu = get_cpu();
-
-	/*
-	 * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
-	 * be the 'current' task's pagetables (e.g., current may be 32-bit,
-	 * but the pagetables may be for a 64-bit task).
-	 * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
-	 * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
-	 */
-	for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-		if (pgd_none(*pgd))
-			continue;
-		pud = pud_offset(pgd, 0);
-		if (PTRS_PER_PUD > 1) /* not folded */
-			seq = pgd_walk_set_prot(pud,flags,cpu,seq);
-		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-			if (pud_none(*pud))
-				continue;
-			pmd = pmd_offset(pud, 0);
-			if (PTRS_PER_PMD > 1) /* not folded */
-				seq = pgd_walk_set_prot(pmd,flags,cpu,seq);
-			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-				if (pmd_none(*pmd))
-					continue;
-				pte = pte_offset_kernel(pmd,0);
-				seq = pgd_walk_set_prot(pte,flags,cpu,seq);
-			}
-		}
-	}
-
-	mcl = per_cpu(pb_mcl, cpu);
-	if (unlikely(seq > PIN_BATCH - 2)) {
-		if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
-			BUG();
-		seq = 0;
-	}
-	MULTI_update_va_mapping(mcl + seq,
-	       (unsigned long)__user_pgd(pgd_base),
-	       pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
-	       0);
-	MULTI_update_va_mapping(mcl + seq + 1,
-	       (unsigned long)pgd_base,
-	       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-	       UVMF_TLB_FLUSH);
-	if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
-		BUG();
-
-	put_cpu();
-}
-
-static void __pgd_pin(pgd_t *pgd)
-{
-	pgd_walk(pgd, PAGE_KERNEL_RO);
-	xen_pgd_pin(__pa(pgd)); /* kernel */
-	xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
-	SetPagePinned(virt_to_page(pgd));
-}
-
-static void __pgd_unpin(pgd_t *pgd)
-{
-	xen_pgd_unpin(__pa(pgd));
-	xen_pgd_unpin(__pa(__user_pgd(pgd)));
-	pgd_walk(pgd, PAGE_KERNEL);
-	ClearPagePinned(virt_to_page(pgd));
-}
-
-void pgd_test_and_unpin(pgd_t *pgd)
-{
-	if (PagePinned(virt_to_page(pgd)))
-		__pgd_unpin(pgd);
-}
-
-void mm_pin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	pin_lock(mm);
-	__pgd_pin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_unpin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	pin_lock(mm);
-	__pgd_unpin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_pin_all(void)
-{
-	struct page *page;
-	unsigned long flags;
-
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	/*
-	 * Allow uninterrupted access to the pgd_list. Also protects
-	 * __pgd_pin() by disabling preemption.
-	 * All other CPUs must be at a safe point (e.g., in stop_machine
-	 * or offlined entirely).
-	 */
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_for_each_entry(page, &pgd_list, lru) {
-		if (!PagePinned(page))
-			__pgd_pin((pgd_t *)page_address(page));
-	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-	if (!PagePinned(virt_to_page(mm->pgd)))
-		mm_pin(mm);
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
-
-	/*
-	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-	 */
-	if (tsk->active_mm == mm) {
-		tsk->active_mm = &init_mm;
-		atomic_inc(&init_mm.mm_count);
-
-		switch_mm(mm, &init_mm, tsk);
-
-		atomic_dec(&mm->mm_count);
-		BUG_ON(atomic_read(&mm->mm_count) == 0);
-	}
-
-	task_unlock(tsk);
-
-	if (PagePinned(virt_to_page(mm->pgd))
-	    && (atomic_read(&mm->mm_count) == 1)
-	    && !mm->context.has_foreign_mappings)
-		mm_unpin(mm);
-}
-
-static void _pte_free(struct page *page, unsigned int order)
-{
-	BUG_ON(order);
-	pte_free(page);
-}
-
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *pte;
-
-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-	if (pte) {
-		SetPageForeign(pte, _pte_free);
-		init_page_count(pte);
-	}
-	return pte;
-}
-
-void pte_free(struct page *pte)
-{
-	unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
-
-	if (!pte_write(*virt_to_ptep(va)))
-		if (HYPERVISOR_update_va_mapping(
-			va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0))
-			BUG();
-
-	ClearPageForeign(pte);
-	init_page_count(pte);
-
-	__free_page(pte);
-}
-#endif	/* CONFIG_XEN */
-
-pte_t *lookup_address(unsigned long address)
-{
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	if (pgd_none(*pgd))
-		return NULL;
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return NULL;
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return NULL;
-	if (pmd_large(*pmd))
-		return (pte_t *)pmd;
-	pte = pte_offset_kernel(pmd, address);
-	if (pte && !pte_present(*pte))
-		pte = NULL;
-	return pte;
-}
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-				     pgprot_t ref_prot)
-{
-	int i;
-	unsigned long addr;
-	struct page *base = alloc_pages(GFP_KERNEL, 0);
-	pte_t *pbase;
-	if (!base)
-		return NULL;
-	/*
-	 * page_private is used to track the number of entries in
-	 * the page table page have non standard attributes.
-	 */
-	SetPagePrivate(base);
-	page_private(base) = 0;
-
-	address = __pa(address);
-	addr = address & LARGE_PAGE_MASK;
-	pbase = (pte_t *)page_address(base);
-	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-		pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
-				   addr == address ? prot : ref_prot);
-	}
-	return base;
-}
-
-void clflush_cache_range(void *adr, int size)
-{
-	int i;
-	for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
-		clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
-	struct list_head *l = (struct list_head *)arg;
-	struct page *pg;
-
-	/* When clflush is available always use it because it is
-	   much cheaper than WBINVD. */
-	/* clflush is still broken. Disable for now. */
-	if (1 || !cpu_has_clflush)
-		asm volatile("wbinvd" ::: "memory");
-	else list_for_each_entry(pg, l, lru) {
-		void *adr = page_address(pg);
-		clflush_cache_range(adr, PAGE_SIZE);
-	}
-	__flush_tlb_all();
-}
-
-static inline void flush_map(struct list_head *l)
-{
-	on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
-
-static inline void save_page(struct page *fpage)
-{
-	if (!test_and_set_bit(PG_arch_1, &fpage->flags))
-		list_add(&fpage->lru, &deferred_pages);
-}
-
-/*
- * No more special protections in this 2/4MB area - revert to a
- * large page again.
- */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t large_pte;
-	unsigned long pfn;
-
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd,address);
-	BUG_ON(pud_none(*pud));
-	pmd = pmd_offset(pud, address);
-	BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
-	pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
-	large_pte = pfn_pte(pfn, ref_prot);
-	large_pte = pte_mkhuge(large_pte);
-	set_pte((pte_t *)pmd, large_pte);
-}
-
-static int
-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
-				   pgprot_t ref_prot)
-{
-	pte_t *kpte;
-	struct page *kpte_page;
-	pgprot_t ref_prot2;
-
-	kpte = lookup_address(address);
-	if (!kpte) return 0;
-	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
-	BUG_ON(PageLRU(kpte_page));
-	BUG_ON(PageCompound(kpte_page));
-	if (pgprot_val(prot) != pgprot_val(ref_prot)) {
-		if (!pte_huge(*kpte)) {
-			set_pte(kpte, pfn_pte(pfn, prot));
-		} else {
- 			/*
-			 * split_large_page will take the reference for this
-			 * change_page_attr on the split page.
- 			 */
-			struct page *split;
-			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(address, prot, ref_prot2);
-			if (!split)
-				return -ENOMEM;
-			pgprot_val(ref_prot2) &= ~_PAGE_NX;
-			set_pte(kpte, mk_pte(split, ref_prot2));
-			kpte_page = split;
-		}
-		page_private(kpte_page)++;
-	} else if (!pte_huge(*kpte)) {
-		set_pte(kpte, pfn_pte(pfn, ref_prot));
-		BUG_ON(page_private(kpte_page) == 0);
-		page_private(kpte_page)--;
-	} else
-		BUG();
-
-	/* on x86-64 the direct mapping set at boot is not using 4k pages */
-	/*
-	 * ..., but the XEN guest kernels (currently) do:
-	 * If the pte was reserved, it means it was created at boot
-	 * time (not via split_large_page) and in turn we must not
-	 * replace it with a large page.
-	 */
-#ifndef CONFIG_XEN
- 	BUG_ON(PageReserved(kpte_page));
-#else
-	if (PageReserved(kpte_page))
-		return 0;
-#endif
-
-	save_page(kpte_page);
-	if (page_private(kpte_page) == 0)
-		revert_page(address, ref_prot);
-	return 0;
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- *
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- *
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
-{
-	int err = 0, kernel_map = 0;
-	int i;
-
-	if (address >= __START_KERNEL_map
-	    && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
-		address = (unsigned long)__va(__pa(address));
-		kernel_map = 1;
-	}
-
-	down_write(&init_mm.mmap_sem);
-	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
-		unsigned long pfn = __pa(address) >> PAGE_SHIFT;
-
-		if (!kernel_map || pte_present(pfn_pte(0, prot))) {
-			err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
-			if (err)
-				break;
-		}
-		/* Handle kernel mapping too which aliases part of the
-		 * lowmem */
-		if (__pa(address) < KERNEL_TEXT_SIZE) {
-			unsigned long addr2;
-			pgprot_t prot2;
-			addr2 = __START_KERNEL_map + __pa(address);
-			/* Make sure the kernel mappings stay executable */
-			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
-			err = __change_page_attr(addr2, pfn, prot2,
-						 PAGE_KERNEL_EXEC);
-		}
-	}
-	up_write(&init_mm.mmap_sem);
-	return err;
-}
-
-/* Don't call this for MMIO areas that may not have a mem_map entry */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-	unsigned long addr = (unsigned long)page_address(page);
-	return change_page_attr_addr(addr, numpages, prot);
-}
-
-void global_flush_tlb(void)
-{
-	struct page *pg, *next;
-	struct list_head l;
-
-	/*
-	 * Write-protect the semaphore, to exclude two contexts
-	 * doing a list_replace_init() call in parallel and to
-	 * exclude new additions to the deferred_pages list:
-	 */
-	down_write(&init_mm.mmap_sem);
-	list_replace_init(&deferred_pages, &l);
-	up_write(&init_mm.mmap_sem);
-
-	flush_map(&l);
-
-	list_for_each_entry_safe(pg, next, &l, lru) {
-		list_del(&pg->lru);
-		clear_bit(PG_arch_1, &pg->flags);
-		if (page_private(pg) != 0)
-			continue;
-		ClearPagePrivate(pg);
-		__free_page(pg);
-	}
-}
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
--- head-2010-04-29.orig/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -29,8 +29,6 @@
 #include <xen/features.h>
 #include <asm/hypervisor.h>

-static void pgd_test_and_unpin(pgd_t *pgd);
-
 void show_mem(void)
 {
 	int total = 0, reserved = 0;
@@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st
 	return pte;
 }

-static void _pte_free(struct page *page, unsigned int order)
-{
-	BUG_ON(order);
-	pte_free(page);
-}
-
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *pte;
-
-#ifdef CONFIG_HIGHPTE
-	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
-	if (pte) {
-		SetPageForeign(pte, _pte_free);
-		init_page_count(pte);
-	}
-	return pte;
-}
-
-void pte_free(struct page *pte)
-{
-	unsigned long pfn = page_to_pfn(pte);
-
-	if (!PageHighMem(pte)) {
-		unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT);
-
-		if (!pte_write(*virt_to_ptep(va)))
-			if (HYPERVISOR_update_va_mapping(
-				va, pfn_pte(pfn, PAGE_KERNEL), 0))
-				BUG();
-	} else
-		ClearPagePinned(pte);
-
-	ClearPageForeign(pte);
-	init_page_count(pte);
-
-	__free_page(pte);
-}
-
-void pmd_ctor(struct kmem_cache *cache, void *pmd)
-{
-	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
 /*
  * List of all pgd's needed for non-PAE so it can invalidate entries
  * in both cached and uncached pgd's; not needed for PAE since the
@@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache,
  * vmalloc faults work because attached pagetables are never freed.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
-
 static inline void pgd_list_add(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
-	page->index = (unsigned long)pgd_list;
-	if (pgd_list)
-		set_page_private(pgd_list, (unsigned long)&page->index);
-	pgd_list = page;
-	set_page_private(page, (unsigned long)&pgd_list);
+
+	list_add(&page->lru, &pgd_list);
 }

 static inline void pgd_list_del(pgd_t *pgd)
 {
-	struct page *next, **pprev, *page = virt_to_page(pgd);
-	next = (struct page *)page->index;
-	pprev = (struct page **)page_private(page);
-	*pprev = next;
-	if (next)
-		set_page_private(next, (unsigned long)pprev);
-}
+	struct page *page = virt_to_page(pgd);

+	list_del(&page->lru);
+}

+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)

-#if (PTRS_PER_PMD == 1)
-/* Non-PAE pgd constructor */
-static void pgd_ctor(void *pgd)
+static void pgd_ctor(void *p)
 {
+	pgd_t *pgd = p;
 	unsigned long flags;

-	/* !PAE, no pagetable sharing */
+	pgd_test_and_unpin(pgd);
+
+	/* Clear usermode parts of PGD */
 	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));

 	spin_lock_irqsave(&pgd_lock, flags);

-	/* must happen under lock */
-	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-			swapper_pg_dir + USER_PTRS_PER_PGD,
-			KERNEL_PGD_PTRS);
-
-	paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-				__pa(swapper_pg_dir) >> PAGE_SHIFT,
-				USER_PTRS_PER_PGD,
-				KERNEL_PGD_PTRS);
-	pgd_list_add(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-#else  /* PTRS_PER_PMD > 1 */
-/* PAE pgd constructor */
-static void pgd_ctor(void *pgd)
-{
-	/* PAE, kernel PMD may be shared */
-
-	if (SHARED_KERNEL_PMD) {
-		clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (PAGETABLE_LEVELS == 2 ||
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
 				swapper_pg_dir + USER_PTRS_PER_PGD,
 				KERNEL_PGD_PTRS);
-	} else {
-		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+					__pa(swapper_pg_dir) >> PAGE_SHIFT,
+					USER_PTRS_PER_PGD,
+					KERNEL_PGD_PTRS);
 	}
+
+	/* list required to sync kernel mapping updates */
+	if (PAGETABLE_LEVELS == 2)
+		pgd_list_add(pgd);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
-#endif	/* PTRS_PER_PMD */

 static void pgd_dtor(void *pgd)
 {
 	unsigned long flags; /* can be called from interrupt context */

-	if (SHARED_KERNEL_PMD)
-		return;
-
-	paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
-	spin_lock_irqsave(&pgd_lock, flags);
-	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	if (!SHARED_KERNEL_PMD) {
+		spin_lock_irqsave(&pgd_lock, flags);
+		pgd_list_del(pgd);
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}

 	pgd_test_and_unpin(pgd);
 }

-#define UNSHARED_PTRS_PER_PGD				\
-	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-/* If we allocate a pmd for part of the kernel address space, then
-   make sure its initialized with the appropriate kernel mappings.
-   Otherwise use a cached zeroed pmd.  */
-static pmd_t *pmd_cache_alloc(int idx)
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 {
-	pmd_t *pmd;
+	int i;

-	if (idx >= USER_PTRS_PER_PGD) {
-		pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+		pgd_t pgd = pgdp[i];

-#ifndef CONFIG_XEN
-		if (pmd)
-			memcpy(pmd,
-			       (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-#endif
-	} else
-		pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+		if (__pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);

-	return pmd;
-}
+			pgdp[i] = xen_make_pgd(0);

-static void pmd_cache_free(pmd_t *pmd, int idx)
-{
-	if (idx >= USER_PTRS_PER_PGD) {
-		make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
-		memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-		free_page((unsigned long)pmd);
-	} else
-		kmem_cache_free(pmd_cache, pmd);
+			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
 }

-pgd_t *pgd_alloc(struct mm_struct *mm)
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 {
+	pud_t *pud;
+	pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
+	unsigned long addr, flags;
 	int i;
-	pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
-	pmd_t **pmds = NULL;
-	unsigned long flags;
-
-	pgd_test_and_unpin(pgd);
-
-	if (PTRS_PER_PMD == 1 || !pgd)
-		return pgd;
-
-#ifdef CONFIG_XEN
-	if (!SHARED_KERNEL_PMD) {
-		/*
-		 * We can race save/restore (if we sleep during a GFP_KERNEL memory
-		 * allocation). We therefore store virtual addresses of pmds as they
-		 * do not change across save/restore, and poke the machine addresses
-		 * into the pgdir under the pgd_lock.
-		 */
-		pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
-		if (!pmds) {
-			quicklist_free(0, pgd_dtor, pgd);
-			return NULL;
-		}
-	}
-#endif

-	/* Allocate pmds, remember virtual addresses. */
-	for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = pmd_cache_alloc(i);
-
-		if (!pmd)
+	/*
+	 * We can race save/restore (if we sleep during a GFP_KERNEL memory
+	 * allocation). We therefore store virtual addresses of pmds as they
+	 * do not change across save/restore, and poke the machine addresses
+	 * into the pgdir under the pgd_lock.
+	 */
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
+		pmds[i] = pmd_alloc_one(mm, addr);
+		if (!pmds[i])
 			goto out_oom;
-
-		paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
-		if (pmds)
-			pmds[i] = pmd;
-		else
-			set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
 	}

-#ifdef CONFIG_XEN
-	if (SHARED_KERNEL_PMD)
-		return pgd;
-
 	spin_lock_irqsave(&pgd_lock, flags);

 	/* Protect against save/restore: move below 4GB under pgd_lock. */
-	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
-		int rc = xen_create_contiguous_region(
-			(unsigned long)pgd, 0, 32);
-		if (rc) {
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			goto out_oom;
-		}
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+		spin_unlock_irqrestore(&pgd_lock, flags);
+out_oom:
+		while (i--)
+			pmd_free(mm, pmds[i]);
+		return 0;
 	}

 	/* Copy kernel pmd contents and write-protect the new pmds. */
-	for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-		memcpy(pmds[i],
-		       (void *)pgd_page_vaddr(swapper_pg_dir[i]),
-		       sizeof(pmd_t) * PTRS_PER_PMD);
-		make_lowmem_page_readonly(
-			pmds[i], XENFEAT_writable_page_tables);
-	}
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+	     i++, pud++, addr += PUD_SIZE) {
+		if (i >= USER_PTRS_PER_PGD) {
+			memcpy(pmds[i],
+			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+			make_lowmem_page_readonly(
+				pmds[i], XENFEAT_writable_page_tables);
+		}

-	/* It is safe to poke machine addresses of pmds under the pmd_lock. */
-	for (i = 0; i < PTRS_PER_PGD; i++)
-		set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
+		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
+		pud_populate(mm, pud, pmds[i]);
+	}

-	/* Ensure this pgd gets picked up and pinned on save/restore. */
+	/* List required to sync kernel mapping updates and
+	 * to pin/unpin on save/restore. */
 	pgd_list_add(pgd);

 	spin_unlock_irqrestore(&pgd_lock, flags);

-	kfree(pmds);
-#endif
+	return 1;
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	return 1;
+}

-	return pgd;
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+}
+#endif	/* CONFIG_X86_PAE */

-out_oom:
-	if (!pmds) {
-		for (i--; i >= 0; i--) {
-			pgd_t pgdent = pgd[i];
-			void* pmd = (void *)__va(pgd_val(pgdent)-1);
-			paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-			pmd_cache_free(pmd, i);
-		}
-	} else {
-		for (i--; i >= 0; i--) {
-			paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
-			pmd_cache_free(pmds[i], i);
-		}
-		kfree(pmds);
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+	/* so that alloc_pd can use it */
+	mm->pgd = pgd;
+	if (pgd)
+		pgd_ctor(pgd);
+
+	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+		free_page((unsigned long)pgd);
+		pgd = NULL;
 	}
-	quicklist_free(0, pgd_dtor, pgd);
-	return NULL;
+
+	return pgd;
 }

-void pgd_free(pgd_t *pgd)
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	int i;
-
 	/*
 	 * After this the pgd should not be pinned for the duration of this
 	 * function's execution. We should never sleep and thus never race:
@@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd)
 	 *  2. The machine addresses in PGD entries will not become invalid
 	 *     due to a concurrent save/restore.
 	 */
-	pgd_test_and_unpin(pgd);
+	pgd_dtor(pgd);

-	/* in the PAE case user pgd entries are overwritten before usage */
-	if (PTRS_PER_PMD > 1) {
-		for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-			pgd_t pgdent = pgd[i];
-			void* pmd = (void *)__va(pgd_val(pgdent)-1);
-			paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-			pmd_cache_free(pmd, i);
-		}
+	if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
+		xen_destroy_contiguous_region((unsigned long)pgd, 0);

-		if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
-			xen_destroy_contiguous_region((unsigned long)pgd, 0);
-	}
+	pgd_mop_up_pmds(mm, pgd);
+	free_page((unsigned long)pgd);
+}

-	/* in the non-PAE case, free_pgtables() clears user pgd entries */
-	quicklist_free(0, pgd_dtor, pgd);
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	paravirt_release_pt(page_to_pfn(pte));
+	tlb_remove_page(tlb, pte);
 }

-void check_pgt_cache(void)
+#ifdef CONFIG_X86_PAE
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-	quicklist_trim(0, pgd_dtor, 25, 16);
+	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pmd));
 }

+#endif
+
 void make_lowmem_page_readonly(void *va, unsigned int feature)
 {
 	pte_t *pte;
+	unsigned int level;
 	int rc;

 	if (xen_feature(feature))
 		return;

-	pte = virt_to_ptep(va);
+	pte = lookup_address((unsigned long)va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
 	rc = HYPERVISOR_update_va_mapping(
 		(unsigned long)va, pte_wrprotect(*pte), 0);
 	BUG_ON(rc);
@@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va,
 void make_lowmem_page_writable(void *va, unsigned int feature)
 {
 	pte_t *pte;
+	unsigned int level;
 	int rc;

 	if (xen_feature(feature))
 		return;

-	pte = virt_to_ptep(va);
+	pte = lookup_address((unsigned long)va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
 	rc = HYPERVISOR_update_va_mapping(
 		(unsigned long)va, pte_mkwrite(*pte), 0);
 	BUG_ON(rc);
 }
-
-void make_page_readonly(void *va, unsigned int feature)
-{
-	pte_t *pte;
-	int rc;
-
-	if (xen_feature(feature))
-		return;
-
-	pte = virt_to_ptep(va);
-	rc = HYPERVISOR_update_va_mapping(
-		(unsigned long)va, pte_wrprotect(*pte), 0);
-	if (rc) /* fallback? */
-		xen_l1_entry_update(pte, pte_wrprotect(*pte));
-	if ((unsigned long)va >= (unsigned long)high_memory) {
-		unsigned long pfn = pte_pfn(*pte);
-#ifdef CONFIG_HIGHMEM
-		if (pfn >= highstart_pfn)
-			kmap_flush_unused(); /* flush stale writable kmaps */
-		else
-#endif
-			make_lowmem_page_readonly(
-				phys_to_virt(pfn << PAGE_SHIFT), feature);
-	}
-}
-
-void make_page_writable(void *va, unsigned int feature)
-{
-	pte_t *pte;
-	int rc;
-
-	if (xen_feature(feature))
-		return;
-
-	pte = virt_to_ptep(va);
-	rc = HYPERVISOR_update_va_mapping(
-		(unsigned long)va, pte_mkwrite(*pte), 0);
-	if (rc) /* fallback? */
-		xen_l1_entry_update(pte, pte_mkwrite(*pte));
-	if ((unsigned long)va >= (unsigned long)high_memory) {
-		unsigned long pfn = pte_pfn(*pte);
-#ifdef CONFIG_HIGHMEM
-		if (pfn < highstart_pfn)
-#endif
-			make_lowmem_page_writable(
-				phys_to_virt(pfn << PAGE_SHIFT), feature);
-	}
-}
-
-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
-{
-	if (xen_feature(feature))
-		return;
-
-	while (nr-- != 0) {
-		make_page_readonly(va, feature);
-		va = (void *)((unsigned long)va + PAGE_SIZE);
-	}
-}
-
-void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
-{
-	if (xen_feature(feature))
-		return;
-
-	while (nr-- != 0) {
-		make_page_writable(va, feature);
-		va = (void *)((unsigned long)va + PAGE_SIZE);
-	}
-}
-
-static void _pin_lock(struct mm_struct *mm, int lock) {
-	if (lock)
-		spin_lock(&mm->page_table_lock);
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-	/* While mm->page_table_lock protects us against insertions and
-	 * removals of higher level page table pages, it doesn't protect
-	 * against updates of pte-s. Such updates, however, require the
-	 * pte pages to be in consistent state (unpinned+writable or
-	 * pinned+readonly). The pinning and attribute changes, however
-	 * cannot be done atomically, which is why such updates must be
-	 * prevented from happening concurrently.
-	 * Note that no pte lock can ever elsewhere be acquired nesting
-	 * with an already acquired one in the same mm, or with the mm's
-	 * page_table_lock already acquired, as that would break in the
-	 * non-split case (where all these are actually resolving to the
-	 * one page_table_lock). Thus acquiring all of them here is not
-	 * going to result in dead locks, and the order of acquires
-	 * doesn't matter.
-	 */
-	{
-		pgd_t *pgd = mm->pgd;
-		unsigned g;
-
-		for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
-			pud_t *pud;
-			unsigned u;
-
-			if (pgd_none(*pgd))
-				continue;
-			pud = pud_offset(pgd, 0);
-			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-				pmd_t *pmd;
-				unsigned m;
-
-				if (pud_none(*pud))
-					continue;
-				pmd = pmd_offset(pud, 0);
-				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-					spinlock_t *ptl;
-
-					if (pmd_none(*pmd))
-						continue;
-					ptl = pte_lockptr(0, pmd);
-					if (lock)
-						spin_lock(ptl);
-					else
-						spin_unlock(ptl);
-				}
-			}
-		}
-	}
-#endif
-	if (!lock)
-		spin_unlock(&mm->page_table_lock);
-}
-#define pin_lock(mm) _pin_lock(mm, 1)
-#define pin_unlock(mm) _pin_lock(mm, 0)
-
-#define PIN_BATCH 4
-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
-
-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
-                                             unsigned int cpu, unsigned seq)
-{
-	unsigned long pfn = page_to_pfn(page);
-
-	if (PageHighMem(page)) {
-		if (pgprot_val(flags) & _PAGE_RW)
-			ClearPagePinned(page);
-		else
-			SetPagePinned(page);
-	} else {
-		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-				(unsigned long)__va(pfn << PAGE_SHIFT),
-				pfn_pte(pfn, flags), 0);
-		if (unlikely(++seq == PIN_BATCH)) {
-			if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-		                                                PIN_BATCH, NULL)))
-				BUG();
-			seq = 0;
-		}
-	}
-
-	return seq;
-}
-
-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
-{
-	pgd_t *pgd = pgd_base;
-	pud_t *pud;
-	pmd_t *pmd;
-	int    g, u, m;
-	unsigned int cpu, seq;
-
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
-	cpu = get_cpu();
-
-	for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
-		if (pgd_none(*pgd))
-			continue;
-		pud = pud_offset(pgd, 0);
-		if (PTRS_PER_PUD > 1) /* not folded */
-			seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
-		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-			if (pud_none(*pud))
-				continue;
-			pmd = pmd_offset(pud, 0);
-			if (PTRS_PER_PMD > 1) /* not folded */
-				seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
-			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-				if (pmd_none(*pmd))
-					continue;
-				seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
-			}
-		}
-	}
-
-	if (likely(seq != 0)) {
-		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-			(unsigned long)pgd_base,
-			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-			UVMF_TLB_FLUSH);
-		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-		                                        seq + 1, NULL)))
-			BUG();
-	} else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
-			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-			UVMF_TLB_FLUSH))
-		BUG();
-
-	put_cpu();
-}
-
-static void __pgd_pin(pgd_t *pgd)
-{
-	pgd_walk(pgd, PAGE_KERNEL_RO);
-	kmap_flush_unused();
-	xen_pgd_pin(__pa(pgd));
-	SetPagePinned(virt_to_page(pgd));
-}
-
-static void __pgd_unpin(pgd_t *pgd)
-{
-	xen_pgd_unpin(__pa(pgd));
-	pgd_walk(pgd, PAGE_KERNEL);
-	ClearPagePinned(virt_to_page(pgd));
-}
-
-static void pgd_test_and_unpin(pgd_t *pgd)
-{
-	if (PagePinned(virt_to_page(pgd)))
-		__pgd_unpin(pgd);
-}
-
-void mm_pin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-	pin_lock(mm);
-	__pgd_pin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_unpin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-	pin_lock(mm);
-	__pgd_unpin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_pin_all(void)
-{
-	struct page *page;
-	unsigned long flags;
-
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	/*
-	 * Allow uninterrupted access to the pgd_list. Also protects
-	 * __pgd_pin() by disabling preemption.
-	 * All other CPUs must be at a safe point (e.g., in stop_machine
-	 * or offlined entirely).
-	 */
-	spin_lock_irqsave(&pgd_lock, flags);
-	for (page = pgd_list; page; page = (struct page *)page->index) {
-		if (!PagePinned(page))
-			__pgd_pin((pgd_t *)page_address(page));
-	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-	if (!PagePinned(virt_to_page(mm->pgd)))
-		mm_pin(mm);
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
-
-	/*
-	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-	 */
-	if (tsk->active_mm == mm) {
-		tsk->active_mm = &init_mm;
-		atomic_inc(&init_mm.mm_count);
-
-		switch_mm(mm, &init_mm, tsk);
-
-		atomic_dec(&mm->mm_count);
-		BUG_ON(atomic_read(&mm->mm_count) == 0);
-	}
-
-	task_unlock(tsk);
-
-	if (PagePinned(virt_to_page(mm->pgd)) &&
-	    (atomic_read(&mm->mm_count) == 1) &&
-	    !mm->context.has_foreign_mappings)
-		mm_unpin(mm);
-}
--- head-2010-04-29.orig/arch/x86/pci/irq-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/pci/irq-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev *
 {
 	static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };

+	WARN_ON_ONCE(pirq >= 16);
 	return irqmap[read_config_nybble(router, 0x48, pirq-1)];
 }

@@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev *
 {
 	static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
 	unsigned int val = irqmap[irq];
-
+
+	WARN_ON_ONCE(pirq >= 16);
 	if (val) {
 		write_config_nybble(router, 0x48, pirq-1, val);
 		return 1;
@@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev *
 static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+	WARN_ON_ONCE(pirq >= 5);
 	return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
 }

 static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+	WARN_ON_ONCE(pirq >= 5);
 	write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
 	return 1;
 }
@@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de
 static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+	WARN_ON_ONCE(pirq >= 4);
 	return read_config_nybble(router,0x43, pirqmap[pirq-1]);
 }

 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+	WARN_ON_ONCE(pirq >= 4);
 	write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
 	return 1;
 }
@@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev *

 static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
+	WARN_ON_ONCE(pirq >= 9);
 	if (pirq > 8) {
 		printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
 		return 0;
@@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev

 static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
+	WARN_ON_ONCE(pirq >= 9);
 	if (pirq > 8) {
 		printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
 		return 0;
@@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev
  */
 static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
-	outb_p(pirq, 0xc00);
+	outb(pirq, 0xc00);
 	return inb(0xc01) & 0xf;
 }

 static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
-	outb_p(pirq, 0xc00);
-	outb_p(irq, 0xc01);
+	outb(pirq, 0xc00);
+	outb(irq, 0xc01);
 	return 1;
 }

@@ -575,6 +587,10 @@ static __init int intel_router_probe(str
 		case PCI_DEVICE_ID_INTEL_ICH9_4:
 		case PCI_DEVICE_ID_INTEL_ICH9_5:
 		case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+		case PCI_DEVICE_ID_INTEL_ICH10_0:
+		case PCI_DEVICE_ID_INTEL_ICH10_1:
+		case PCI_DEVICE_ID_INTEL_ICH10_2:
+		case PCI_DEVICE_ID_INTEL_ICH10_3:
 			r->name = "PIIX/ICH";
 			r->get = pirq_piix_get;
 			r->set = pirq_piix_set;
--- head-2010-04-29.orig/arch/x86/vdso/Makefile	2010-03-24 15:01:37.000000000 +0100
+++ head-2010-04-29/arch/x86/vdso/Makefile	2010-03-24 15:10:37.000000000 +0100
@@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y)		+= int80
 vdso32.so-$(CONFIG_COMPAT)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
 xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
+xen-vdso32-$(CONFIG_X86_32)	+= syscall
 vdso32.so-$(CONFIG_XEN)		+= $(xen-vdso32-y)

 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
--- head-2010-04-29.orig/arch/x86/vdso/vdso32/syscall.S	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/vdso/vdso32/syscall.S	2010-03-24 15:10:37.000000000 +0100
@@ -19,8 +19,10 @@ __kernel_vsyscall:
 .Lpush_ebp:
 	movl	%ecx, %ebp
 	syscall
+#ifndef CONFIG_XEN
 	movl	$__USER32_DS, %ecx
 	movl	%ecx, %ss
+#endif
 	movl	%ebp, %ecx
 	popl	%ebp
 .Lpop_ebp:
--- head-2010-04-29.orig/arch/x86/vdso/vdso32.S	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/vdso/vdso32.S	2010-03-24 15:10:37.000000000 +0100
@@ -19,4 +19,16 @@ vdso32_sysenter_start:
 	.incbin "arch/x86/vdso/vdso32-sysenter.so"
 vdso32_sysenter_end:

+#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
+	.globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
+	.incbin "arch/x86/vdso/vdso32-int80.so"
+vdso32_int80_end:
+#elif defined(CONFIG_X86_XEN)
+	.globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+	.incbin "arch/x86/vdso/vdso32-syscall.so"
+vdso32_syscall_end:
+#endif
+
 __FINIT
--- head-2010-04-29.orig/arch/x86/vdso/vdso32-setup.c	2010-03-24 15:01:37.000000000 +0100
+++ head-2010-04-29/arch/x86/vdso/vdso32-setup.c	2010-03-24 15:10:37.000000000 +0100
@@ -26,10 +26,6 @@
 #include <asm/vdso.h>
 #include <asm/proto.h>

-#ifdef CONFIG_XEN
-#include <xen/interface/callback.h>
-#endif
-
 enum {
 	VDSO_DISABLED = 0,
 	VDSO_ENABLED = 1,
@@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m

 void enable_sep_cpu(void)
 {
-#ifndef CONFIG_XEN
 	int cpu = get_cpu();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);

@@ -244,35 +239,6 @@ void enable_sep_cpu(void)
 	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
 	put_cpu();
-#else
-	extern asmlinkage void ia32pv_sysenter_target(void);
-	static struct callback_register sysenter = {
-		.type = CALLBACKTYPE_sysenter,
-		.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
-	};
-
-	if (!boot_cpu_has(X86_FEATURE_SEP))
-		return;
-
-	get_cpu();
-
-	if (xen_feature(XENFEAT_supervisor_mode_kernel))
-		sysenter.address.eip = (unsigned long)ia32_sysenter_target;
-
-	switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
-	case 0:
-		break;
-#if CONFIG_XEN_COMPAT < 0x030200
-	case -ENOSYS:
-		sysenter.type = CALLBACKTYPE_sysenter_deprecated;
-		if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
-			break;
-#endif
-	default:
-		clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
-		break;
-	}
-#endif
 }

 static struct vm_area_struct gate_vma;
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/vdso/vdso32-setup-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,506 @@
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+#include <asm/vdso.h>
+#include <asm/proto.h>
+
+#include <xen/interface/callback.h>
+
+enum {
+	VDSO_DISABLED = 0,
+	VDSO_ENABLED = 1,
+	VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	VDSO_COMPAT
+#else
+#define VDSO_DEFAULT	VDSO_ENABLED
+#endif
+
+#ifdef CONFIG_X86_64
+#define vdso_enabled			sysctl_vsyscall32
+#define arch_setup_additional_pages	syscall32_setup_pages
+#endif
+
+/*
+ * This is the difference between the prelinked addresses in the vDSO images
+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
+ * in the user address space.
+ */
+#define VDSO_ADDR_ADJUST	(VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+#endif
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+				unsigned offset, unsigned size)
+{
+	Elf32_Sym *sym = (void *)ehdr + offset;
+	unsigned nsym = size / sizeof(*sym);
+	unsigned i;
+
+	for(i = 0; i < nsym; i++, sym++) {
+		if (sym->st_shndx == SHN_UNDEF ||
+		    sym->st_shndx == SHN_ABS)
+			continue;  /* skip */
+
+		if (sym->st_shndx > SHN_LORESERVE) {
+			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+			       sym->st_shndx);
+			continue;
+		}
+
+		switch(ELF_ST_TYPE(sym->st_info)) {
+		case STT_OBJECT:
+		case STT_FUNC:
+		case STT_SECTION:
+		case STT_FILE:
+			sym->st_value += VDSO_ADDR_ADJUST;
+		}
+	}
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+	Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+	for(; dyn->d_tag != DT_NULL; dyn++)
+		switch(dyn->d_tag) {
+		case DT_PLTGOT:
+		case DT_HASH:
+		case DT_STRTAB:
+		case DT_SYMTAB:
+		case DT_RELA:
+		case DT_INIT:
+		case DT_FINI:
+		case DT_REL:
+		case DT_DEBUG:
+		case DT_JMPREL:
+		case DT_VERSYM:
+		case DT_VERDEF:
+		case DT_VERNEED:
+		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+			/* definitely pointers needing relocation */
+			dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+			break;
+
+		case DT_ENCODING ... OLD_DT_LOOS-1:
+		case DT_LOOS ... DT_HIOS-1:
+			/* Tags above DT_ENCODING are pointers if
+			   they're even */
+			if (dyn->d_tag >= DT_ENCODING &&
+			    (dyn->d_tag & 1) == 0)
+				dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+			break;
+
+		case DT_VERDEFNUM:
+		case DT_VERNEEDNUM:
+		case DT_FLAGS_1:
+		case DT_RELACOUNT:
+		case DT_RELCOUNT:
+		case DT_VALRNGLO ... DT_VALRNGHI:
+			/* definitely not pointers */
+			break;
+
+		case OLD_DT_LOOS ... DT_LOOS-1:
+		case DT_HIOS ... DT_VALRNGLO-1:
+		default:
+			if (dyn->d_tag > DT_ENCODING)
+				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+				       dyn->d_tag);
+			break;
+		}
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr *phdr;
+	Elf32_Shdr *shdr;
+	int i;
+
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	       !elf_check_arch_ia32(ehdr) ||
+	       ehdr->e_type != ET_DYN);
+
+	ehdr->e_entry += VDSO_ADDR_ADJUST;
+
+	/* rebase phdrs */
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
+
+		/* relocate dynamic stuff */
+		if (phdr[i].p_type == PT_DYNAMIC)
+			reloc_dyn(ehdr, phdr[i].p_offset);
+	}
+
+	/* rebase sections */
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	for(i = 0; i < ehdr->e_shnum; i++) {
+		if (!(shdr[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		shdr[i].sh_addr += VDSO_ADDR_ADJUST;
+
+		if (shdr[i].sh_type == SHT_SYMTAB ||
+		    shdr[i].sh_type == SHT_DYNSYM)
+			reloc_symtab(ehdr, shdr[i].sh_offset,
+				     shdr[i].sh_size);
+	}
+}
+
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_default_start, vdso32_default_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+static struct page *vdso32_pages[1];
+
+#ifdef CONFIG_X86_64
+
+#if CONFIG_XEN_COMPAT < 0x030200
+static int use_int80 = 1;
+#endif
+static int use_sysenter __read_mostly = -1;
+
+#define	vdso32_sysenter()	(use_sysenter > 0)
+
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+	static const struct callback_register cstar = {
+		.type = CALLBACKTYPE_syscall32,
+		.address = (unsigned long)ia32_cstar_target
+	};
+	static const struct callback_register sysenter = {
+		.type = CALLBACKTYPE_sysenter,
+		.address = (unsigned long)ia32_sysenter_target
+	};
+
+	if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
+	    (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
+#if CONFIG_XEN_COMPAT < 0x030200
+		return;
+	use_int80 = 0;
+#else
+		BUG();
+#endif
+
+	if (use_sysenter < 0)
+		use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+}
+
+#define compat_uses_vma		1
+
+static inline void map_compat_vdso(int map)
+{
+}
+
+#else  /* CONFIG_X86_32 */
+
+#define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
+
+extern asmlinkage void ia32pv_cstar_target(void);
+static const struct callback_register __cpuinitconst cstar = {
+	.type = CALLBACKTYPE_syscall32,
+	.address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
+};
+
+void __cpuinit enable_sep_cpu(void)
+{
+	extern asmlinkage void ia32pv_sysenter_target(void);
+	static struct callback_register __cpuinitdata sysenter = {
+		.type = CALLBACKTYPE_sysenter,
+		.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+	};
+
+	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+		if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
+			BUG();
+		return;
+	}
+
+	if (!boot_cpu_has(X86_FEATURE_SEP))
+		return;
+
+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+		sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+
+	switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
+	case 0:
+		break;
+#if CONFIG_XEN_COMPAT < 0x030200
+	case -ENOSYS:
+		sysenter.type = CALLBACKTYPE_sysenter_deprecated;
+		if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
+			break;
+#endif
+	default:
+		setup_clear_cpu_cap(X86_FEATURE_SEP);
+		break;
+	}
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+	gate_vma.vm_page_prot = __P101;
+	/*
+	 * Make sure the vDSO gets into every core dump.
+	 * Dumping its contents makes post-mortem fully interpretable later
+	 * without matching up the same kernel and hardware config to see
+	 * what PC values meant.
+	 */
+	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+	return 0;
+}
+
+#define compat_uses_vma		0
+
+static void map_compat_vdso(int map)
+{
+	static int vdso_mapped;
+
+	if (map == vdso_mapped)
+		return;
+
+	vdso_mapped = map;
+
+	__set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
+		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+	/* flush stray tlbs */
+	flush_tlb_all();
+}
+
+#endif	/* CONFIG_X86_64 */
+
+int __init sysenter_setup(void)
+{
+	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	const void *vsyscall;
+	size_t vsyscall_len;
+
+	vdso32_pages[0] = virt_to_page(syscall_page);
+
+#ifdef CONFIG_X86_32
+	gate_vma_init();
+
+	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#endif
+
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
+	if (use_int80) {
+		extern const char vdso32_int80_start, vdso32_int80_end;
+
+		vsyscall = &vdso32_int80_start;
+		vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
+	} else
+#elif defined(CONFIG_X86_32)
+	if (boot_cpu_has(X86_FEATURE_SYSCALL)
+	    && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+		|| HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
+		setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+	barrier(); /* until clear_bit()'s constraints are correct ... */
+	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+		extern const char vdso32_syscall_start, vdso32_syscall_end;
+
+		vsyscall = &vdso32_syscall_start;
+		vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+	} else
+#endif
+	if (!vdso32_sysenter()) {
+		vsyscall = &vdso32_default_start;
+		vsyscall_len = &vdso32_default_end - &vdso32_default_start;
+	} else {
+		vsyscall = &vdso32_sysenter_start;
+		vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+	}
+
+	memcpy(syscall_page, vsyscall, vsyscall_len);
+	relocate_vdso(syscall_page);
+
+	return 0;
+}
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+	bool compat;
+
+	down_write(&mm->mmap_sem);
+
+	/* Test compat mode once here, in case someone
+	   changes it via sysctl */
+	compat = (vdso_enabled == VDSO_COMPAT);
+
+	map_compat_vdso(compat);
+
+	if (compat)
+		addr = VDSO_HIGH_BASE;
+	else {
+		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		if (IS_ERR_VALUE(addr)) {
+			ret = addr;
+			goto up_fail;
+		}
+	}
+
+	if (compat_uses_vma || !compat) {
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 *
+		 * Make sure the vDSO gets into every core dump.
+		 * Dumping its contents makes post-mortem fully
+		 * interpretable later without matching up the same
+		 * kernel and hardware config to see what PC values
+		 * meant.
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+					      VM_ALWAYSDUMP,
+					      vdso32_pages);
+
+		if (ret)
+			goto up_fail;
+	}
+
+	current->mm->context.vdso = (void *)addr;
+	current_thread_info()->sysenter_return =
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+  up_fail:
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+/*
+ * This must be done early in case we have an initrd containing 32-bit
+ * binaries (e.g., hotplug). This could be pushed upstream.
+ */
+core_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+	{
+		.procname	= "vsyscall32",
+		.data		= &sysctl_vsyscall32,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+static ctl_table abi_root_table2[] = {
+	{
+		.ctl_name = CTL_ABI,
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_table2
+	},
+	{}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+	register_sysctl_table(abi_root_table2);
+	return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
+
+#else  /* CONFIG_X86_32 */
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	/* Check to see if this task was created in compat vdso mode */
+	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+		return &gate_vma;
+	return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	const struct vm_area_struct *vma = get_gate_vma(task);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_X86_64 */
--- head-2010-04-29.orig/drivers/pci/msi-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/pci/msi-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -45,6 +45,53 @@ struct msi_pirq_entry {
 	int entry_nr;
 };

+/* Arch hooks */
+
+int __attribute__ ((weak))
+arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+	return 0;
+}
+
+#ifndef CONFIG_XEN
+int __attribute__ ((weak))
+arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
+{
+	return 0;
+}
+
+int __attribute__ ((weak))
+arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	struct msi_desc *entry;
+	int ret;
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		ret = arch_setup_msi_irq(dev, entry);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
+{
+	return;
+}
+
+void __attribute__ ((weak))
+arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *entry;
+
+	list_for_each_entry(entry, &dev->msi_list, list) {
+		if (entry->irq != 0)
+			arch_teardown_msi_irq(entry->irq);
+	}
+}
+#endif
+
 static void msi_set_enable(struct pci_dev *dev, int enable)
 {
 	int pos;
@@ -266,7 +313,6 @@ static void pci_intx_for_msi(struct pci_
 		pci_intx(dev, enable);
 }

-#ifdef CONFIG_PM
 void pci_restore_msi_state(struct pci_dev *dev)
 {
 	int rc;
@@ -286,7 +332,7 @@ void pci_restore_msi_state(struct pci_de
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
 	WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
 }
-#endif	/* CONFIG_PM */
+EXPORT_SYMBOL_GPL(pci_restore_msi_state);

 /**
  * msi_capability_init - configure device's MSI capability structure
@@ -707,51 +753,3 @@ void pci_msi_init_pci_dev(struct pci_dev
 	INIT_LIST_HEAD(&dev->msi_list);
 #endif
 }
-
-
-/* Arch hooks */
-
-int __attribute__ ((weak))
-arch_msi_check_device(struct pci_dev* dev, int nvec, int type)
-{
-	return 0;
-}
-
-#ifndef CONFIG_XEN
-int __attribute__ ((weak))
-arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry)
-{
-	return 0;
-}
-
-int __attribute__ ((weak))
-arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	struct msi_desc *entry;
-	int ret;
-
-	list_for_each_entry(entry, &dev->msi_list, list) {
-		ret = arch_setup_msi_irq(dev, entry);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq)
-{
-	return;
-}
-
-void __attribute__ ((weak))
-arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-	struct msi_desc *entry;
-
-	list_for_each_entry(entry, &dev->msi_list, list) {
-		if (entry->irq != 0)
-			arch_teardown_msi_irq(entry->irq);
-	}
-}
-#endif
--- head-2010-04-29.orig/drivers/pci/pci.c	2010-04-29 09:30:41.000000000 +0200
+++ head-2010-04-29/drivers/pci/pci.c	2010-04-15 09:56:06.000000000 +0200
@@ -458,7 +458,12 @@ pci_find_parent_resource(const struct pc
  * Restore the BAR values for a given device, so as to make it
  * accessible by its driver.
  */
+#ifndef CONFIG_XEN
 static void
+#else
+EXPORT_SYMBOL_GPL(pci_restore_bars);
+void
+#endif
 pci_restore_bars(struct pci_dev *dev)
 {
 	int i;
--- head-2010-04-29.orig/drivers/xen/balloon/sysfs.c	2010-03-24 15:09:08.000000000 +0100
+++ head-2010-04-29/drivers/xen/balloon/sysfs.c	2010-03-24 15:10:37.000000000 +0100
@@ -104,7 +104,7 @@ static struct attribute_group balloon_in
 };

 static struct sysdev_class balloon_sysdev_class = {
-	set_kset_name(BALLOON_CLASS_NAME),
+	.name = BALLOON_CLASS_NAME,
 };

 static struct sys_device balloon_sysdev;
--- head-2010-04-29.orig/drivers/xen/blkback/blkback.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/blkback/blkback.c	2010-03-24 15:10:37.000000000 +0100
@@ -150,7 +150,7 @@ static void unplug_queue(blkif_t *blkif)
 		return;
 	if (blkif->plug->unplug_fn)
 		blkif->plug->unplug_fn(blkif->plug);
-	blk_put_queue(blkif->plug);
+	kobject_put(&blkif->plug->kobj);
 	blkif->plug = NULL;
 }

@@ -161,7 +161,8 @@ static void plug_queue(blkif_t *blkif, s
 	if (q == blkif->plug)
 		return;
 	unplug_queue(blkif);
-	blk_get_queue(q);
+	WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
+	kobject_get(&q->kobj);
 	blkif->plug = q;
 }

--- head-2010-04-29.orig/drivers/xen/blkfront/blkfront.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/blkfront/blkfront.c	2010-03-24 15:10:37.000000000 +0100
@@ -739,7 +739,6 @@ static irqreturn_t blkif_int(int irq, vo
 	RING_IDX i, rp;
 	unsigned long flags;
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
-	int uptodate;

 	spin_lock_irqsave(&blkif_io_lock, flags);

@@ -764,13 +763,13 @@ static irqreturn_t blkif_int(int irq, vo

 		ADD_ID_TO_FREELIST(info, id);

-		uptodate = (bret->status == BLKIF_RSP_OKAY);
+		ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
 		switch (bret->operation) {
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				printk("blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
-				uptodate = -EOPNOTSUPP;
+				ret = -EOPNOTSUPP;
 				info->feature_barrier = 0;
 			        xlvbd_barrier(info);
 			}
@@ -781,10 +780,8 @@ static irqreturn_t blkif_int(int irq, vo
 				DPRINTK("Bad return from blkdev data "
 					"request: %x\n", bret->status);

-			ret = end_that_request_first(req, uptodate,
-				req->hard_nr_sectors);
+			ret = __blk_end_request(req, ret, blk_rq_bytes(req));
 			BUG_ON(ret);
-			end_that_request_last(req, uptodate);
 			break;
 		default:
 			BUG();
--- head-2010-04-29.orig/drivers/xen/blktap/blktap.c	2010-03-24 15:08:58.000000000 +0100
+++ head-2010-04-29/drivers/xen/blktap/blktap.c	2010-03-24 15:10:37.000000000 +0100
@@ -336,8 +336,8 @@ static pte_t blktap_clear_pte(struct vm_
 		uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
 	}
 	if (vma->vm_file == NULL || uvaddr < uvstart)
-		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
-					       ptep, is_fullmm);
+		return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);

 	/* TODO Should these be changed to if statements? */
 	BUG_ON(!info);
@@ -380,8 +380,8 @@ static pte_t blktap_clear_pte(struct vm_
 		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));

 		/* USING SHADOW PAGE TABLES. */
-		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
-					       is_fullmm);
+		copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);
 	}

 	if (count) {
--- head-2010-04-29.orig/drivers/xen/blktap2/device.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/blktap2/device.c	2010-03-24 15:10:37.000000000 +0100
@@ -163,9 +163,9 @@ blktap_map_uaddr_fn(pte_t *ptep, struct
 }

 static int
-blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
+blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
-	return apply_to_page_range(mm, address,
+	return apply_to_page_range(vma ? vma->vm_mm : NULL, address,
 				   PAGE_SIZE, blktap_map_uaddr_fn, &pte);
 }

@@ -173,18 +173,29 @@ static int
 blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
 		     unsigned long addr, void *data)
 {
-	struct mm_struct *mm = (struct mm_struct *)data;
+	struct vm_area_struct *vma = data;

 	BTDBG("ptep %p\n", ptep);
-	pte_clear(mm, addr, ptep);
+	xen_ptep_get_and_clear_full(vma, addr, ptep, 1);
 	return 0;
 }

 static int
-blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
+blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address)
 {
+	struct mm_struct *mm = NULL;
+
+	if (!vma) {
+#ifdef CONFIG_X86
+		if (HYPERVISOR_update_va_mapping(address, __pte(0),
+						 UVMF_INVLPG|UVMF_ALL))
+			BUG();
+		return 1;
+#endif
+	} else
+		mm = vma->vm_mm;
 	return apply_to_page_range(mm, address,
-				   PAGE_SIZE, blktap_umap_uaddr_fn, mm);
+				   PAGE_SIZE, blktap_umap_uaddr_fn, vma);
 }

 static inline void
@@ -198,17 +209,10 @@ flush_tlb_kernel_page(unsigned long kvad
 }

 static void
-blktap_device_end_dequeued_request(struct blktap_device *dev,
-				   struct request *req, int uptodate)
+blktap_device_end_dequeued_request(struct request *req, int ret)
 {
-	int ret;
-
-	ret = end_that_request_first(req, uptodate, req->hard_nr_sectors);
-	BUG_ON(ret);
-
-	spin_lock_irq(&dev->lock);
-	end_that_request_last(req, uptodate);
-	spin_unlock_irq(&dev->lock);
+	if (blk_end_request(req, ret, blk_rq_bytes(req)))
+		BUG();
 }

 /*
@@ -336,8 +340,8 @@ blktap_unmap(struct blktap *tap, struct

 		if (!xen_feature(XENFEAT_auto_translated_physmap) &&
 		    request->handles[i].kernel == INVALID_GRANT_HANDLE) {
-			blktap_umap_uaddr(&init_mm, kvaddr);
-			flush_tlb_kernel_page(kvaddr);
+			if (blktap_umap_uaddr(NULL, kvaddr) == 0)
+				flush_tlb_kernel_page(kvaddr);
 			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
 					    INVALID_P2M_ENTRY);
 		}
@@ -377,7 +381,7 @@ blktap_device_fail_pending_requests(stru

 		blktap_unmap(tap, request);
 		req = (struct request *)(unsigned long)request->id;
-		blktap_device_end_dequeued_request(dev, req, 0);
+		blktap_device_end_dequeued_request(req, -ENODEV);
 		blktap_request_free(tap, request);
 	}

@@ -400,16 +404,11 @@ blktap_device_finish_request(struct blkt
 			     blkif_response_t *res,
 			     struct blktap_request *request)
 {
-	int uptodate;
 	struct request *req;
-	struct blktap_device *dev;
-
-	dev = &tap->device;

 	blktap_unmap(tap, request);

 	req = (struct request *)(unsigned long)request->id;
-	uptodate = (res->status == BLKIF_RSP_OKAY);

 	BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
 	      res->status, res->operation, request->operation,
@@ -421,7 +420,8 @@ blktap_device_finish_request(struct blkt
 		if (unlikely(res->status != BLKIF_RSP_OKAY))
 			BTERR("Bad return from device data "
 				"request: %x\n", res->status);
-		blktap_device_end_dequeued_request(dev, req, uptodate);
+		blktap_device_end_dequeued_request(req,
+			res->status == BLKIF_RSP_OKAY ? 0 : -EIO);
 		break;
 	default:
 		BUG();
@@ -571,9 +571,9 @@ blktap_map(struct blktap *tap,

 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		pte = mk_pte(page, ring->vma->vm_page_prot);
-		blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
+		blktap_map_uaddr(ring->vma, uvaddr, pte_mkwrite(pte));
 		flush_tlb_page(ring->vma, uvaddr);
-		blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
+		blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
 		flush_tlb_kernel_page(kvaddr);

 		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
@@ -896,7 +896,7 @@ blktap_device_run_queue(struct blktap *t
 		if (!err)
 			queued++;
 		else {
-			blktap_device_end_dequeued_request(dev, req, 0);
+			blktap_device_end_dequeued_request(req, err);
 			blktap_request_free(tap, request);
 		}

--- head-2010-04-29.orig/drivers/xen/blktap2/ring.c	2010-04-29 09:34:47.000000000 +0200
+++ head-2010-04-29/drivers/xen/blktap2/ring.c	2010-03-24 15:10:37.000000000 +0100
@@ -103,8 +103,8 @@ blktap_ring_clear_pte(struct vm_area_str
 	 * mapped region.
 	 */
 	if (uvaddr < ring->user_vstart)
-		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
-					       ptep, is_fullmm);
+		return xen_ptep_get_and_clear_full(vma, uvaddr,
+						   ptep, is_fullmm);

 	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
 	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
@@ -146,8 +146,8 @@ blktap_ring_clear_pte(struct vm_area_str
 				    khandle->user);
 		count++;
 	} else
-		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
-					       is_fullmm);
+		copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);

 	if (count)
 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
--- head-2010-04-29.orig/drivers/xen/core/Makefile	2008-07-21 11:00:33.000000000 +0200
+++ head-2010-04-29/drivers/xen/core/Makefile	2010-03-24 15:10:37.000000000 +0100
@@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR)	+= hypervis
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_X86_SMP)		+= spinlock.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
--- head-2010-04-29.orig/drivers/xen/core/evtchn.c	2010-03-24 15:09:08.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/evtchn.c	2010-04-23 15:15:37.000000000 +0200
@@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc

 /* Upcall to generic IRQ layer. */
 #ifdef CONFIG_X86
-extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
+extern unsigned int do_IRQ(struct pt_regs *regs);
 void __init xen_init_IRQ(void);
 void __init init_IRQ(void)
 {
@@ -203,13 +203,11 @@ void __init init_IRQ(void)
 }
 #if defined (__i386__)
 static inline void exit_idle(void) {}
-#define IRQ_REG orig_eax
 #elif defined (__x86_64__)
 #include <asm/idle.h>
-#define IRQ_REG orig_rax
 #endif
 #define do_IRQ(irq, regs) do {		\
-	(regs)->IRQ_REG = ~(irq);	\
+	(regs)->orig_ax = ~(irq);	\
 	do_IRQ((regs));			\
 } while (0)
 #endif
@@ -676,13 +674,12 @@ static void set_affinity_irq(unsigned in
 int resend_irq_on_evtchn(unsigned int irq)
 {
 	int masked, evtchn = evtchn_from_irq(irq);
-	shared_info_t *s = HYPERVISOR_shared_info;

 	if (!VALID_EVTCHN(evtchn))
 		return 1;

 	masked = test_and_set_evtchn_mask(evtchn);
-	synch_set_bit(evtchn, s->evtchn_pending);
+	set_evtchn(evtchn);
 	if (!masked)
 		unmask_evtchn(evtchn);

@@ -971,6 +968,43 @@ void disable_all_local_evtchn(void)
 			synch_set_bit(i, &s->evtchn_mask[0]);
 }

+/* Clear an irq's pending state, in preparation for polling on it. */
+void xen_clear_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+/* Set an irq's pending state, to avoid blocking on it. */
+void xen_set_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		set_evtchn(evtchn);
+}
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
+}
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+	evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)
+	    && HYPERVISOR_poll_no_timeout(&evtchn, 1))
+		BUG();
+}
+
 static void restore_cpu_virqs(unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
@@ -1024,8 +1058,8 @@ static void restore_cpu_ipis(unsigned in
 		bind_evtchn_to_cpu(evtchn, cpu);

 		/* Ready for use. */
-		unmask_evtchn(evtchn);
-
+		if (!(irq_desc[irq].status & IRQ_DISABLED))
+			unmask_evtchn(evtchn);
 	}
 }

--- head-2010-04-29.orig/drivers/xen/core/hypervisor_sysfs.c	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/hypervisor_sysfs.c	2010-03-24 15:10:37.000000000 +0100
@@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init
 	if (!is_running_on_xen())
 		return -ENODEV;

-	hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
+	hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
 	return 0;
 }

--- head-2010-04-29.orig/drivers/xen/core/smpboot.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/smpboot.c	2010-03-24 15:10:37.000000000 +0100
@@ -72,6 +72,10 @@ void __init prefill_possible_map(void)
 		return;

 	for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_HOTPLUG_CPU
+		if (i >= setup_max_cpus)
+			break;
+#endif
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
 		if (rc >= 0)
 			cpu_set(i, cpu_possible_map);
@@ -134,6 +138,10 @@ static int __cpuinit xen_smp_intr_init(u
 		goto fail;
 	per_cpu(callfunc_irq, cpu) = rc;

+	rc = xen_spinlock_init(cpu);
+	if (rc < 0)
+		goto fail;
+
 	if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
 		goto fail;

@@ -144,6 +152,7 @@ static int __cpuinit xen_smp_intr_init(u
 		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
 	if (per_cpu(callfunc_irq, cpu) >= 0)
 		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	xen_spinlock_cleanup(cpu);
 	return rc;
 }

@@ -155,6 +164,7 @@ static void xen_smp_intr_exit(unsigned i

 	unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	xen_spinlock_cleanup(cpu);
 }
 #endif

@@ -207,36 +217,25 @@ static void __cpuinit cpu_initialize_con
 	smp_trap_init(ctxt.trap_ctxt);

 	ctxt.ldt_ents = 0;
-	ctxt.gdt_ents = GDT_SIZE / 8;
-
-#ifdef __i386__
 	ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
+	ctxt.gdt_ents = GDT_SIZE / 8;

 	ctxt.user_regs.cs = __KERNEL_CS;
-	ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+	ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);

 	ctxt.kernel_ss = __KERNEL_DS;
-	ctxt.kernel_sp = idle->thread.esp0;
+	ctxt.kernel_sp = idle->thread.sp0;

-	ctxt.event_callback_cs     = __KERNEL_CS;
 	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-	ctxt.failsafe_callback_cs  = __KERNEL_CS;
 	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+#ifdef __i386__
+	ctxt.event_callback_cs     = __KERNEL_CS;
+	ctxt.failsafe_callback_cs  = __KERNEL_CS;

 	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));

 	ctxt.user_regs.fs = __KERNEL_PERCPU;
 #else /* __x86_64__ */
-	ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
-
-	ctxt.user_regs.cs = __KERNEL_CS;
-	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
-
-	ctxt.kernel_ss = __KERNEL_DS;
-	ctxt.kernel_sp = idle->thread.rsp0;
-
-	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
-	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
 	ctxt.syscall_callback_eip  = (unsigned long)system_call;

 	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/drivers/xen/core/spinlock.c	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,246 @@
+/*
+ *	Xen spinlock functions
+ *
+ *	See arch/x86/xen/smp.c for copyright and credits for derived
+ *	portions of this file.
+ */
+#define XEN_SPINLOCK_SOURCE
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <xen/evtchn.h>
+
+extern irqreturn_t smp_reschedule_interrupt(int, void *);
+
+static DEFINE_PER_CPU(int, spinlock_irq) = -1;
+static char spinlock_name[NR_CPUS][15];
+
+struct spinning {
+	raw_spinlock_t *lock;
+	unsigned int ticket;
+	struct spinning *prev;
+};
+static DEFINE_PER_CPU(struct spinning *, spinning);
+/*
+ * Protect removal of objects: Addition can be done lockless, and even
+ * removal itself doesn't need protection - what needs to be prevented is
+ * removed objects going out of scope (as they're allocated on the stack.
+ */
+static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED;
+
+int __cpuinit xen_spinlock_init(unsigned int cpu)
+{
+	int rc;
+
+	sprintf(spinlock_name[cpu], "spinlock%u", cpu);
+	rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR,
+				    cpu,
+				    smp_reschedule_interrupt,
+				    IRQF_DISABLED|IRQF_NOBALANCING,
+				    spinlock_name[cpu],
+				    NULL);
+ 	if (rc < 0)
+ 		return rc;
+
+	disable_irq(rc); /* make sure it's never delivered */
+	per_cpu(spinlock_irq, cpu) = rc;
+
+	return 0;
+}
+
+void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
+{
+	if (per_cpu(spinlock_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL);
+	per_cpu(spinlock_irq, cpu) = -1;
+}
+
+static unsigned int spin_adjust(struct spinning *spinning,
+				const raw_spinlock_t *lock,
+				unsigned int token)
+{
+	for (; spinning; spinning = spinning->prev)
+		if (spinning->lock == lock) {
+			unsigned int ticket = spinning->ticket;
+
+			if (unlikely(!(ticket + 1)))
+				break;
+			spinning->ticket = token >> TICKET_SHIFT;
+			token = (token & ((1 << TICKET_SHIFT) - 1))
+				| (ticket << TICKET_SHIFT);
+			break;
+		}
+
+	return token;
+}
+
+unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
+{
+	return spin_adjust(__get_cpu_var(spinning), lock, token);
+}
+
+bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
+                   unsigned int flags)
+{
+	int irq = __get_cpu_var(spinlock_irq);
+	bool rc;
+	typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask;
+	raw_rwlock_t *rm_lock;
+	struct spinning spinning, *other;
+
+	/* If kicker interrupt not initialized yet, just spin. */
+	if (unlikely(irq < 0) || unlikely(!cpu_online(raw_smp_processor_id())))
+		return false;
+
+	/* announce we're spinning */
+	spinning.ticket = *ptok >> TICKET_SHIFT;
+	spinning.lock = lock;
+	spinning.prev = __get_cpu_var(spinning);
+	smp_wmb();
+	__get_cpu_var(spinning) = &spinning;
+	upcall_mask = current_vcpu_info()->evtchn_upcall_mask;
+
+	do {
+		bool nested = false;
+
+		xen_clear_irq_pending(irq);
+
+		/*
+		 * Check again to make sure it didn't become free while
+		 * we weren't looking.
+		 */
+		if (lock->cur == spinning.ticket) {
+			/*
+			 * If we interrupted another spinlock while it was
+			 * blocking, make sure it doesn't block (again)
+			 * without rechecking the lock.
+			 */
+			if (spinning.prev)
+				xen_set_irq_pending(irq);
+			rc = true;
+			break;
+		}
+
+		for (other = spinning.prev; other; other = other->prev) {
+			if (other->lock == lock)
+				nested = true;
+			else {
+				/*
+				 * Return the ticket if we now own the lock.
+				 * While just being desirable generally (to
+				 * reduce latency on other CPUs), this is
+				 * essential in the case where interrupts
+				 * get re-enabled below.
+				 * Try to get a new ticket right away (to
+				 * reduce latency after the current lock was
+				 * released), but don't acquire the lock.
+				 */
+				raw_spinlock_t *lock = other->lock;
+
+				raw_local_irq_disable();
+				while (lock->cur == other->ticket) {
+					unsigned int token;
+					bool kick, free;
+
+					other->ticket = -1;
+					__raw_spin_unlock_body;
+					if (!kick)
+						break;
+					xen_spin_kick(lock, token);
+					__raw_spin_lock_preamble;
+					if (!free)
+						token = spin_adjust(
+							other->prev, lock,
+							token);
+					other->ticket = token >> TICKET_SHIFT;
+					smp_mb();
+				}
+			}
+		}
+
+		/*
+		 * No need to use raw_local_irq_restore() here, as the
+		 * intended event processing will happen with the poll
+		 * call.
+		 */
+		current_vcpu_info()->evtchn_upcall_mask =
+			nested ? upcall_mask : flags;
+
+		xen_poll_irq(irq);
+
+		current_vcpu_info()->evtchn_upcall_mask = upcall_mask;
+
+		rc = !xen_test_irq_pending(irq);
+		kstat_this_cpu.irqs[irq] += !rc;
+	} while (spinning.prev || rc);
+
+	/*
+	 * Leave the irq pending so that any interrupted blocker will
+	 * re-check.
+	 */
+
+	/* announce we're done */
+	__get_cpu_var(spinning) = other = spinning.prev;
+	rm_lock = &__get_cpu_var(spinning_rm_lock);
+	raw_local_irq_disable();
+	__raw_write_lock(rm_lock);
+	__raw_write_unlock(rm_lock);
+	*ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
+
+	/*
+	 * Obtain new tickets for (or acquire) all those locks where
+	 * above we avoided acquiring them.
+	 */
+	for (; other; other = other->prev)
+		if (!(other->ticket + 1)) {
+			unsigned int token;
+			bool free;
+
+			lock = other->lock;
+			__raw_spin_lock_preamble;
+			if (!free)
+				token = spin_adjust(other->prev, lock, token);
+			other->ticket = token >> TICKET_SHIFT;
+		}
+	raw_local_irq_restore(upcall_mask);
+
+	return rc;
+}
+
+void xen_spin_kick(raw_spinlock_t *lock, unsigned int token)
+{
+	unsigned int cpu;
+
+	token &= (1U << TICKET_SHIFT) - 1;
+	for_each_online_cpu(cpu) {
+		raw_rwlock_t *rm_lock;
+		unsigned long flags;
+		struct spinning *spinning;
+
+		if (cpu == raw_smp_processor_id())
+			continue;
+
+		rm_lock = &per_cpu(spinning_rm_lock, cpu);
+		raw_local_irq_save(flags);
+		__raw_read_lock(rm_lock);
+
+		spinning = per_cpu(spinning, cpu);
+		smp_rmb();
+		while (spinning) {
+			if (spinning->lock == lock && spinning->ticket == token)
+				break;
+			spinning = spinning->prev;
+		}
+
+		__raw_read_unlock(rm_lock);
+		raw_local_irq_restore(flags);
+
+		if (unlikely(spinning)) {
+			notify_remote_via_irq(per_cpu(spinlock_irq, cpu));
+			return;
+		}
+	}
+}
+EXPORT_SYMBOL(xen_spin_kick);
--- head-2010-04-29.orig/drivers/xen/core/xen_sysfs.c	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/xen_sysfs.c	2010-03-24 15:10:37.000000000 +0100
@@ -30,12 +30,12 @@ HYPERVISOR_ATTR_RO(type);

 static int __init xen_sysfs_type_init(void)
 {
-	return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
+	return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
 }

 static void xen_sysfs_type_destroy(void)
 {
-	sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
+	sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
 }

 /* xen version attributes */
@@ -91,13 +91,12 @@ static struct attribute_group version_gr

 static int __init xen_sysfs_version_init(void)
 {
-	return sysfs_create_group(&hypervisor_subsys.kobj,
-				  &version_group);
+	return sysfs_create_group(hypervisor_kobj, &version_group);
 }

 static void xen_sysfs_version_destroy(void)
 {
-	sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
+	sysfs_remove_group(hypervisor_kobj, &version_group);
 }

 /* UUID */
@@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid);

 static int __init xen_sysfs_uuid_init(void)
 {
-	return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
+	return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
 }

 static void xen_sysfs_uuid_destroy(void)
 {
-	sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
+	sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
 }

 /* xen compilation attributes */
@@ -204,14 +203,12 @@ static struct attribute_group xen_compil

 int __init static xen_compilation_init(void)
 {
-	return sysfs_create_group(&hypervisor_subsys.kobj,
-				  &xen_compilation_group);
+	return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
 }

 static void xen_compilation_destroy(void)
 {
-	sysfs_remove_group(&hypervisor_subsys.kobj,
-			   &xen_compilation_group);
+	sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
 }

 /* xen properties info */
@@ -325,14 +322,12 @@ static struct attribute_group xen_proper

 static int __init xen_properties_init(void)
 {
-	return sysfs_create_group(&hypervisor_subsys.kobj,
-				  &xen_properties_group);
+	return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
 }

 static void xen_properties_destroy(void)
 {
-	sysfs_remove_group(&hypervisor_subsys.kobj,
-			   &xen_properties_group);
+	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }

 #ifdef CONFIG_KEXEC
@@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo);

 static int __init xen_sysfs_vmcoreinfo_init(void)
 {
-	return sysfs_create_file(&hypervisor_subsys.kobj,
-				 &vmcoreinfo_attr.attr);
+	return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
 }

 static void xen_sysfs_vmcoreinfo_destroy(void)
 {
-	sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr);
+	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
 }

 #endif
--- head-2010-04-29.orig/drivers/xen/gntdev/gntdev.c	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-04-29/drivers/xen/gntdev/gntdev.c	2010-03-24 15:10:37.000000000 +0100
@@ -791,7 +791,7 @@ static pte_t gntdev_clear_pte(struct vm_
 				       op.status);
 		} else {
 			/* USING SHADOW PAGE TABLES. */
-			copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
+			copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
 		}

 		/* Finally, we unmap the grant from kernel space. */
@@ -819,7 +819,7 @@ static pte_t gntdev_clear_pte(struct vm_
 			INVALID_P2M_ENTRY);

 	} else {
-		copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
+		copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
 	}

 	return copy;
--- head-2010-04-29.orig/drivers/xen/scsifront/scsifront.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/scsifront/scsifront.c	2010-03-24 15:10:37.000000000 +0100
@@ -260,19 +260,19 @@ static int map_data_for_request(struct v
 		return -ENOMEM;
 	}

-	if (sc->use_sg) {
+	if (scsi_bufflen(sc)) {
 		/* quoted scsi_lib.c/scsi_req_map_sg . */
-		struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer;
-		unsigned int data_len = sc->request_bufflen;
+		struct scatterlist *sg, *sgl = scsi_sglist(sc);
+		unsigned int data_len = scsi_bufflen(sc);

-		nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (nr_pages > VSCSIIF_SG_TABLESIZE) {
 			printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n");
 			ref_cnt = (-E2BIG);
 			goto big_to_sg;
 		}

-		for_each_sg (sgl, sg, sc->use_sg, i) {
+		for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
 			page = sg_page(sg);
 			off = sg->offset;
 			len = sg->length;
@@ -306,45 +306,6 @@ static int map_data_for_request(struct v
 				ref_cnt++;
 			}
 		}
-	} else if (sc->request_bufflen) {
-		unsigned long end   = ((unsigned long)sc->request_buffer
-					+ sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
-
-		page = virt_to_page(sc->request_buffer);
-		nr_pages = end - start;
-		len = sc->request_bufflen;
-
-		if (nr_pages > VSCSIIF_SG_TABLESIZE) {
-			ref_cnt = (-E2BIG);
-			goto big_to_sg;
-		}
-
-		buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
-
-		off = offset_in_page((unsigned long)sc->request_buffer);
-		for (i = 0; i < nr_pages; i++) {
-			bytes = PAGE_SIZE - off;
-
-			if (bytes > len)
-				bytes = len;
-
-			ref = gnttab_claim_grant_reference(&gref_head);
-			BUG_ON(ref == -ENOSPC);
-
-			gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
-				buffer_pfn, write);
-
-			info->shadow[id].gref[i] = ref;
-			ring_req->seg[i].gref     = ref;
-			ring_req->seg[i].offset   = (uint16_t)off;
-			ring_req->seg[i].length   = (uint16_t)bytes;
-
-			buffer_pfn++;
-			len -= bytes;
-			off = 0;
-			ref_cnt++;
-		}
 	}

 big_to_sg:
--- head-2010-04-29.orig/drivers/xen/usbfront/usbfront-dbg.c	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-04-29/drivers/xen/usbfront/usbfront-dbg.c	2010-03-24 15:10:37.000000000 +0100
@@ -43,17 +43,16 @@
  * DEALINGS IN THE SOFTWARE.
  */

-static ssize_t show_statistics(struct class_device *class_dev, char *buf)
+static ssize_t show_statistics(struct device *dev,
+			       struct device_attribute *attr, char *buf)
 {
-	struct usb_bus *bus;
 	struct usb_hcd *hcd;
 	struct usbfront_info *info;
 	unsigned long flags;
 	unsigned temp, size;
 	char *next;

-	bus = class_get_devdata(class_dev);
-	hcd = bus->hcpriv;
+	hcd = dev_get_drvdata(dev);
 	info = hcd_to_info(hcd);
 	next = buf;
 	size = PAGE_SIZE;
@@ -85,18 +84,18 @@ static ssize_t show_statistics(struct cl
 	return PAGE_SIZE - size;
 }

-static CLASS_DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL);
+static DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL);

 static inline void create_debug_file(struct usbfront_info *info)
 {
-	struct class_device *cldev = info_to_hcd(info)->self.class_dev;
-	if (class_device_create_file(cldev, &class_device_attr_statistics))
+	struct device *dev = info_to_hcd(info)->self.controller;
+	if (device_create_file(dev, &dev_attr_statistics))
 		printk(KERN_WARNING "statistics file not created for %s\n",
 		       info_to_hcd(info)->self.bus_name);
 }

 static inline void remove_debug_file(struct usbfront_info *info)
 {
-	struct class_device *cldev = info_to_hcd(info)->self.class_dev;
-	class_device_remove_file(cldev, &class_device_attr_statistics);
+	struct device *dev = info_to_hcd(info)->self.controller;
+	device_remove_file(dev, &dev_attr_statistics);
 }
--- head-2010-04-29.orig/drivers/xen/xenoprof/xenoprofile.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenoprof/xenoprofile.c	2010-03-24 15:10:37.000000000 +0100
@@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de


 static struct sysdev_class oprofile_sysclass = {
-	set_kset_name("oprofile"),
+	.name		= "oprofile",
 	.resume		= xenoprof_resume,
 	.suspend	= xenoprof_suspend
 };
--- head-2010-04-29.orig/arch/x86/include/asm/e820.h	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/include/asm/e820.h	2010-03-24 15:10:37.000000000 +0100
@@ -61,7 +61,11 @@ struct e820map {
 	struct e820entry map[E820_X_MAX];
 };

+#ifndef CONFIG_XEN
 #define ISA_START_ADDRESS	0xa0000
+#else
+#define ISA_START_ADDRESS	0
+#endif
 #define ISA_END_ADDRESS		0x100000

 #define BIOS_BEGIN		0x000a0000
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/agp.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/agp.h	2010-03-24 15:10:37.000000000 +0100
@@ -13,18 +13,13 @@
  * page. This avoids data corruption on some CPUs.
  */

-/*
- * Caller's responsibility to call global_flush_tlb() for performance
- * reasons
- */
 #define map_page_into_agp(page) ( \
 	xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
-	?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE))
+	?: set_pages_uc(page, 1))
 #define unmap_page_from_agp(page) ( \
 	xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
 	/* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
-	change_page_attr(page, 1, PAGE_KERNEL))
-#define flush_agp_mappings() global_flush_tlb()
+	set_pages_wb(page, 1))

 /*
  * Could use CLFLUSH here if the cpu supports it. But then it would
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,404 @@
+#ifndef _ASM_DESC_H_
+#define _ASM_DESC_H_
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc,
+			    const struct user_desc *info)
+{
+	desc->limit0 = info->limit & 0x0ffff;
+	desc->base0 = info->base_addr & 0x0000ffff;
+
+	desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+	desc->type = (info->read_exec_only ^ 1) << 1;
+	desc->type |= info->contents << 2;
+	desc->s = 1;
+	desc->dpl = 0x3;
+	desc->p = info->seg_not_present ^ 1;
+	desc->limit = (info->limit & 0xf0000) >> 16;
+	desc->avl = info->useable;
+	desc->d = info->seg_32bit;
+	desc->g = info->limit_in_pages;
+	desc->base2 = (info->base_addr & 0xff000000) >> 24;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+#endif
+
+#ifdef CONFIG_X86_64
+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
+extern struct desc_ptr cpu_gdt_descr[];
+/* the cpu gdt accessor */
+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+			     unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate->offset_low = PTR_LOW(func);
+	gate->segment = __KERNEL_CS;
+	gate->ist = ist;
+	gate->p = 1;
+	gate->dpl = dpl;
+	gate->zero0 = 0;
+	gate->zero1 = 0;
+	gate->type = type;
+	gate->offset_middle = PTR_MIDDLE(func);
+	gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+struct gdt_page {
+	struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return per_cpu(gdt_page, cpu).gdt;
+}
+
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
+
+{
+	gate->a = (seg << 16) | (base & 0xffff);
+	gate->b = (base & 0xffff0000) |
+		  (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+	const u32 *desc = ptr;
+	return !(desc[0] | desc[1]);
+}
+
+#ifndef CONFIG_XEN
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) \
+				native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) \
+				native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+					  const gate_desc *gate)
+{
+	memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+					  const void *desc)
+{
+	memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+					  const void *desc, int type)
+{
+	unsigned int size;
+	switch (type) {
+	case DESC_TSS:
+		size = sizeof(tss_desc);
+		break;
+	case DESC_LDT:
+		size = sizeof(ldt_desc);
+		break;
+	default:
+		size = sizeof(struct desc_struct);
+		break;
+	}
+	memcpy(&gdt[entry], desc, size);
+}
+#endif
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+				   unsigned long limit, unsigned char type,
+				   unsigned char flags)
+{
+	desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+	desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+		  (limit & 0x000f0000) | ((type & 0xff) << 8) |
+		  ((flags & 0xf) << 20);
+	desc->p = 1;
+}
+
+
+#ifndef CONFIG_XEN
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+					 unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+	struct ldttss_desc64 *desc = d;
+	memset(desc, 0, sizeof(*desc));
+	desc->limit0 = size & 0xFFFF;
+	desc->base0 = PTR_LOW(addr);
+	desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+	desc->type = type;
+	desc->p = 1;
+	desc->limit1 = (size >> 16) & 0xF;
+	desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+	desc->base3 = PTR_HIGH(addr);
+#else
+
+	pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+	struct desc_struct *d = get_cpu_gdt_table(cpu);
+	tss_desc tss;
+
+	/*
+	 * sizeof(unsigned long) coming from an extra "long" at the end
+	 * of the iobitmap. See tss_struct definition in processor.h
+	 *
+	 * -1? seg base+limit should be pointing to the address of the
+	 * last valid byte
+	 */
+	set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+		IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
+	write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+	if (likely(entries == 0))
+		__asm__ __volatile__("lldt %w0"::"q" (0));
+	else {
+		unsigned cpu = smp_processor_id();
+		ldt_desc ldt;
+
+		set_tssldt_descriptor(&ldt, (unsigned long)addr,
+				      DESC_LDT, entries * sizeof(ldt) - 1);
+		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+				&ldt, DESC_LDT);
+		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+	}
+}
+
+static inline void native_load_tr_desc(void)
+{
+	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+	asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+	asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+	asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+	asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+	unsigned long tr;
+	asm volatile("str %0":"=r" (tr));
+	return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	unsigned int i;
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+#else
+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+#define set_ldt xen_set_ldt
+
+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
+			   const void *desc);
+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
+			   const void *desc, int type);
+
+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	unsigned int i;
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
+						 *(u64 *)&t->tls_array[i]))
+			BUG();
+}
+#endif
+
+#define _LDT_empty(info) (\
+	(info)->base_addr	== 0	&& \
+	(info)->limit		== 0	&& \
+	(info)->contents	== 0	&& \
+	(info)->read_exec_only	== 1	&& \
+	(info)->seg_32bit	== 0	&& \
+	(info)->limit_in_pages	== 0	&& \
+	(info)->seg_not_present	== 1	&& \
+	(info)->useable		== 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+	set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+	set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+	preempt_disable();
+	load_LDT_nolock(pc);
+	preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+	return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+	return desc->limit0 | (desc->limit << 16);
+}
+
+#ifndef CONFIG_X86_NO_IDT
+static inline void _set_gate(int gate, unsigned type, void *addr,
+			      unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate_desc s;
+	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+	/*
+	 * does not need to be atomic because it is only done once at
+	 * setup time
+	 */
+	write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_system_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
 #ifdef CONFIG_X86_32
-# include "desc_32.h"
+	_set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+#else
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+#endif
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+#endif
+
 #else
-# include "desc_64.h"
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ *    idx - descriptor index
+ *    gdt - GDT pointer
+ *    base - 32bit register to which the base will be written
+ *    lo_w - lo word of the "base" register
+ *    lo_b - lo byte of the "base" register
+ *    hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+	movb idx*8+4(gdt), lo_b; \
+	movb idx*8+7(gdt), hi_b; \
+	shll $16, base; \
+	movw idx*8+2(gdt), lo_w;
+
+
+#endif /* __ASSEMBLY__ */
+
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/desc_32.h	2010-03-24 15:09:15.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,262 +0,0 @@
-#ifndef __ARCH_DESC_H
-#define __ARCH_DESC_H
-
-#include <asm/ldt.h>
-#include <asm/segment.h>
-
-#ifndef __ASSEMBLY__
-
-#include <linux/preempt.h>
-#include <linux/smp.h>
-
-#include <asm/mmu.h>
-
-struct Xgt_desc_struct {
-	unsigned short size;
-	unsigned long address __attribute__((packed));
-	unsigned short pad;
-} __attribute__ ((packed));
-
-struct gdt_page
-{
-	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
-	return per_cpu(gdt_page, cpu).gdt;
-}
-
-extern struct Xgt_desc_struct idt_descr;
-extern struct desc_struct idt_table[];
-extern void set_intr_gate(unsigned int irq, void * addr);
-
-static inline void pack_descriptor(__u32 *a, __u32 *b,
-	unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
-{
-	*a = ((base & 0xffff) << 16) | (limit & 0xffff);
-	*b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
-		(limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
-}
-
-static inline void pack_gate(__u32 *a, __u32 *b,
-	unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
-{
-	*a = (seg << 16) | (base & 0xffff);
-	*b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
-}
-
-#define DESCTYPE_LDT 	0x82	/* present, system, DPL-0, LDT */
-#define DESCTYPE_TSS 	0x89	/* present, system, DPL-0, 32-bit TSS */
-#define DESCTYPE_TASK	0x85	/* present, system, DPL-0, task gate */
-#define DESCTYPE_INT	0x8e	/* present, system, DPL-0, interrupt gate */
-#define DESCTYPE_TRAP	0x8f	/* present, system, DPL-0, trap gate */
-#define DESCTYPE_DPL3	0x60	/* DPL-3 */
-#define DESCTYPE_S	0x10	/* !system */
-
-#ifndef CONFIG_XEN
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-
-static inline void write_dt_entry(struct desc_struct *dt,
-				  int entry, u32 entry_low, u32 entry_high)
-{
-	dt[entry].a = entry_low;
-	dt[entry].b = entry_high;
-}
-
-static inline void native_set_ldt(const void *addr, unsigned int entries)
-{
-	if (likely(entries == 0))
-		__asm__ __volatile__("lldt %w0"::"q" (0));
-	else {
-		unsigned cpu = smp_processor_id();
-		__u32 a, b;
-
-		pack_descriptor(&a, &b, (unsigned long)addr,
-				entries * sizeof(struct desc_struct) - 1,
-				DESCTYPE_LDT, 0);
-		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
-		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
-	}
-}
-
-
-static inline void native_load_tr_desc(void)
-{
-	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
-}
-
-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
-{
-	asm volatile("lgdt %0"::"m" (*dtr));
-}
-
-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
-{
-	asm volatile("lidt %0"::"m" (*dtr));
-}
-
-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
-{
-	asm ("sgdt %0":"=m" (*dtr));
-}
-
-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
-{
-	asm ("sidt %0":"=m" (*dtr));
-}
-
-static inline unsigned long native_store_tr(void)
-{
-	unsigned long tr;
-	asm ("str %0":"=r" (tr));
-	return tr;
-}
-
-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	unsigned int i;
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-
-	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-		gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
-}
-#else
-#define load_TLS(t, cpu) xen_load_tls(t, cpu)
-#define set_ldt xen_set_ldt
-
-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
-extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
-
-static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	unsigned int i;
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
-
-	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-		if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
-						 *(u64 *)&t->tls_array[i]))
-			BUG();
-}
-#endif
-
-#ifndef CONFIG_X86_NO_IDT
-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
-{
-	__u32 a, b;
-	pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
-	write_idt_entry(idt_table, gate, a, b);
-}
-#endif
-
-#ifndef CONFIG_X86_NO_TSS
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
-{
-	__u32 a, b;
-	pack_descriptor(&a, &b, (unsigned long)addr,
-			offsetof(struct tss_struct, __cacheline_filler) - 1,
-			DESCTYPE_TSS, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
-}
-#endif
-
-
-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
-
-#define LDT_entry_a(info) \
-	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-
-#define LDT_entry_b(info) \
-	(((info)->base_addr & 0xff000000) | \
-	(((info)->base_addr & 0x00ff0000) >> 16) | \
-	((info)->limit & 0xf0000) | \
-	(((info)->read_exec_only ^ 1) << 9) | \
-	((info)->contents << 10) | \
-	(((info)->seg_not_present ^ 1) << 15) | \
-	((info)->seg_32bit << 22) | \
-	((info)->limit_in_pages << 23) | \
-	((info)->useable << 20) | \
-	0x7000)
-
-#define LDT_empty(info) (\
-	(info)->base_addr	== 0	&& \
-	(info)->limit		== 0	&& \
-	(info)->contents	== 0	&& \
-	(info)->read_exec_only	== 1	&& \
-	(info)->seg_32bit	== 0	&& \
-	(info)->limit_in_pages	== 0	&& \
-	(info)->seg_not_present	== 1	&& \
-	(info)->useable		== 0	)
-
-static inline void clear_LDT(void)
-{
-	set_ldt(NULL, 0);
-}
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
-	set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-	preempt_disable();
-	load_LDT_nolock(pc);
-	preempt_enable();
-}
-
-static inline unsigned long get_desc_base(unsigned long *desc)
-{
-	unsigned long base;
-	base = ((desc[0] >> 16)  & 0x0000ffff) |
-		((desc[1] << 16) & 0x00ff0000) |
-		(desc[1] & 0xff000000);
-	return base;
-}
-
-#else /* __ASSEMBLY__ */
-
-/*
- * GET_DESC_BASE reads the descriptor base of the specified segment.
- *
- * Args:
- *    idx - descriptor index
- *    gdt - GDT pointer
- *    base - 32bit register to which the base will be written
- *    lo_w - lo word of the "base" register
- *    lo_b - lo byte of the "base" register
- *    hi_b - hi byte of the low word of the "base" register
- *
- * Example:
- *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
-	movb idx*8+4(gdt), lo_b; \
-	movb idx*8+7(gdt), hi_b; \
-	shll $16, base; \
-	movw idx*8+2(gdt), lo_w;
-
-#endif /* !__ASSEMBLY__ */
-
-#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/desc_64.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,228 +0,0 @@
-/* Written 2000 by Andi Kleen */
-#ifndef __ARCH_DESC_H
-#define __ARCH_DESC_H
-
-#include <linux/threads.h>
-#include <asm/ldt.h>
-
-#ifndef __ASSEMBLY__
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <asm/desc_defs.h>
-
-#include <asm/segment.h>
-#include <asm/mmu.h>
-
-extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
-
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-
-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
-
-static inline void clear_LDT(void)
-{
-	int cpu = get_cpu();
-
-	/*
-	 * NB. We load the default_ldt for lcall7/27 handling on demand, as
-	 * it slows down context switching. Noone uses it anyway.
-	 */
-	cpu = cpu;              /* XXX avoid compiler warning */
-	xen_set_ldt(NULL, 0);
-	put_cpu();
-}
-
-#ifndef CONFIG_X86_NO_TSS
-static inline unsigned long __store_tr(void)
-{
-       unsigned long tr;
-
-       asm volatile ("str %w0":"=r" (tr));
-       return tr;
-}
-
-#define store_tr(tr) (tr) = __store_tr()
-#endif
-
-/*
- * This is the ldt that every process will get unless we need
- * something other than this.
- */
-extern struct desc_struct default_ldt[];
-#ifndef CONFIG_X86_NO_IDT
-extern struct gate_struct idt_table[];
-#endif
-extern struct desc_ptr cpu_gdt_descr[];
-
-/* the cpu gdt accessor */
-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
-
-#ifndef CONFIG_XEN
-static inline void load_gdt(const struct desc_ptr *ptr)
-{
-	asm volatile("lgdt %w0"::"m" (*ptr));
-}
-
-static inline void store_gdt(struct desc_ptr *ptr)
-{
-       asm("sgdt %w0":"=m" (*ptr));
-}
-#endif
-
-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
-{
-	struct gate_struct s;
-	s.offset_low = PTR_LOW(func);
-	s.segment = __KERNEL_CS;
-	s.ist = ist;
-	s.p = 1;
-	s.dpl = dpl;
-	s.zero0 = 0;
-	s.zero1 = 0;
-	s.type = type;
-	s.offset_middle = PTR_MIDDLE(func);
-	s.offset_high = PTR_HIGH(func);
-	/* does not need to be atomic because it is only done once at setup time */
-	memcpy(adr, &s, 16);
-}
-
-#ifndef CONFIG_X86_NO_IDT
-static inline void set_intr_gate(int nr, void *func)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
-}
-
-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
-}
-
-static inline void set_system_gate(int nr, void *func)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
-}
-
-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
-{
-	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
-}
-
-static inline void load_idt(const struct desc_ptr *ptr)
-{
-	asm volatile("lidt %w0"::"m" (*ptr));
-}
-
-static inline void store_idt(struct desc_ptr *dtr)
-{
-       asm("sidt %w0":"=m" (*dtr));
-}
-#endif
-
-static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
-					 unsigned size)
-{
-	struct ldttss_desc d;
-	memset(&d,0,sizeof(d));
-	d.limit0 = size & 0xFFFF;
-	d.base0 = PTR_LOW(tss);
-	d.base1 = PTR_MIDDLE(tss) & 0xFF;
-	d.type = type;
-	d.p = 1;
-	d.limit1 = (size >> 16) & 0xF;
-	d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
-	d.base3 = PTR_HIGH(tss);
-	memcpy(ptr, &d, 16);
-}
-
-#ifndef CONFIG_X86_NO_TSS
-static inline void set_tss_desc(unsigned cpu, void *addr)
-{
-	/*
-	 * sizeof(unsigned long) coming from an extra "long" at the end
-	 * of the iobitmap. See tss_struct definition in processor.h
-	 *
-	 * -1? seg base+limit should be pointing to the address of the
-	 * last valid byte
-	 */
-	set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
-		(unsigned long)addr, DESC_TSS,
-		IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
-}
-#endif
-
-static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
-{
-	set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
-			      DESC_LDT, size * 8 - 1);
-}
-
-#define LDT_entry_a(info) \
-	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-/* Don't allow setting of the lm bit. It is useless anyways because
-   64bit system calls require __USER_CS. */
-#define LDT_entry_b(info) \
-	(((info)->base_addr & 0xff000000) | \
-	(((info)->base_addr & 0x00ff0000) >> 16) | \
-	((info)->limit & 0xf0000) | \
-	(((info)->read_exec_only ^ 1) << 9) | \
-	((info)->contents << 10) | \
-	(((info)->seg_not_present ^ 1) << 15) | \
-	((info)->seg_32bit << 22) | \
-	((info)->limit_in_pages << 23) | \
-	((info)->useable << 20) | \
-	/* ((info)->lm << 21) | */ \
-	0x7000)
-
-#define LDT_empty(info) (\
-	(info)->base_addr	== 0	&& \
-	(info)->limit		== 0	&& \
-	(info)->contents	== 0	&& \
-	(info)->read_exec_only	== 1	&& \
-	(info)->seg_32bit	== 0	&& \
-	(info)->limit_in_pages	== 0	&& \
-	(info)->seg_not_present	== 1	&& \
-	(info)->useable		== 0	&& \
-	(info)->lm		== 0)
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-	unsigned int i;
-	u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
-
-	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-		if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
-						 t->tls_array[i]))
-			BUG();
-}
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
-{
-	void *segments = pc->ldt;
-	int count = pc->size;
-
-	if (likely(!count))
-		segments = NULL;
-
-	xen_set_ldt(segments, count);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-	int cpu = get_cpu();
-	load_LDT_nolock(pc, cpu);
-	put_cpu();
-}
-
-extern struct desc_ptr idt_descr;
-
-#endif /* !__ASSEMBLY__ */
-
-#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/fixmap_32.h	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/fixmap_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -64,7 +64,7 @@ enum fixed_addresses {
 #endif
 #ifdef CONFIG_X86_VISWS_APIC
 	FIX_CO_CPU,	/* Cobalt timer */
-	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
+	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
 	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
 	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
 #endif
@@ -73,7 +73,7 @@ enum fixed_addresses {
 #endif
 #ifdef CONFIG_X86_CYCLONE_TIMER
 	FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif
+#endif
 #ifdef CONFIG_HIGHMEM
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -93,11 +93,23 @@ enum fixed_addresses {
 	FIX_ISAMAP_END,
 	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
 	__end_of_permanent_fixed_addresses,
-	/* temporary boot-time mappings, used before ioremap() is functional */
-#define NR_FIX_BTMAPS	16
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+	/*
+	 * 256 temporary boot-time mappings, used by early_ioremap(),
+	 * before ioremap() is functional.
+	 *
+	 * We round it up to the next 512 pages boundary so that we
+	 * can have a single pgd entry and a single pte table:
+	 */
+#define NR_FIX_BTMAPS		64
+#define FIX_BTMAPS_NESTING	4
+	FIX_BTMAP_END =
+		__end_of_permanent_fixed_addresses + 512 -
+			(__end_of_permanent_fixed_addresses & 511),
+	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
 	FIX_WP_TEST,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 	__end_of_fixed_addresses
 };

--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -15,6 +15,7 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
+#include <asm/efi.h>
 #include <asm/acpi.h>

 /*
@@ -46,6 +47,10 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
 #endif
+#ifdef CONFIG_EFI
+	FIX_EFI_IO_MAP_LAST_PAGE,
+	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
+#endif
 #ifdef CONFIG_ACPI
 	FIX_ACPI_BEGIN,
 	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
@@ -55,10 +60,22 @@ enum fixed_addresses {
 	FIX_ISAMAP_END,
 	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
 	__end_of_permanent_fixed_addresses,
-	/* temporary boot-time mappings, used before ioremap() is functional */
-#define NR_FIX_BTMAPS	16
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
-	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+	/*
+	 * 256 temporary boot-time mappings, used by early_ioremap(),
+	 * before ioremap() is functional.
+	 *
+	 * We round it up to the next 512 pages boundary so that we
+	 * can have a single pgd entry and a single pte table:
+	 */
+#define NR_FIX_BTMAPS		64
+#define FIX_BTMAPS_NESTING	4
+	FIX_BTMAP_END =
+		__end_of_permanent_fixed_addresses + 512 -
+			(__end_of_permanent_fixed_addresses & 511),
+	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 	__end_of_fixed_addresses
 };

--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 17:04:33.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 15:10:37.000000000 +0100
@@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table;
  * easily, subsequent pte tables have to be allocated in one physical
  * chunk of RAM.
  */
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
 /*
  * Ordering is:
  *
@@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table;
  * VMALLOC_START
  * high_memory
  */
-#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
 #define LAST_PKMAP_MASK (LAST_PKMAP-1)
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))

-extern void * FASTCALL(kmap_high(struct page *page));
-extern void FASTCALL(kunmap_high(struct page *page));
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);

 void *kmap(struct page *page);
 void kunmap(struct page *page);
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:10:37.000000000 +0100
@@ -271,6 +271,25 @@ HYPERVISOR_poll(
 	return rc;
 }

+static inline int __must_check
+HYPERVISOR_poll_no_timeout(
+	evtchn_port_t *ports, unsigned int nr_ports)
+{
+	int rc;
+	struct sched_poll sched_poll = {
+		.nr_ports = nr_ports
+	};
+	set_xen_guest_handle(sched_poll.ports, ports);
+
+	rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+	return rc;
+}
+
 #ifdef CONFIG_XEN

 static inline void
@@ -310,4 +329,6 @@ MULTI_grant_table_op(multicall_entry_t *

 #endif

+#define uvm_multi(cpumask) ((unsigned long)cpus_addr(cpumask) | UVMF_MULTI)
+
 #endif /* __HYPERVISOR_H__ */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,249 @@
-#ifdef CONFIG_X86_32
-# include "irqflags_32.h"
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
+
+#define xen_restore_fl(f)					\
+do {								\
+	vcpu_info_t *_vcpu;					\
+	barrier();						\
+	_vcpu = current_vcpu_info();				\
+	if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {		\
+		barrier(); /* unmask then check (avoid races) */\
+		if (unlikely(_vcpu->evtchn_upcall_pending))	\
+			force_evtchn_callback();		\
+	}							\
+} while (0)
+
+#define xen_irq_disable()					\
+do {								\
+	current_vcpu_info()->evtchn_upcall_mask = 1;		\
+	barrier();						\
+} while (0)
+
+#define xen_irq_enable()					\
+do {								\
+	vcpu_info_t *_vcpu;					\
+	barrier();						\
+	_vcpu = current_vcpu_info();				\
+	_vcpu->evtchn_upcall_mask = 0;				\
+	barrier(); /* unmask then check (avoid races) */	\
+	if (unlikely(_vcpu->evtchn_upcall_pending))		\
+		force_evtchn_callback();			\
+} while (0)
+
+void xen_safe_halt(void);
+
+void xen_halt(void);
+
+#define __raw_local_save_flags() xen_save_fl()
+
+#define raw_local_irq_restore(flags) xen_restore_fl(flags)
+
+#define raw_local_irq_disable()	xen_irq_disable()
+
+#define raw_local_irq_enable() xen_irq_enable()
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+	xen_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+	xen_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+#define __raw_local_irq_save()						\
+({									\
+	unsigned long flags = __raw_local_save_flags();			\
+									\
+	raw_local_irq_disable();					\
+									\
+	flags;								\
+})
 #else
-# include "irqflags_64.h"
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending		/* 0 */
+#define evtchn_upcall_mask		1
+
+#define sizeof_vcpu_shift		6
+
+#ifdef CONFIG_X86_64
+# define __REG_si %rsi
+# define __CPU_num %gs:pda_cpunumber
+#else
+# define __REG_si %esi
+# define __CPU_num TI_cpu(%ebp)
+#endif
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO		movl __CPU_num,%esi			; \
+				shl $sizeof_vcpu_shift,%esi		; \
+				add HYPERVISOR_shared_info,__REG_si
+#else
+#define GET_VCPU_INFO		mov HYPERVISOR_shared_info,__REG_si
+#endif
+
+#define __DISABLE_INTERRUPTS	movb $1,evtchn_upcall_mask(__REG_si)
+#define __ENABLE_INTERRUPTS	movb $0,evtchn_upcall_mask(__REG_si)
+#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(__REG_si)
+#define DISABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
+				__DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
+				__ENABLE_INTERRUPTS
+
+#ifndef CONFIG_X86_64
+#define INTERRUPT_RETURN		iret
+#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS		; \
+sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/		; \
+	__TEST_PENDING							; \
+	jnz  14f	/* process more events if necessary... */	; \
+	movl PT_ESI(%esp), %esi						; \
+	sysexit								; \
+14:	__DISABLE_INTERRUPTS						; \
+	TRACE_IRQS_OFF							; \
+sysexit_ecrit:	/**** END OF SYSEXIT CRITICAL REGION ****/		; \
+	mov  $__KERNEL_PERCPU, %ecx					; \
+	push %esp							; \
+	mov  %ecx, %fs							; \
+	call evtchn_do_upcall						; \
+	add  $4,%esp							; \
+	jmp  ret_from_intr
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+		do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return (flags != 0);
+}
+
+#define raw_irqs_disabled()						\
+({									\
+	unsigned long flags = __raw_local_save_flags();			\
+									\
+	raw_irqs_disabled_flags(flags);					\
+})
+
+/*
+ * makes the traced hardirq state match with the machine state
+ *
+ * should be a rarely used function, only in places where its
+ * otherwise impossible to know the irq state, like in traps.
+ */
+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+{
+	if (raw_irqs_disabled_flags(flags))
+		trace_hardirqs_off();
+	else
+		trace_hardirqs_on();
+}
+
+#define trace_hardirqs_fixup() \
+	trace_hardirqs_fixup_flags(__raw_local_save_flags())
+
+#else
+
+#ifdef CONFIG_X86_64
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack).  So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK	swapgs
+#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
+#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
+#define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
+	TRACE_IRQS_ON; \
+	ENABLE_INTERRUPTS(CLBR_NONE); \
+	SAVE_REST; \
+	LOCKDEP_SYS_EXIT; \
+	RESTORE_REST; \
+	__DISABLE_INTERRUPTS; \
+	TRACE_IRQS_OFF;
+
+#else
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call lockdep_sys_exit;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  define LOCKDEP_SYS_EXIT	ARCH_LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ	ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+#  define LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/irqflags_32.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,214 +0,0 @@
-/*
- * include/asm-i386/irqflags.h
- *
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-
-#ifndef __ASSEMBLY__
-#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
-
-#define xen_restore_fl(f)					\
-do {								\
-	vcpu_info_t *_vcpu;					\
-	barrier();						\
-	_vcpu = current_vcpu_info();				\
-	if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {		\
-		barrier(); /* unmask then check (avoid races) */\
-		if (unlikely(_vcpu->evtchn_upcall_pending))	\
-			force_evtchn_callback();		\
-	}							\
-} while (0)
-
-#define xen_irq_disable()					\
-do {								\
-	current_vcpu_info()->evtchn_upcall_mask = 1;		\
-	barrier();						\
-} while (0)
-
-#define xen_irq_enable()					\
-do {								\
-	vcpu_info_t *_vcpu;					\
-	barrier();						\
-	_vcpu = current_vcpu_info();				\
-	_vcpu->evtchn_upcall_mask = 0;				\
-	barrier(); /* unmask then check (avoid races) */	\
-	if (unlikely(_vcpu->evtchn_upcall_pending))		\
-		force_evtchn_callback();			\
-} while (0)
-
-void xen_safe_halt(void);
-
-void xen_halt(void);
-
-/*
- * The use of 'barrier' in the following reflects their use as local-lock
- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
- * critical operations are executed. All critical operations must complete
- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
- * includes these barriers, for example.
- */
-
-#define __raw_local_save_flags() xen_save_fl()
-
-#define raw_local_irq_restore(flags) xen_restore_fl(flags)
-
-#define raw_local_irq_disable()	xen_irq_disable()
-
-#define raw_local_irq_enable() xen_irq_enable()
-
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
-	xen_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
-	xen_halt();
-}
-
-/*
- * For spinlocks, etc:
- */
-#define __raw_local_irq_save()						\
-({									\
-	unsigned long flags = __raw_local_save_flags();			\
-									\
-	raw_local_irq_disable();					\
-									\
-	flags;								\
-})
-
-#else
-/* Offsets into shared_info_t. */
-#define evtchn_upcall_pending		/* 0 */
-#define evtchn_upcall_mask		1
-
-#define sizeof_vcpu_shift		6
-
-#ifdef CONFIG_SMP
-#define GET_VCPU_INFO		movl TI_cpu(%ebp),%esi			; \
-				shl  $sizeof_vcpu_shift,%esi		; \
-				addl HYPERVISOR_shared_info,%esi
-#else
-#define GET_VCPU_INFO		movl HYPERVISOR_shared_info,%esi
-#endif
-
-#define __DISABLE_INTERRUPTS	movb $1,evtchn_upcall_mask(%esi)
-#define __ENABLE_INTERRUPTS	movb $0,evtchn_upcall_mask(%esi)
-#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(%esi)
-#define DISABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
-				__DISABLE_INTERRUPTS
-#define ENABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
-				__ENABLE_INTERRUPTS
-#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS			; \
-sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/		; \
-	__TEST_PENDING							; \
-	jnz  14f	/* process more events if necessary... */	; \
-	movl PT_ESI(%esp), %esi						; \
-	sysexit								; \
-14:	__DISABLE_INTERRUPTS						; \
-	TRACE_IRQS_OFF							; \
-sysexit_ecrit:	/**** END OF SYSEXIT CRITICAL REGION ****/		; \
-	mov  $__KERNEL_PERCPU, %ecx					; \
-	push %esp							; \
-	mov  %ecx, %fs							; \
-	call evtchn_do_upcall						; \
-	add  $4,%esp							; \
-	jmp  ret_from_intr
-#define INTERRUPT_RETURN	iret
-#endif /* __ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
-		do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return (flags != 0);
-}
-
-#define raw_irqs_disabled()						\
-({									\
-	unsigned long flags = __raw_local_save_flags();			\
-									\
-	raw_irqs_disabled_flags(flags);					\
-})
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-	if (raw_irqs_disabled_flags(flags))
-		trace_hardirqs_off();
-	else
-		trace_hardirqs_on();
-}
-
-#define trace_hardirqs_fixup() \
-	trace_hardirqs_fixup_flags(__raw_local_save_flags())
-#endif /* __ASSEMBLY__ */
-
-/*
- * Do the CPU's IRQ-state tracing from assembly code. We call a
- * C function, so save all the C-clobbered registers:
- */
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-# define TRACE_IRQS_ON				\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-# define TRACE_IRQS_OFF				\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCKDEP_SYS_EXIT			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call lockdep_sys_exit;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-#else
-# define LOCKDEP_SYS_EXIT
-#endif
-
-#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/irqflags_64.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,178 +0,0 @@
-/*
- * include/asm-x86_64/irqflags.h
- *
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-#include <asm/processor-flags.h>
-
-#ifndef __ASSEMBLY__
-/*
- * Interrupt control:
- */
-
-/*
- * The use of 'barrier' in the following reflects their use as local-lock
- * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
- * critical operations are executed. All critical operations must complete
- * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
- * includes these barriers, for example.
- */
-
-#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
-
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_restore(x)					\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	barrier();							\
-	_vcpu = current_vcpu_info();		\
-	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
-		barrier(); /* unmask then check (avoid races) */	\
-		if ( unlikely(_vcpu->evtchn_upcall_pending) )		\
-			force_evtchn_callback();			\
-	}								\
-} while (0)
-
-#ifdef CONFIG_X86_VSMP
-
-/*
- * Interrupt control for the VSMP architecture:
- */
-
-static inline void raw_local_irq_disable(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
-}
-
-#else /* CONFIG_X86_VSMP */
-
-#define raw_local_irq_disable()						\
-do {									\
-	current_vcpu_info()->evtchn_upcall_mask = 1;					\
-	barrier();							\
-} while (0)
-
-#define raw_local_irq_enable()						\
-do {									\
-	vcpu_info_t *_vcpu;						\
-	barrier();							\
-	_vcpu = current_vcpu_info();		\
-	_vcpu->evtchn_upcall_mask = 0;					\
-	barrier(); /* unmask then check (avoid races) */		\
-	if ( unlikely(_vcpu->evtchn_upcall_pending) )			\
-		force_evtchn_callback();				\
-} while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return (flags != 0);
-}
-
-#endif
-
-/*
- * For spinlocks, etc.:
- */
-
-#define __raw_local_irq_save()						\
-({									\
-	unsigned long flags = __raw_local_save_flags();			\
-									\
-	raw_local_irq_disable();					\
-									\
-	flags;								\
-})
-
-#define raw_local_irq_save(flags) \
-		do { (flags) = __raw_local_irq_save(); } while (0)
-
-#define raw_irqs_disabled()						\
-({									\
-	unsigned long flags = __raw_local_save_flags();			\
-									\
-	raw_irqs_disabled_flags(flags);					\
-})
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-	if (raw_irqs_disabled_flags(flags))
-		trace_hardirqs_off();
-	else
-		trace_hardirqs_on();
-}
-
-#define trace_hardirqs_fixup() \
-	trace_hardirqs_fixup_flags(__raw_local_save_flags())
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-void xen_safe_halt(void);
-static inline void raw_safe_halt(void)
-{
-	xen_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-void xen_halt(void);
-static inline void halt(void)
-{
-	xen_halt();
-}
-
-#else /* __ASSEMBLY__: */
-# ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk
-# else
-#  define TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF
-# endif
-# ifdef CONFIG_DEBUG_LOCK_ALLOC
-#  define LOCKDEP_SYS_EXIT	call lockdep_sys_exit_thunk
-#  define LOCKDEP_SYS_EXIT_IRQ	\
-	TRACE_IRQS_ON; \
-	sti; \
-	SAVE_REST; \
-	LOCKDEP_SYS_EXIT; \
-	RESTORE_REST; \
-	cli; \
-	TRACE_IRQS_OFF;
-# else
-#  define LOCKDEP_SYS_EXIT
-#  define LOCKDEP_SYS_EXIT_IRQ
-# endif
-#endif
-
-#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/maddr_32.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/maddr_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,6 +1,7 @@
 #ifndef _I386_MADDR_H
 #define _I386_MADDR_H

+#include <asm/bug.h>
 #include <xen/features.h>
 #include <xen/interface/xen.h>

@@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy
 	phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
 	return phys;
 }
-#endif
-
-#ifdef CONFIG_X86_PAE
-#define __pte_ma(x)	((pte_t) { (x), (maddr_t)(x) >> 32 } )
-extern unsigned long long __supported_pte_mask;
-static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
-{
-	pte_t pte;
-
-	pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
-					(pgprot_val(pgprot) >> 32);
-	pte.pte_high &= (__supported_pte_mask >> 32);
-	pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
-							__supported_pte_mask;
-	return pte;
-}
 #else
-#define __pte_ma(x)	((pte_t) { (x) } )
-#define pfn_pte_ma(pfn, prot)	__pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pte_phys_to_machine phys_to_machine
+#define pte_machine_to_phys machine_to_phys
 #endif

 #else /* !CONFIG_XEN */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/maddr_64.h	2007-06-12 13:14:13.000000000 +0200
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/maddr_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,6 +1,7 @@
 #ifndef _X86_64_MADDR_H
 #define _X86_64_MADDR_H

+#include <asm/bug.h>
 #include <xen/features.h>
 #include <xen/interface/xen.h>

@@ -16,6 +17,7 @@ typedef unsigned long maddr_t;
 #ifdef CONFIG_XEN

 extern unsigned long *phys_to_machine_mapping;
+extern unsigned long  max_mapnr;

 #undef machine_to_phys_mapping
 extern unsigned long *machine_to_phys_mapping;
@@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u
 {
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
-	BUG_ON(end_pfn && pfn >= end_pfn);
+	BUG_ON(max_mapnr && pfn >= max_mapnr);
 	return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
 }

@@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin
 {
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 1;
-	BUG_ON(end_pfn && pfn >= end_pfn);
+	BUG_ON(max_mapnr && pfn >= max_mapnr);
 	return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
 }

@@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u
 		return mfn;

 	if (unlikely((mfn >> machine_to_phys_order) != 0))
-		return end_pfn;
+		return max_mapnr;

 	/* The array access can fail (e.g., device space beyond end of RAM). */
 	asm (
@@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u
 		"	.quad 1b,3b\n"
 		".previous"
 		: "=r" (pfn)
-		: "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
+		: "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );

 	return pfn;
 }
@@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
 	unsigned long pfn = mfn_to_pfn(mfn);
-	if ((pfn < end_pfn)
+	if ((pfn < max_mapnr)
 	    && !xen_feature(XENFEAT_auto_translated_physmap)
 	    && (phys_to_machine_mapping[pfn] != mfn))
-		return end_pfn; /* force !pfn_valid() */
+		return max_mapnr; /* force !pfn_valid() */
 	return pfn;
 }

 static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	BUG_ON(end_pfn && pfn >= end_pfn);
+	BUG_ON(max_mapnr && pfn >= max_mapnr);
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 		return;
@@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy
 	return phys;
 }

-#define __pte_ma(x)     ((pte_t) { (x) } )
-#define pfn_pte_ma(pfn, prot)	__pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
-
 #else /* !CONFIG_XEN */

 #define pfn_to_mfn(pfn) (pfn)
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -51,8 +51,6 @@ static inline void __prepare_arch_switch
 		: : "r" (0) );
 }

-void leave_mm(unsigned long cpu);
-
 static inline void switch_mm(struct mm_struct *prev,
 			     struct mm_struct *next,
 			     struct task_struct *tsk)
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/mmu_context_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm)
 extern void mm_unpin(struct mm_struct *mm);
 void mm_pin_all(void);

-static inline void load_cr3(pgd_t *pgd)
-{
-	asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
-		     "memory");
-}
-
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
@@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s
 		op++;

 		if (unlikely(next->context.ldt != prev->context.ldt)) {
-			/* load_LDT_nolock(&next->context, cpu) */
+			/* load_LDT_nolock(&next->context) */
 			op->cmd = MMUEXT_SET_LDT;
 			op->arg1.linear_addr = (unsigned long)next->context.ldt;
 			op->arg2.nr_ents     = next->context.size;
@@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s
 	else {
 		write_pda(mmu_state, TLBSTATE_OK);
 		if (read_pda(active_mm) != next)
-			out_of_line_bug();
+			BUG();
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
@@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s
 			 */
                         load_cr3(next->pgd);
                         xen_new_user_pt(__pa(__user_pgd(next->pgd)));
-			load_LDT_nolock(&next->context, cpu);
+			load_LDT_nolock(&next->context);
 		}
 	}
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:10:37.000000000 +0100
@@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc


 #ifdef CONFIG_PCI
+extern void early_quirks(void);
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice(
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#else
+static inline void early_quirks(void) { }
 #endif

-
 #endif  /* __KERNEL__ */

 #ifdef CONFIG_X86_32
@@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice(
 /* generic pci stuff */
 #include <asm-generic/pci.h>

+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(struct pci_bus *bus)
+{
+	struct pci_sysdata *sd = bus->sysdata;
+
+	return sd->node;
+}

+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
+{
+	return node_to_cpumask(__pcibus_to_node(bus));
+}
+#endif

 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgalloc_32.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgalloc_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -3,69 +3,109 @@

 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
+#include <linux/pagemap.h>
+#include <asm/tlb.h>
+#include <asm-generic/tlb.h>
 #include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */

 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
 #define paravirt_release_pt(pfn) do { } while (0)
 #define paravirt_release_pd(pfn) do { } while (0)

-#define pmd_populate_kernel(mm, pmd, pte)			\
-do {								\
-	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);		\
-	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));		\
-} while (0)
-
-#define pmd_populate(mm, pmd, pte) 					\
-do {									\
-	unsigned long pfn = page_to_pfn(pte);				\
-	paravirt_alloc_pt(mm, pfn);					\
-	if (PagePinned(virt_to_page((mm)->pgd))) {			\
-		if (!PageHighMem(pte))					\
-			BUG_ON(HYPERVISOR_update_va_mapping(		\
-			  (unsigned long)__va(pfn << PAGE_SHIFT),	\
-			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));		\
-		else if (!test_and_set_bit(PG_pinned, &pte->flags))	\
-			kmap_flush_unused();				\
-		set_pmd(pmd,						\
-		        __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \
-	} else							\
-		*(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \
-} while (0)
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+				       pmd_t *pmd, pte_t *pte)
+{
+	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+	unsigned long pfn = page_to_pfn(pte);
+
+	paravirt_alloc_pt(mm, pfn);
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		if (!PageHighMem(pte))
+			BUG_ON(HYPERVISOR_update_va_mapping(
+			  (unsigned long)__va(pfn << PAGE_SHIFT),
+			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+		else if (!test_and_set_bit(PG_pinned, &pte->flags))
+			kmap_flush_unused();
+		set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+	} else
+		*pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+}
+#define pmd_pgtable(pmd) pmd_page(pmd)

 /*
  * Allocate and free page tables.
  */
+extern void pgd_test_and_unpin(pgd_t *);
 extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

 extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);

-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
 	free_page((unsigned long)pte);
 }

-extern void pte_free(struct page *pte);
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
+{
+	__pte_free(pte);
+}
+

-#define __pte_free_tlb(tlb,pte) 					\
-do {									\
-	paravirt_release_pt(page_to_pfn(pte));				\
-	tlb_remove_page((tlb),(pte));					\
-} while (0)
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);

 #ifdef CONFIG_X86_PAE
 /*
  * In the PAE case we free the pmds as part of the pgd.
  */
-#define pmd_alloc_one(mm, addr)		({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)			do { } while (0)
-#define __pmd_free_tlb(tlb,x)		do { } while (0)
-#define pud_populate(mm, pmd, pte)	BUG()
-#endif
+extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
+
+extern void __pmd_free(pgtable_t);
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+	__pmd_free(virt_to_page(pmd));
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+	struct page *page = virt_to_page(pmd);
+	unsigned long pfn = page_to_pfn(page);
+
+	paravirt_alloc_pd(mm, pfn);
+
+	/* Note: almost everything apart from _PAGE_PRESENT is
+	   reserved at the pmd (PDPT) level. */
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		BUG_ON(PageHighMem(page));
+		BUG_ON(HYPERVISOR_update_va_mapping(
+			  (unsigned long)__va(pfn << PAGE_SHIFT),
+			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+		set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+	} else
+		*pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 */
+	if (mm == current->active_mm)
+		xen_tlb_flush();
+}
+#endif	/* CONFIG_X86_PAE */

 #endif /* _I386_PGALLOC_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgalloc_64.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgalloc_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -6,30 +6,13 @@
 #include <linux/mm.h>
 #include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */

-#include <xen/features.h>
-void make_page_readonly(void *va, unsigned int feature);
-void make_page_writable(void *va, unsigned int feature);
-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+void early_make_page_readonly(void *va, unsigned int feature);

 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)

-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
-{
-	set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
-	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-		BUG_ON(HYPERVISOR_update_va_mapping(
-			       (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
-			       pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
-		set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
-	} else {
-		*(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
-	}
-}
+#define pmd_populate_kernel(mm, pmd, pte) \
+		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))

 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
@@ -63,53 +46,58 @@ static inline void pgd_populate(struct m
 	}
 }

-extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-extern void pte_free(struct page *pte);
+#define pmd_pgtable(pmd) pmd_page(pmd)

-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
 {
-	struct page *pg;
-
-	pg = pte_alloc_one(mm, addr);
-	return pg ? page_address(pg) : NULL;
+	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+		BUG_ON(HYPERVISOR_update_va_mapping(
+			       (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
+			       pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+		set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+	} else {
+		*(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+	}
 }

-static inline void pmd_free(pmd_t *pmd)
+extern void __pmd_free(pgtable_t);
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	pte_free(virt_to_page(pmd));
+	__pmd_free(virt_to_page(pmd));
 }

+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	struct page *pg;
-
-	pg = pte_alloc_one(mm, addr);
-	return pg ? page_address(pg) : NULL;
+	return (pud_t *)pmd_alloc_one(mm, addr);
 }

-static inline void pud_free(pud_t *pud)
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	pte_free(virt_to_page(pud));
+	__pmd_free(virt_to_page(pud));
 }

 static inline void pgd_list_add(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
+	unsigned long flags;

-	spin_lock(&pgd_lock);
+	spin_lock_irqsave(&pgd_lock, flags);
 	list_add(&page->lru, &pgd_list);
-	spin_unlock(&pgd_lock);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }

 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
+	unsigned long flags;

-	spin_lock(&pgd_lock);
+	spin_lock_irqsave(&pgd_lock, flags);
 	list_del(&page->lru);
-	spin_unlock(&pgd_lock);
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }

 extern void pgd_test_and_unpin(pgd_t *);
@@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm
 	return pgd;
 }

-static inline void pgd_free(pgd_t *pgd)
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 	pgd_test_and_unpin(pgd);
 	pgd_list_del(pgd);
@@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne
 	return pte;
 }

+extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+
 /* Should really implement gc for free page table pages. This could be
    done with a reference count in struct page. */

-static inline void pte_free_kernel(pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
 	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
 	make_page_writable(pte, XENFEAT_writable_page_tables);
 	free_page((unsigned long)pte);
 }

-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
+{
+	__pte_free(pte);
+}
+
+#define __pte_free_tlb(tlb,pte)				\
+do {							\
+	pgtable_page_dtor((pte));				\
+	tlb_remove_page((tlb), (pte));			\
+} while (0)
+
 #define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
 #define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))

--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,467 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
+#define FIRST_USER_ADDRESS	0
+
+#define _PAGE_BIT_PRESENT	0
+#define _PAGE_BIT_RW		1
+#define _PAGE_BIT_USER		2
+#define _PAGE_BIT_PWT		3
+#define _PAGE_BIT_PCD		4
+#define _PAGE_BIT_ACCESSED	5
+#define _PAGE_BIT_DIRTY		6
+#define _PAGE_BIT_FILE		6
+#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT		7	/* on 4KB pages */
+#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_IO		9	/* Mapped page is I/O or foreign and
+					 * has no associated page struct. */
+#define _PAGE_BIT_UNUSED2	10	/* available for programmer */
+#define _PAGE_BIT_UNUSED3	11
+#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+
+/*
+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
+ * sign-extended value on 32-bit with all 1's in the upper word,
+ * which preserves the upper pte values on 64-bit ptes:
+ */
+#define _PAGE_PRESENT	(_AC(1, L)<<_PAGE_BIT_PRESENT)
+#define _PAGE_RW	(_AC(1, L)<<_PAGE_BIT_RW)
+#define _PAGE_USER	(_AC(1, L)<<_PAGE_BIT_USER)
+#define _PAGE_PWT	(_AC(1, L)<<_PAGE_BIT_PWT)
+#define _PAGE_PCD	(_AC(1, L)<<_PAGE_BIT_PCD)
+#define _PAGE_ACCESSED	(_AC(1, L)<<_PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY	(_AC(1, L)<<_PAGE_BIT_DIRTY)
+#define _PAGE_PSE	(_AC(1, L)<<_PAGE_BIT_PSE)	/* 2MB page */
+#define _PAGE_GLOBAL	(_AC(1, L)<<_PAGE_BIT_GLOBAL)	/* Global TLB entry */
+#define _PAGE_IO	(_AC(1, L)<<_PAGE_BIT_IO)
+#define _PAGE_UNUSED2	(_AC(1, L)<<_PAGE_BIT_UNUSED2)
+#define _PAGE_UNUSED3	(_AC(1, L)<<_PAGE_BIT_UNUSED3)
+#define _PAGE_PAT	(_AC(1, L)<<_PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX	(_AC(1, ULL) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX	0
+#endif
+
+/* If _PAGE_PRESENT is clear, we use these: */
+#define _PAGE_FILE	_PAGE_DIRTY	/* nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_PROTNONE	_PAGE_PSE	/* if the user mapped it with PROT_NONE;
+					   pte_present gives true */
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
+
+#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
+
+#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY		PAGE_COPY_NOEXEC
+#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+
+#ifdef CONFIG_X86_32
+#define _PAGE_KERNEL_EXEC \
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#ifndef __ASSEMBLY__
+extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
+#endif	/* __ASSEMBLY__ */
+#else
+#define __PAGE_KERNEL_EXEC						\
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
+#endif
+
+#define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+/*
+ * We don't support GLOBAL page in xenolinux64
+ */
+#define MAKE_GLOBAL(x)			__pgprot((x))
+
+#define PAGE_KERNEL			MAKE_GLOBAL(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO			MAKE_GLOBAL(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE		MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL		MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+/*         xwr */
+#define __P000	PAGE_NONE
+#define __P001	PAGE_READONLY
+#define __P010	PAGE_COPY
+#define __P011	PAGE_COPY
+#define __P100	PAGE_READONLY_EXEC
+#define __P101	PAGE_READONLY_EXEC
+#define __P110	PAGE_COPY_EXEC
+#define __P111	PAGE_COPY_EXEC
+
+#define __S000	PAGE_NONE
+#define __S001	PAGE_READONLY
+#define __S010	PAGE_SHARED
+#define __S011	PAGE_SHARED
+#define __S100	PAGE_READONLY_EXEC
+#define __S101	PAGE_READONLY_EXEC
+#define __S110	PAGE_SHARED_EXEC
+#define __S111	PAGE_SHARED_EXEC
+
+#ifndef __ASSEMBLY__
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & _PAGE_DIRTY; }
+static inline int pte_young(pte_t pte)		{ return __pte_val(pte) & _PAGE_ACCESSED; }
+static inline int pte_write(pte_t pte)		{ return __pte_val(pte) & _PAGE_RW; }
+static inline int pte_file(pte_t pte)		{ return __pte_val(pte) & _PAGE_FILE; }
+static inline int pte_huge(pte_t pte)		{ return __pte_val(pte) & _PAGE_PSE; }
+static inline int pte_global(pte_t pte) 	{ return 0; }
+static inline int pte_exec(pte_t pte)		{ return !(__pte_val(pte) & _PAGE_NX); }
+
+static inline int pmd_large(pmd_t pte) {
+	return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+		(_PAGE_PSE|_PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
+static inline pte_t pte_mkold(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
+static inline pte_t pte_wrprotect(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
+static inline pte_t pte_mkexec(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
+static inline pte_t pte_mkdirty(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
+static inline pte_t pte_mkyoung(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
+static inline pte_t pte_mkwrite(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_RW); }
+static inline pte_t pte_mkhuge(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
+static inline pte_t pte_clrhuge(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
+static inline pte_t pte_mkglobal(pte_t pte)	{ return pte; }
+static inline pte_t pte_clrglobal(pte_t pte)	{ return pte; }
+
+extern pteval_t __supported_pte_mask;
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
+		      pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) |
+			 pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
+		      pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	pteval_t val = pte_val(pte);
+
+	val &= _PAGE_CHG_MASK;
+	val |= pgprot_val(newprot) & __supported_pte_mask;
+
+	return __pte(val);
+}
+
+#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
+
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
+#define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_atomic(ptep, pte)					\
+	xen_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd)		xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd)		xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd)			xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud)		xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud)			xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep)	xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd)			xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep)              do { } while (0)
+#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+
+#endif	/* __ASSEMBLY__ */
+
 #ifdef CONFIG_X86_32
 # include "pgtable_32.h"
 #else
 # include "pgtable_64.h"
 #endif
+
+#ifndef __ASSEMBLY__
+
+enum {
+	PG_LEVEL_NONE,
+	PG_LEVEL_4K,
+	PG_LEVEL_2M,
+	PG_LEVEL_1G,
+};
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+	xen_set_pte(ptep, __pte(0));
+	return res;
+}
+
+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+				  pte_t *ptep , pte_t pte)
+{
+	if ((mm != current->mm && mm != &init_mm) ||
+	    HYPERVISOR_update_va_mapping(addr, pte, 0))
+		xen_set_pte(ptep, pte);
+}
+
+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep)
+{
+	if ((mm != current->mm && mm != &init_mm)
+	    || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+		__xen_pte_clear(ptep);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces.  It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush.  The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep)		do { } while (0)
+#define pte_update_defer(mm, addr, ptep)	do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
+({									\
+	int __changed = !pte_same(*(ptep), entry);			\
+	if (__changed && (dirty)) {					\
+		if ( likely((vma)->vm_mm == current->mm) ) {		\
+			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
+				entry,					\
+				uvm_multi((vma)->vm_mm->cpu_vm_mask) |	\
+					UVMF_INVLPG));			\
+		} else {						\
+			xen_l1_entry_update(ptep, entry);		\
+			flush_tlb_page(vma, address);			\
+		}							\
+	}								\
+	__changed;							\
+})
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(vma, addr, ptep) ({			\
+	int __ret = 0;							\
+	if (pte_young(*(ptep)))						\
+		__ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,		\
+					   &(ptep)->pte);		\
+	if (__ret)							\
+		pte_update((vma)->vm_mm, addr, ptep);			\
+	__ret;								\
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(vma, address, ptep)			\
+({									\
+	pte_t __pte = *(ptep);						\
+	int __young = pte_young(__pte);					\
+	__pte = pte_mkold(__pte);					\
+	if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))		\
+		(void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
+	else if (__young)						\
+		(ptep)->pte_low = __pte.pte_low;			\
+	__young;							\
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define ptep_clear_flush(vma, addr, ptep)			\
+({								\
+	pte_t *__ptep = (ptep);					\
+	pte_t __res = *__ptep;					\
+	if (!pte_none(__res) &&					\
+	    ((vma)->vm_mm != current->mm ||			\
+	     HYPERVISOR_update_va_mapping(addr,	__pte(0),	\
+			uvm_multi((vma)->vm_mm->cpu_vm_mask) |	\
+				UVMF_INVLPG))) {		\
+		__xen_pte_clear(__ptep);			\
+		flush_tlb_page(vma, addr);			\
+	}							\
+	__res;							\
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	if (!pte_none(pte)
+	    && (mm != &init_mm
+	        || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+		pte = xen_ptep_get_and_clear(ptep, pte);
+		pte_update(mm, addr, ptep);
+	}
+	return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(mm, addr, ptep, full)		\
+	((full) ? ({						\
+		pte_t *__ptep = (ptep);				\
+		pte_t __res = *__ptep;				\
+		if (!PagePinned(virt_to_page((mm)->pgd)))	\
+			__xen_pte_clear(__ptep);		\
+		else if (!pte_none(__res))			\
+			xen_l1_entry_update(__ptep, __pte(0));	\
+		__res;						\
+	 }) :							\
+	 ptep_get_and_clear(mm, addr, ptep))
+
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	if (pte_write(pte))
+		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+}
+
+#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
+	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
+
+#define arbitrary_virt_to_machine(va)					\
+({									\
+	unsigned int __lvl;						\
+	pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);	\
+	BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
+	(((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT)			\
+	 | ((unsigned long)(va) & (PAGE_SIZE - 1)));			\
+})
+
+#ifdef CONFIG_HIGHPTE
+#include <asm/io.h>
+struct page *kmap_atomic_to_page(void *);
+#define ptep_to_machine(ptep)						\
+({									\
+	pte_t *__ptep = (ptep);						\
+	page_to_phys(kmap_atomic_to_page(__ptep))			\
+		| ((unsigned long)__ptep & (PAGE_SIZE - 1));		\
+})
+#else
+#define ptep_to_machine(ptep)	virt_to_machine(ptep)
+#endif
+
+#include <asm-generic/pgtable.h>
+
+#include <xen/features.h>
+void make_page_readonly(void *va, unsigned int feature);
+void make_page_writable(void *va, unsigned int feature);
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+
+struct vm_area_struct;
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+                           unsigned long address,
+                           unsigned long mfn,
+                           unsigned long size,
+                           pgprot_t prot,
+                           domid_t  domid);
+int direct_kernel_remap_pfn_range(unsigned long address,
+				  unsigned long mfn,
+				  unsigned long size,
+				  pgprot_t prot,
+				  domid_t  domid);
+int create_lookup_pte_addr(struct mm_struct *mm,
+                           unsigned long address,
+                           uint64_t *ptep);
+int touch_pte_range(struct mm_struct *mm,
+                    unsigned long address,
+                    unsigned long size);
+
+int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable);
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* _ASM_X86_PGTABLE_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:10:37.000000000 +0100
@@ -18,16 +18,18 @@
 	printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
 	       &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)

-#define pud_none(pud)				0
-#define pud_bad(pud)				0
-#define pud_present(pud)			1

-/*
- * All present pages with !NX bit are kernel-executable:
- */
-static inline int pte_exec_kernel(pte_t pte)
+static inline int pud_none(pud_t pud)
+{
+	return __pud_val(pud) == 0;
+}
+static inline int pud_bad(pud_t pud)
 {
-	return !(__pte_val(pte) & _PAGE_NX);
+	return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+static inline int pud_present(pud_t pud)
+{
+	return __pud_val(pud) & _PAGE_PRESENT;
 }

 /* Rules for using set_pte: the pte being assigned *must* be
@@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt
 	ptep->pte_low = pte.pte_low;
 }

-static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
-				  pte_t *ptep , pte_t pte)
-{
-	if ((mm != current->mm && mm != &init_mm) ||
-	    HYPERVISOR_update_va_mapping(addr, pte, 0))
-		xen_set_pte(ptep, pte);
-}
-
 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((unsigned long long *)(ptep),__pte_val(pte));
@@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu
  * entry, so clear the bottom half first and enforce ordering with a compiler
  * barrier.
  */
-static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline void __xen_pte_clear(pte_t *ptep)
 {
-	if ((mm != current->mm && mm != &init_mm)
-	    || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
-		ptep->pte_low = 0;
-		smp_wmb();
-		ptep->pte_high = 0;
-	}
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
 }

 static inline void xen_pmd_clear(pmd_t *pmd)
@@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t *
 	xen_l2_entry_update(pmd, __pmd(0));
 }

-#define set_pte(ptep, pte)			xen_set_pte(ptep, pte)
-#define set_pte_at(mm, addr, ptep, pte)		xen_set_pte_at(mm, addr, ptep, pte)
-#define set_pte_atomic(ptep, pte)		xen_set_pte_atomic(ptep, pte)
-#define set_pmd(pmdp, pmd)			xen_set_pmd(pmdp, pmd)
-#define set_pud(pudp, pud)			xen_set_pud(pudp, pud)
-#define pte_clear(mm, addr, ptep)		xen_pte_clear(mm, addr, ptep)
-#define pmd_clear(pmd)				xen_pmd_clear(pmd)
+static inline void pud_clear(pud_t *pudp)
+{
+	pgdval_t pgd;
+
+	set_pud(pudp, __pud(0));

-/*
- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
- * the TLB via cr3 if the top-level pgd is changed...
- * We do not let the generic code free and clear pgd entries due to
- * this erratum.
- */
-static inline void pud_clear (pud_t * pud) { }
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 *
+	 * Make sure the pud entry we're updating is within the
+	 * current pgd to avoid unnecessary TLB flushes.
+	 */
+	pgd = read_cr3();
+	if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+		xen_tlb_flush();
+}

 #define pud_page(pud) \
 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
@@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle
 #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
 #endif

-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define ptep_clear_flush(vma, addr, ptep)			\
-({								\
-	pte_t *__ptep = (ptep);					\
-	pte_t __res = *__ptep;					\
-	if (!pte_none(__res) &&					\
-	    ((vma)->vm_mm != current->mm ||			\
-	     HYPERVISOR_update_va_mapping(addr,	__pte(0),	\
-			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-				UVMF_INVLPG|UVMF_MULTI))) {	\
-		__ptep->pte_low = 0;				\
-		smp_wmb();					\
-		__ptep->pte_high = 0;				\
-		flush_tlb_page(vma, addr);			\
-	}							\
-	__res;							\
-})
-
 #define __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t a, pte_t b)
 {
@@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte)
 		       mfn_to_local_pfn(__pte_mfn(_pte)) :	\
 		       __pte_mfn(_pte))

-extern unsigned long long __supported_pte_mask;
-
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
-{
-	return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
-}
-
-static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
-{
-	return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
-}
-
 /*
  * Bits 0, 6 and 7 are taken in the low part of the pte,
  * put the 32 bits of offset into the high part.
  */
 #define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
+#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
 #define PTE_FILE_MAX_BITS       32

 /* Encode and de-code a swap entry */
@@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon
 #define __swp_offset(x)			((x).val >> 5)
 #define __swp_entry(type, offset)	((swp_entry_t){(type) | (offset) << 5})
 #define __pte_to_swp_entry(pte)		((swp_entry_t){ (pte).pte_high })
-#define __swp_entry_to_pte(x)		((pte_t){ 0, (x).val })
-
-#define __pmd_free_tlb(tlb, x)		do { } while (0)
+#define __swp_entry_to_pte(x)		((pte_t){ { .pte_high = (x).val } })

 #endif /* _I386_PGTABLE_3LEVEL_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,8 +1,6 @@
 #ifndef _I386_PGTABLE_H
 #define _I386_PGTABLE_H

-#include <asm/hypervisor.h>
-
 /*
  * The Linux memory management assumes a three-level page table setup. On
  * the i386, we use that, but "fold" the mid level into the top-level page
@@ -25,20 +23,10 @@

 struct vm_area_struct;

-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-extern unsigned long empty_zero_page[1024];
 extern pgd_t *swapper_pg_dir;
-extern struct kmem_cache *pmd_cache;
-extern spinlock_t pgd_lock;
-extern struct page *pgd_list;
-void check_pgt_cache(void);

-void pmd_ctor(struct kmem_cache *, void *);
-void pgtable_cache_init(void);
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
 void paging_init(void);


@@ -58,16 +46,9 @@ void paging_init(void);
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))

-#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0
-
 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
 #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)

-#define TWOLEVEL_PGDIR_SHIFT	22
-#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
-#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
-
 /* Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 8MB value just means that there will be a 8MB "hole" after the
  * physical memory until the kernel virtual memory starts.  That means that
@@ -78,121 +59,19 @@ void paging_init(void);
 #define VMALLOC_OFFSET	(8*1024*1024)
 #define VMALLOC_START	(((unsigned long) high_memory + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
-#else
-# define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
-#endif
-
-/*
- * _PAGE_PSE set in the page directory entry just means that
- * the page directory entry points directly to a 4MB-aligned block of
- * memory.
- */
-#define _PAGE_BIT_PRESENT	0
-#define _PAGE_BIT_RW		1
-#define _PAGE_BIT_USER		2
-#define _PAGE_BIT_PWT		3
-#define _PAGE_BIT_PCD		4
-#define _PAGE_BIT_ACCESSED	5
-#define _PAGE_BIT_DIRTY		6
-#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page, Pentium+, if present.. */
-#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
-/*#define _PAGE_BIT_UNUSED1	9*/	/* available for programmer */
-#define _PAGE_BIT_UNUSED2	10
-#define _PAGE_BIT_UNUSED3	11
-#define _PAGE_BIT_NX		63
-
-#define _PAGE_PRESENT	0x001
-#define _PAGE_RW	0x002
-#define _PAGE_USER	0x004
-#define _PAGE_PWT	0x008
-#define _PAGE_PCD	0x010
-#define _PAGE_ACCESSED	0x020
-#define _PAGE_DIRTY	0x040
-#define _PAGE_PSE	0x080	/* 4 MB (or 2MB) page, Pentium+, if present.. */
-#define _PAGE_GLOBAL	0x100	/* Global TLB entry PPro+ */
-/*#define _PAGE_UNUSED1	0x200*/	/* available for programmer */
-#define _PAGE_UNUSED2	0x400
-#define _PAGE_UNUSED3	0x800
-
-/* If _PAGE_PRESENT is clear, we use these: */
-#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_PROTNONE	0x080	/* if the user mapped it with PROT_NONE;
-				   pte_present gives true */
 #ifdef CONFIG_X86_PAE
-#define _PAGE_NX	(1ULL<<_PAGE_BIT_NX)
+#define LAST_PKMAP 512
 #else
-#define _PAGE_NX	0
+#define LAST_PKMAP 1024
 #endif

-/* Mapped page is I/O or foreign and has no associated page struct. */
-#define _PAGE_IO	0x200
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)

-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
-
-#define PAGE_NONE \
-	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED \
-	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-
-#define PAGE_SHARED_EXEC \
-	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC \
-	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC \
-	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY \
-	PAGE_COPY_NOEXEC
-#define PAGE_READONLY \
-	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC \
-	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-
-#define _PAGE_KERNEL \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define _PAGE_KERNEL_EXEC \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-
-extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
-#define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD)
-#define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define PAGE_KERNEL		__pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-
-/*
- * The i386 can't do page protection for execute, and considers that
- * the same are read. Also, write permissions imply read permissions.
- * This is the closest we can get..
- */
-#define __P000	PAGE_NONE
-#define __P001	PAGE_READONLY
-#define __P010	PAGE_COPY
-#define __P011	PAGE_COPY
-#define __P100	PAGE_READONLY_EXEC
-#define __P101	PAGE_READONLY_EXEC
-#define __P110	PAGE_COPY_EXEC
-#define __P111	PAGE_COPY_EXEC
-
-#define __S000	PAGE_NONE
-#define __S001	PAGE_READONLY
-#define __S010	PAGE_SHARED
-#define __S011	PAGE_SHARED
-#define __S100	PAGE_READONLY_EXEC
-#define __S101	PAGE_READONLY_EXEC
-#define __S110	PAGE_SHARED_EXEC
-#define __S111	PAGE_SHARED_EXEC
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
+#else
+# define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
+#endif

 /*
  * Define this if things work differently on an i386 and an i486:
@@ -221,28 +100,6 @@ extern unsigned long pg0[];

 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))

-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return (pte).pte_low & _PAGE_ACCESSED; }
-static inline int pte_write(pte_t pte)		{ return (pte).pte_low & _PAGE_RW; }
-static inline int pte_huge(pte_t pte)		{ return (pte).pte_low & _PAGE_PSE; }
-
-/*
- * The following only works if pte_present() is not true.
- */
-static inline int pte_file(pte_t pte)		{ return (pte).pte_low & _PAGE_FILE; }
-
-static inline pte_t pte_mkclean(pte_t pte)	{ (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkold(pte_t pte)	{ (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_wrprotect(pte_t pte)	{ (pte).pte_low &= ~_PAGE_RW; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
@@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte
 #endif

 /*
- * Rules for using pte_update - it must be called after any PTE update which
- * has not been done using the set_pte / clear_pte interfaces.  It is used by
- * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
- * updates should either be sets, clears, or set_pte_atomic for P->P
- * transitions, which means this hook should only be called for user PTEs.
- * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.  The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
- */
-#define pte_update(mm, addr, ptep)		do { } while (0)
-#define pte_update_defer(mm, addr, ptep)	do { } while (0)
-
-/* local pte updates need not use xchg for locking */
-static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
-{
-	xen_set_pte(ptep, __pte(0));
-	return res;
-}
-
-/*
- * We only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time.
- */
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
-({									\
-	int __changed = !pte_same(*(ptep), entry);			\
-	if (__changed && (dirty)) {					\
-		if ( likely((vma)->vm_mm == current->mm) ) {		\
-			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
-				entry,					\
-				(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-					UVMF_INVLPG|UVMF_MULTI));	\
-		} else {						\
-			xen_l1_entry_update(ptep, entry);		\
-			flush_tlb_page(vma, address);			\
-		}							\
-	}								\
-	__changed;							\
-})
-
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({			\
-	int __ret = 0;							\
-	if (pte_young(*(ptep)))						\
-		__ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,		\
-						&(ptep)->pte_low);	\
-	if (__ret)							\
-		pte_update((vma)->vm_mm, addr, ptep);			\
-	__ret;								\
-})
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep)			\
-({									\
-	pte_t __pte = *(ptep);						\
-	int __young = pte_young(__pte);					\
-	__pte = pte_mkold(__pte);					\
-	if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))		\
-		(void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
-	else if (__young)						\
-		(ptep)->pte_low = __pte.pte_low;			\
-	__young;							\
-})
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	if (!pte_none(pte)
-	    && (mm != &init_mm
-	        || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
-		pte = xen_ptep_get_and_clear(ptep, pte);
-		pte_update(mm, addr, ptep);
-	}
-	return pte;
-}
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define ptep_get_and_clear_full(mm, addr, ptep, full)			\
-	((full) ? ({							\
-		pte_t __res = *(ptep);					\
-		if (PagePinned(virt_to_page((mm)->pgd)))		\
-			xen_l1_entry_update(ptep, __pte(0));		\
-		else							\
-			*(ptep) = __pte(0);				\
-		__res;							\
-	 }) :								\
-	 ptep_get_and_clear(mm, addr, ptep))
-
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	if (pte_write(pte))
-		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
-}
-
-/*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
  *  dst - pointer to pgd range anwhere on a pgd page
@@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t

 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))

-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	/*
-	 * Since this might change the present bit (which controls whether
-	 * a pte_t object has undergone p2m translation), we must use
-	 * pte_val() on the input pte and __pte() for the return value.
-	 */
-	paddr_t pteval = pte_val(pte);
-
-	pteval &= _PAGE_CHG_MASK;
-	pteval |= pgprot_val(newprot);
-#ifdef CONFIG_X86_PAE
-	pteval &= __supported_pte_mask;
-#endif
-	return __pte(pteval);
-}
-
-#define pmd_large(pmd) \
-((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
-
 /*
  * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
  *
@@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte
  */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)

+static inline int pud_large(pud_t pud) { return 0; }
+
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
  *
@@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte
 #define pmd_page_vaddr(pmd) \
 		((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))

-/*
- * Helper function that returns the kernel pagetable entry controlling
- * the virtual address 'address'. NULL means no pagetable entry present.
- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
- * as a pte too.
- */
-extern pte_t *lookup_address(unsigned long address);
-
-/*
- * Make a given kernel text page executable/non-executable.
- * Returns the previous executability setting of that page (which
- * is used to restore the previous state). Used by the SMP bootup code.
- * NOTE: this is an __init function for security reasons.
- */
-#ifdef CONFIG_X86_PAE
- extern int set_kernel_exec(unsigned long vaddr, int enable);
-#else
- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
-#endif
-
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
@@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo
  */
 #define update_mmu_cache(vma,address,pte) do { } while (0)

-#include <xen/features.h>
 void make_lowmem_page_readonly(void *va, unsigned int feature);
 void make_lowmem_page_writable(void *va, unsigned int feature);
-void make_page_readonly(void *va, unsigned int feature);
-void make_page_writable(void *va, unsigned int feature);
-void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
-void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
-
-#define virt_to_ptep(va)						\
-({									\
-	pte_t *__ptep = lookup_address((unsigned long)(va));		\
-	BUG_ON(!__ptep || !pte_present(*__ptep));			\
-	__ptep;								\
-})
-
-#define arbitrary_virt_to_machine(va)					\
-	(((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)		\
-	 | ((unsigned long)(va) & (PAGE_SIZE - 1)))
-
-#ifdef CONFIG_HIGHPTE
-#include <asm/io.h>
-struct page *kmap_atomic_to_page(void *);
-#define ptep_to_machine(ptep)						\
-({									\
-	pte_t *__ptep = (ptep);						\
-	page_to_phys(kmap_atomic_to_page(__ptep))			\
-		| ((unsigned long)__ptep & (PAGE_SIZE - 1));		\
-})
-#else
-#define ptep_to_machine(ptep)	virt_to_machine(ptep)
-#endif

 #endif /* !__ASSEMBLY__ */

+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
 #ifdef CONFIG_FLATMEM
 #define kern_addr_valid(addr)	(1)
-#endif /* CONFIG_FLATMEM */
-
-int direct_remap_pfn_range(struct vm_area_struct *vma,
-                           unsigned long address,
-                           unsigned long mfn,
-                           unsigned long size,
-                           pgprot_t prot,
-                           domid_t  domid);
-int direct_kernel_remap_pfn_range(unsigned long address,
-				  unsigned long mfn,
-				  unsigned long size,
-				  pgprot_t prot,
-				  domid_t  domid);
-int create_lookup_pte_addr(struct mm_struct *mm,
-                           unsigned long address,
-                           uint64_t *ptep);
-int touch_pte_range(struct mm_struct *mm,
-                    unsigned long address,
-                    unsigned long size);
-
-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable);
-
-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
-	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
+#else
+#define kern_addr_valid(kaddr)	(0)
+#endif

 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)

-#include <asm-generic/pgtable.h>
-
 #endif /* _I386_PGTABLE_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -13,49 +13,26 @@
 #include <linux/threads.h>
 #include <linux/sched.h>
 #include <asm/pda.h>
-#ifdef CONFIG_XEN
-#include <asm/hypervisor.h>

+#ifdef CONFIG_XEN
 extern pud_t level3_user_pgt[512];

 extern void xen_init_pt(void);
-
-extern pte_t *lookup_address(unsigned long address);
-
-#define virt_to_ptep(va)						\
-({									\
-	pte_t *__ptep = lookup_address((unsigned long)(va));		\
-	BUG_ON(!__ptep || !pte_present(*__ptep));			\
-	__ptep;								\
-})
-
-#define arbitrary_virt_to_machine(va)					\
-	(((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT)		\
-	 | ((unsigned long)(va) & (PAGE_SIZE - 1)))
-
-#define ptep_to_machine(ptep)	virt_to_machine(ptep)
 #endif

 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
-extern unsigned long __supported_pte_mask;

 #define swapper_pg_dir init_level4_pgt

 extern void paging_init(void);
-extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
-
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))

 #endif /* !__ASSEMBLY__ */

+#define SHARED_KERNEL_PMD	1
+
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
@@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG
 #define pgd_none(x)	(!__pgd_val(x))
 #define pud_none(x)	(!__pud_val(x))

-static inline void set_pte(pte_t *dst, pte_t val)
+struct mm_struct;
+
+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	*ptep = pte;
+}
+
+static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
-	*dst = val;
+	xen_set_pte(ptep, pte);
 }

-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
-#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
-#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
+{
+	return __pte_ma(xchg(&xp->pte, 0));
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif

-static inline void pud_clear (pud_t * pud)
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
-	set_pud(pud, __pud(0));
+	xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_pmd_clear(pmd_t *pmd)
+{
+	xen_set_pmd(pmd, xen_make_pmd(0));
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+	xen_l3_entry_update(pudp, pud);
+}
+
+static inline void xen_pud_clear(pud_t *pud)
+{
+	xen_set_pud(pud, xen_make_pud(0));
 }

 #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)

-static inline void pgd_clear (pgd_t * pgd)
+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-        set_pgd(pgd, __pgd(0));
-        set_pgd(__user_pgd(pgd), __pgd(0));
+	xen_l4_entry_update(pgdp, pgd);
 }

-#define pte_same(a, b)		((a).pte == (b).pte)
+static inline void xen_pgd_clear(pgd_t * pgd)
+{
+	xen_set_pgd(pgd, xen_make_pgd(0));
+	xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
+}

-#define pte_pgprot(a)	(__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
+#define pte_same(a, b)		((a).pte == (b).pte)

 #endif /* !__ASSEMBLY__ */

@@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg
 #define PGDIR_SIZE	(_AC(1,UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))

-#define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
-#define FIRST_USER_ADDRESS	0

 #define MAXMEM		 _AC(0x6fffffffff, UL)
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
@@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)

-#define _PAGE_BIT_PRESENT	0
-#define _PAGE_BIT_RW		1
-#define _PAGE_BIT_USER		2
-#define _PAGE_BIT_PWT		3
-#define _PAGE_BIT_PCD		4
-#define _PAGE_BIT_ACCESSED	5
-#define _PAGE_BIT_DIRTY		6
-#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
-#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
-#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
-
-#define _PAGE_PRESENT	0x001
-#define _PAGE_RW	0x002
-#define _PAGE_USER	0x004
-#define _PAGE_PWT	0x008
-#define _PAGE_PCD	0x010
-#define _PAGE_ACCESSED	0x020
-#define _PAGE_DIRTY	0x040
-#define _PAGE_PSE	0x080	/* 2MB page */
-#define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_GLOBAL	0x100	/* Global TLB entry */
-
-#define _PAGE_PROTNONE	0x080	/* If not present */
-#define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
-
-/* Mapped page is I/O or foreign and has no associated page struct. */
-#define _PAGE_IO	0x200
-
-#ifndef __ASSEMBLY__
-#if CONFIG_XEN_COMPAT <= 0x030002
-extern unsigned int __kernel_page_user;
-#else
-#define __kernel_page_user 0
-#endif
-#endif
-
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
-
-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
-
-#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
-#define __PAGE_KERNEL_EXEC \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
-#define __PAGE_KERNEL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
-#define __PAGE_KERNEL_RO \
-	(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user)
-#define __PAGE_KERNEL_VSYSCALL \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
-#define __PAGE_KERNEL_LARGE \
-	(__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC \
-	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-/*
- * We don't support GLOBAL page in xenolinux64
- */
-#define MAKE_GLOBAL(x) __pgprot((x))
-
-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
-
-/*         xwr */
-#define __P000	PAGE_NONE
-#define __P001	PAGE_READONLY
-#define __P010	PAGE_COPY
-#define __P011	PAGE_COPY
-#define __P100	PAGE_READONLY_EXEC
-#define __P101	PAGE_READONLY_EXEC
-#define __P110	PAGE_COPY_EXEC
-#define __P111	PAGE_COPY_EXEC
-
-#define __S000	PAGE_NONE
-#define __S001	PAGE_READONLY
-#define __S010	PAGE_SHARED
-#define __S011	PAGE_SHARED
-#define __S100	PAGE_READONLY_EXEC
-#define __S101	PAGE_READONLY_EXEC
-#define __S110	PAGE_SHARED_EXEC
-#define __S111	PAGE_SHARED_EXEC
-
 #ifndef __ASSEMBLY__

 static inline unsigned long pgd_bad(pgd_t pgd)
@@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_
 	return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
 }

-#define set_pte_at(_mm,addr,ptep,pteval) do {				\
-	if (((_mm) != current->mm && (_mm) != &init_mm) ||		\
-	    HYPERVISOR_update_va_mapping((addr), (pteval), 0))		\
-		set_pte((ptep), (pteval));				\
-} while (0)
-
 #define pte_none(x)	(!(x).pte)
 #define pte_present(x)	((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)

-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))	/* FIXME: is this right? */

 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
 	__pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
-#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn :	\
+#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr :	\
 		       (_pte).pte & _PAGE_PRESENT ?		\
 		       mfn_to_local_pfn(__pte_mfn(_pte)) :	\
 		       __pte_mfn(_pte))

 #define pte_page(x)	pfn_to_page(pte_pfn(x))

-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
-{
-	unsigned long pte = page_nr << PAGE_SHIFT;
-	pte |= pgprot_val(pgprot);
-	pte &= __supported_pte_mask;
-	return __pte(pte);
-}
-
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	if (!pte_none(pte)) {
-		if ((mm != &init_mm) ||
-		    HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
-			pte = __pte_ma(xchg(&ptep->pte, 0));
-	}
-	return pte;
-}
-
-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
-{
-	if (full) {
-		pte_t pte = *ptep;
-		if (PagePinned(virt_to_page(mm->pgd)))
-			xen_l1_entry_update(ptep, __pte(0));
-		else
-			*ptep = __pte(0);
-		return pte;
-	}
-	return ptep_get_and_clear(mm, addr, ptep);
-}
-
-#define ptep_clear_flush(vma, addr, ptep)			\
-({								\
-	pte_t *__ptep = (ptep);					\
-	pte_t __res = *__ptep;					\
-	if (!pte_none(__res) &&					\
-	    ((vma)->vm_mm != current->mm ||			\
-	     HYPERVISOR_update_va_mapping(addr,	__pte(0), 	\
-			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-				UVMF_INVLPG|UVMF_MULTI))) {	\
-		__ptep->pte = 0;				\
-		flush_tlb_page(vma, addr);			\
-	}							\
-	__res;							\
-})
-
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
-static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return __pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_write(pte_t pte)		{ return __pte_val(pte) & _PAGE_RW; }
-static inline int pte_file(pte_t pte)		{ return __pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)		{ return __pte_val(pte) & _PAGE_PSE; }
-
-static inline pte_t pte_mkclean(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkold(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_wrprotect(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_RW; return pte; }
-static inline pte_t pte_mkexec(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_NX; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte)	{ __pte_val(pte) |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte)	{ __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)	{ __pte_val(pte) |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ __pte_val(pte) |= _PAGE_PSE; return pte; }
-static inline pte_t pte_clrhuge(pte_t pte)	{ __pte_val(pte) &= ~_PAGE_PSE; return pte; }
-
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-	if (!pte_young(*ptep))
-		return 0;
-	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
-}
-
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	if (pte_write(pte))
-		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
-}
-
 /*
  * Macro to mark a page protection value as "uncacheable".
  */
 #define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))

-static inline int pmd_large(pmd_t pte) {
-	return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
-}
-

 /*
  * Conversion functions: convert a page and protection to a page entry,
@@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) {
 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
+static inline int pgd_large(pgd_t pgd) { return 0; }
 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)

 /* PUD - Level3 access */
@@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) {
 #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)

+static inline int pud_large(pud_t pte)
+{
+	return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+		(_PAGE_PSE|_PAGE_PRESENT);
+}
+
 /* PMD  - Level 2 access */
 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
@@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) {
 #else
 #define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
 #endif
-#define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
 #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)

 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT

 /* PTE - Level 1 access. */

 /* page, protection -> pte */
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)

-/* Change flags of a PTE */
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	/*
-	 * Since this might change the present bit (which controls whether
-	 * a pte_t object has undergone p2m translation), we must use
-	 * pte_val() on the input pte and __pte() for the return value.
-	 */
-	unsigned long pteval = pte_val(pte);
-
-	pteval &= _PAGE_CHG_MASK;
-	pteval |= pgprot_val(newprot);
-	pteval &= __supported_pte_mask;
-	return __pte(pteval);
-}
-
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
@@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte

 #define update_mmu_cache(vma,address,pte) do { } while (0)

-/*
- * Rules for using ptep_establish: the pte MUST be a user pte, and
- * must be a present->present transition.
- */
-#define __HAVE_ARCH_PTEP_ESTABLISH
-#define ptep_establish(vma, address, ptep, pteval)			\
-	do {								\
-		if ( likely((vma)->vm_mm == current->mm) ) {		\
-			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
-				pteval,					\
-				(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
-					UVMF_INVLPG|UVMF_MULTI));	\
-		} else {						\
-			xen_l1_entry_update(ptep, pteval);		\
-			flush_tlb_page(vma, address);			\
-		}							\
-	} while (0)
-
-/* We only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time. */
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
-({									\
-	int __changed = !pte_same(*(ptep), entry);			\
-	if (__changed && (dirty))					\
-		ptep_establish(vma, address, ptep, entry);		\
-	__changed;							\
-})
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep)			\
-({									\
-	pte_t __pte = *(ptep);						\
-	int __young = pte_young(__pte);					\
-	__pte = pte_mkold(__pte);					\
-	if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))		\
-		(void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
-	else if (__young)						\
-		set_pte(ptep, __pte);					\
-	__young;							\
-})
-
 /* Encode and de-code a swap entry */
 #define __swp_type(x)			(((x).val >> 1) & 0x3f)
 #define __swp_offset(x)			((x).val >> 8)
 #define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { __pte_val(pte) })
-#define __swp_entry_to_pte(x)		((pte_t) { (x).val })
-
-extern spinlock_t pgd_lock;
-extern struct list_head pgd_list;
+#define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })

 extern int kern_addr_valid(unsigned long addr);
-
-#define DOMID_LOCAL (0xFFFFU)
-
-struct vm_area_struct;
-
-int direct_remap_pfn_range(struct vm_area_struct *vma,
-                            unsigned long address,
-                            unsigned long mfn,
-                            unsigned long size,
-                            pgprot_t prot,
-                            domid_t  domid);
-
-int direct_kernel_remap_pfn_range(unsigned long address,
-				  unsigned long mfn,
-				  unsigned long size,
-				  pgprot_t prot,
-				  domid_t  domid);
-
-int create_lookup_pte_addr(struct mm_struct *mm,
-                           unsigned long address,
-                           uint64_t *ptep);
-
-int touch_pte_range(struct mm_struct *mm,
-                    unsigned long address,
-                    unsigned long size);
-
-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable);
-
-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
-	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
-
-pte_t *lookup_address(unsigned long addr);
+extern void cleanup_highmap(void);

 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
 		direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)

 #define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

 #define pgtable_cache_init()   do { } while (0)
 #define check_pgt_cache()      do { } while (0)
@@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr
 #define	kc_offset_to_vaddr(o) \
    (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))

-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTE_SAME
-#include <asm-generic/pgtable.h>
 #endif /* !__ASSEMBLY__ */

 #endif /* _X86_64_PGTABLE_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,789 @@
+#ifndef __ASM_X86_PROCESSOR_H
+#define __ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* migration helpers, for KVM - will be removed in 2.6.25: */
+#include <asm/vm86.h>
+#define Xgt_desc_struct	desc_ptr
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+	void *pc;
+	asm volatile("mov $1f,%0\n1:":"=r" (pc));
+	return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+#define ARCH_MIN_TASKALIGN	(1 << INTERNODE_CACHE_SHIFT)
+#define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
+#else
+#define ARCH_MIN_TASKALIGN	16
+#define ARCH_MIN_MMSTRUCT_ALIGN	0
+#endif
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+	__u8	x86;		/* CPU family */
+	__u8	x86_vendor;	/* CPU vendor */
+	__u8	x86_model;
+	__u8	x86_mask;
+#ifdef CONFIG_X86_32
+	char	wp_works_ok;	/* It doesn't on 386's */
+	char	hlt_works_ok;	/* Problems on some 486Dx4's and old 386's */
+	char	hard_math;
+	char	rfu;
+	char	fdiv_bug;
+	char	f00f_bug;
+	char	coma_bug;
+	char	pad0;
+#else
+	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
+	int     x86_tlbsize;
+	__u8    x86_virt_bits, x86_phys_bits;
+	/* cpuid returned core id bits */
+	__u8    x86_coreid_bits;
+	/* Max extended CPUID function supported */
+	__u32   extended_cpuid_level;
+#endif
+	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
+	__u32	x86_capability[NCAPINTS];
+	char	x86_vendor_id[16];
+	char	x86_model_id[64];
+	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
+				    call  */
+	int 	x86_cache_alignment;	/* In bytes */
+	int	x86_power;
+	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
+	u16 x86_max_cores;		/* cpuid returned max cores value */
+	u16 apicid;
+	u16 x86_clflush_size;
+#ifdef CONFIG_SMP
+	u16 booted_cores;		/* number of cores as seen by OS */
+	u16 phys_proc_id; 		/* Physical processor id. */
+	u16 cpu_core_id;  		/* Core id */
+	u16 cpu_index;			/* index into per_cpu list */
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_NEXGEN 4
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+extern __u32 cleared_cpu_caps[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
+#define current_cpu_data	cpu_data(smp_processor_id())
+#else
+#define cpu_data(cpu)		boot_cpu_data
+#define current_cpu_data	boot_cpu_data
+#endif
+
+void cpu_detect(struct cpuinfo_x86 *c);
+
+extern void identify_cpu(struct cpuinfo_x86 *);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
+extern void detect_ht(struct cpuinfo_x86 *c);
+#else
+static inline void detect_ht(struct cpuinfo_x86 *c) {}
+#endif
+
+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	__asm__(XEN_CPUID
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+	write_cr3(__pa(pgdir));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+	unsigned short	back_link, __blh;
+	unsigned long	sp0;
+	unsigned short	ss0, __ss0h;
+	unsigned long	sp1;
+	unsigned short	ss1, __ss1h;	/* ss1 caches MSR_IA32_SYSENTER_CS */
+	unsigned long	sp2;
+	unsigned short	ss2, __ss2h;
+	unsigned long	__cr3;
+	unsigned long	ip;
+	unsigned long	flags;
+	unsigned long	ax, cx, dx, bx;
+	unsigned long	sp, bp, si, di;
+	unsigned short	es, __esh;
+	unsigned short	cs, __csh;
+	unsigned short	ss, __ssh;
+	unsigned short	ds, __dsh;
+	unsigned short	fs, __fsh;
+	unsigned short	gs, __gsh;
+	unsigned short	ldt, __ldth;
+	unsigned short	trace, io_bitmap_base;
+} __attribute__((packed));
+extern struct tss_struct doublefault_tss;
+#else
+struct x86_hw_tss {
+	u32 reserved1;
+	u64 sp0;
+	u64 sp1;
+	u64 sp2;
+	u64 reserved2;
+	u64 ist[7];
+	u32 reserved3;
+	u32 reserved4;
+	u16 reserved5;
+	u16 io_bitmap_base;
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+#endif /* CONFIG_X86_NO_TSS */
+
+/*
+ * Size of io_bitmap.
+ */
+#define IO_BITMAP_BITS  65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+#ifndef CONFIG_X86_NO_TSS
+struct tss_struct {
+	struct x86_hw_tss x86_tss;
+
+	/*
+	 * The extra 1 is there because the CPU will access an
+	 * additional byte beyond the end of the IO permission
+	 * bitmap. The extra byte must be all 1 bits, and must
+	 * be within the limit.
+	 */
+	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
+	/*
+	 * Cache the current maximum and the last task that used the bitmap:
+	 */
+	unsigned long io_bitmap_max;
+	struct thread_struct *io_bitmap_owner;
+	/*
+	 * pads the TSS to be cacheline-aligned (size is 0x100)
+	 */
+	unsigned long __cacheline_filler[35];
+	/*
+	 * .. and then another 0x100 bytes for emergency kernel stack
+	 */
+	unsigned long stack[64];
+} __attribute__((packed));
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
+/* Save the original ist values for checking stack pointers during debugging */
+struct orig_ist {
+	unsigned long ist[7];
+};
+#endif /* CONFIG_X86_NO_TSS */
+
+#define	MXCSR_DEFAULT		0x1f80
+
+struct i387_fsave_struct {
+	u32	cwd;
+	u32	swd;
+	u32	twd;
+	u32	fip;
+	u32	fcs;
+	u32	foo;
+	u32	fos;
+	u32	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
+	u32	status;		/* software status information */
+};
+
+struct i387_fxsave_struct {
+	u16	cwd;
+	u16	swd;
+	u16	twd;
+	u16	fop;
+	union {
+		struct {
+			u64	rip;
+			u64	rdp;
+		};
+		struct {
+			u32	fip;
+			u32	fcs;
+			u32	foo;
+			u32	fos;
+		};
+	};
+	u32	mxcsr;
+	u32	mxcsr_mask;
+	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
+	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
+	u32	padding[24];
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+	u32	cwd;
+	u32	swd;
+	u32	twd;
+	u32	fip;
+	u32	fcs;
+	u32	foo;
+	u32	fos;
+	u32	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
+	u8	ftop, changed, lookahead, no_update, rm, alimit;
+	struct info	*info;
+	u32	entry_eip;
+};
+
+union i387_union {
+	struct i387_fsave_struct	fsave;
+	struct i387_fxsave_struct	fxsave;
+	struct i387_soft_struct 	soft;
+};
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(u8, cpu_llc_id);
+#elif !defined(CONFIG_X86_NO_TSS)
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+struct thread_struct {
+/* cached TLS descriptors. */
+	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+	unsigned long	sp0;
+	unsigned long	sp;
+#ifdef CONFIG_X86_32
+	unsigned long	sysenter_cs;
+#else
+	unsigned short	es, ds, fsindex, gsindex;
+#endif
+	unsigned long	ip;
+	unsigned long	fs;
+	unsigned long	gs;
+/* Hardware debugging registers */
+	unsigned long	debugreg0;
+	unsigned long	debugreg1;
+	unsigned long	debugreg2;
+	unsigned long	debugreg3;
+	unsigned long	debugreg6;
+	unsigned long	debugreg7;
+/* fault info */
+	unsigned long	cr2, trap_no, error_code;
+/* floating point info */
+	union i387_union	i387 __attribute__((aligned(16)));;
+#ifdef CONFIG_X86_32
+/* virtual 86 mode info */
+	struct vm86_struct __user *vm86_info;
+	unsigned long		screen_bitmap;
+	unsigned long		v86flags, v86mask, saved_sp0;
+	unsigned int		saved_fs, saved_gs;
+#endif
+/* IO permissions */
+	unsigned long	*io_bitmap_ptr;
+	unsigned long	iopl;
+/* max allowed port in the bitmap, in bytes: */
+	unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
+	unsigned long	debugctlmsr;
+/* Debug Store - if not 0 points to a DS Save Area configuration;
+ *               goes into MSR_IA32_DS_AREA */
+	unsigned long	ds_area_msr;
+};
+
+static inline unsigned long xen_get_debugreg(int regno)
+{
+	return HYPERVISOR_get_debugreg(regno);
+}
+
+static inline void xen_set_debugreg(int regno, unsigned long value)
+{
+	WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void xen_set_iopl_mask(unsigned mask)
+{
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+static inline void native_load_sp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+	/* Only happens when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+		tss->x86_tss.ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+#endif
+}
+#else
+#define xen_load_sp0(tss, thread) do { \
+	if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
+		BUG(); \
+} while (0)
+#endif
+
+#define __cpuid xen_cpuid
+#define paravirt_enabled() 0
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register)				\
+	(var) = xen_get_debugreg(register)
+#define set_debugreg(value, register)				\
+	xen_set_debugreg(register, value)
+
+#define load_sp0 xen_load_sp0
+
+#define set_iopl_mask xen_set_iopl_mask
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+	unsigned cr4;
+	mmu_cr4_features |= mask;
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+	unsigned cr4;
+	mmu_cr4_features &= ~mask;
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
+}
+
+struct microcode_header {
+	unsigned int hdrver;
+	unsigned int rev;
+	unsigned int date;
+	unsigned int sig;
+	unsigned int cksum;
+	unsigned int ldrver;
+	unsigned int pf;
+	unsigned int datasize;
+	unsigned int totalsize;
+	unsigned int reserved[3];
+};
+
+struct microcode {
+	struct microcode_header hdr;
+	unsigned int bits[0];
+};
+
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+	unsigned int sig;
+	unsigned int pf;
+	unsigned int cksum;
+};
+
+struct extended_sigtable {
+	unsigned int count;
+	unsigned int cksum;
+	unsigned int reserved[3];
+	struct extended_signature sigs[0];
+};
+
+typedef struct {
+	unsigned long seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy status */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+			 unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+			       unsigned int *eax, unsigned int *ebx,
+			       unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return ebx;
+}
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return ecx;
+}
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+	return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+	__asm__ __volatile__("rep;nop": : :"memory");
+}
+
+/* Stop speculative execution */
+static inline void sync_core(void)
+{
+	int tmp;
+	asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+					  : "ebx", "ecx", "edx", "memory");
+}
+
+#define cpu_relax()   rep_nop()
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+		unsigned long edx)
+{
+	/* "monitor %eax,%ecx,%edx;" */
+	asm volatile(
+		".byte 0x0f,0x01,0xc8;"
+		: :"a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax,%ecx;" */
+	asm volatile(
+		".byte 0x0f,0x01,0xc9;"
+		: :"a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax,%ecx;" */
+	asm volatile(
+		"sti; .byte 0x0f,0x01,0xc9;"
+		: :"a" (eax), "c" (ecx));
+}
+
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+extern int force_mwait;
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+extern unsigned long boot_option_idle_override;
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+/* from system description table in BIOS.  Mostly for MCA use, but
+ * others may find it useful. */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+
+/* Boot loader type from the setup header */
+extern int bootloader_type;
+
+extern char ignore_fpu_irq;
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+#define BASE_PREFETCH	ASM_NOP4
+#define ARCH_HAS_PREFETCH
+#else
+#define BASE_PREFETCH	"prefetcht0 (%1)"
+#endif
+
+/* Prefetch instructions for Pentium III and AMD Athlon */
+/* It's not worth to care about 3dnow! prefetches for the K6
+   because they are microcoded there and very slow.
+   However we don't do prefetches for pre XP Athlons currently
+   That should be fixed. */
+static inline void prefetch(const void *x)
+{
+	alternative_input(BASE_PREFETCH,
+			  "prefetchnta (%1)",
+			  X86_FEATURE_XMM,
+			  "r" (x));
+}
+
+/* 3dnow! prefetch to get an exclusive cache line. Useful for
+   spinlocks to avoid one state transition in the cache coherency protocol. */
+static inline void prefetchw(const void *x)
+{
+	alternative_input(BASE_PREFETCH,
+			  "prefetchw (%1)",
+			  X86_FEATURE_3DNOW,
+			  "r" (x));
+}
+
+#define spin_lock_prefetch(x)	prefetchw(x)
 #ifdef CONFIG_X86_32
-# include "processor_32.h"
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE	(PAGE_OFFSET)
+#define STACK_TOP	TASK_SIZE
+#define STACK_TOP_MAX	STACK_TOP
+
+#define INIT_THREAD  {							\
+	.sp0 = sizeof(init_stack) + (long)&init_stack,			\
+	.vm86_info = NULL,						\
+	.sysenter_cs = __KERNEL_CS,					\
+	.io_bitmap_ptr = NULL,						\
+	.fs = __KERNEL_PERCPU,						\
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS  {							\
+	.x86_tss = {							\
+		.sp0		= sizeof(init_stack) + (long)&init_stack, \
+		.ss0		= __KERNEL_DS,				\
+		.ss1		= __KERNEL_CS,				\
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		\
+	 },								\
+	.io_bitmap	= { [0 ... IO_BITMAP_LONGS] = ~0 },		\
+}
+
+#define start_thread(regs, new_eip, new_esp) do {		\
+	__asm__("movl %0,%%gs": :"r" (0));			\
+	regs->fs = 0;						\
+	set_fs(USER_DS);					\
+	regs->ds = __USER_DS;					\
+	regs->es = __USER_DS;					\
+	regs->ss = __USER_DS;					\
+	regs->cs = __USER_CS;					\
+	regs->ip = new_eip;					\
+	regs->sp = new_esp;					\
+} while (0)
+
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info)                                                 \
+({                                                                     \
+       unsigned long *__ptr = (unsigned long *)(info);                 \
+       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessable even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task)                                             \
+({                                                                     \
+       struct pt_regs *__regs__;                                       \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ - 1;                                                   \
+})
+
 #else
-# include "processor_64.h"
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE64	(0x800000000000UL - 4096)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+			   0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE 		(test_thread_flag(TIF_IA32) ? \
+				 IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child) 	((test_tsk_thread_flag(child, TIF_IA32)) ? \
+				  IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define STACK_TOP		TASK_SIZE
+#define STACK_TOP_MAX		TASK_SIZE64
+
+#define INIT_THREAD  { \
+	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS  { \
+	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define start_thread(regs, new_rip, new_rsp) do { 			     \
+	asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));  \
+	load_gs_index(0);						     \
+	(regs)->ip = (new_rip);						     \
+	(regs)->sp = (new_rsp);						     \
+	(regs)->cs = __USER_CS;						     \
+	(regs)->ss = __USER_DS;						     \
+	(regs)->flags = 0x200;						     \
+	set_fs(USER_DS);						     \
+} while (0)
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#endif /* CONFIG_X86_64 */
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/processor_32.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,751 +0,0 @@
-/*
- * include/asm-i386/processor.h
- *
- * Copyright (C) 1994 Linus Torvalds
- */
-
-#ifndef __ASM_I386_PROCESSOR_H
-#define __ASM_I386_PROCESSOR_H
-
-#include <asm/vm86.h>
-#include <asm/math_emu.h>
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/sigcontext.h>
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-#include <asm/system.h>
-#include <linux/cache.h>
-#include <linux/threads.h>
-#include <asm/percpu.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <asm/processor-flags.h>
-#include <xen/interface/physdev.h>
-
-/* flag for disabling the tsc */
-#define tsc_disable 0
-
-struct desc_struct {
-	unsigned long a,b;
-};
-
-#define desc_empty(desc) \
-		(!((desc)->a | (desc)->b))
-
-#define desc_equal(desc1, desc2) \
-		(((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
-/*
- * Default implementation of macro that returns current
- * instruction pointer ("program counter").
- */
-#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
-
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-	__u8	x86;		/* CPU family */
-	__u8	x86_vendor;	/* CPU vendor */
-	__u8	x86_model;
-	__u8	x86_mask;
-	char	wp_works_ok;	/* It doesn't on 386's */
-	char	hlt_works_ok;	/* Problems on some 486Dx4's and old 386's */
-	char	hard_math;
-	char	rfu;
-       	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
-	unsigned long	x86_capability[NCAPINTS];
-	char	x86_vendor_id[16];
-	char	x86_model_id[64];
-	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
-				    call  */
-	int 	x86_cache_alignment;	/* In bytes */
-	char	fdiv_bug;
-	char	f00f_bug;
-	char	coma_bug;
-	char	pad0;
-	int	x86_power;
-	unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
-#endif
-	unsigned char x86_max_cores;	/* cpuid returned max cores value */
-	unsigned char apicid;
-	unsigned short x86_clflush_size;
-#ifdef CONFIG_SMP
-	unsigned char booted_cores;	/* number of cores as seen by OS */
-	__u8 phys_proc_id; 		/* Physical processor id. */
-	__u8 cpu_core_id;  		/* Core id */
-	__u8 cpu_index;			/* index into per_cpu list */
-#endif
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
-#define X86_VENDOR_UNKNOWN 0xff
-
-/*
- * capabilities of CPUs
- */
-
-extern struct cpuinfo_x86 boot_cpu_data;
-extern struct cpuinfo_x86 new_cpu_data;
-#ifndef CONFIG_X86_NO_TSS
-extern struct tss_struct doublefault_tss;
-DECLARE_PER_CPU(struct tss_struct, init_tss);
-#endif
-
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
-#define current_cpu_data	cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)		boot_cpu_data
-#define current_cpu_data	boot_cpu_data
-#endif
-
-/*
- * the following now lives in the per cpu area:
- * extern	int cpu_llc_id[NR_CPUS];
- */
-DECLARE_PER_CPU(u8, cpu_llc_id);
-extern char ignore_fpu_irq;
-
-void __init cpu_detect(struct cpuinfo_x86 *c);
-
-extern void identify_boot_cpu(void);
-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
-extern void print_cpu_info(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
-
-#ifdef CONFIG_X86_HT
-extern void detect_ht(struct cpuinfo_x86 *c);
-#else
-static inline void detect_ht(struct cpuinfo_x86 *c) {}
-#endif
-
-static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
-			     unsigned int *ecx, unsigned int *edx)
-{
-	/* ecx is often an input as well as an output. */
-	__asm__(XEN_CPUID
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (*eax), "2" (*ecx));
-}
-
-#define load_cr3(pgdir) write_cr3(__pa(pgdir))
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-
-static inline void set_in_cr4 (unsigned long mask)
-{
-	unsigned cr4;
-	mmu_cr4_features |= mask;
-	cr4 = read_cr4();
-	cr4 |= mask;
-	write_cr4(cr4);
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
-	unsigned cr4;
-	mmu_cr4_features &= ~mask;
-	cr4 = read_cr4();
-	cr4 &= ~mask;
-	write_cr4(cr4);
-}
-
-/* Stop speculative execution */
-static inline void sync_core(void)
-{
-	int tmp;
-	asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
-}
-
-static inline void __monitor(const void *eax, unsigned long ecx,
-		unsigned long edx)
-{
-	/* "monitor %eax,%ecx,%edx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc8;"
-		: :"a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax,%ecx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc9;"
-		: :"a" (eax), "c" (ecx));
-}
-
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
-/* from system description table in BIOS.  Mostly for MCA use, but
-others may find it useful. */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
-extern unsigned int mca_pentium_flag;
-
-/* Boot loader type from the setup header */
-extern int bootloader_type;
-
-/*
- * User space process size: 3GB (default).
- */
-#define TASK_SIZE	(PAGE_OFFSET)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
-
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
-extern void hard_disable_TSC(void);
-extern void disable_TSC(void);
-extern void hard_enable_TSC(void);
-
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#ifndef CONFIG_X86_NO_TSS
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#endif
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
-
-struct i387_fsave_struct {
-	long	cwd;
-	long	swd;
-	long	twd;
-	long	fip;
-	long	fcs;
-	long	foo;
-	long	fos;
-	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	long	status;		/* software status information */
-};
-
-struct i387_fxsave_struct {
-	unsigned short	cwd;
-	unsigned short	swd;
-	unsigned short	twd;
-	unsigned short	fop;
-	long	fip;
-	long	fcs;
-	long	foo;
-	long	fos;
-	long	mxcsr;
-	long	mxcsr_mask;
-	long	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-	long	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-	long	padding[56];
-} __attribute__ ((aligned (16)));
-
-struct i387_soft_struct {
-	long	cwd;
-	long	swd;
-	long	twd;
-	long	fip;
-	long	fcs;
-	long	foo;
-	long	fos;
-	long	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	unsigned char	ftop, changed, lookahead, no_update, rm, alimit;
-	struct info	*info;
-	unsigned long	entry_eip;
-};
-
-union i387_union {
-	struct i387_fsave_struct	fsave;
-	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct soft;
-};
-
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
-struct thread_struct;
-
-#ifndef CONFIG_X86_NO_TSS
-/* This is the TSS defined by the hardware. */
-struct i386_hw_tss {
-	unsigned short	back_link,__blh;
-	unsigned long	esp0;
-	unsigned short	ss0,__ss0h;
-	unsigned long	esp1;
-	unsigned short	ss1,__ss1h;	/* ss1 is used to cache MSR_IA32_SYSENTER_CS */
-	unsigned long	esp2;
-	unsigned short	ss2,__ss2h;
-	unsigned long	__cr3;
-	unsigned long	eip;
-	unsigned long	eflags;
-	unsigned long	eax,ecx,edx,ebx;
-	unsigned long	esp;
-	unsigned long	ebp;
-	unsigned long	esi;
-	unsigned long	edi;
-	unsigned short	es, __esh;
-	unsigned short	cs, __csh;
-	unsigned short	ss, __ssh;
-	unsigned short	ds, __dsh;
-	unsigned short	fs, __fsh;
-	unsigned short	gs, __gsh;
-	unsigned short	ldt, __ldth;
-	unsigned short	trace, io_bitmap_base;
-} __attribute__((packed));
-
-struct tss_struct {
-	struct i386_hw_tss x86_tss;
-
-	/*
-	 * The extra 1 is there because the CPU will access an
-	 * additional byte beyond the end of the IO permission
-	 * bitmap. The extra byte must be all 1 bits, and must
-	 * be within the limit.
-	 */
-	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
-	/*
-	 * Cache the current maximum and the last task that used the bitmap:
-	 */
-	unsigned long io_bitmap_max;
-	struct thread_struct *io_bitmap_owner;
-	/*
-	 * pads the TSS to be cacheline-aligned (size is 0x100)
-	 */
-	unsigned long __cacheline_filler[35];
-	/*
-	 * .. and then another 0x100 bytes for emergency kernel stack
-	 */
-	unsigned long stack[64];
-} __attribute__((packed));
-#endif
-
-#define ARCH_MIN_TASKALIGN	16
-
-struct thread_struct {
-/* cached TLS descriptors. */
-	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-	unsigned long	esp0;
-	unsigned long	sysenter_cs;
-	unsigned long	eip;
-	unsigned long	esp;
-	unsigned long	fs;
-	unsigned long	gs;
-/* Hardware debugging registers */
-	unsigned long	debugreg[8];  /* %%db0-7 debug registers */
-/* fault info */
-	unsigned long	cr2, trap_no, error_code;
-/* floating point info */
-	union i387_union	i387;
-/* virtual 86 mode info */
-	struct vm86_struct __user * vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags, v86mask, saved_esp0;
-	unsigned int		saved_fs, saved_gs;
-/* IO permissions */
-	unsigned long	*io_bitmap_ptr;
- 	unsigned long	iopl;
-/* max allowed port in the bitmap, in bytes: */
-	unsigned long	io_bitmap_max;
-};
-
-#define INIT_THREAD  {							\
-	.esp0 = sizeof(init_stack) + (long)&init_stack,			\
-	.vm86_info = NULL,						\
-	.sysenter_cs = __KERNEL_CS,					\
-	.io_bitmap_ptr = NULL,						\
-	.fs = __KERNEL_PERCPU,						\
-}
-
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS  {							\
-	.x86_tss = {							\
-		.esp0		= sizeof(init_stack) + (long)&init_stack, \
-		.ss0		= __KERNEL_DS,				\
-		.ss1		= __KERNEL_CS,				\
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		\
-	 },								\
-	.io_bitmap	= { [ 0 ... IO_BITMAP_LONGS] = ~0 },		\
-}
-
-#define start_thread(regs, new_eip, new_esp) do {		\
-	__asm__("movl %0,%%gs": :"r" (0));			\
-	regs->xfs = 0;						\
-	set_fs(USER_DS);					\
-	regs->xds = __USER_DS;					\
-	regs->xes = __USER_DS;					\
-	regs->xss = __USER_DS;					\
-	regs->xcs = __USER_CS;					\
-	regs->eip = new_eip;					\
-	regs->esp = new_esp;					\
-} while (0)
-
-/* Forward declaration, a strange C thing */
-struct task_struct;
-struct mm_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
-/*
- * create a kernel thread without removing it from tasklists
- */
-extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
-
-unsigned long get_wchan(struct task_struct *p);
-
-#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info)                                                 \
-({                                                                     \
-       unsigned long *__ptr = (unsigned long *)(info);                 \
-       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
-})
-
-/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the xss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
-       __regs__ - 1;                                                   \
-})
-
-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
-
-
-struct microcode_header {
-	unsigned int hdrver;
-	unsigned int rev;
-	unsigned int date;
-	unsigned int sig;
-	unsigned int cksum;
-	unsigned int ldrver;
-	unsigned int pf;
-	unsigned int datasize;
-	unsigned int totalsize;
-	unsigned int reserved[3];
-};
-
-struct microcode {
-	struct microcode_header hdr;
-	unsigned int bits[0];
-};
-
-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-	unsigned int sig;
-	unsigned int pf;
-	unsigned int cksum;
-};
-
-struct extended_sigtable {
-	unsigned int count;
-	unsigned int cksum;
-	unsigned int reserved[3];
-	struct extended_signature sigs[0];
-};
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-	__asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax()	rep_nop()
-
-#ifndef CONFIG_X86_NO_TSS
-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-	tss->x86_tss.esp0 = thread->esp0;
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-}
-#else
-#define xen_load_esp0(tss, thread) do { \
-	if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
-		BUG(); \
-} while (0)
-#endif
-
-
-static inline unsigned long xen_get_debugreg(int regno)
-{
-	return HYPERVISOR_get_debugreg(regno);
-}
-
-static inline void xen_set_debugreg(int regno, unsigned long value)
-{
-	WARN_ON(HYPERVISOR_set_debugreg(regno, value));
-}
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void xen_set_iopl_mask(unsigned mask)
-{
-	struct physdev_set_iopl set_iopl;
-
-	/* Force the change at ring 0. */
-	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
-	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-}
-
-
-#define paravirt_enabled() 0
-#define __cpuid xen_cpuid
-
-#define load_esp0 xen_load_esp0
-
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register)				\
-	(var) = xen_get_debugreg(register)
-#define set_debugreg(value, register)				\
-	xen_set_debugreg(register, value)
-
-#define set_iopl_mask xen_set_iopl_mask
-
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
-			 unsigned int *eax, unsigned int *ebx,
-			 unsigned int *ecx, unsigned int *edx)
-{
-	*eax = op;
-	*ecx = 0;
-	__cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
-			       unsigned int *eax, unsigned int *ebx,
-			       unsigned int *ecx, unsigned int *edx)
-{
-	*eax = op;
-	*ecx = count;
-	__cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return eax;
-}
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return ebx;
-}
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return ecx;
-}
-static inline unsigned int cpuid_edx(unsigned int op)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	cpuid(op, &eax, &ebx, &ecx, &edx);
-	return edx;
-}
-
-/* generic versions from gas */
-#define GENERIC_NOP1	".byte 0x90\n"
-#define GENERIC_NOP2    	".byte 0x89,0xf6\n"
-#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6	".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7	".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8	GENERIC_NOP1 GENERIC_NOP7
-
-/* Opteron nops */
-#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2	".byte 0x66,0x90\n"
-#define K8_NOP3	".byte 0x66,0x66,0x90\n"
-#define K8_NOP4	".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5	K8_NOP3 K8_NOP2
-#define K8_NOP6	K8_NOP3 K8_NOP3
-#define K8_NOP7	K8_NOP4 K8_NOP3
-#define K8_NOP8	K8_NOP4 K8_NOP4
-
-/* K7 nops */
-/* uses eax dependencies (arbitary choice) */
-#define K7_NOP1  GENERIC_NOP1
-#define K7_NOP2	".byte 0x8b,0xc0\n"
-#define K7_NOP3	".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4	".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5	K7_NOP4 ASM_NOP1
-#define K7_NOP6	".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8        K7_NOP7 ASM_NOP1
-
-/* P6 nops */
-/* uses eax dependencies (Intel-recommended choice) */
-#define P6_NOP1	GENERIC_NOP1
-#define P6_NOP2	".byte 0x66,0x90\n"
-#define P6_NOP3	".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4	".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5	".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6	".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7	".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8	".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
-
-#ifdef CONFIG_MK8
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
-#elif defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
-      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
-      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
-#else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
-#endif
-
-#define ASM_NOP_MAX 8
-
-/* Prefetch instructions for Pentium III and AMD Athlon */
-/* It's not worth to care about 3dnow! prefetches for the K6
-   because they are microcoded there and very slow.
-   However we don't do prefetches for pre XP Athlons currently
-   That should be fixed. */
-#define ARCH_HAS_PREFETCH
-static inline void prefetch(const void *x)
-{
-	alternative_input(ASM_NOP4,
-			  "prefetchnta (%1)",
-			  X86_FEATURE_XMM,
-			  "r" (x));
-}
-
-#define ARCH_HAS_PREFETCH
-#define ARCH_HAS_PREFETCHW
-#define ARCH_HAS_SPINLOCK_PREFETCH
-
-/* 3dnow! prefetch to get an exclusive cache line. Useful for
-   spinlocks to avoid one state transition in the cache coherency protocol. */
-static inline void prefetchw(const void *x)
-{
-	alternative_input(ASM_NOP4,
-			  "prefetchw (%1)",
-			  X86_FEATURE_3DNOW,
-			  "r" (x));
-}
-#define spin_lock_prefetch(x)	prefetchw(x)
-
-extern void select_idle_routine(const struct cpuinfo_x86 *c);
-
-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-
-extern unsigned long boot_option_idle_override;
-extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
-
-/* Defined in head.S */
-extern struct Xgt_desc_struct early_gdt_descr;
-
-extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
-extern void cpu_init(void);
-extern void init_gdt(int cpu);
-
-extern int force_mwait;
-
-#endif /* __ASM_I386_PROCESSOR_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/processor_64.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,461 +0,0 @@
-/*
- * include/asm-x86_64/processor.h
- *
- * Copyright (C) 1994 Linus Torvalds
- */
-
-#ifndef __ASM_X86_64_PROCESSOR_H
-#define __ASM_X86_64_PROCESSOR_H
-
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/sigcontext.h>
-#include <asm/cpufeature.h>
-#include <linux/threads.h>
-#include <asm/msr.h>
-#include <asm/current.h>
-#include <asm/system.h>
-#include <asm/mmsegment.h>
-#include <asm/percpu.h>
-#include <linux/personality.h>
-#include <linux/cpumask.h>
-#include <asm/processor-flags.h>
-
-#define TF_MASK		0x00000100
-#define IF_MASK		0x00000200
-#define IOPL_MASK	0x00003000
-#define NT_MASK		0x00004000
-#define VM_MASK		0x00020000
-#define AC_MASK		0x00040000
-#define VIF_MASK	0x00080000	/* virtual interrupt flag */
-#define VIP_MASK	0x00100000	/* virtual interrupt pending */
-#define ID_MASK		0x00200000
-
-#define desc_empty(desc) \
-               (!((desc)->a | (desc)->b))
-
-#define desc_equal(desc1, desc2) \
-               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
-
-/*
- * Default implementation of macro that returns current
- * instruction pointer ("program counter").
- */
-#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
-
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- */
-
-struct cpuinfo_x86 {
-	__u8	x86;		/* CPU family */
-	__u8	x86_vendor;	/* CPU vendor */
-	__u8	x86_model;
-	__u8	x86_mask;
-	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
-	__u32	x86_capability[NCAPINTS];
-	char	x86_vendor_id[16];
-	char	x86_model_id[64];
-	int 	x86_cache_size;  /* in KB */
-	int	x86_clflush_size;
-	int	x86_cache_alignment;
-	int	x86_tlbsize;	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
-        __u8    x86_virt_bits, x86_phys_bits;
-	__u8	x86_max_cores;	/* cpuid returned max cores value */
-        __u32   x86_power;
-	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
-	unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
-#endif
-	__u8	apicid;
-#ifdef CONFIG_SMP
-	__u8	booted_cores;	/* number of cores as seen by OS */
-	__u8	phys_proc_id;	/* Physical Processor id. */
-	__u8	cpu_core_id;	/* Core id. */
-	__u8	cpu_index;	/* index into per_cpu list */
-#endif
-} ____cacheline_aligned;
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NUM 8
-#define X86_VENDOR_UNKNOWN 0xff
-
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
-#define current_cpu_data	cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)		boot_cpu_data
-#define current_cpu_data	boot_cpu_data
-#endif
-
-extern char ignore_irq13;
-
-extern void identify_cpu(struct cpuinfo_x86 *);
-extern void print_cpu_info(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-
-static inline void set_in_cr4 (unsigned long mask)
-{
-	mmu_cr4_features |= mask;
-	__asm__("movq %%cr4,%%rax\n\t"
-		"orq %0,%%rax\n\t"
-		"movq %%rax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
-	mmu_cr4_features &= ~mask;
-	__asm__("movq %%cr4,%%rax\n\t"
-		"andq %0,%%rax\n\t"
-		"movq %%rax,%%cr4\n"
-		: : "irg" (~mask)
-		:"ax");
-}
-
-
-/*
- * User space process size. 47bits minus one guard page.
- */
-#define TASK_SIZE64	(0x800000000000UL - 4096)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
-
-#define TASK_SIZE 		(test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
-#define TASK_SIZE_OF(child) 	((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
-
-#define TASK_UNMAPPED_BASE	PAGE_ALIGN(TASK_SIZE/3)
-
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#ifndef CONFIG_X86_NO_TSS
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#endif
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-
-struct i387_fxsave_struct {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-	u32	padding[24];
-} __attribute__ ((aligned (16)));
-
-union i387_union {
-	struct i387_fxsave_struct	fxsave;
-};
-
-#ifndef CONFIG_X86_NO_TSS
-struct tss_struct {
-	u32 reserved1;
-	u64 rsp0;
-	u64 rsp1;
-	u64 rsp2;
-	u64 reserved2;
-	u64 ist[7];
-	u32 reserved3;
-	u32 reserved4;
-	u16 reserved5;
-	u16 io_bitmap_base;
-	/*
-	 * The extra 1 is there because the CPU will access an
-	 * additional byte beyond the end of the IO permission
-	 * bitmap. The extra byte must be all 1 bits, and must
-	 * be within the limit. Thus we have:
-	 *
-	 * 128 bytes, the bitmap itself, for ports 0..0x3ff
-	 * 8 bytes, for an extra "long" of ~0UL
-	 */
-	unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
-} __attribute__((packed)) ____cacheline_aligned;
-
-DECLARE_PER_CPU(struct tss_struct,init_tss);
-#endif
-
-
-extern struct cpuinfo_x86 boot_cpu_data;
-#ifndef CONFIG_X86_NO_TSS
-/* Save the original ist values for checking stack pointers during debugging */
-struct orig_ist {
-	unsigned long ist[7];
-};
-DECLARE_PER_CPU(struct orig_ist, orig_ist);
-#endif
-
-#ifdef CONFIG_X86_VSMP
-#define ARCH_MIN_TASKALIGN	(1 << INTERNODE_CACHE_SHIFT)
-#define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
-#else
-#define ARCH_MIN_TASKALIGN	16
-#define ARCH_MIN_MMSTRUCT_ALIGN	0
-#endif
-
-struct thread_struct {
-	unsigned long	rsp0;
-	unsigned long	rsp;
-	unsigned long 	userrsp;	/* Copy from PDA */
-	unsigned long	fs;
-	unsigned long	gs;
-	unsigned short	es, ds, fsindex, gsindex;
-/* Hardware debugging registers */
-	unsigned long	debugreg0;
-	unsigned long	debugreg1;
-	unsigned long	debugreg2;
-	unsigned long	debugreg3;
-	unsigned long	debugreg6;
-	unsigned long	debugreg7;
-/* fault info */
-	unsigned long	cr2, trap_no, error_code;
-/* floating point info */
-	union i387_union	i387  __attribute__((aligned(16)));
-/* IO permissions. the bitmap could be moved into the GDT, that would make
-   switch faster for a limited number of ioperm using tasks. -AK */
-	int		ioperm;
-	unsigned long	*io_bitmap_ptr;
-	unsigned io_bitmap_max;
-/* cached TLS descriptors. */
-	u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
-	unsigned int	iopl;
-} __attribute__((aligned(16)));
-
-#define INIT_THREAD  { \
-	.rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#ifndef CONFIG_X86_NO_TSS
-#define INIT_TSS  { \
-	.rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-#endif
-
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
-#define start_thread(regs,new_rip,new_rsp) do { \
-	asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));	 \
-	load_gs_index(0);							\
-	(regs)->rip = (new_rip);						 \
-	(regs)->rsp = (new_rsp);						 \
-	write_pda(oldrsp, (new_rsp));						 \
-	(regs)->cs = __USER_CS;							 \
-	(regs)->ss = __USER_DS;							 \
-	(regs)->eflags = 0x200;							 \
-	set_fs(USER_DS);							 \
-} while(0)
-
-#define get_debugreg(var, register)				\
-	var = HYPERVISOR_get_debugreg(register)
-#define set_debugreg(value, register) do {			\
-	if (HYPERVISOR_set_debugreg(register, value))		\
-		BUG();						\
-} while (0)
-
-struct task_struct;
-struct mm_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
-/*
- * create a kernel thread without removing it from tasklists
- */
-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
-
-extern unsigned long get_wchan(struct task_struct *p);
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
-
-
-struct microcode_header {
-	unsigned int hdrver;
-	unsigned int rev;
-	unsigned int date;
-	unsigned int sig;
-	unsigned int cksum;
-	unsigned int ldrver;
-	unsigned int pf;
-	unsigned int datasize;
-	unsigned int totalsize;
-	unsigned int reserved[3];
-};
-
-struct microcode {
-	struct microcode_header hdr;
-	unsigned int bits[0];
-};
-
-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-	unsigned int sig;
-	unsigned int pf;
-	unsigned int cksum;
-};
-
-struct extended_sigtable {
-	unsigned int count;
-	unsigned int cksum;
-	unsigned int reserved[3];
-	struct extended_signature sigs[0];
-};
-
-
-#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
-#else
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
-#endif
-
-/* Opteron nops */
-#define K8_NOP1 ".byte 0x90\n"
-#define K8_NOP2	".byte 0x66,0x90\n"
-#define K8_NOP3	".byte 0x66,0x66,0x90\n"
-#define K8_NOP4	".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5	K8_NOP3 K8_NOP2
-#define K8_NOP6	K8_NOP3 K8_NOP3
-#define K8_NOP7	K8_NOP4 K8_NOP3
-#define K8_NOP8	K8_NOP4 K8_NOP4
-
-/* P6 nops */
-/* uses eax dependencies (Intel-recommended choice) */
-#define P6_NOP1	".byte 0x90\n"
-#define P6_NOP2	".byte 0x66,0x90\n"
-#define P6_NOP3	".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4	".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5	".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6	".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7	".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8	".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
-
-#define ASM_NOP_MAX 8
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-	__asm__ __volatile__("rep;nop": : :"memory");
-}
-
-/* Stop speculative execution */
-static inline void sync_core(void)
-{
-	int tmp;
-	asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
-}
-
-#define ARCH_HAS_PREFETCHW 1
-static inline void prefetchw(void *x)
-{
-	alternative_input("prefetcht0 (%1)",
-			  "prefetchw (%1)",
-			  X86_FEATURE_3DNOW,
-			  "r" (x));
-}
-
-#define ARCH_HAS_SPINLOCK_PREFETCH 1
-
-#define spin_lock_prefetch(x)  prefetchw(x)
-
-#define cpu_relax()   rep_nop()
-
-static inline void __monitor(const void *eax, unsigned long ecx,
-		unsigned long edx)
-{
-	/* "monitor %eax,%ecx,%edx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc8;"
-		: :"a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax,%ecx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc9;"
-		: :"a" (eax), "c" (ecx));
-}
-
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
-	/* "mwait %eax,%ecx;" */
-	asm volatile(
-		"sti; .byte 0x0f,0x01,0xc9;"
-		: :"a" (eax), "c" (ecx));
-}
-
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
-#define stack_current() \
-({								\
-	struct thread_info *ti;					\
-	asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
-	ti->task;					\
-})
-
-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-
-extern unsigned long boot_option_idle_override;
-/* Boot loader type from the setup header */
-extern int bootloader_type;
-
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
-#endif /* __ASM_X86_64_PROCESSOR_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/smp_32.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/smp_32.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,56 +1,51 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H

+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+
 /*
  * We need the APIC definitions automatically as part of 'smp.h'
  */
-#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+#  include <asm/io_apic.h>
+# endif
 #endif

-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
-#include <linux/bitops.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#ifdef CONFIG_X86_IO_APIC
-#include <asm/io_apic.h>
-#endif
-#endif
+#define cpu_callout_map cpu_possible_map
+#define cpu_callin_map cpu_possible_map

-#define BAD_APICID 0xFFu
-#ifdef CONFIG_SMP
-#ifndef __ASSEMBLY__
+extern int smp_num_siblings;
+extern unsigned int num_processors;

-/*
- * Private routines/data
- */
-
 extern void smp_alloc_memory(void);
-extern int pic_mode;
-extern int smp_num_siblings;
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);

 extern void (*mtrr_hook) (void);
 extern void zap_low_mappings (void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);

-#define MAX_APICID 256
-extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u8, cpu_llc_id);
 DECLARE_PER_CPU(u8, x86_cpu_to_apicid);

-#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
-
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif

+#ifdef CONFIG_SMP
+
 #ifndef CONFIG_XEN
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
 struct smp_ops
 {
 	void (*smp_prepare_boot_cpu)(void);
@@ -104,11 +99,7 @@ void native_smp_prepare_cpus(unsigned in
 int native_cpu_up(unsigned int cpunum);
 void native_smp_cpus_done(unsigned int max_cpus);

-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) 		\
-do { } while (0)
-
-#else
-
+#else /* CONFIG_XEN */

 void xen_smp_send_stop(void);
 void xen_smp_send_reschedule(int cpu);
@@ -120,7 +111,12 @@ int xen_smp_call_function_mask(cpumask_t
 #define smp_send_reschedule	xen_smp_send_reschedule
 #define smp_call_function_mask	xen_smp_call_function_mask

-#endif
+extern void prefill_possible_map(void);
+
+#endif /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);

 /*
  * This function is needed by all SMP systems. It must _always_ be valid
@@ -130,64 +126,49 @@ int xen_smp_call_function_mask(cpumask_t
 DECLARE_PER_CPU(int, cpu_number);
 #define raw_smp_processor_id() (x86_read_percpu(cpu_number))

-extern cpumask_t cpu_possible_map;
-#define cpu_callin_map cpu_possible_map
+#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
+
+#define safe_smp_processor_id() smp_processor_id()

 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
 {
-	return cpus_weight(cpu_possible_map);
+	return cpus_weight(cpu_callout_map);
 }

-#define safe_smp_processor_id() smp_processor_id()
-extern int __cpu_disable(void);
-extern void __cpu_die(unsigned int cpu);
-extern void prefill_possible_map(void);
-extern unsigned int num_processors;
-
-#endif /* !__ASSEMBLY__ */
-
 #else /* CONFIG_SMP */

 #define safe_smp_processor_id()		0
 #define cpu_physical_id(cpu)		boot_cpu_physical_apicid

-#define NO_PROC_ID		0xFF		/* No processor magic marker */
-
-#endif /* CONFIG_SMP */
-
-#ifndef __ASSEMBLY__
+#endif /* !CONFIG_SMP */

 #ifdef CONFIG_X86_LOCAL_APIC

-#ifdef APIC_DEFINITION
+static __inline int logical_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+# ifdef APIC_DEFINITION
 extern int hard_smp_processor_id(void);
-#else
-#include <mach_apicdef.h>
+# else
+#  include <mach_apicdef.h>
 static inline int hard_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
+	return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
 }
-#endif /* APIC_DEFINITION */
+# endif /* APIC_DEFINITION */

 #else /* CONFIG_X86_LOCAL_APIC */

-#ifndef CONFIG_SMP
-#define hard_smp_processor_id()		0
-#endif
+# ifndef CONFIG_SMP
+#  define hard_smp_processor_id()	0
+# endif

 #endif /* CONFIG_X86_LOCAL_APIC */

-extern u8 apicid_2_node[];
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static __inline int logical_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
-}
-#endif
-#endif
-
+#endif /* !ASSEMBLY */
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/smp_64.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/smp_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,139 +1,103 @@
 #ifndef __ASM_SMP_H
 #define __ASM_SMP_H

-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <linux/bitops.h>
 #include <linux/init.h>
-extern int disable_apic;

 #ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
 #include <asm/apic.h>
 #ifdef CONFIG_X86_IO_APIC
 #include <asm/io_apic.h>
 #endif
-#include <asm/thread_info.h>
+#include <asm/mpspec.h>
 #endif
-
-#ifdef CONFIG_SMP
-
 #include <asm/pda.h>
+#include <asm/thread_info.h>

-struct pt_regs;
-
-extern cpumask_t cpu_present_mask;
-extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_initialized;

-/*
- * Private routines/data
- */
-
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+
 extern void smp_alloc_memory(void);
-extern volatile unsigned long smp_invalidate_needed;
 extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
-extern int smp_num_siblings;
-extern void smp_send_reschedule(int cpu);
+
 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 				  void *info, int wait);

-/*
- * cpu_sibling_map and cpu_core_map now live
- * in the per cpu area
- *
- * extern cpumask_t cpu_sibling_map[NR_CPUS];
- * extern cpumask_t cpu_core_map[NR_CPUS];
- */
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-
-#define SMP_TRAMPOLINE_BASE 0x6000
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);

-/*
- * On x86 all CPUs are mapped 1:1 to the APIC space.
- * This simplifies scheduling and IPI sending and
- * compresses data structures.
- */
-
-static inline int num_booting_cpus(void)
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline int cpu_present_to_apicid(int mps_cpu)
 {
-	return cpus_weight(cpu_possible_map);
+	if (cpu_present(mps_cpu))
+		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+	else
+		return BAD_APICID;
 }
+#endif

-#define raw_smp_processor_id() read_pda(cpunumber)
+#ifdef CONFIG_SMP
+
+#define SMP_TRAMPOLINE_BASE 0x6000

 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern void prefill_possible_map(void);
-extern unsigned num_processors;
 extern unsigned __cpuinitdata disabled_cpus;

-#define NO_PROC_ID		0xFF		/* No processor magic marker */
-
-#endif /* CONFIG_SMP */
+#define raw_smp_processor_id()	read_pda(cpunumber)
+#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)

-#define safe_smp_processor_id()		smp_processor_id()
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline int hard_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
-}
-#endif
+#define stack_smp_processor_id()					\
+	({								\
+	struct thread_info *ti;						\
+	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
+	ti->cpu;							\
+})

 /*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
+ * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
+ * scheduling and IPI sending and compresses data structures.
  */
-extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);	/* physical ID */
-extern u8 bios_cpu_apicid[];
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline int cpu_present_to_apicid(int mps_cpu)
+static inline int num_booting_cpus(void)
 {
-	if (mps_cpu < NR_CPUS)
-		return (int)bios_cpu_apicid[mps_cpu];
-	else
-		return BAD_APICID;
+	return cpus_weight(cpu_possible_map);
 }
-#endif

-#ifndef CONFIG_SMP
+extern void smp_send_reschedule(int cpu);
+
+#else /* CONFIG_SMP */
+
+extern unsigned int boot_cpu_id;
+#define cpu_physical_id(cpu)	boot_cpu_id
 #define stack_smp_processor_id() 0
-#define cpu_logical_map(x) (x)
-#else
-#include <asm/thread_info.h>
-#define stack_smp_processor_id() \
-({ 								\
-	struct thread_info *ti;					\
-	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
-	ti->cpu;						\
-})
-#endif
+
+#endif /* !CONFIG_SMP */
+
+#define safe_smp_processor_id()		smp_processor_id()

 #ifdef CONFIG_X86_LOCAL_APIC
 static __inline int logical_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
+	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+static inline int hard_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
 }
 #endif

-#ifdef CONFIG_SMP
-#define cpu_physical_id(cpu)		per_cpu(x86_cpu_to_apicid, cpu)
-#else
-extern unsigned int boot_cpu_id;
-#define cpu_physical_id(cpu)		boot_cpu_id
-#endif /* !CONFIG_SMP */
 #endif

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,341 @@
+#ifndef _X86_SPINLOCK_H_
+#define _X86_SPINLOCK_H_
+
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations.  There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+# define REG_PTR_MODE "k"
+#else
+# define LOCK_PTR_REG "D"
+# define REG_PTR_MODE "q"
+#endif
+
+#if defined(CONFIG_X86_32) && \
+	(defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
+/*
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+#include <asm/irqflags.h>
+
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+bool xen_spin_wait(raw_spinlock_t *, unsigned int *token,
+		   unsigned int flags);
+unsigned int xen_spin_adjust(const raw_spinlock_t *, unsigned int token);
+void xen_spin_kick(raw_spinlock_t *, unsigned int token);
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ *
+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
+ * save some instructions and make the code more elegant. There really isn't
+ * much between them in performance though, especially as locks are out of line.
+ */
+#if TICKET_SHIFT == 8
+#define __raw_spin_lock_preamble \
+	asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
+	    "cmpb %h0, %b0\n\t" \
+	    "sete %1" \
+	    : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
+	    : "0" (0x0100) \
+	    : "memory", "cc")
+#define __raw_spin_lock_body \
+	asm("1:\t" \
+	    "cmpb %h0, %b0\n\t" \
+	    "je 2f\n\t" \
+	    "decl %1\n\t" \
+	    "jz 2f\n\t" \
+	    "rep ; nop\n\t" \
+	    "movb %2, %b0\n\t" \
+	    /* don't need lfence here, because loads are in-order */ \
+	    "jmp 1b\n" \
+	    "2:" \
+	    : "+Q" (token), "+g" (count) \
+	    : "m" (lock->slock) \
+	    : "memory", "cc")
+#define __raw_spin_unlock_body \
+	asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \
+	    "movzwl %2, %0\n\t" \
+	    "cmpb %h0, %b0\n\t" \
+	    "setne %1" \
+	    : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) \
+	    : \
+	    : "memory", "cc")
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	int tmp, new;
+
+	asm("movzwl %2, %0\n\t"
+	    "cmpb %h0, %b0\n\t"
+	    "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
+	    "jne 1f\n\t"
+	    LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
+	    "1:\t"
+	    "sete %b1\n\t"
+	    "movzbl %b1, %0\n\t"
+	    : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+	    :
+	    : "memory", "cc");
+
+	return tmp;
+}
+#elif TICKET_SHIFT == 16
+#define __raw_spin_lock_preamble \
+	do { \
+		unsigned int tmp; \
+		asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
+		    "shldl $16, %0, %3\n\t" \
+		    "cmpw %w3, %w0\n\t" \
+		    "sete %1" \
+		    : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
+		      "=&g" (tmp) \
+		    : "0" (0x00010000) \
+		    : "memory", "cc"); \
+	} while (0)
+#define __raw_spin_lock_body \
+	do { \
+		unsigned int tmp; \
+		asm("shldl $16, %0, %2\n" \
+		    "1:\t" \
+		    "cmpw %w2, %w0\n\t" \
+		    "je 2f\n\t" \
+		    "decl %1\n\t" \
+		    "jz 2f\n\t" \
+		    "rep ; nop\n\t" \
+		    "movw %3, %w0\n\t" \
+		    /* don't need lfence here, because loads are in-order */ \
+		    "jmp 1b\n" \
+		    "2:" \
+		    : "+r" (token), "+g" (count), "=&g" (tmp) \
+		    : "m" (lock->slock) \
+		    : "memory", "cc"); \
+	} while (0)
+#define __raw_spin_unlock_body \
+	do { \
+		unsigned int tmp; \
+		asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \
+		    "movl %2, %0\n\t" \
+		    "shldl $16, %0, %3\n\t" \
+		    "cmpw %w3, %w0\n\t" \
+		    "setne %1" \
+		    : "=&r" (token), "=qm" (kick), "+m" (lock->slock), \
+		      "=&r" (tmp) \
+		    : \
+		    : "memory", "cc"); \
+	} while (0)
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	int tmp;
+	int new;
+
+	asm("movl %2, %0\n\t"
+	    "movl %0, %1\n\t"
+	    "roll $16, %0\n\t"
+	    "cmpl %0, %1\n\t"
+	    "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
+	    "jne 1f\n\t"
+	    LOCK_PREFIX "cmpxchgl %1, %2\n"
+	    "1:\t"
+	    "sete %b1\n\t"
+	    "movzbl %b1, %0\n\t"
+	    : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+	    :
+	    : "memory", "cc");
+
+	return tmp;
+}
+#endif
+
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+	int tmp = *(volatile signed int *)(&(lock)->slock);
+
+	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+	int tmp = *(volatile signed int *)(&(lock)->slock);
+
+	return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
+}
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	unsigned int token, count;
+	unsigned int flags = __raw_local_irq_save();
+	bool free;
+
+	__raw_spin_lock_preamble;
+	if (likely(free)) {
+		raw_local_irq_restore(flags);
+		return;
+	}
+	token = xen_spin_adjust(lock, token);
+	raw_local_irq_restore(flags);
+	do {
+		count = 1 << 10;
+		__raw_spin_lock_body;
+	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
+}
+
+static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+					 unsigned long flags)
+{
+	unsigned int token, count;
+	bool free;
+
+	__raw_spin_lock_preamble;
+	if (likely(free))
+		return;
+	token = xen_spin_adjust(lock, token);
+	do {
+		count = 1 << 10;
+		__raw_spin_lock_body;
+	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	unsigned int token;
+	bool kick;
+
+	__raw_spin_unlock_body;
+	if (kick)
+		xen_spin_kick(lock, token);
+}
+
+#ifndef XEN_SPINLOCK_SOURCE
+#undef __raw_spin_lock_preamble
+#undef __raw_spin_lock_body
+#undef __raw_spin_unlock_body
+#endif
+
+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+{
+	while (__raw_spin_is_locked(lock))
+		cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+{
+	return (int)(lock)->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+{
+	return (lock)->lock == RW_LOCK_BIAS;
+}
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+		     "jns 1f\n"
+		     "call __read_lock_failed\n\t"
+		     "1:\n"
+		     ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void __raw_write_lock(raw_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+		     "jz 1f\n"
+		     "call __write_lock_failed\n\t"
+		     "1:\n"
+		     ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+}
+
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
+{
+	atomic_t *count = (atomic_t *)lock;
+
+	atomic_dec(count);
+	if (atomic_read(count) >= 0)
+		return 1;
+	atomic_inc(count);
+	return 0;
+}
+
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
+{
+	atomic_t *count = (atomic_t *)lock;
+
+	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+		return 1;
+	atomic_add(RW_LOCK_BIAS, count);
+	return 0;
+}
+
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX "addl %1, %0"
+		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
+#endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/spinlock_types.h	2010-03-24 15:10:37.000000000 +0100
@@ -0,0 +1,36 @@
+#ifndef __ASM_SPINLOCK_TYPES_H
+#define __ASM_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+#include <asm/types.h>
+
+typedef union {
+	unsigned int slock;
+	struct {
+/*
+ * On Xen we support a single level of interrupt re-enabling per lock. Hence
+ * we can have twice as many outstanding tickets. Thus the cut-off for using
+ * byte register pairs must be at half the number of CPUs.
+ */
+#if 2 * CONFIG_NR_CPUS < 256
+# define TICKET_SHIFT 8
+		u8 cur, seq;
+#else
+# define TICKET_SHIFT 16
+		u16 cur, seq;
+#endif
+	};
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0 }
+
+typedef struct {
+	unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+
+#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,393 @@
+#ifndef _ASM_X86_SYSTEM_H_
+#define _ASM_X86_SYSTEM_H_
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+#include <asm/hypervisor.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#ifdef CONFIG_IA32_EMULATION
+# define AT_VECTOR_SIZE_ARCH 2
+#else
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+#ifdef CONFIG_X86_32
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+				struct task_struct *next);
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) do {				\
+	unsigned long esi, edi;						\
+	asm volatile("pushfl\n\t"		/* Save flags */	\
+		     "pushl %%ebp\n\t"					\
+		     "movl %%esp,%0\n\t"	/* save ESP */		\
+		     "movl %5,%%esp\n\t"	/* restore ESP */	\
+		     "movl $1f,%1\n\t"		/* save EIP */		\
+		     "pushl %6\n\t"		/* restore EIP */	\
+		     "jmp __switch_to\n"				\
+		     "1:\t"						\
+		     "popl %%ebp\n\t"					\
+		     "popfl"						\
+		     :"=m" (prev->thread.sp), "=m" (prev->thread.ip),	\
+		      "=a" (last), "=S" (esi), "=D" (edi)		\
+		     :"m" (next->thread.sp), "m" (next->thread.ip),	\
+		      "2" (prev), "d" (next));				\
+} while (0)
+
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#else
+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER  \
+	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+	  "r12", "r13", "r14", "r15"
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+	asm volatile(SAVE_CONTEXT						    \
+	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
+	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
+	     "call __switch_to\n\t"					  \
+	     ".globl thread_return\n"					  \
+	     "thread_return:\n\t"					  \
+	     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
+	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
+	     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
+	     "movq %%rax,%%rdi\n\t" 					  \
+	     "jc   ret_from_fork\n\t"					  \
+	     RESTORE_CONTEXT						  \
+	     : "=a" (last)					  	  \
+	     : [next] "S" (next), [prev] "D" (prev),			  \
+	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
+	       [tif_fork] "i" (TIF_FORK),			  	  \
+	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
+	       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
+	     : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+	"rorl $16,%%edx\n\t" \
+	"movb %%dl,%2\n\t" \
+	"movb %%dh,%3" \
+	:"=&d" (__pr) \
+	:"m" (*((addr)+2)), \
+	 "m" (*((addr)+4)), \
+	 "m" (*((addr)+7)), \
+	 "0" (base) \
+	); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+	"rorl $16,%%edx\n\t" \
+	"movb %2,%%dh\n\t" \
+	"andb $0xf0,%%dh\n\t" \
+	"orb %%dh,%%dl\n\t" \
+	"movb %%dl,%2" \
+	:"=&d" (__lr) \
+	:"m" (*(addr)), \
+	 "m" (*((addr)+6)), \
+	 "0" (limit) \
+	); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+extern void load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value)			\
+	asm volatile("\n"			\
+		"1:\t"				\
+		"movl %k0,%%" #seg "\n"		\
+		"2:\n"				\
+		".section .fixup,\"ax\"\n"	\
+		"3:\t"				\
+		"movl %k1, %%" #seg "\n\t"	\
+		"jmp 2b\n"			\
+		".previous\n"			\
+		_ASM_EXTABLE(1b,3b)		\
+		: :"r" (value), "r" (0))
+
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+	asm volatile("mov %%" #seg ",%0":"=rm" (value))
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+	unsigned long __limit;
+	__asm__("lsll %1,%0"
+		:"=r" (__limit):"r" (segment));
+	return __limit+1;
+}
+
+static inline void xen_clts(void)
+{
+	HYPERVISOR_fpu_taskswitch(0);
+}
+
+static inline void xen_stts(void)
+{
+	HYPERVISOR_fpu_taskswitch(1);
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long xen_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline void xen_write_cr0(unsigned long val)
+{
+	asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
+}
+
+#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
+#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
+
+static inline unsigned long xen_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
+#ifdef CONFIG_X86_32
+	return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+#else
+	return machine_to_phys(val);
+#endif
+}
+
+static inline void xen_write_cr3(unsigned long val)
+{
+#ifdef CONFIG_X86_32
+	val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+#else
+	val = phys_to_machine(val);
+#endif
+	asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long xen_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
+	return val;
+}
+
+#define xen_read_cr4_safe() xen_read_cr4()
+
+static inline void xen_write_cr4(unsigned long val)
+{
+	asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+	return 0;
+}
+
+static inline void xen_write_cr8(unsigned long val)
+{
+	BUG_ON(val);
+}
+#endif
+
+static inline void xen_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+#define read_cr0()	(xen_read_cr0())
+#define write_cr0(x)	(xen_write_cr0(x))
+#define read_cr2()	(xen_read_cr2())
+#define write_cr2(x)	(xen_write_cr2(x))
+#define read_cr3()	(xen_read_cr3())
+#define write_cr3(x)	(xen_write_cr3(x))
+#define read_cr4()	(xen_read_cr4())
+#define read_cr4_safe()	(xen_read_cr4_safe())
+#define write_cr4(x)	(xen_write_cr4(x))
+#define wbinvd()	(xen_wbinvd())
+#ifdef CONFIG_X86_64
+#define read_cr8()	(xen_read_cr8())
+#define write_cr8(x)	(xen_write_cr8(x))
+#endif
+
+/* Clear the 'TS' bit */
+#define clts()		(xen_clts())
+#define stts()		(xen_stts())
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(volatile void *__p)
+{
+	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() __asm__ __volatile__ ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+extern int es7000_plat;
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
 #ifdef CONFIG_X86_32
-# include "system_32.h"
+/*
+ * For now, "wmb()" doesn't actually do anything, as all
+ * Intel CPU's follow what Intel calls a *Processor Order*,
+ * in which all writes are seen in the program order even
+ * outside the CPU.
+ *
+ * I expect future Intel CPU's to have a weaker ordering,
+ * but I'd also expect them to finally get their act together
+ * and add some real memory barriers if so.
+ *
+ * Some non intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() 	asm volatile("mfence":::"memory")
+#define rmb()	asm volatile("lfence":::"memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	b = 2;
+ *	memory_barrier();
+ *	p = &b;				q = p;
+ *					read_barrier_depends();
+ *					d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	a = 2;
+ *	memory_barrier();
+ *	b = 3;				y = b;
+ *					read_barrier_depends();
+ *					x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends()	do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb()	rmb()
 #else
-# include "system_64.h"
+# define smp_rmb()	barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() 	wmb()
+#else
+# define smp_wmb()	barrier()
+#endif
+#define smp_read_barrier_depends()	read_barrier_depends()
+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#define smp_read_barrier_depends()	do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/system_32.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,312 +0,0 @@
-#ifndef __ASM_SYSTEM_H
-#define __ASM_SYSTEM_H
-
-#include <linux/kernel.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
-#include <asm/cmpxchg.h>
-#include <asm/synch_bitops.h>
-#include <asm/hypervisor.h>
-
-#ifdef __KERNEL__
-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
-
-struct task_struct;	/* one of the stranger aspects of C forward declarations.. */
-extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
-
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
-#define switch_to(prev,next,last) do {					\
-	unsigned long esi,edi;						\
-	asm volatile("pushfl\n\t"		/* Save flags */	\
-		     "pushl %%ebp\n\t"					\
-		     "movl %%esp,%0\n\t"	/* save ESP */		\
-		     "movl %5,%%esp\n\t"	/* restore ESP */	\
-		     "movl $1f,%1\n\t"		/* save EIP */		\
-		     "pushl %6\n\t"		/* restore EIP */	\
-		     "jmp __switch_to\n"				\
-		     "1:\t"						\
-		     "popl %%ebp\n\t"					\
-		     "popfl"						\
-		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
-		      "=a" (last),"=S" (esi),"=D" (edi)			\
-		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
-		      "2" (prev), "d" (next));				\
-} while (0)
-
-#define _set_base(addr,base) do { unsigned long __pr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-	"rorl $16,%%edx\n\t" \
-	"movb %%dl,%2\n\t" \
-	"movb %%dh,%3" \
-	:"=&d" (__pr) \
-	:"m" (*((addr)+2)), \
-	 "m" (*((addr)+4)), \
-	 "m" (*((addr)+7)), \
-         "0" (base) \
-        ); } while(0)
-
-#define _set_limit(addr,limit) do { unsigned long __lr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-	"rorl $16,%%edx\n\t" \
-	"movb %2,%%dh\n\t" \
-	"andb $0xf0,%%dh\n\t" \
-	"orb %%dh,%%dl\n\t" \
-	"movb %%dl,%2" \
-	:"=&d" (__lr) \
-	:"m" (*(addr)), \
-	 "m" (*((addr)+6)), \
-	 "0" (limit) \
-        ); } while(0)
-
-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg,value)			\
-	asm volatile("\n"			\
-		"1:\t"				\
-		"mov %0,%%" #seg "\n"		\
-		"2:\n"				\
-		".section .fixup,\"ax\"\n"	\
-		"3:\t"				\
-		"pushl $0\n\t"			\
-		"popl %%" #seg "\n\t"		\
-		"jmp 2b\n"			\
-		".previous\n"			\
-		".section __ex_table,\"a\"\n\t"	\
-		".align 4\n\t"			\
-		".long 1b,3b\n"			\
-		".previous"			\
-		: :"rm" (value))
-
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value) \
-	asm volatile("mov %%" #seg ",%0":"=rm" (value))
-
-static inline void xen_clts(void)
-{
-	HYPERVISOR_fpu_taskswitch(0);
-}
-
-static inline unsigned long xen_read_cr0(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static inline void xen_write_cr0(unsigned long val)
-{
-	asm volatile("movl %0,%%cr0": :"r" (val));
-}
-
-#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
-
-static inline void xen_write_cr2(unsigned long val)
-{
-	asm volatile("movl %0,%%cr2": :"r" (val));
-}
-
-static inline unsigned long xen_read_cr3(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
-	return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
-}
-
-static inline void xen_write_cr3(unsigned long val)
-{
-	val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
-	asm volatile("movl %0,%%cr3": :"r" (val));
-}
-
-static inline unsigned long xen_read_cr4(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static inline unsigned long xen_read_cr4_safe(void)
-{
-	unsigned long val;
-	/* This could fault if %cr4 does not exist */
-	asm volatile("1: movl %%cr4, %0		\n"
-		"2:				\n"
-		".section __ex_table,\"a\"	\n"
-		".long 1b,2b			\n"
-		".previous			\n"
-		: "=r" (val): "0" (0));
-	return val;
-}
-
-static inline void xen_write_cr4(unsigned long val)
-{
-	asm volatile("movl %0,%%cr4": :"r" (val));
-}
-
-static inline void xen_wbinvd(void)
-{
-	asm volatile("wbinvd": : :"memory");
-}
-
-static inline void clflush(volatile void *__p)
-{
-	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
-#define read_cr0()	(xen_read_cr0())
-#define write_cr0(x)	(xen_write_cr0(x))
-#define read_cr2()	(xen_read_cr2())
-#define write_cr2(x)	(xen_write_cr2(x))
-#define read_cr3()	(xen_read_cr3())
-#define write_cr3(x)	(xen_write_cr3(x))
-#define read_cr4()	(xen_read_cr4())
-#define read_cr4_safe()	(xen_read_cr4_safe())
-#define write_cr4(x)	(xen_write_cr4(x))
-#define wbinvd()	(xen_wbinvd())
-
-/* Clear the 'TS' bit */
-#define clts()		(xen_clts())
-
-/* Set the 'TS' bit */
-#define stts() (HYPERVISOR_fpu_taskswitch(1))
-
-#endif	/* __KERNEL__ */
-
-static inline unsigned long get_limit(unsigned long segment)
-{
-	unsigned long __limit;
-	__asm__("lsll %1,%0"
-		:"=r" (__limit):"r" (segment));
-	return __limit+1;
-}
-
-#define nop() __asm__ __volatile__ ("nop")
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- *
- * For now, "wmb()" doesn't actually do anything, as all
- * Intel CPU's follow what Intel calls a *Processor Order*,
- * in which all writes are seen in the program order even
- * outside the CPU.
- *
- * I expect future Intel CPU's to have a weaker ordering,
- * but I'd also expect them to finally get their act together
- * and add some real memory barriers if so.
- *
- * Some non intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-
-
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends()	do { } while(0)
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb()	rmb()
-#else
-# define smp_rmb()	barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb() 	wmb()
-#else
-# define smp_wmb()	barrier()
-#endif
-#define smp_read_barrier_depends()	read_barrier_depends()
-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-#else
-#define smp_mb()	barrier()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while(0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-#include <linux/irqflags.h>
-
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-void disable_hlt(void);
-void enable_hlt(void);
-
-extern int es7000_plat;
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
-void __show_registers(struct pt_regs *, int all);
-
-#endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/system_64.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/system_64.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,122 +1,9 @@
 #ifndef __ASM_SYSTEM_H
 #define __ASM_SYSTEM_H

-#include <linux/kernel.h>
 #include <asm/segment.h>
 #include <asm/cmpxchg.h>

-#include <asm/synch_bitops.h>
-#include <asm/hypervisor.h>
-#include <xen/interface/arch-x86_64.h>
-
-#ifdef __KERNEL__
-
-/* entries in ARCH_DLINFO: */
-#ifdef CONFIG_IA32_EMULATION
-# define AT_VECTOR_SIZE_ARCH 2
-#else
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
-#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t"
-
-#define __EXTRA_CLOBBER  \
-	,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
-
-/* Save restore flags to clear handle leaking NT */
-#define switch_to(prev,next,last) \
-	asm volatile(SAVE_CONTEXT						    \
-		     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
-		     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
-		     "call __switch_to\n\t"					  \
-		     ".globl thread_return\n"					\
-		     "thread_return:\n\t"					    \
-		     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
-		     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
-		     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
-		     "movq %%rax,%%rdi\n\t" 					  \
-		     "jc   ret_from_fork\n\t"					  \
-		     RESTORE_CONTEXT						    \
-		     : "=a" (last)					  	  \
-		     : [next] "S" (next), [prev] "D" (prev),			  \
-		       [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
-		       [ti_flags] "i" (offsetof(struct thread_info, flags)),\
-		       [tif_fork] "i" (TIF_FORK),			  \
-		       [thread_info] "i" (offsetof(struct task_struct, stack)), \
-		       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
-		     : "memory", "cc" __EXTRA_CLOBBER)
-
-extern void load_gs_index(unsigned);
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg,value)	\
-	asm volatile("\n"			\
-		"1:\t"				\
-		"movl %k0,%%" #seg "\n"		\
-		"2:\n"				\
-		".section .fixup,\"ax\"\n"	\
-		"3:\t"				\
-		"movl %1,%%" #seg "\n\t" 	\
-		"jmp 2b\n"			\
-		".previous\n"			\
-		".section __ex_table,\"a\"\n\t"	\
-		".align 8\n\t"			\
-		".quad 1b,3b\n"			\
-		".previous"			\
-		: :"r" (value), "r" (0))
-
-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() (HYPERVISOR_fpu_taskswitch(0))
-
-static inline unsigned long read_cr0(void)
-{
-	unsigned long cr0;
-	asm volatile("movq %%cr0,%0" : "=r" (cr0));
-	return cr0;
-}
-
-static inline void write_cr0(unsigned long val)
-{
-	asm volatile("movq %0,%%cr0" :: "r" (val));
-}
-
-#define read_cr2() current_vcpu_info()->arch.cr2
-
-#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val)))
-
-#define read_cr3() ({ \
-	unsigned long __dummy; \
-	asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \
-	machine_to_phys(__dummy); \
-})
-
-static inline void write_cr3(unsigned long val)
-{
-	val = phys_to_machine(val);
-	asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
-}
-
-static inline unsigned long read_cr4(void)
-{
-	unsigned long cr4;
-	asm volatile("movq %%cr4,%0" : "=r" (cr4));
-	return cr4;
-}
-
-static inline void write_cr4(unsigned long val)
-{
-	asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
-}

 static inline unsigned long read_cr8(void)
 {
@@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo
 	BUG_ON(val);
 }

-#define stts() (HYPERVISOR_fpu_taskswitch(1))
-
-#define wbinvd() \
-	__asm__ __volatile__ ("wbinvd": : :"memory")
-
-#endif	/* __KERNEL__ */
-
-static inline void clflush(volatile void *__p)
-{
-	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
-#define nop() __asm__ __volatile__ ("nop")
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do {} while(0)
-#else
-#define smp_mb()	barrier()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do {} while(0)
-#endif
-
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#define mb() 	asm volatile("mfence":::"memory")
-#define rmb()	asm volatile("lfence":::"memory")
-#define wmb()	asm volatile("sfence" ::: "memory")
-
-#define read_barrier_depends()	do {} while(0)
-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-
-#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
-
 #include <linux/irqflags.h>

-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
 #endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:10:37.000000000 +0100
@@ -1,5 +1,106 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+
 #ifdef CONFIG_X86_32
-# include "tlbflush_32.h"
+# define TLB_FLUSH_ALL	0xffffffff
 #else
-# include "tlbflush_64.h"
+# define TLB_FLUSH_ALL	-1ULL
 #endif
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb() flushes the current mm struct TLBs
+ *  - flush_tlb_all() flushes all processes TLBs
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+	if (mm == current->active_mm)
+		__flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+				  unsigned long addr)
+{
+	if (vma->vm_mm == current->active_mm)
+		__flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
+{
+	if (vma->vm_mm == current->active_mm)
+		__flush_tlb();
+}
+
+#else  /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_all xen_tlb_flush_all
+#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
+#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
+#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
+
+#define flush_tlb()	flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
+{
+	flush_tlb_mm(vma->vm_mm);
+}
+
+#define TLBSTATE_OK	1
+#define TLBSTATE_LAZY	2
+
+#ifdef CONFIG_X86_32
+struct tlb_state
+{
+	struct mm_struct *active_mm;
+	int state;
+	char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+#endif
+
+#endif	/* SMP */
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+					  unsigned long end)
+{
+	flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/tlbflush_32.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,99 +0,0 @@
-#ifndef _I386_TLBFLUSH_H
-#define _I386_TLBFLUSH_H
-
-#include <linux/mm.h>
-#include <asm/processor.h>
-
-#define __flush_tlb() xen_tlb_flush()
-#define __flush_tlb_global() xen_tlb_flush()
-#define __flush_tlb_all() xen_tlb_flush()
-
-#define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
-
-#define __flush_tlb_single(addr) xen_invlpg(addr)
-
-#define __flush_tlb_one(addr) __flush_tlb_single(addr)
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *
- * ..but the i386 has somewhat limited tlb flushing capabilities,
- * and page-granular flushes are available only on i486 and up.
- */
-
-#define TLB_FLUSH_ALL	0xffffffff
-
-
-#ifndef CONFIG_SMP
-
-#include <linux/sched.h>
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-	if (mm == current->active_mm)
-		__flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-	unsigned long addr)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-	unsigned long start, unsigned long end)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb();
-}
-
-#else  /* SMP */
-
-#include <asm/smp.h>
-
-#define local_flush_tlb() \
-	__flush_tlb()
-
-#define flush_tlb_all xen_tlb_flush_all
-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
-
-#define flush_tlb()	flush_tlb_current_task()
-
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
-{
-	flush_tlb_mm(vma->vm_mm);
-}
-
-#define TLBSTATE_OK	1
-#define TLBSTATE_LAZY	2
-
-struct tlb_state
-{
-	struct mm_struct *active_mm;
-	int state;
-	char __cacheline_padding[L1_CACHE_BYTES-8];
-};
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
-#endif	/* SMP */
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					unsigned long end)
-{
-	flush_tlb_all();
-}
-
-#endif /* _I386_TLBFLUSH_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/tlbflush_64.h	2010-03-24 15:10:29.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,97 +0,0 @@
-#ifndef _X8664_TLBFLUSH_H
-#define _X8664_TLBFLUSH_H
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <asm/processor.h>
-#include <asm/system.h>
-
-#define __flush_tlb()	xen_tlb_flush()
-
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-#define __flush_tlb_global()	xen_tlb_flush()
-
-#define __flush_tlb_all() __flush_tlb_global()
-
-#define __flush_tlb_one(addr)	xen_invlpg((unsigned long)addr)
-
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
- */
-
-#ifndef CONFIG_SMP
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-	if (mm == current->active_mm)
-		__flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-	unsigned long addr)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-	unsigned long start, unsigned long end)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb();
-}
-
-#else
-
-#include <asm/smp.h>
-
-#define local_flush_tlb() \
-	__flush_tlb()
-
-#define flush_tlb_all xen_tlb_flush_all
-#define flush_tlb_current_task() xen_tlb_flush_mask(&current->mm->cpu_vm_mask)
-#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask)
-#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va)
-
-#define flush_tlb()	flush_tlb_current_task()
-
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
-{
-	flush_tlb_mm(vma->vm_mm);
-}
-
-#define TLBSTATE_OK	1
-#define TLBSTATE_LAZY	2
-
-/* Roughly an IPI every 20MB with 4k pages for freeing page table
-   ranges. Cost is about 42k of memory for each CPU. */
-#define ARCH_FREE_PTE_NR 5350
-
-#endif
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					unsigned long end)
-{
-	flush_tlb_all();
-}
-
-#endif /* _X8664_TLBFLUSH_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/irq_vectors.h	2008-09-25 13:55:32.000000000 +0200
+++ head-2010-04-29/arch/x86/include/mach-xen/irq_vectors.h	2010-03-24 15:10:37.000000000 +0100
@@ -82,7 +82,8 @@

 #define RESCHEDULE_VECTOR	0
 #define CALL_FUNCTION_VECTOR	1
-#define NR_IPIS			2
+#define SPIN_UNLOCK_VECTOR	2
+#define NR_IPIS			3

 /*
  * The maximum number of vectors supported by i386 processors
--- head-2010-04-29.orig/arch/x86/include/asm/mmu.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/arch/x86/include/asm/mmu.h	2010-03-24 15:10:37.000000000 +0100
@@ -18,7 +18,7 @@ typedef struct {
 	void *vdso;
 } mm_context_t;

-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 void leave_mm(int cpu);
 #else
 static inline void leave_mm(int cpu)
--- head-2010-04-29.orig/arch/x86/include/asm/ptrace.h	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/include/asm/ptrace.h	2010-03-24 15:12:22.000000000 +0100
@@ -298,7 +298,9 @@ static inline unsigned long regs_get_ker
 }

 #define arch_has_single_step()	(1)
-#ifdef CONFIG_X86_DEBUGCTLMSR
+#if defined(CONFIG_XEN)
+#define arch_has_block_step()	(0)
+#elif defined(CONFIG_X86_DEBUGCTLMSR)
 #define arch_has_block_step()	(1)
 #else
 #define arch_has_block_step()	(boot_cpu_data.x86 >= 6)
--- head-2010-04-29.orig/arch/x86/include/asm/thread_info.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/arch/x86/include/asm/thread_info.h	2010-03-24 15:10:37.000000000 +0100
@@ -96,6 +96,9 @@ struct thread_info {
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#ifdef CONFIG_X86_XEN
+#define TIF_CSTAR		31      /* cstar-based syscall (special handling) */
+#endif

 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -119,6 +122,7 @@ struct thread_info {
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_CSTAR		(1 << TIF_CSTAR)

 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -150,12 +154,12 @@ struct thread_info {
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)

-#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 #else
-#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG)
-#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC)
+#define _TIF_WORK_CTXSW (_TIF_NOTSC \
+     /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
 #endif
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)

 #define PREEMPT_ACTIVE		0x10000000

--- head-2010-04-29.orig/arch/x86/include/asm/time.h	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/include/asm/time.h	2010-03-24 15:10:37.000000000 +0100
@@ -7,4 +7,10 @@ extern void hpet_time_init(void);

 extern void time_init(void);

+#ifdef CONFIG_XEN
+extern int xen_independent_wallclock(void);
+extern unsigned long xen_read_persistent_clock(void);
+extern int xen_update_persistent_clock(void);
+#endif
+
 #endif /* _ASM_X86_TIME_H */
--- head-2010-04-29.orig/include/linux/page-flags.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-04-29/include/linux/page-flags.h	2010-03-24 15:10:37.000000000 +0100
@@ -129,8 +129,8 @@ enum pageflags {
 #ifdef CONFIG_XEN
 	PG_pinned = PG_locked,	/* Cannot alias with PG_owner_priv_1 since
 				 * bad_page() checks should include this bit.
-				 * Also cannot use PG_arch_1 since that now
-				 * has a different purpose on x86. */
+				 * Should not use PG_arch_1 as that may have
+				 * a different purpose elsewhere. */
 #else
 	PG_pinned = PG_owner_priv_1,
 	PG_savepinned = PG_dirty,
--- head-2010-04-29.orig/include/linux/pci.h	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-04-29/include/linux/pci.h	2010-03-24 15:10:37.000000000 +0100
@@ -781,6 +781,9 @@ int pci_reset_function(struct pci_dev *d
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
+#ifdef CONFIG_XEN
+void pci_restore_bars(struct pci_dev *);
+#endif

 /* ROM control related routines */
 int pci_enable_rom(struct pci_dev *pdev);
--- head-2010-04-29.orig/include/xen/evtchn.h	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-04-29/include/xen/evtchn.h	2010-03-24 15:10:37.000000000 +0100
@@ -133,12 +133,37 @@ static inline void clear_evtchn(int port
 	synch_clear_bit(port, s->evtchn_pending);
 }

+static inline void set_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	synch_set_bit(port, s->evtchn_pending);
+}
+
+static inline int test_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	return synch_test_bit(port, s->evtchn_pending);
+}
+
 static inline void notify_remote_via_evtchn(int port)
 {
 	struct evtchn_send send = { .port = port };
 	VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
 }

+/* Clear an irq's pending state, in preparation for polling on it. */
+void xen_clear_irq_pending(int irq);
+
+/* Set an irq's pending state, to avoid blocking on it. */
+void xen_set_irq_pending(int irq);
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq);
+
+/* Poll waiting for an irq to become pending.  In the usual case, the
+   irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq);
+
 /*
  * Use these to access the event channel underlying the IRQ handle returned
  * by bind_*_to_irqhandler().
--- head-2010-04-29.orig/kernel/sysctl_binary.c	2010-04-15 09:55:30.000000000 +0200
+++ head-2010-04-29/kernel/sysctl_binary.c	2010-04-15 09:55:52.000000000 +0200
@@ -875,7 +875,7 @@ static const struct bin_table bin_bus_ta


 #ifdef CONFIG_XEN
-static struct trans_ctl_table trans_xen_table[] = {
+static const struct trans_ctl_table trans_xen_table[] = {
 	{ CTL_XEN_INDEPENDENT_WALLCLOCK,	"independent_wallclock" },
 	{ CTL_XEN_PERMITTED_CLOCK_JITTER,	"permitted_clock_jitter" },
 	{}
--- head-2010-04-29.orig/lib/swiotlb-xen.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/lib/swiotlb-xen.c	2010-03-24 15:10:37.000000000 +0100
@@ -30,7 +30,6 @@
 #include <asm/gnttab_dma.h>

 int swiotlb;
-EXPORT_SYMBOL(swiotlb);

 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))

@@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c
 	}
 }

+static inline unsigned int is_span_boundary(unsigned int index,
+					    unsigned int nslots,
+					    unsigned long offset_slots,
+					    unsigned long max_slots)
+{
+	unsigned long offset = (offset_slots + index) & (max_slots - 1);
+	return offset + nslots > max_slots;
+}
+
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct
 	unsigned int nslots, stride, index, wrap;
 	struct phys_addr slot_buf;
 	int i;
+	unsigned long mask;
+	unsigned long offset_slots;
+	unsigned long max_slots;
+
+	mask = dma_get_seg_boundary(hwdev);
+	offset_slots = -IO_TLB_SEGSIZE;
+	max_slots = mask + 1
+		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);

 	/*
 	 * For mappings greater than a page, we limit the stride (and
@@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
 	{
-		wrap = index = ALIGN(io_tlb_index, stride);
-
+		index = ALIGN(io_tlb_index, stride);
 		if (index >= iotlb_nslabs)
-			wrap = index = 0;
+			index = 0;
+		wrap = index;

 		do {
+			while (is_span_boundary(index, nslots, offset_slots,
+						max_slots)) {
+				index += stride;
+				if (index >= iotlb_nslabs)
+					index = 0;
+				if (index == wrap)
+					goto not_found;
+			}
+
 			/*
 			 * If we find a slot that indicates we have 'nslots'
 			 * number of contiguous buffers, we allocate the
@@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct
 				index = 0;
 		} while (index != wrap);

+  not_found:
 		spin_unlock_irqrestore(&io_tlb_lock, flags);
 		return NULL;
 	}
--- head-2010-04-29.orig/mm/memory.c	2010-04-15 09:51:22.000000000 +0200
+++ head-2010-04-29/mm/memory.c	2010-04-15 09:55:57.000000000 +0200
@@ -2117,6 +2117,10 @@ int apply_to_page_range(struct mm_struct
 	unsigned long start = addr, end = addr + size;
 	int err;

+#ifdef CONFIG_XEN
+	if (!mm)
+		mm = &init_mm;
+#endif
 	BUG_ON(addr >= end);
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	pgd = pgd_offset(mm, addr);