qubes-linux-kernel/patches.xen/xen3-patch-2.6.27

From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: [PATCH] Linux: Update to 2.6.27
Patch-mainline: 2.6.27

 This patch contains the differences between Linux 2.6.26 and 2.6.27.

Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py

Removed adjustments NO_HZ -> NO_HZ || NO_IDLE_HZ from kernel/{hr,}timer.c,
as they would get removed again by xen-clockevents (and really shouldn't
have been needed - see SLE11 SPn).

--- head-2011-03-11.orig/arch/x86/Kconfig	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/Kconfig	2011-02-01 14:38:38.000000000 +0100
@@ -723,7 +723,7 @@ config AMD_IOMMU
 	bool "AMD IOMMU support"
 	select SWIOTLB
 	select PCI_MSI
-	depends on X86_64 && PCI && ACPI
+	depends on X86_64 && PCI && ACPI && !XEN
 	---help---
 	  With this option you can enable support for AMD IOMMU hardware in
 	  your system. An IOMMU is a hardware component which provides
@@ -1465,7 +1465,7 @@ config MTRR
 config MTRR_SANITIZER
 	def_bool y
 	prompt "MTRR cleanup support"
-	depends on MTRR
+	depends on MTRR && !XEN
 	---help---
 	  Convert MTRR layout from continuous to discrete, so X drivers can
 	  add writeback entries.
--- head-2011-03-11.orig/arch/x86/Kconfig.debug	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/Kconfig.debug	2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@ config STRICT_DEVMEM
 config X86_VERBOSE_BOOTUP
 	bool "Enable verbose x86 bootup info messages"
 	default y
+	depends on !XEN
 	---help---
 	  Enables the informational output from the decompression stage
 	  (e.g. bzImage) of the boot. If you disable this you will still
@@ -179,6 +180,7 @@ config IOMMU_LEAK

 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
+	depends on !XEN

 config X86_DECODER_SELFTEST
 	bool "x86 instruction decoder selftest"
--- head-2011-03-11.orig/arch/x86/Makefile	2011-02-01 14:11:04.000000000 +0100
+++ head-2011-03-11/arch/x86/Makefile	2011-02-01 14:38:38.000000000 +0100
@@ -117,8 +117,8 @@ endif
 KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)

 # Xen subarch support
-mflags-$(CONFIG_X86_XEN)	:= -Iinclude/asm-x86/mach-xen
-mcore-$(CONFIG_X86_XEN)		:= arch/x86/mach-xen/
+mflags-$(CONFIG_XEN)		:= -Iinclude/asm-x86/mach-xen
+mcore-$(CONFIG_XEN)		:= arch/x86/mach-xen/

 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
--- head-2011-03-11.orig/arch/x86/ia32/ia32entry-xen.S	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/ia32/ia32entry-xen.S	2011-02-01 14:38:38.000000000 +0100
@@ -15,6 +15,16 @@
 #include <asm/irqflags.h>
 #include <linux/linkage.h>

+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE	   0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysexit_audit int_ret_from_sys_call
+#define sysretl_audit int_ret_from_sys_call
+#endif
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)

 	.macro IA32_ARG_FIXUP noebp=0
@@ -37,6 +47,11 @@
 	movq	%rax,R8(%rsp)
 	.endm

+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %eax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
 	.macro LOAD_ARGS32 offset
 	movl \offset(%rsp),%r11d
 	movl \offset+8(%rsp),%r10d
@@ -46,7 +61,6 @@
 	movl \offset+48(%rsp),%edx
 	movl \offset+56(%rsp),%esi
 	movl \offset+64(%rsp),%edi
-	movl \offset+72(%rsp),%eax
 	.endm

 	.macro CFI_STARTPROC32 simple
@@ -61,6 +75,19 @@
 	CFI_UNDEFINED	r15
 	.endm

+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+	swapgs
+	sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+	swapgs
+	sti
+	sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
 /*
  * 32bit SYSENTER instruction entry.
  *
@@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_RESTORE	rcx
  	movl	%ebp,%ebp		/* zero extension */
 	movl	%eax,%eax
-	movl	48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
+	movl	48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
 	movl	$__USER32_DS,40(%rsp)
 	movq	%rbp,32(%rsp)
 	movl	$__USER32_CS,16(%rsp)
@@ -113,19 +140,75 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous
 	GET_THREAD_INFO(%r10)
-	orl    $TS_COMPAT,threadinfo_status(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl    $TS_COMPAT,TI_status(%r10)
+	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz  sysenter_tracesys
-sysenter_do_call:
 	cmpl	$(IA32_NR_syscalls-1),%eax
 	ja	ia32_badsys
+sysenter_do_call:
 	IA32_ARG_FIXUP 1
+sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX-ARGOFFSET(%rsp)
+	GET_THREAD_INFO(%r10)
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl	$_TIF_ALLWORK_MASK,TI_flags(%r10)
+	jnz	sysexit_audit
 	jmp int_ret_from_sys_call

+#ifdef CONFIG_AUDITSYSCALL
+	.macro auditsys_entry_common
+	movl %esi,%r9d			/* 6th arg: 4th syscall arg */
+	movl %edx,%r8d			/* 5th arg: 3rd syscall arg */
+	/* (already in %ecx)		   4th arg: 2nd syscall arg */
+	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
+	movl %eax,%esi			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
+	call audit_syscall_entry
+	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
+	cmpl $(IA32_NR_syscalls-1),%eax
+	ja ia32_badsys
+	movl %ebx,%edi			/* reload 1st syscall arg */
+	movl RCX-ARGOFFSET(%rsp),%esi	/* reload 2nd syscall arg */
+	movl RDX-ARGOFFSET(%rsp),%edx	/* reload 3rd syscall arg */
+	movl RSI-ARGOFFSET(%rsp),%ecx	/* reload 4th syscall arg */
+	movl RDI-ARGOFFSET(%rsp),%r8d	/* reload 5th syscall arg */
+	.endm
+
+	.macro auditsys_exit exit,ebpsave=RBP
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	jnz int_ret_from_sys_call
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	movl %eax,%esi		/* second arg, syscall return value */
+	cmpl $0,%eax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
+	movzbl %al,%edi		/* zero-extend that into %edi */
+	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
+	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp int_with_check
+	.endm
+
+sysenter_auditsys:
+	auditsys_entry_common
+	movl %ebp,%r9d			/* reload 6th syscall arg */
+	jmp sysenter_dispatch
+
+sysexit_audit:
+	auditsys_exit sysexit_from_sys_call
+#endif
+
 sysenter_tracesys:
 	xchgl	%r9d,%ebp
+#ifdef CONFIG_AUDITSYSCALL
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	jz	sysenter_auditsys
+#endif
 	SAVE_REST
 	CLEAR_RREGS
 	movq	%r9,R9(%rsp)
@@ -186,18 +269,38 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous
 	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,threadinfo_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl   $TS_COMPAT,TI_status(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz   cstar_tracesys
 cstar_do_call:
 	cmpl $IA32_NR_syscalls-1,%eax
 	ja  ia32_badsys
 	IA32_ARG_FIXUP 1
+cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX-ARGOFFSET(%rsp)
+	GET_THREAD_INFO(%r10)
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+	jnz sysretl_audit
 	jmp int_ret_from_sys_call

-cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+	movl %r9d,R9-ARGOFFSET(%rsp)	/* register to be clobbered by call */
+	auditsys_entry_common
+	movl R9-ARGOFFSET(%rsp),%r9d	/* reload 6th syscall arg */
+	jmp cstar_dispatch
+
+sysretl_audit:
+	auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	jz cstar_auditsys
+#endif
 	xchgl %r9d,%ebp
 	SAVE_REST
 	CLEAR_RREGS
@@ -263,8 +366,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	orl   $TS_COMPAT,threadinfo_status(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+	orl   $TS_COMPAT,TI_status(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:
 	cmpl $(IA32_NR_syscalls-1),%eax
@@ -309,13 +412,11 @@ quiet_ni_syscall:
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
 	PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
-	PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
 	PTREGSCALL stub32_execve, sys32_execve, %rcx
 	PTREGSCALL stub32_fork, sys_fork, %rdi
 	PTREGSCALL stub32_clone, sys32_clone, %rdx
 	PTREGSCALL stub32_vfork, sys_vfork, %rdi
 	PTREGSCALL stub32_iopl, sys_iopl, %rsi
-	PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx

 ENTRY(ia32_ptregs_common)
 	popq %r11
@@ -415,7 +516,7 @@ ia32_sys_call_table:
 	.quad sys_ssetmask
 	.quad sys_setreuid16	/* 70 */
 	.quad sys_setregid16
-	.quad stub32_sigsuspend
+	.quad sys32_sigsuspend
 	.quad compat_sys_sigpending
 	.quad sys_sethostname
 	.quad compat_sys_setrlimit	/* 75 */
@@ -522,7 +623,7 @@ ia32_sys_call_table:
 	.quad sys32_rt_sigpending
 	.quad compat_sys_rt_sigtimedwait
 	.quad sys32_rt_sigqueueinfo
-	.quad stub32_rt_sigsuspend
+	.quad sys_rt_sigsuspend
 	.quad sys32_pread		/* 180 */
 	.quad sys32_pwrite
 	.quad sys_chown16
@@ -670,4 +771,10 @@ ia32_sys_call_table:
 	.quad sys32_fallocate
 	.quad compat_sys_timerfd_settime	/* 325 */
 	.quad compat_sys_timerfd_gettime
+	.quad compat_sys_signalfd4
+	.quad sys_eventfd2
+	.quad sys_epoll_create1
+	.quad sys_dup3			/* 330 */
+	.quad sys_pipe2
+	.quad sys_inotify_init1
 ia32_syscall_end:
--- head-2011-03-11.orig/arch/x86/kernel/Makefile	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/Makefile	2011-02-01 14:38:38.000000000 +0100
@@ -125,9 +125,11 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o

-	obj-$(CONFIG_XEN)		+= nmi_64.o
+	obj-$(CONFIG_XEN)		+= nmi.o
 	time_64-$(CONFIG_XEN)		+= time_32.o
 endif

-disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
-	pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
+	i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
+	tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o
--- head-2011-03-11.orig/arch/x86/kernel/acpi/boot.c	2011-03-11 10:59:02.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/acpi/boot.c	2011-03-11 10:59:30.000000000 +0100
@@ -1349,6 +1349,7 @@ static int __init dmi_disable_acpi(const
 	return 0;
 }

+#ifndef CONFIG_XEN
 /*
  * Force ignoring BIOS IRQ0 pin2 override
  */
@@ -1366,6 +1367,7 @@ static int __init dmi_ignore_irq0_timer_
 	}
 	return 0;
 }
+#endif

 static int __init force_acpi_rsdt(const struct dmi_system_id *d)
 {
@@ -1486,6 +1488,7 @@ static struct dmi_system_id __initdata a
 	{}
 };

+#ifndef CONFIG_XEN
 /* second table for DMI checks that should run after early-quirks */
 static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 	/*
@@ -1532,6 +1535,7 @@ static struct dmi_system_id __initdata a
 	 },
 	{}
 };
+#endif

 /*
  * acpi_boot_table_init() and acpi_boot_init()
@@ -1604,8 +1608,10 @@ int __init early_acpi_boot_init(void)

 int __init acpi_boot_init(void)
 {
+#ifndef CONFIG_XEN
 	/* those are executed after early-quirks are executed */
 	dmi_check_system(acpi_dmi_table_late);
+#endif

 	/*
 	 * If acpi_disabled, bail out
--- head-2011-03-11.orig/arch/x86/kernel/acpi/sleep-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/acpi/sleep-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -9,6 +9,7 @@
 #include <linux/bootmem.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
+#include <asm/segment.h>

 #include "realmode/wakeup.h"
 #include "sleep.h"
@@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
 /* address in low memory of the wakeup routine. */
 static unsigned long acpi_realmode;

-#ifdef CONFIG_64BIT
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
 static char temp_stack[10240];
 #endif
 #endif
@@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
 	header->video_mode = saved_video_mode;

 	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+
+	/*
+	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
+	 * that is, with limits set to 4 GB.  At least the Lenovo
+	 * Thinkpad X61 is known to need this for the video BIOS
+	 * initialization quirk to work; this is likely to also
+	 * be the case for other laptops or integrated video devices.
+	 */
+
 	/* GDT[0]: GDT self-pointer */
 	header->wakeup_gdt[0] =
 		(u64)(sizeof(header->wakeup_gdt) - 1) +
 		((u64)(acpi_wakeup_address +
 			((char *)&header->wakeup_gdt - (char *)acpi_realmode))
 				<< 16);
-	/* GDT[1]: real-mode-like code segment */
-	header->wakeup_gdt[1] = (0x009bULL << 40) +
-		((u64)acpi_wakeup_address << 16) + 0xffff;
-	/* GDT[2]: real-mode-like data segment */
-	header->wakeup_gdt[2] = (0x0093ULL << 40) +
-		((u64)acpi_wakeup_address << 16) + 0xffff;
+	/* GDT[1]: big real mode-like code segment */
+	header->wakeup_gdt[1] =
+		GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
+	/* GDT[2]: big real mode-like data segment */
+	header->wakeup_gdt[2] =
+		GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);

 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
 #endif /* !CONFIG_64BIT */

 	header->pmode_cr0 = read_cr0();
-	header->pmode_cr4 = read_cr4();
+	header->pmode_cr4 = read_cr4_safe();
 	header->realmode_flags = acpi_realmode_flags;
 	header->real_magic = 0x12345678;

@@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
-	init_rsp = (unsigned long)temp_stack + 4096;
+#ifdef CONFIG_SMP
+	stack_start.sp = temp_stack + 4096;
+#endif
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
 #endif /* CONFIG_64BIT */
@@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
 			acpi_realmode_flags |= 2;
 		if (strncmp(str, "s3_beep", 7) == 0)
 			acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+		if (strncmp(str, "s4_nohwsig", 10) == 0)
+			acpi_no_s4_hw_signature();
+#endif
+		if (strncmp(str, "old_ordering", 12) == 0)
+			acpi_old_suspend_ordering();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
--- head-2011-03-11.orig/arch/x86/kernel/amd_nb.c	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/amd_nb.c	2011-02-01 14:38:38.000000000 +0100
@@ -15,6 +15,10 @@ static u32 *flush_words;
 struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+#ifdef CONFIG_XEN
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
+#endif
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
 	{}
 };
--- head-2011-03-11.orig/arch/x86/kernel/apic/apic-xen.c	2011-02-24 15:45:13.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/apic/apic-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
 /*
  * Debug level, exported for io_apic.c
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
+
+/* Have we found an MP table */
+int smp_found_config;

 #ifndef CONFIG_XEN
 static int modern_apic(void)
--- head-2011-03-11.orig/arch/x86/kernel/asm-offsets_64.c	2010-01-19 16:00:48.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/asm-offsets_64.c	2011-02-01 14:38:38.000000000 +0100
@@ -132,7 +132,7 @@ int main(void)

 	BLANK();
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 	BLANK();
 	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
--- head-2011-03-11.orig/arch/x86/kernel/cpu/amd.c	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/amd.c	2011-02-01 14:38:38.000000000 +0100
@@ -575,6 +575,7 @@ static void __cpuinit init_amd(struct cp
 		fam10h_check_enable_mmcfg();
 	}

+#ifndef CONFIG_XEN
 	if (c == &boot_cpu_data && c->x86 >= 0xf) {
 		unsigned long long tseg;

@@ -594,6 +595,7 @@ static void __cpuinit init_amd(struct cp
 		}
 	}
 #endif
+#endif
 }

 #ifdef CONFIG_X86_32
--- head-2011-03-11.orig/arch/x86/kernel/cpu/bugs_64.c	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/bugs_64.c	2011-02-01 14:38:38.000000000 +0100
@@ -20,6 +20,7 @@ void __init check_bugs(void)
 #endif
 	alternative_instructions();

+#ifndef CONFIG_XEN
 	/*
 	 * Make sure the first 2MB area is not mapped by huge pages
 	 * There are typically fixed size MTRRs in there and overlapping
@@ -30,4 +31,5 @@ void __init check_bugs(void)
 	 */
 	if (!direct_gbpages)
 		set_memory_4k((unsigned long)__va(0), 1);
+#endif
 }
--- head-2011-03-11.orig/arch/x86/kernel/cpu/common-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/common-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -13,6 +13,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/pat.h>
+#include <asm/asm.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
@@ -341,11 +342,24 @@ static void __init early_cpu_detect(void

 	get_cpu_vendor(c, 1);

+	early_get_cap(c);
+
 	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
 	    cpu_devs[c->x86_vendor]->c_early_init)
 		cpu_devs[c->x86_vendor]->c_early_init(c);
+}

-	early_get_cap(c);
+/*
+ * The NOPL instruction is supposed to exist on all CPUs with
+ * family >= 6; unfortunately, that's not true in practice because
+ * of early VIA chips and (more importantly) broken virtualizers that
+ * are not easy to detect.  In the latter case it doesn't even *fail*
+ * reliably, so probing for it doesn't even work.  Disable it completely
+ * unless we can find a reliable way to detect all the broken cases.
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+	clear_cpu_cap(c, X86_FEATURE_NOPL);
 }

 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -404,8 +418,8 @@ static void __cpuinit generic_identify(s
 		}

 		init_scattered_cpuid_features(c);
+		detect_nopl(c);
 	}
-
 }

 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
@@ -436,7 +450,7 @@ __setup("serialnumber", x86_serial_nr_se
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;

@@ -452,6 +466,8 @@ void __cpuinit identify_cpu(struct cpuin
 #endif
 	c->x86_clflush_size = 32;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	if (boot_cpu_has(X86_FEATURE_SYSCALL32))
+		set_cpu_cap(c, X86_FEATURE_SYSCALL32);

 	if (!have_cpuid_p()) {
 		/*
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/cpu/common_64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,777 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/linkage.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pat.h>
+#include <asm/asm.h>
+#include <asm/numa.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#elif defined(CONFIG_XEN)
+#include <mach_apic.h>
+#endif
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/genapic.h>
+
+#include "cpu.h"
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+#ifndef CONFIG_XEN
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+#else
+	void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
+	unsigned long frames[16];
+	unsigned int f = 0;
+
+	for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
+		frames[f++] = virt_to_mfn(va);
+		make_page_readonly(va, XENFEAT_writable_descriptor_tables);
+	}
+	if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
+		BUG();
+#endif
+}
+
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+	display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+	.c_init	= default_init,
+	.c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+	unsigned int *v;
+
+	if (c->extended_cpuid_level < 0x80000004)
+		return 0;
+
+	v = (unsigned int *) c->x86_model_id;
+	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+	c->x86_model_id[48] = 0;
+	return 1;
+}
+
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+	unsigned int n, dummy, ebx, ecx, edx;
+
+	n = c->extended_cpuid_level;
+
+	if (n >= 0x80000005) {
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+		       "D cache %dK (%d bytes/line)\n",
+		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+	}
+
+	if (n >= 0x80000006) {
+		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+		ecx = cpuid_ecx(0x80000006);
+		c->x86_cache_size = ecx >> 16;
+		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+		c->x86_cache_size, ecx & 0xFF);
+	}
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
+
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+
+		if (smp_num_siblings > NR_CPUS) {
+			printk(KERN_WARNING "CPU: Unsupported number of "
+			       "siblings %d", smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+
+		index_msb = get_count_order(smp_num_siblings);
+		c->phys_proc_id = phys_pkg_id(index_msb);
+
+		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+		index_msb = get_count_order(smp_num_siblings);
+
+		core_bits = get_count_order(c->x86_max_cores);
+
+		c->cpu_core_id = phys_pkg_id(index_msb) &
+					       ((1 << core_bits) - 1);
+	}
+out:
+	if ((c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	}
+
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+	char *v = c->x86_vendor_id;
+	int i;
+	static int printed;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		if (cpu_devs[i]) {
+			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+			    (cpu_devs[i]->c_ident[1] &&
+			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+				c->x86_vendor = i;
+				this_cpu = cpu_devs[i];
+				return;
+			}
+		}
+	}
+	if (!printed) {
+		printed++;
+		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: Your system may be unstable.\n");
+	}
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+static void __init early_cpu_support_print(void)
+{
+	int i,j;
+	struct cpu_dev *cpu_devx;
+
+	printk("KERNEL supported cpus:\n");
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		cpu_devx = cpu_devs[i];
+		if (!cpu_devx)
+			continue;
+		for (j = 0; j < 2; j++) {
+			if (!cpu_devx->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpu_devx->c_vendor,
+				cpu_devx->c_ident[j]);
+		}
+	}
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs with
+ * family >= 6, unfortunately, that's not true in practice because
+ * of early VIA chips and (more importantly) broken virtualizers that
+ * are not easy to detect.  Hence, probe for it based on first
+ * principles.
+ *
+ * Note: no 64-bit chip is known to lack these, but put the code here
+ * for consistency with 32 bits, and to make it utterly trivial to
+ * diagnose the problem should it ever surface.
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+	const u32 nopl_signature = 0x888c53b1; /* Random number */
+	u32 has_nopl = nopl_signature;
+
+	clear_cpu_cap(c, X86_FEATURE_NOPL);
+	if (c->x86 >= 6) {
+		asm volatile("\n"
+			     "1:      .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
+			     "2:\n"
+			     "        .section .fixup,\"ax\"\n"
+			     "3:      xor %0,%0\n"
+			     "        jmp 2b\n"
+			     "        .previous\n"
+			     _ASM_EXTABLE(1b,3b)
+			     : "+a" (has_nopl));
+
+		if (has_nopl == nopl_signature)
+			set_cpu_cap(c, X86_FEATURE_NOPL);
+	}
+}
+
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+void __init early_cpu_init(void)
+{
+        struct cpu_vendor_dev *cvdev;
+
+        for (cvdev = __x86cpuvendor_start ;
+             cvdev < __x86cpuvendor_end   ;
+             cvdev++)
+                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+	early_cpu_support_print();
+	early_identify_cpu(&boot_cpu_data);
+}
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+	u32 tfms, xlvl;
+
+	c->loops_per_jiffy = loops_per_jiffy;
+	c->x86_cache_size = -1;
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
+	c->x86_vendor_id[0] = '\0'; /* Unset */
+	c->x86_model_id[0] = '\0';  /* Unset */
+	c->x86_clflush_size = 64;
+	c->x86_cache_alignment = c->x86_clflush_size;
+#ifndef CONFIG_XEN
+	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
+#endif
+	c->extended_cpuid_level = 0;
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+	/* Get vendor name */
+	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+	      (unsigned int *)&c->x86_vendor_id[0],
+	      (unsigned int *)&c->x86_vendor_id[8],
+	      (unsigned int *)&c->x86_vendor_id[4]);
+
+	get_cpu_vendor(c);
+
+	/* Initialize the standard set of capabilities */
+	/* Note that the vendor-specific code below might override */
+
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		__u32 misc;
+		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+		      &c->x86_capability[0]);
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
+		if (c->x86 == 0xf)
+			c->x86 += (tfms >> 20) & 0xff;
+		if (c->x86 >= 0x6)
+			c->x86_model += ((tfms >> 16) & 0xF) << 4;
+		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+	} else {
+		/* Have CPUID level 0 only - unheard of */
+		c->x86 = 4;
+	}
+
+#ifndef CONFIG_XEN
+	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+#ifdef CONFIG_SMP
+	c->phys_proc_id = c->initial_apicid;
+#endif
+#endif
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = xlvl;
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[6] = cpuid_ecx(0x80000001);
+		}
+		if (xlvl >= 0x80000004)
+			get_model_name(c); /* Default name */
+	}
+
+	/* Transmeta-defined flags: level 0x80860001 */
+	xlvl = cpuid_eax(0x80860000);
+	if ((xlvl & 0xffff0000) == 0x80860000) {
+		/* Don't set x86_cpuid_level here for now to not confuse. */
+		if (xlvl >= 0x80860001)
+			c->x86_capability[2] = cpuid_edx(0x80860001);
+	}
+
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
+
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	detect_nopl(c);
+
+	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+	    cpu_devs[c->x86_vendor]->c_early_init)
+		cpu_devs[c->x86_vendor]->c_early_init(c);
+
+	validate_pat_support(c);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	early_identify_cpu(c);
+
+	init_scattered_cpuid_features(c);
+
+#ifndef CONFIG_XEN
+	c->apicid = phys_pkg_id(0);
+#endif
+
+	/*
+	 * Vendor-specific initialization.  In this section we
+	 * canonicalize the feature flags, meaning if there are
+	 * features a certain CPU supports which CPUID doesn't
+	 * tell us, CPUID claiming incorrect flags, or other bugs,
+	 * we handle them here.
+	 *
+	 * At the end of this section, c->x86_capability better
+	 * indicate the features this CPU genuinely supports!
+	 */
+	if (this_cpu->c_init)
+		this_cpu->c_init(c);
+
+	detect_ht(c);
+
+	/*
+	 * On SMP, boot_cpu_data holds the common feature set between
+	 * all CPUs; so make sure that we indicate which features are
+	 * common between the CPUs.  The first time this routine gets
+	 * executed, c == &boot_cpu_data.
+	 */
+	if (c != &boot_cpu_data) {
+		/* AND the already accumulated flags with these */
+		for (i = 0; i < NCAPINTS; i++)
+			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+	}
+
+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+#ifdef CONFIG_X86_MCE
+	mcheck_init(c);
+#endif
+	select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+	numa_add_cpu(smp_processor_id());
+#endif
+
+}
+
+void __cpuinit identify_boot_cpu(void)
+{
+	identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+	BUG_ON(c == &boot_cpu_data);
+	identify_cpu(c);
+	mtrr_ap_init();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+	if (c->x86_model_id[0])
+		printk(KERN_CONT "%s", c->x86_model_id);
+
+	if (c->x86_mask || c->cpuid_level >= 0)
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+	else
+		printk(KERN_CONT "\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+#ifndef CONFIG_X86_NO_IDT
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+#endif
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on	Enable(default)
+off	Disable
+*/
+static int __init nonx_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		do_not_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		do_not_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", nonx_setup);
+
+int force_personality32;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off	PROT_READ implies PROT_EXEC
+*/
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+static void __init_refok switch_pt(int cpu)
+{
+#ifdef CONFIG_XEN
+	if (cpu == 0)
+		xen_init_pt();
+	xen_pt_switch(__pa_symbol(init_level4_pgt));
+	xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
+#endif
+}
+
+void pda_init(int cpu)
+{
+	struct x8664_pda *pda = cpu_pda(cpu);
+
+	/* Setup up data that may be needed in __get_free_pages early */
+	loadsegment(fs, 0);
+	loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+	/* Memory clobbers used to order PDA accessed */
+	mb();
+	wrmsrl(MSR_GS_BASE, pda);
+	mb();
+#else
+	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+					(unsigned long)pda))
+		BUG();
+#endif
+
+	pda->cpunumber = cpu;
+	pda->irqcount = -1;
+	pda->kernelstack = (unsigned long)stack_thread_info() -
+				 PDA_STACKOFFSET + THREAD_SIZE;
+	pda->active_mm = &init_mm;
+	pda->mmu_state = 0;
+
+	if (cpu == 0) {
+		/* others are initialized in smpboot.c */
+		pda->pcurrent = &init_task;
+		pda->irqstackptr = boot_cpu_stack;
+		pda->irqstackptr += IRQSTACKSIZE - 64;
+	} else {
+		if (!pda->irqstackptr) {
+			pda->irqstackptr = (char *)
+				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+			if (!pda->irqstackptr)
+				panic("cannot allocate irqstack for cpu %d",
+				      cpu);
+			pda->irqstackptr += IRQSTACKSIZE - 64;
+		}
+
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
+
+	switch_pt(cpu);
+}
+
+#ifndef CONFIG_X86_NO_TSS
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+			   DEBUG_STKSZ] __page_aligned_bss;
+#endif
+
+extern asmlinkage void ignore_sysret(void);
+
+void __cpuinit syscall_init(void)
+{
+#ifndef CONFIG_XEN
+	/*
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to
+	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+	 */
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+	syscall32_cpu_init();
+#else
+	static const struct callback_register __cpuinitconst cstar = {
+		.type = CALLBACKTYPE_syscall32,
+		.address = (unsigned long)ignore_sysret
+	};
+
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
+		printk(KERN_WARNING "Unable to register CSTAR callback\n");
+#endif
+}
+
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || do_not_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+
+unsigned long kernel_eflags;
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init(void)
+{
+	int cpu = stack_smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+	unsigned long v;
+	char *estacks = NULL;
+	int i;
+#endif
+	struct task_struct *me;
+
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0)
+		pda_init(cpu);
+#ifndef CONFIG_X86_NO_TSS
+	else
+		estacks = boot_exception_stacks;
+#endif
+
+	me = current;
+
+	if (cpu_test_and_set(cpu, cpu_initialized))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+
+	switch_to_new_gdt();
+#ifndef CONFIG_X86_NO_IDT
+	load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier();
+
+	check_efer();
+
+#ifndef CONFIG_X86_NO_TSS
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	if (!orig_ist->ist[0]) {
+		static const unsigned int order[N_EXCEPTION_STACKS] = {
+		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+		};
+		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+			if (cpu) {
+				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+				if (!estacks)
+					panic("Cannot allocate exception "
+					      "stack %ld %d\n", v, cpu);
+			}
+			estacks += PAGE_SIZE << order[v];
+			orig_ist->ist[v] = t->x86_tss.ist[v] =
+					(unsigned long)estacks;
+		}
+	}
+
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+#endif
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	if (me->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, me);
+
+	load_sp0(t, &current->thread);
+#ifndef CONFIG_X86_NO_TSS
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+#endif
+	load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
+	/*
+	 * Clear all 6 debug registers:
+	 */
+
+	set_debugreg(0UL, 0);
+	set_debugreg(0UL, 1);
+	set_debugreg(0UL, 2);
+	set_debugreg(0UL, 3);
+	set_debugreg(0UL, 6);
+	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif
+
+	fpu_init();
+
+	asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
+	if (raw_irqs_disabled())
+		kernel_eflags &= ~X86_EFLAGS_IF;
+
+	if (is_uv_system())
+		uv_cpu_init();
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/e820-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,1553 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/firmware-map.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <xen/interface/memory.h>
+
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
+struct e820map e820;
+#ifndef CONFIG_XEN
+struct e820map e820_saved;
+#else
+static struct e820map machine_e820;
+#define e820_saved machine_e820
+#endif
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+#ifndef CONFIG_XEN
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+#else
+	if (!is_initial_xendomain())
+		return 0;
+	for (i = 0; i < machine_e820.nr_map; ++i) {
+		const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+		if (type && ei->type != type)
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+#ifndef CONFIG_XEN
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+#else
+	if (!is_initial_xendomain())
+		return 0;
+	for (i = 0; i < machine_e820.nr_map; ++i) {
+		const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/*
+		 * if start is now at or beyond end, we're done, full
+		 * coverage
+		 */
+		if (start >= end)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+	int x = e820.nr_map;
+
+	if (x == ARRAY_SIZE(e820.map)) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+
+	e820.map[x].addr = start;
+	e820.map[x].size = size;
+	e820.map[x].type = type;
+	e820.nr_map++;
+}
+
+static void __init _e820_print_map(const struct e820map *e820, const char *who)
+{
+	int i;
+
+	for (i = 0; i < e820->nr_map; i++) {
+		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		       (unsigned long long) e820->map[i].addr,
+		       (unsigned long long)
+		       (e820->map[i].addr + e820->map[i].size));
+		switch (e820->map[i].type) {
+		case E820_RAM:
+		case E820_RESERVED_KERN:
+			printk(KERN_CONT "(usable)\n");
+			break;
+		case E820_RESERVED:
+			printk(KERN_CONT "(reserved)\n");
+			break;
+		case E820_ACPI:
+			printk(KERN_CONT "(ACPI data)\n");
+			break;
+		case E820_NVS:
+			printk(KERN_CONT "(ACPI NVS)\n");
+			break;
+		default:
+			printk(KERN_CONT "type %u\n", e820->map[i].type);
+			break;
+		}
+	}
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ *	Visually we're performing the following
+ *	(1,2,3,4 = memory types)...
+ *
+ *	Sample memory map (w/overlaps):
+ *	   ____22__________________
+ *	   ______________________4_
+ *	   ____1111________________
+ *	   _44_____________________
+ *	   11111111________________
+ *	   ____________________33__
+ *	   ___________44___________
+ *	   __________33333_________
+ *	   ______________22________
+ *	   ___________________2222_
+ *	   _________111111111______
+ *	   _____________________11_
+ *	   _________________4______
+ *
+ *	Sanitized equivalent (no overlap):
+ *	   1_______________________
+ *	   _44_____________________
+ *	   ___1____________________
+ *	   ____22__________________
+ *	   ______11________________
+ *	   _________1______________
+ *	   __________3_____________
+ *	   ___________44___________
+ *	   _____________33_________
+ *	   _______________2________
+ *	   ________________1_______
+ *	   _________________4______
+ *	   ___________________2____
+ *	   ____________________33__
+ *	   ______________________4_
+ */
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+				int *pnr_map)
+{
+	struct change_member {
+		struct e820entry *pbios; /* pointer to original bios entry */
+		unsigned long long addr; /* address for this change point */
+	};
+	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+	static struct change_member *change_point[2*E820_X_MAX] __initdata;
+	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+	static struct e820entry new_bios[E820_X_MAX] __initdata;
+	struct change_member *change_tmp;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx, still_changing;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr, chg_nr;
+	int i;
+
+	/* if there's only one memory region, don't bother */
+#ifdef CONFIG_XEN
+	if (*pnr_map == 1)
+		return 0;
+#endif
+	if (*pnr_map < 2)
+		return -1;
+
+	old_nr = *pnr_map;
+	BUG_ON(old_nr > max_nr_map);
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i = 0; i < old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+			return -1;
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i = 0; i < 2 * old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses),
+	   omitting those that are for empty memory regions */
+	chgidx = 0;
+	for (i = 0; i < old_nr; i++)	{
+		if (biosmap[i].size != 0) {
+			change_point[chgidx]->addr = biosmap[i].addr;
+			change_point[chgidx++]->pbios = &biosmap[i];
+			change_point[chgidx]->addr = biosmap[i].addr +
+				biosmap[i].size;
+			change_point[chgidx++]->pbios = &biosmap[i];
+		}
+	}
+	chg_nr = chgidx;
+
+	/* sort change-point list by memory addresses (low -> high) */
+	still_changing = 1;
+	while (still_changing)	{
+		still_changing = 0;
+		for (i = 1; i < chg_nr; i++)  {
+			unsigned long long curaddr, lastaddr;
+			unsigned long long curpbaddr, lastpbaddr;
+
+			curaddr = change_point[i]->addr;
+			lastaddr = change_point[i - 1]->addr;
+			curpbaddr = change_point[i]->pbios->addr;
+			lastpbaddr = change_point[i - 1]->pbios->addr;
+
+			/*
+			 * swap entries, when:
+			 *
+			 * curaddr > lastaddr or
+			 * curaddr == lastaddr and curaddr == curpbaddr and
+			 * lastaddr != lastpbaddr
+			 */
+			if (curaddr < lastaddr ||
+			    (curaddr == lastaddr && curaddr == curpbaddr &&
+			     lastaddr != lastpbaddr)) {
+				change_tmp = change_point[i];
+				change_point[i] = change_point[i-1];
+				change_point[i-1] = change_tmp;
+				still_changing = 1;
+			}
+		}
+	}
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries = 0;	 /* number of entries in the overlap table */
+	new_bios_entry = 0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr ==
+		    change_point[chgidx]->pbios->addr) {
+			/*
+			 * add map entry to overlap list (> 1 entry
+			 * implies an overlap)
+			 */
+			overlap_list[overlap_entries++] =
+				change_point[chgidx]->pbios;
+		} else {
+			/*
+			 * remove entry from list (order independent,
+			 * so swap with last)
+			 */
+			for (i = 0; i < overlap_entries; i++) {
+				if (overlap_list[i] ==
+				    change_point[chgidx]->pbios)
+					overlap_list[i] =
+						overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/*
+		 * if there are overlapping entries, decide which
+		 * "type" to use (larger value takes precedence --
+		 * 1=usable, 2,3,4,4+=unusable)
+		 */
+		current_type = 0;
+		for (i = 0; i < overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/*
+		 * continue building up new bios map based on this
+		 * information
+		 */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/*
+				 * move forward only if the new size
+				 * was non-zero
+				 */
+				if (new_bios[new_bios_entry].size != 0)
+					/*
+					 * no more space left for new
+					 * bios entries ?
+					 */
+					if (++new_bios_entry >= max_nr_map)
+						break;
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr =
+					change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr = change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	/* retain count for new bios entries */
+	new_nr = new_bios_entry;
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	return 0;
+}
+
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+	while (nr_map) {
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		e820_add_region(start, size, type);
+
+		biosmap++;
+		nr_map--;
+	}
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+#else
+	BUG_ON(nr_map < 1);
+#endif
+
+	return __append_e820_map(biosmap, nr_map);
+}
+
+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+					u64 size, unsigned old_type,
+					unsigned new_type)
+{
+	unsigned int i, x;
+	u64 real_updated_size = 0;
+
+	BUG_ON(old_type == new_type);
+
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
+	for (i = 0; i < e820x->nr_map; i++) {
+		struct e820entry *ei = &e820x->map[i];
+		u64 final_start, final_end;
+		if (ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
+			ei->type = new_type;
+			real_updated_size += ei->size;
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+
+		x = e820x->nr_map;
+		if (x == ARRAY_SIZE(e820x->map)) {
+			printk(KERN_ERR "Too many memory map entries!\n");
+			break;
+		}
+		e820x->map[x].addr = final_start;
+		e820x->map[x].size = final_end - final_start;
+		e820x->map[x].type = new_type;
+		e820x->nr_map++;
+
+		real_updated_size += final_end - final_start;
+
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
+		ei->size -= final_end - final_start;
+	}
+	return real_updated_size;
+}
+
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+			     unsigned new_type)
+{
+	return e820_update_range_map(&e820, start, size, old_type, new_type);
+}
+
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+					  unsigned old_type, unsigned new_type)
+{
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain())
+		return e820_update_range_map(&machine_e820,
+					     phys_to_machine(start), size,
+					     old_type, new_type);
+#endif
+	return e820_update_range_map(&e820_saved, start, size, old_type,
+				     new_type);
+}
+
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+			     int checktype)
+{
+	int i;
+	u64 real_removed_size = 0;
+
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+
+		if (checktype && ei->type != old_type)
+			continue;
+		/* totally covered? */
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
+			real_removed_size += ei->size;
+			memset(ei, 0, sizeof(struct e820entry));
+			continue;
+		}
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(start + size, ei->addr + ei->size);
+		if (final_start >= final_end)
+			continue;
+		real_removed_size += final_end - final_start;
+
+		ei->size -= final_end - final_start;
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
+	}
+	return real_removed_size;
+}
+
+void __init update_e820(void)
+{
+	int nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	_e820_print_map(&e820, "modified");
+}
+static void __init update_e820_saved(void)
+{
+	int nr_map;
+
+	nr_map = e820_saved.nr_map;
+	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+		return;
+	e820_saved.nr_map = nr_map;
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+#define MAX_GAP_END 0x100000000ull
+/*
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
+ */
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+		unsigned long start_addr, unsigned long long end_addr)
+{
+	unsigned long long last;
+	int i = e820.nr_map;
+	int found = 0;
+
+	last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+#ifdef CONFIG_X86_64
+	if (start_addr >= MAX_GAP_END)
+		last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
+#endif
+
+	while (--i >= 0) {
+		unsigned long long start = e820.map[i].addr;
+		unsigned long long end = start + e820.map[i].size;
+
+		if (end < start_addr)
+			continue;
+
+		/*
+		 * Since "last" is at most 4GB, we know we'll
+		 * fit in 32 bits if this condition is true
+		 */
+		if (last > end) {
+			unsigned long gap = last - end;
+
+			if (gap >= *gapsize) {
+				*gapsize = gap;
+				*gapstart = end;
+				found = 1;
+			}
+		}
+		if (start < last)
+			last = start;
+	}
+	return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+	unsigned long gapstart, gapsize, round;
+	int found;
+
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+
+#ifdef CONFIG_X86_64
+	if (!found) {
+		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+		       "address range\n"
+		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
+		       "registers may break!\n");
+		found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
+		WARN_ON(!found);
+	}
+#endif
+
+	/*
+	 * See how much we want to round up: start off with
+	 * rounding to the next 1MB area.
+	 */
+	round = 0x100000;
+	while ((gapsize >> 4) > round)
+		round += round;
+	/* Fun with two's complement */
+	pci_mem_start = (gapstart + round) & -round;
+
+	printk(KERN_INFO
+	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+	       pci_mem_start, gapstart, gapsize);
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+	u32 map_len;
+	int entries;
+	struct e820entry *extmap;
+
+	entries = sdata->len / sizeof(struct e820entry);
+	map_len = sdata->len + sizeof(struct setup_data);
+	if (map_len > PAGE_SIZE)
+		sdata = early_ioremap(pa_data, map_len);
+	extmap = (struct e820entry *)(sdata->data);
+	__append_e820_map(extmap, entries);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	if (map_len > PAGE_SIZE)
+		early_iounmap(sdata, map_len);
+	printk(KERN_INFO "extended physical RAM map:\n");
+	_e820_print_map(&e820, "extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= limit_pfn)
+			break;
+	}
+}
+#endif
+#endif
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+	u64 start, end;
+	char name[16];
+	char overlap_ok;
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+#ifndef CONFIG_XEN
+	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+	/*
+	 * But first pinch a few for the stack/trampoline stuff
+	 * FIXME: Don't need the extra page at 4K, but need to fix
+	 * trampoline before removing it. (see the GDT stuff)
+	 */
+	{ PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+	/*
+	 * Has to be in very low memory so we can execute
+	 * real-mode AP code.
+	 */
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#endif
+	{}
+};
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+	int j;
+
+	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+}
+
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+	u64 lower_start, lower_end;
+	u64 upper_start, upper_end;
+	char name[16];
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+
+		/* Continue past non-overlapping ranges */
+		if (end <= r->start || start >= r->end)
+			continue;
+
+		/*
+		 * Leave non-ok overlaps as is; let caller
+		 * panic "Overlapping early reservations"
+		 * when it hits this overlap.
+		 */
+		if (!r->overlap_ok)
+			return;
+
+		/*
+		 * We have an ok overlap.  We will drop it from the early
+		 * reservation map, and add back in any non-overlapping
+		 * portions (lower or upper) as separate, overlap_ok,
+		 * non-overlapping ranges.
+		 */
+
+		/* 1. Note any non-overlapping (lower or upper) ranges. */
+		strncpy(name, r->name, sizeof(name) - 1);
+
+		lower_start = lower_end = 0;
+		upper_start = upper_end = 0;
+		if (r->start < start) {
+		 	lower_start = r->start;
+			lower_end = start;
+		}
+		if (r->end > end) {
+			upper_start = end;
+			upper_end = r->end;
+		}
+
+		/* 2. Drop the original ok overlapping range */
+		drop_range(i);
+
+		i--;		/* resume for-loop on copied down entry */
+
+		/* 3. Add back in any non-overlapping ranges. */
+		if (lower_end)
+			reserve_early_overlap_ok(lower_start, lower_end, name);
+		if (upper_end)
+			reserve_early_overlap_ok(upper_start, upper_end, name);
+	}
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+						int overlap_ok)
+{
+	int i;
+	struct early_res *r;
+
+	i = find_overlapped_early(start, end);
+	if (i >= MAX_EARLY_RES)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	if (r->end)
+		panic("Overlapping early reservations "
+		      "%llx-%llx %s to %llx-%llx %s\n",
+		      start, end - 1, name?name:"", r->start,
+		      r->end - 1, r->name);
+	r->start = start;
+	r->end = end;
+	r->overlap_ok = overlap_ok;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 0);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+	struct early_res *r;
+	int i;
+
+	i = find_overlapped_early(start, end);
+	r = &early_res[i];
+	if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+		panic("free_early on not reserved area: %llx-%llx!",
+			 start, end - 1);
+
+	drop_range(i);
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+	int i, count;
+	u64 final_start, final_end;
+
+	count  = 0;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+		count++;
+
+	printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+			 count, start, end);
+	for (i = 0; i < count; i++) {
+		struct early_res *r = &early_res[i];
+		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+			r->start, r->end, r->name);
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end) {
+			printk(KERN_CONT "\n");
+			continue;
+		}
+		printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+			final_start, final_end);
+		reserve_bootmem_generic(final_start, final_end - final_start,
+				BOOTMEM_DEFAULT);
+	}
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+	int i;
+	u64 addr = *addrp;
+	int changed = 0;
+	struct early_res *r;
+again:
+	i = find_overlapped_early(addr, addr + size);
+	r = &early_res[i];
+	if (i < MAX_EARLY_RES && r->end) {
+		*addrp = addr = round_up(r->end, align);
+		changed = 1;
+		goto again;
+	}
+	return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+			;
+		last = addr + size;
+		if (last > ei_last)
+			continue;
+		if (last > end)
+			continue;
+		return addr;
+	}
+	return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (bad_addr_size(&addr, sizep, align) &&
+			addr + *sizep <= ei_last)
+			;
+		last = addr + *sizep;
+		if (last > ei_last)
+			continue;
+		return addr;
+	}
+
+	return -1ULL;
+}
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+	u64 size = 0;
+	u64 addr;
+	u64 start;
+#ifdef CONFIG_XEN
+	unsigned int order = get_order(sizet);
+
+	if (is_initial_xendomain()) {
+		sizet = PAGE_SIZE << order;
+		if (align < PAGE_SIZE)
+			align = PAGE_SIZE;
+	}
+#endif
+	for (start = startt; ; start += size) {
+		start = find_e820_area_size(start, &size, align);
+		if (!(start + 1))
+			return 0;
+		if (size >= sizet)
+			break;
+	}
+
+#ifdef CONFIG_X86_32
+	if (start >= MAXMEM)
+		return 0;
+	if (start + size > MAXMEM)
+		size = MAXMEM - start;
+#endif
+#ifdef CONFIG_XEN
+	if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
+		return 0;
+	if (PFN_UP(start + size) > xen_start_info->nr_pages)
+		size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
+#endif
+
+	addr = round_down(start + size - sizet, align);
+	if (addr < start)
+		return 0;
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain()) {
+		int rc;
+		unsigned long max_initmap_pfn;
+
+		max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
+					       + xen_start_info->nr_pt_frames
+					       + 1 + (1 << (19 - PAGE_SHIFT)),
+					1UL << (22 - PAGE_SHIFT));
+#ifdef CONFIG_X86_32
+		if ((addr >> PAGE_SHIFT)
+		    < max(max_initmap_pfn, max_pfn_mapped))
+			rc = xen_create_contiguous_region((unsigned long)
+							  __va(addr),
+							  order, 32);
+#else
+		if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
+			rc = xen_create_contiguous_region((unsigned long)
+							  __va(addr),
+							  order, 32);
+		else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
+			rc = xen_create_contiguous_region(__START_KERNEL_map
+							  + addr,
+							  order, 32);
+#endif
+		else
+			rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
+							    order, 32);
+		if (rc)
+			return 0;
+	}
+#endif
+	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
+	printk(KERN_INFO "update e820 for early_reserve_e820\n");
+	update_e820();
+	update_e820_saved();
+
+	return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+#  define MAX_ARCH_PFN		(1ULL<<(40-PAGE_SHIFT))
+# else
+#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+{
+	int i;
+	unsigned long last_pfn = 0;
+	unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long start_pfn;
+		unsigned long end_pfn;
+
+		if (ei->type != type)
+			continue;
+
+		start_pfn = ei->addr >> PAGE_SHIFT;
+		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+		if (start_pfn >= limit_pfn)
+			continue;
+		if (end_pfn > limit_pfn) {
+			last_pfn = limit_pfn;
+			break;
+		}
+		if (end_pfn > last_pfn)
+			last_pfn = end_pfn;
+	}
+
+	if (last_pfn > max_arch_pfn)
+		last_pfn = max_arch_pfn;
+
+	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+			 last_pfn, max_arch_pfn);
+	return last_pfn;
+}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	u64 align = PAGE_SIZE;
+
+#ifdef CONFIG_XEN
+	if (last_pfn > xen_start_info->nr_pages)
+		last_pfn = xen_start_info->nr_pages;
+#endif
+
+	*ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+				    *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
+
+	return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn)
+{
+	unsigned long ei_startpfn;
+	unsigned long ei_endpfn;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++)
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			add_active_range(nid, ei_startpfn, ei_endpfn);
+#ifdef CONFIG_XEN
+	BUG_ON(nid);
+	add_active_range(nid, last_pfn, last_pfn);
+#endif
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_pfn = end >> PAGE_SHIFT;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			ram += ei_endpfn - ei_startpfn;
+	}
+	return end - start - ((u64)ram << PAGE_SHIFT);
+}
+
+static void early_panic(char *msg)
+{
+	early_printk(msg);
+	panic(msg);
+}
+
+static int userdef __initdata;
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+	u64 mem_size, current_end;
+	unsigned int i;
+
+	if (!p)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	if (!strcmp(p, "nopentium")) {
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
+		return 0;
+	}
+#endif
+
+	userdef = 1;
+	mem_size = memparse(p, &p);
+	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+	i = e820.nr_map - 1;
+	current_end = e820.map[i].addr + e820.map[i].size;
+	if (current_end < mem_size) {
+		/*
+		 * The e820 map ends before our requested size so
+		 * extend the final entry to the requested address.
+		 */
+		if (e820.map[i].type == E820_RAM)
+			e820.map[i].size = mem_size - e820.map[i].addr;
+		else
+			e820_add_region(current_end, mem_size - current_end, E820_RAM);
+	}
+
+	return 0;
+}
+early_param("mem", parse_memopt);
+
+#ifndef CONFIG_XEN
+static int __init parse_memmap_opt(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
+		 * reset.
+		 */
+		saved_max_pfn = e820_end_of_ram_pfn();
+#endif
+		e820.nr_map = 0;
+		userdef = 1;
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	userdef = 1;
+	if (*p == '@') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RAM);
+	} else if (*p == '#') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_ACPI);
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else
+		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+#endif
+
+void __init finish_e820_parsing(void)
+{
+	if (userdef) {
+		int nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		_e820_print_map(&e820, "user");
+	}
+}
+
+static inline const char *e820_type_to_string(int e820_type)
+{
+	switch (e820_type) {
+	case E820_RESERVED_KERN:
+	case E820_RAM:	return "System RAM";
+	case E820_ACPI:	return "ACPI Tables";
+	case E820_NVS:	return "ACPI Non-volatile Storage";
+	default:	return "reserved";
+	}
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+	int i;
+	struct resource *res;
+	u64 end;
+
+	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	for (i = 0; i < e820.nr_map; i++) {
+		end = e820.map[i].addr + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+		if (end > 0x100000000ULL) {
+			res++;
+			continue;
+		}
+#endif
+		res->name = e820_type_to_string(e820.map[i].type);
+		res->start = e820.map[i].addr;
+		res->end = end;
+
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		insert_resource(&iomem_resource, res);
+		res++;
+	}
+
+	for (i = 0; i < e820_saved.nr_map; i++) {
+		struct e820entry *entry = &e820_saved.map[i];
+		firmware_map_add_early(entry->addr,
+			entry->addr + entry->size - 1,
+			e820_type_to_string(entry->type));
+	}
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+char *__init default_machine_specific_memory_setup(void)
+{
+	char *who = "BIOS-e820";
+	int new_nr;
+	/*
+	 * Try to copy the BIOS-supplied E820-map.
+	 *
+	 * Otherwise fake a memory map; one section from 0k->640k,
+	 * the next section from 1mb->appropriate_mem_k
+	 */
+	new_nr = boot_params.e820_entries;
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&new_nr);
+	boot_params.e820_entries = new_nr;
+	if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
+	  < 0) {
+		u64 mem_size;
+
+		/* compare results from other methods and take the greater */
+		if (boot_params.alt_mem_k
+		    < boot_params.screen_info.ext_mem_k) {
+			mem_size = boot_params.screen_info.ext_mem_k;
+			who = "BIOS-88";
+		} else {
+			mem_size = boot_params.alt_mem_k;
+			who = "BIOS-e801";
+		}
+
+		e820.nr_map = 0;
+		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+	}
+
+	/* In case someone cares... */
+	return who;
+}
+
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+	if (x86_quirks->arch_memory_setup) {
+		char *who = x86_quirks->arch_memory_setup();
+
+		if (who)
+			return who;
+	}
+	return default_machine_specific_memory_setup();
+}
+#endif
+
+static char * __init _memory_setup(void)
+{
+	int rc, nr_map;
+	struct xen_memory_map memmap;
+	static struct e820entry __initdata map[E820MAX];
+
+	memmap.nr_entries = E820MAX;
+	set_xen_guest_handle(memmap.buffer, map);
+
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc == -ENOSYS) {
+		memmap.nr_entries = 1;
+		map[0].addr = 0ULL;
+		map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
+		/* 8MB slack (to balance backend allocations). */
+		map[0].size += 8ULL << 20;
+		map[0].type = E820_RAM;
+		rc = 0;
+	}
+	BUG_ON(rc);
+
+	nr_map = memmap.nr_entries;
+	sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
+
+	if (append_e820_map(map, nr_map) < 0)
+		BUG();
+
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain()) {
+		memmap.nr_entries = E820MAX;
+		set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+			BUG();
+		machine_e820.nr_map = memmap.nr_entries;
+	}
+#endif
+
+	return "Xen";
+}
+
+void __init setup_memory_map(void)
+{
+	char *who;
+
+	who = _memory_setup();
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain()) {
+		printk(KERN_INFO "Xen-provided machine memory map:\n");
+		_e820_print_map(&machine_e820, "BIOS");
+	} else
+#endif
+		memcpy(&e820_saved, &e820, sizeof(struct e820map));
+	printk(KERN_INFO "Xen-provided physical RAM map:\n");
+	_e820_print_map(&e820, who);
+}
--- head-2011-03-11.orig/arch/x86/kernel/e820_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,860 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <xen/interface/memory.h>
-
-struct e820map e820;
-struct change_member {
-	struct e820entry *pbios; /* pointer to original bios entry */
-	unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-#ifdef CONFIG_XEN
-	/* Nothing to do if not running in dom0. */
-	if (!is_initial_xendomain())
-		return;
-#endif
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
-#ifdef CONFIG_XEN
-static struct e820map machine_e820;
-#define e820 machine_e820
-#endif
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
-		struct resource *data_resource,
-		struct resource *bss_resource)
-{
-	int i;
-
-	probe_roms();
-	for (i = 0; i < e820.nr_map; i++) {
-		struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
-#endif
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res)) {
-			kfree(res);
-			continue;
-		}
-		if (e820.map[i].type == E820_RAM) {
-			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
-			 */
-#ifndef CONFIG_XEN
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#endif
-#ifdef CONFIG_KEXEC
-			if (crashk_res.start != crashk_res.end)
-				request_resource(res, &crashk_res);
-#ifdef CONFIG_XEN
-			xen_machine_kexec_register_resources(res);
-#endif
-#endif
-		}
-	}
-}
-
-#undef e820
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long pfn;
-
-	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (pfn < PFN_UP(ei->addr))
-			register_nosave_region(pfn, PFN_UP(ei->addr));
-
-		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr), pfn);
-
-		if (pfn >= max_low_pfn)
-			break;
-	}
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type)
-{
-	int x;
-
-	x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2) {
-		return -1;
-	}
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-			return -1;
-		}
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;    	/* true number of change-points */
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing=1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
-				if (new_bios[new_bios_entry].size != 0)
-					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-#ifndef CONFIG_XEN
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-#else
-	BUG_ON(nr_map < 1);
-#endif
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-
-#ifdef CONFIG_XEN
-	if (is_initial_xendomain()) {
-		struct xen_memory_map memmap;
-
-		memmap.nr_entries = E820MAX;
-		set_xen_guest_handle(memmap.buffer, machine_e820.map);
-
-		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
-			BUG();
-		machine_e820.nr_map = memmap.nr_entries;
-	} else
-		machine_e820 = e820;
-#endif
-
-	return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
-	int i;
-
-	max_pfn = 0;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-		/* RAM? */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		start = PFN_UP(e820.map[i].addr);
-		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-		if (start >= end)
-			continue;
-		if (end > max_pfn)
-			max_pfn = end;
-		memory_present(0, start, end);
-	}
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-#ifdef CONFIG_XEN
-		/*
-		 * Truncate to the number of actual pages currently
-		 * present.
-		 */
-		if (last_pfn > xen_start_info->nr_pages)
-			last_pfn = xen_start_info->nr_pages;
-#endif
-
-		if (last_pfn > max_low_pfn)
-			last_pfn = max_low_pfn;
-
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
-
-		size = last_pfn - curr_pfn;
-		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
-}
-
-void __init e820_register_memory(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
-
-#ifdef CONFIG_XEN
-#define e820 machine_e820
-#endif
-	/*
-	 * Search for the biggest gap in the low 32 bits of the e820
-	 * memory space.
-	 */
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-#undef e820
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-		pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(" %s: %016Lx - %016Lx ", who,
-			e820.map[i].addr,
-			e820.map[i].addr + e820.map[i].size);
-		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
-		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
-		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
-		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
-		}
-	}
-}
-
-void __init limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr = 0;
-	int i;
-
-	print_memory_map("limit_regions start");
-	for (i = 0; i < e820.nr_map; i++) {
-		current_addr = e820.map[i].addr + e820.map[i].size;
-		if (current_addr < size)
-			continue;
-
-		if (e820.map[i].type != E820_RAM)
-			continue;
-
-		if (e820.map[i].addr >= size) {
-			/*
-			 * This region starts past the end of the
-			 * requested size, skip it completely.
-			 */
-			e820.nr_map = i;
-		} else {
-			e820.nr_map = i + 1;
-			e820.map[i].size -= current_addr - size;
-		}
-		print_memory_map("limit_regions endfor");
-		return;
-	}
-#ifdef CONFIG_XEN
-	if (current_addr < size) {
-		/*
-		 * The e820 map finished before our requested size so
-		 * extend the final entry to the requested address.
-		 */
-		--i;
-		if (e820.map[i].type == E820_RAM)
-			e820.map[i].size -= current_addr - size;
-		else
-			add_memory_region(current_addr, size - current_addr, E820_RAM);
-	}
-#endif
-	print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
-	int i;
-
-#ifndef CONFIG_XEN
-	for (i = 0; i < e820.nr_map; i++) {
-		const struct e820entry *ei = &e820.map[i];
-#else
-	if (!is_initial_xendomain())
-		return 0;
-	for (i = 0; i < machine_e820.nr_map; ++i) {
-		const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-	u64 start = s;
-	u64 end = e;
-	int i;
-
-#ifndef CONFIG_XEN
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-#else
-	if (!is_initial_xendomain())
-		return 0;
-	for (i = 0; i < machine_e820.nr_map; ++i) {
-		const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full
-		 * coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
-		 * reset.
-		 */
-		propagate_e820_map();
-		saved_max_pfn = max_pfn;
-#endif
-		e820.nr_map = 0;
-		user_defined_memmap = 1;
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long start_at, mem_size;
-
-		mem_size = memparse(arg, &arg);
-		if (*arg == '@') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RAM);
-		} else if (*arg == '#') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_ACPI);
-		} else if (*arg == '$') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RESERVED);
-		} else {
-			limit_regions(mem_size);
-			user_defined_memmap = 1;
-		}
-	}
-	return 0;
-}
-early_param("memmap", parse_memmap);
-
-#ifndef CONFIG_XEN
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
-			ei->type = new_type;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-	}
-}
-
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	print_memory_map("modified");
-}
-#endif
--- head-2011-03-11.orig/arch/x86/kernel/e820_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,1052 +0,0 @@
-/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
- *
- *  Getting sanitize_e820_map() in sync with i386 version by applying change:
- *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *     Alex Achenbach <xela@slit.de>, December 2002.
- *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/pfn.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
-#include <xen/interface/memory.h>
-
-struct e820map e820 __initdata;
-#ifdef CONFIG_XEN
-struct e820map machine_e820;
-#endif
-
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-#ifndef CONFIG_XEN
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-#endif
-
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
-	unsigned long start, end;
-	char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
-#ifndef CONFIG_XEN
-	{ 0, PAGE_SIZE, "BIOS data page" },			/* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
-	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#endif
-	{}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
-	int i;
-	struct early_res *r;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (end > r->start && start < r->end)
-			panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
-			      start, end - 1, name?name:"", r->start, r->end - 1, r->name);
-	}
-	if (i >= MAX_EARLY_RES)
-		panic("Too many early reservations");
-	r = &early_res[i];
-	r->start = start;
-	r->end = end;
-	if (name)
-		strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
-	struct early_res *r;
-	int i, j;
-
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (start == r->start && end == r->end)
-			break;
-	}
-	if (i >= MAX_EARLY_RES || !early_res[i].end)
-		panic("free_early on not reserved area: %lx-%lx!", start, end);
-
-	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
-		;
-
-	memmove(&early_res[i], &early_res[i + 1],
-	       (j - 1 - i) * sizeof(struct early_res));
-
-	early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
-	int i;
-	unsigned long final_start, final_end;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		final_start = max(start, r->start);
-		final_end = min(end, r->end);
-		if (final_start >= final_end)
-			continue;
-		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
-			final_start, final_end - 1, r->name);
-		reserve_bootmem_generic(final_start, final_end - final_start);
-	}
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last >= r->start && addr < r->end) {
-			*addrp = addr = round_up(r->end, align);
-			changed = 1;
-			goto again;
-		}
-	}
-	return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	unsigned long size = *sizep;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last > r->start && addr < r->start) {
-			size = r->start - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last > r->end && addr < r->end) {
-			addr = round_up(r->end, align);
-			size = last - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last <= r->end && addr >= r->start) {
-			(*sizep)++;
-			return 0;
-		}
-	}
-	if (changed) {
-		*addrp = addr;
-		*sizep = size;
-	}
-	return changed;
-}
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{
-	int i;
-
-#ifndef CONFIG_XEN
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-#else
-	if (!is_initial_xendomain())
-		return 0;
-	for (i = 0; i < machine_e820.nr_map; i++) {
-		const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type)
-{
-	int i;
-
-#ifndef CONFIG_XEN
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-#else
-	if (!is_initial_xendomain())
-		return 0;
-	for (i = 0; i < machine_e820.nr_map; i++) {
-		const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/*
-		 * if start is now at or beyond end, we're done, full
-		 * coverage
-		 */
-		if (start >= end)
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
-				    unsigned long size, unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-			;
-		last = addr + size;
-		if (last > ei_last)
-			continue;
-		if (last > end)
-			continue;
-		return addr;
-	}
-	return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
-					 unsigned long *sizep,
-					 unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		*sizep = ei_last - addr;
-		while (bad_addr_size(&addr, sizep, align) &&
-			addr + *sizep <= ei_last)
-			;
-		last = addr + *sizep;
-		if (last > ei_last)
-			continue;
-		return addr;
-	}
-	return -1UL;
-
-}
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
-	unsigned long end_pfn;
-
-	end_pfn = find_max_pfn_with_active_regions();
-
-	if (end_pfn > max_pfn_mapped)
-		max_pfn_mapped = end_pfn;
-	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
-		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
-	if (end_pfn > end_user_pfn)
-		end_pfn = end_user_pfn;
-	if (end_pfn > max_pfn_mapped)
-		end_pfn = max_pfn_mapped;
-
-	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
-	return end_pfn;
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
-{
-	int i;
-	struct resource *res;
-
-	res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
-	for (i = 0; i < nr_map; i++) {
-		switch (e820[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820[i].addr;
-		res->end = res->start + e820[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		insert_resource(&iomem_resource, res);
-		res++;
-	}
-}
-
-#ifndef CONFIG_XEN
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long paddr;
-
-	paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (paddr < ei->addr)
-			register_nosave_region(PFN_DOWN(paddr),
-						PFN_UP(ei->addr));
-
-		paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr),
-						PFN_DOWN(paddr));
-
-		if (paddr >= (end_pfn << PAGE_SHIFT))
-			break;
-	}
-}
-#endif
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
-					  unsigned long start_pfn,
-					  unsigned long end_pfn,
-					  unsigned long *ei_startpfn,
-					  unsigned long *ei_endpfn)
-{
-#ifdef CONFIG_XEN
-	if (end_pfn > xen_start_info->nr_pages)
-		end_pfn = xen_start_info->nr_pages;
-#endif
-
-	*ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
-	*ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* Skip map entries smaller than a page */
-	if (*ei_startpfn >= *ei_endpfn)
-		return 0;
-
-	/* Check if max_pfn_mapped should be updated */
-	if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
-		max_pfn_mapped = *ei_endpfn;
-
-	/* Skip if map is outside the node */
-	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= end_pfn)
-		return 0;
-
-	/* Check for overlaps */
-	if (*ei_startpfn < start_pfn)
-		*ei_startpfn = start_pfn;
-	if (*ei_endpfn > end_pfn)
-		*ei_endpfn = end_pfn;
-
-	/* Obey end_user_pfn to save on memmap */
-	if (*ei_startpfn >= end_user_pfn)
-		return 0;
-	if (*ei_endpfn > end_user_pfn)
-		*ei_endpfn = end_user_pfn;
-
-	return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++)
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			add_active_range(nid, ei_startpfn, ei_endpfn);
-#ifdef CONFIG_XEN
-	BUG_ON(nid);
-	add_active_range(nid, end_pfn, end_pfn);
-#endif
-}
-
-/*
- * Add a memory region to the kernel e820 map.
- */
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
-{
-	int x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn, ei_endpfn, ram = 0;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			ram += ei_endpfn - ei_startpfn;
-	}
-	return end - start - (ram << PAGE_SHIFT);
-}
-
-static void __init e820_print_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-		       (unsigned long long) e820.map[i].addr,
-		       (unsigned long long)
-		       (e820.map[i].addr + e820.map[i].size));
-		switch (e820.map[i].type) {
-		case E820_RAM:
-			printk(KERN_CONT "(usable)\n");
-			break;
-		case E820_RESERVED:
-			printk(KERN_CONT "(reserved)\n");
-			break;
-		case E820_ACPI:
-			printk(KERN_CONT "(ACPI data)\n");
-			break;
-		case E820_NVS:
-			printk(KERN_CONT "(ACPI NVS)\n");
-			break;
-		default:
-			printk(KERN_CONT "type %u\n", e820.map[i].type);
-			break;
-		}
-	}
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
-{
-	struct change_member {
-		struct e820entry *pbios; /* pointer to original bios entry */
-		unsigned long long addr; /* address for this change point */
-	};
-	static struct change_member change_point_list[2*E820MAX] __initdata;
-	static struct change_member *change_point[2*E820MAX] __initdata;
-	static struct e820entry *overlap_list[E820MAX] __initdata;
-	static struct e820entry new_bios[E820MAX] __initdata;
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following
-		(1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2)
-		return -1;
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i = 0; i < old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
-			return -1;
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i = 0; i < 2 * old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i = 0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr +
-				biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
-			unsigned long long curaddr, lastaddr;
-			unsigned long long curpbaddr, lastpbaddr;
-
-			curaddr = change_point[i]->addr;
-			lastaddr = change_point[i - 1]->addr;
-			curpbaddr = change_point[i]->pbios->addr;
-			lastpbaddr = change_point[i - 1]->pbios->addr;
-
-			/*
-			 * swap entries, when:
-			 *
-			 * curaddr > lastaddr or
-			 * curaddr == lastaddr and curaddr == curpbaddr and
-			 * lastaddr != lastpbaddr
-			 */
-			if (curaddr < lastaddr ||
-			    (curaddr == lastaddr && curaddr == curpbaddr &&
-			     lastaddr != lastpbaddr)) {
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing = 1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries = 0;	 /* number of entries in the overlap table */
-	new_bios_entry = 0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr ==
-		    change_point[chgidx]->pbios->addr) {
-			/*
-			 * add map entry to overlap list (> 1 entry
-			 * implies an overlap)
-			 */
-			overlap_list[overlap_entries++] =
-				change_point[chgidx]->pbios;
-		} else {
-			/*
-			 * remove entry from list (order independent,
-			 * so swap with last)
-			 */
-			for (i = 0; i < overlap_entries; i++) {
-				if (overlap_list[i] ==
-				    change_point[chgidx]->pbios)
-					overlap_list[i] =
-						overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/*
-		 * if there are overlapping entries, decide which
-		 * "type" to use (larger value takes precedence --
-		 * 1=usable, 2,3,4,4+=unusable)
-		 */
-		current_type = 0;
-		for (i = 0; i < overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/*
-		 * continue building up new bios map based on this
-		 * information
-		 */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/*
-				 * move forward only if the new size
-				 * was non-zero
-				 */
-				if (new_bios[new_bios_entry].size != 0)
-					/*
-					 * no more space left for new
-					 * bios entries ?
-					 */
-					if (++new_bios_entry >= E820MAX)
-						break;
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr =
-					change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr = change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	/* retain count for new bios entries */
-	new_nr = new_bios_entry;
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-#ifndef CONFIG_XEN
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-#else
-	BUG_ON(nr_map < 1);
-#endif
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-
-#ifdef CONFIG_XEN
-	if (is_initial_xendomain()) {
-		struct xen_memory_map memmap;
-
-		memmap.nr_entries = E820MAX;
-		set_xen_guest_handle(memmap.buffer, machine_e820.map);
-
-		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
-			BUG();
-		machine_e820.nr_map = memmap.nr_entries;
-	} else
-		machine_e820 = e820;
-#endif
-
-	return 0;
-}
-
-static void early_panic(char *msg)
-{
-	early_printk(msg);
-	panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
-{
-#ifndef CONFIG_XEN
-	char *who = "BIOS-e820";
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
-		early_panic("Cannot find a valid memory map");
-#else  /* CONFIG_XEN */
-	char *who = "Xen";
-	int rc;
-	struct xen_memory_map memmap;
-	static struct e820entry __initdata map[E820MAX];
-
-	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
-
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if ( rc == -ENOSYS ) {
-		memmap.nr_entries = 1;
-		map[0].addr = 0ULL;
-		map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
-		/* 8MB slack (to balance backend allocations). */
-		map[0].size += 8 << 20;
-		map[0].type = E820_RAM;
-		rc = 0;
-	}
-	BUG_ON(rc);
-
-	sanitize_e820_map(map, (char *)&memmap.nr_entries);
-
-	if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
-		early_panic("Cannot find a valid memory map");
-#endif
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(who);
-
-	/* In case someone cares... */
-	return who;
-}
-
-static int __init parse_memopt(char *p)
-{
-	int i;
-	unsigned long current_end;
-	unsigned long end;
-
-	if (!p)
-		return -EINVAL;
-	end_user_pfn = memparse(p, &p);
-	end_user_pfn >>= PAGE_SHIFT;
-
-	end = end_user_pfn<<PAGE_SHIFT;
-	i = e820.nr_map-1;
-	current_end = e820.map[i].addr + e820.map[i].size;
-
-	if (current_end < end) {
-		/*
-                 * The e820 map ends before our requested size so
-                 * extend the final entry to the requested address.
-                 */
-		if (e820.map[i].type == E820_RAM)
-			e820.map[i].size = end - e820.map[i].addr;
-		else
-			add_memory_region(current_end, end - current_end, E820_RAM);
-	}
-
-	return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
-	char *oldp;
-	unsigned long long start_at, mem_size;
-
-	if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
-		/*
-		 * If we are doing a crash dump, we still need to know
-		 * the real mem size before original memory map is
-		 * reset.
-		 */
-		e820_register_active_regions(0, 0, -1UL);
-		saved_max_pfn = e820_end_of_ram();
-		remove_all_active_ranges();
-#endif
-		max_pfn_mapped = 0;
-		e820.nr_map = 0;
-		userdef = 1;
-		return 0;
-	}
-
-	oldp = p;
-	mem_size = memparse(p, &p);
-	if (p == oldp)
-		return -EINVAL;
-
-	userdef = 1;
-	if (*p == '@') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RAM);
-	} else if (*p == '#') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_ACPI);
-	} else if (*p == '$') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RESERVED);
-	} else {
-		end_user_pfn = (mem_size >> PAGE_SHIFT);
-	}
-	return *p == '\0' ? 0 : -EINVAL;
-}
-early_param("memmap", parse_memmap_opt);
-
-void __init finish_e820_parsing(void)
-{
-	if (userdef) {
-		char nr = e820.nr_map;
-
-		if (sanitize_e820_map(e820.map, &nr) < 0)
-			early_panic("Invalid user supplied memory map");
-		e820.nr_map = nr;
-
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		e820_print_map("user");
-	}
-}
-
-#ifndef CONFIG_XEN
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
-			ei->type = new_type;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-	}
-}
-
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	e820_print_map("modified");
-}
-#endif
-
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
-/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.  We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
- */
-__init void e820_setup_gap(struct e820entry *e820, int nr_map)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long last;
-	int i;
-	int found = 0;
-
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820[i].addr;
-		unsigned long long end = start + e820[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-				found = 1;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	if (!found) {
-		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
-		       "address range\n"
-		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
-		       "registers may break!\n");
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk(KERN_INFO
-	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-	       pci_mem_start, gapstart, gapsize);
-}
-
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-	int i;
-
-	if (slot < 0 || slot >= e820.nr_map)
-		return -1;
-	for (i = slot; i < e820.nr_map; i++) {
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		break;
-	}
-	if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-		return -1;
-	*addr = e820.map[i].addr;
-	*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-		max_pfn << PAGE_SHIFT) - *addr;
-	return i + 1;
-}
--- head-2011-03-11.orig/arch/x86/kernel/early_printk-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/early_printk-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -225,7 +225,7 @@ static struct console simnow_console = {
 static struct console *early_console = &early_vga_console;
 static int early_console_initialized;

-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
 {
 	char buf[512];
 	int n;
--- head-2011-03-11.orig/arch/x86/kernel/entry_32-xen.S	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_32-xen.S	2011-02-01 14:38:38.000000000 +0100
@@ -51,15 +51,26 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
 #include <xen/interface/xen.h>

+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE	   0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit	syscall_trace_entry
+#define sysexit_audit	syscall_exit_work
+#endif
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
  *   INTERRUPT_RETURN (aka. "iret")
  *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
  *
  * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
  * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -277,11 +288,6 @@ END(resume_kernel)
 #endif
 	CFI_ENDPROC

-	.macro test_tif ti_reg		# system call tracing in operation / emulation
-	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
-	.endm
-
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */

@@ -338,8 +344,9 @@ sysenter_past_esp:
 .previous

 	GET_THREAD_INFO(%ebp)
-	test_tif %ebp
-	jnz syscall_trace_entry
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	jnz sysenter_audit
+sysenter_do_call:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
@@ -349,14 +356,54 @@ sysenter_past_esp:
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
-	jne syscall_exit_work
+	jne sysexit_audit
+sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl PT_EIP(%esp), %edx
 	movl PT_OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
-	ENABLE_INTERRUPTS_SYSCALL_RET
+	ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+	testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	/* %esi already in 8(%esp)	   6th arg: 4th syscall arg */
+	/* %edx already in 4(%esp)	   5th arg: 3rd syscall arg */
+	/* %ecx already in 0(%esp)	   4th arg: 2nd syscall arg */
+	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
+	movl %eax,%edx			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
+	call audit_syscall_entry
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	movl PT_EAX(%esp),%eax		/* reload syscall number */
+	jmp sysenter_do_call
+
+sysexit_audit:
+	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	jne syscall_exit_work
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	movl %eax,%edx		/* second arg, syscall return value */
+	cmpl $0,%eax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
+	movzbl %al,%eax		/* zero-extend that */
+	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	jne syscall_exit_work
+	movl PT_EAX(%esp),%eax	/* reload syscall return value */
+	jmp sysenter_exit
+#endif
+
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
@@ -400,7 +447,7 @@ ENTRY(system_call)
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	test_tif %ebp
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -413,10 +460,6 @@ syscall_exit:
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
-	testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
-	jz no_singlestep
-	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
@@ -588,12 +631,8 @@ END(work_pending)
 syscall_trace_entry:
 	movl $-ENOSYS,PT_EAX(%esp)
 	movl %esp, %eax
-	xorl %edx,%edx
-	call do_syscall_trace
-	cmpl $0, %eax
-	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
-					# so must skip actual syscall
-	movl PT_ORIG_EAX(%esp), %eax
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
@@ -602,14 +641,13 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+	testb $_TIF_WORK_SYSCALL_EXIT, %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
 					# schedule() instead
 	movl %esp, %eax
-	movl $1, %edx
-	call do_syscall_trace
+	call syscall_trace_leave
 	jmp resume_userspace
 END(syscall_exit_work)
 	CFI_ENDPROC
@@ -1113,10 +1151,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)

-ENTRY(native_irq_enable_syscall_ret)
+ENTRY(native_irq_enable_sysexit)
 	sti
 	sysexit
-END(native_irq_enable_syscall_ret)
+END(native_irq_enable_sysexit)
 #endif

 KPROBE_ENTRY(int3)
@@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)

+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	subl $MCOUNT_INSN_SIZE, %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+	call *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 #include <asm/alternative-asm.h>

 	# pv syscall call handler stub
@@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
 .previous
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	test_tif %ebp
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz cstar_trace_entry
 	cmpl $nr_syscalls,%eax
 	jae cstar_badsys
@@ -1324,29 +1433,21 @@ cstar_trace_entry:
 	btl %eax,cstar_special
 	jc .Lcstar_trace_special
 1:	movl %esp,%eax
-	xorl %edx,%edx
 	LOCK_PREFIX
 	orl $_TIF_CSTAR,TI_flags(%ebp)
-	call do_syscall_trace
+	call syscall_trace_enter
 	LOCK_PREFIX
 	andl $~_TIF_CSTAR,TI_flags(%ebp)
-	testl %eax,%eax
-	jne .Lcstar_resume		# ret != 0 -> running under PTRACE_SYSEMU,
-					# so must skip actual syscall
-	movl PT_ORIG_EAX(%esp),%eax
+	/* What it returned is what we'll actually use.  */
 	cmpl $nr_syscalls,%eax
 	jb .Lcstar_call
 	jmp .Lcstar_exit
 .Lcstar_trace_special:
 	movl PT_ECX(%esp),%ecx
 	movl %esp,%eax
-	xorl %edx,%edx
 	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
-	call do_syscall_trace
-	testl %eax,%eax
-	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
-					# so must skip actual syscall
-	movl PT_ORIG_EAX(%esp),%eax
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
 	cmpl $nr_syscalls,%eax
 	jb syscall_call
 	jmp syscall_exit
--- head-2011-03-11.orig/arch/x86/kernel/entry_64.S	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_64.S	2011-02-16 16:02:30.000000000 +0100
@@ -1253,7 +1253,7 @@ ENTRY(arch_unwind_init_running)
 END(arch_unwind_init_running)
 #endif

-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback

 /*
@@ -1353,7 +1353,7 @@ END(xen_failsafe_callback)
 apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
 	xen_hvm_callback_vector xen_evtchn_do_upcall

-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */

 /*
  * Some functions should be protected against kprobes
--- head-2011-03-11.orig/arch/x86/kernel/entry_64-xen.S	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_64-xen.S	2011-02-01 14:38:38.000000000 +0100
@@ -53,12 +53,124 @@
 #include <asm/hw_irq.h>
 #include <asm/page.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 #include <asm/errno.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/features.h>

+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE	   0x40000000
+
 	.code64

+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif
@@ -95,7 +207,7 @@ NMI_MASK = 0x80000000
 	.macro FAKE_STACK_FRAME child_rip
 	/* push in order ss, rsp, eflags, cs, rip */
 	xorl %eax, %eax
-	pushq %rax /* ss */
+	pushq $__KERNEL_DS /* ss */
 	CFI_ADJUST_CFA_OFFSET	8
 	/*CFI_REL_OFFSET	ss,0*/
 	pushq %rax /* rsp */
@@ -190,13 +302,13 @@ ENTRY(ret_from_fork)
 	CFI_ADJUST_CFA_OFFSET -4
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jnz rff_trace
 rff_action:
 	RESTORE_REST
 	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
 	je   int_ret_from_sys_call
-	testl $_TIF_IA32,threadinfo_flags(%rcx)
+	testl $_TIF_IA32,TI_flags(%rcx)
 	jnz  int_ret_from_sys_call
 	RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
 	jmp ret_from_sys_call
@@ -258,8 +370,9 @@ ENTRY(system_call)
 	SAVE_ARGS -8,0
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
+system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
@@ -277,7 +390,7 @@ sysret_check:
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	jnz  sysret_careful
 	CFI_REMEMBER_STATE
@@ -308,16 +421,16 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz    1f
-
-	/* Really a signal */
+#ifdef CONFIG_AUDITSYSCALL
+	bt $TIF_SYSCALL_AUDIT,%edx
+	jc sysret_audit
+#endif
 	/* edx:	work flags (arg3) */
 	leaq do_notify_resume(%rip),%rax
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+	movl $_TIF_WORK_MASK,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -328,14 +441,56 @@ badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
 	jmp ret_from_sys_call

+#ifdef CONFIG_AUDITSYSCALL
+	/*
+	 * Fast path for syscall audit without full syscall trace.
+	 * We just call audit_syscall_entry() directly, and then
+	 * jump back to the normal fast path.
+	 */
+auditsys:
+	movq %r10,%r9			/* 6th arg: 4th syscall arg */
+	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
+	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
+	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
+	movq %rax,%rsi			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
+	call audit_syscall_entry
+	LOAD_ARGS 0		/* reload call-clobbered registers */
+	jmp system_call_fastpath
+
+	/*
+	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
+	 * masked off.
+	 */
+sysret_audit:
+	movq %rax,%rsi		/* second arg, syscall return value */
+	cmpq $0,%rax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
+	movzbl %al,%edi		/* zero-extend that into %edi */
+	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+	jmp sysret_check
+#endif	/* CONFIG_AUDITSYSCALL */
+
 	/* Do syscall tracing */
 tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	jz auditsys
+#endif
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %rax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
+	LOAD_ARGS ARGOFFSET, 1
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
@@ -349,6 +504,7 @@ tracesys:
  * Has correct top of stack, but partial stack frame.
  */
 	.globl int_ret_from_sys_call
+	.globl int_with_check
 int_ret_from_sys_call:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -363,10 +519,10 @@ int_ret_from_sys_call:
 int_with_check:
 	LOCKDEP_SYS_EXIT_IRQ
 	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	jnz   int_careful
-	andl    $~TS_COMPAT,threadinfo_status(%rcx)
+	andl    $~TS_COMPAT,TI_status(%rcx)
 	jmp   retint_restore_args

 	/* Either reschedule or signal or syscall exit tracking needed. */
@@ -392,7 +548,7 @@ int_very_careful:
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	/* Check for syscall exit trace */
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -400,7 +556,7 @@ int_very_careful:
 	call syscall_trace_leave
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
-	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest

 int_signal:
@@ -409,7 +565,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -436,7 +592,6 @@ END(\label)
 	PTREGSCALL stub_clone, sys_clone, %r8
 	PTREGSCALL stub_fork, sys_fork, %rdi
 	PTREGSCALL stub_vfork, sys_vfork, %rdi
-	PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi

@@ -510,10 +665,12 @@ END(stub_rt_sigreturn)
  *
  */

-retint_check:
+retint_with_reschedule:
 	CFI_DEFAULT_STACK adj=1
+	movl $_TIF_WORK_MASK,%edi
+retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
-	movl threadinfo_flags(%rcx),%edx
+	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	CFI_REMEMBER_STATE
 	jnz  retint_careful
@@ -558,17 +715,16 @@ retint_signal:
 	RESTORE_REST
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
-	jmp retint_check
+	jmp retint_with_reschedule

 #ifdef CONFIG_PREEMPT
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-	cmpl $0,threadinfo_preempt_count(%rcx)
+	cmpl $0,TI_preempt_count(%rcx)
 	jnz  retint_restore_args
-	bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
 	jnc  retint_restore_args
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
@@ -623,6 +779,9 @@ END(invalidate_interrupt\num)
 ENTRY(call_function_interrupt)
 	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
 END(call_function_interrupt)
+ENTRY(call_function_single_interrupt)
+	apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
+END(call_function_single_interrupt)
 ENTRY(irq_move_cleanup_interrupt)
 	apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
 END(irq_move_cleanup_interrupt)
@@ -632,6 +791,10 @@ ENTRY(apic_timer_interrupt)
 	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
 END(apic_timer_interrupt)

+ENTRY(uv_bau_message_intr1)
+	apicinterrupt 220,uv_bau_message_interrupt
+END(uv_bau_message_intr1)
+
 ENTRY(error_interrupt)
 	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
 END(error_interrupt)
@@ -745,7 +908,7 @@ paranoid_restore\trace:
 	jmp irq_return
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%ebx
+	movl TI_flags(%rcx),%ebx
 	andl $_TIF_WORK_MASK,%ebx
 	jz paranoid_swapgs\trace
 	movq %rsp,%rdi			/* &pt_regs */
@@ -842,7 +1005,7 @@ error_exit:
 	testb $3,CS-ARGOFFSET(%rsp)
 	jz retint_kernel
 	LOCKDEP_SYS_EXIT_IRQ
-	movl  threadinfo_flags(%rcx),%edx
+	movl  TI_flags(%rcx),%edx
 	movl  $_TIF_WORK_MASK,%edi
 	andl  %edi,%edx
 	jnz   retint_careful
@@ -864,11 +1027,11 @@ error_kernelspace:
 	   iret run with kernel gs again, so don't set the user space flag.
 	   B stepping K8s sometimes report an truncated RIP for IRET
 	   exceptions returning to compat mode. Check for these here too. */
-	leaq irq_return(%rip),%rbp
-	cmpq %rbp,RIP(%rsp)
+	leaq irq_return(%rip),%rcx
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
-	movl %ebp,%ebp	/* zero extend */
-	cmpq %rbp,RIP(%rsp)
+	movl %ecx,%ecx	/* zero extend */
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
 	cmpq $gs_change,RIP(%rsp)
         je   error_swapgs
@@ -1114,6 +1277,7 @@ END(device_not_available)
 	/* runs on exception stack */
 KPROBE_ENTRY(debug)
 /* 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	*/
 	zeroentry do_debug
@@ -1141,6 +1305,7 @@ END(do_nmi_callback)

 KPROBE_ENTRY(int3)
 /* 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8 */
  	zeroentry do_int3
@@ -1164,14 +1329,11 @@ ENTRY(coprocessor_segment_overrun)
 	zeroentry do_coprocessor_segment_overrun
 END(coprocessor_segment_overrun)

-ENTRY(reserved)
-	zeroentry do_reserved
-END(reserved)
-
 #if 0
 	/* runs on exception stack */
 ENTRY(double_fault)
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_double_fault
 	jmp paranoid_exit1
 	CFI_ENDPROC
@@ -1189,6 +1351,7 @@ END(segment_not_present)
 	/* runs on exception stack */
 ENTRY(stack_segment)
 /*	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_stack_segment */
 	errorentry do_stack_segment
 /*	jmp paranoid_exit1
--- head-2011-03-11.orig/arch/x86/kernel/fixup.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/fixup.c	2011-02-01 14:38:38.000000000 +0100
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/version.h>
+#include <asm/traps.h>

 #define DP(_f, _args...) pr_alert("  " _f "\n" , ## _args )

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/head-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+#ifndef CONFIG_XEN
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+#endif
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/head32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,57 @@
+/*
+ *  linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
+void __init i386_start_kernel(void)
+{
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+		u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+	}
+#endif
+	reserve_early(init_pg_tables_start, init_pg_tables_end,
+			"INIT_PG_TABLE");
+#else
+	reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+		      __pa(xen_start_info->pt_base)
+		      + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+		      "Xen provided");
+
+	{
+		int max_cmdline;
+
+		if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+			max_cmdline = COMMAND_LINE_SIZE;
+		memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+		boot_command_line[max_cmdline-1] = '\0';
+	}
+#endif
+
+	reserve_ebda_region();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
+	start_kernel();
+}
--- head-2011-03-11.orig/arch/x86/kernel/head64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -32,7 +32,26 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>

-unsigned long start_pfn;
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+/*
+ * We install an empty cpu_pda pointer table to indicate to early users
+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
+ * the boot cpu is not yet setup.
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
+void __init x86_64_init_pda(void)
+{
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
+	pda_init(0);
+}

 #ifndef CONFIG_XEN
 static void __init zap_identity_mappings(void)
@@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
 unsigned int machine_to_phys_order;
 EXPORT_SYMBOL(machine_to_phys_order);

-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-#ifndef CONFIG_XEN
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
-#endif
-}
-
-static void __init reserve_setup_data(void)
-{
-#ifndef CONFIG_XEN
-	struct setup_data *data;
-	unsigned long pa_data;
-	char buf[32];
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
-		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
-	}
-#endif
-}
-
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	struct xen_machphys_mapping mapping;
 	unsigned long machine_to_phys_nr_ents;
-	int i;

 	/*
 	 * Build-time sanity checks on the kernel image and module
@@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 				(__START_KERNEL & PGDIR_MASK)));
+	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

 	xen_setup_features();

@@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		phys_to_machine_mapping =
 			(unsigned long *)xen_start_info->mfn_list;
-	start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
-		xen_start_info->nr_pt_frames;

 	machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
 	machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
@@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r

 	early_printk("Kernel alive\n");

- 	for (i = 0; i < NR_CPUS; i++)
- 		cpu_pda(i) = &boot_cpu_pda[i];
+	x86_64_init_pda();

-	pda_init(0);
+	early_printk("Kernel really alive\n");
+
+	x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
 	copy_bootdata(__va(real_mode_data));

 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");

 	reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
-		      start_pfn << PAGE_SHIFT, "Xen provided");
-
-	reserve_ebda_region();
-	reserve_setup_data();
+		      __pa(xen_start_info->pt_base)
+		      + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+		      "Xen provided");

 	/*
 	 * At this point everything still needed from the boot loader
--- head-2011-03-11.orig/arch/x86/kernel/head_32-xen.S	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head_32-xen.S	2011-03-03 16:19:21.000000000 +0100
@@ -61,8 +61,6 @@ ENTRY(startup_32)
 	movb %cl,X86_MASK
 	movl %edx,X86_CAPABILITY

-	movb $1,X86_HARD_MATH
-
 	xorl %eax,%eax		# Clear GS
 	movl %eax,%gs

--- head-2011-03-11.orig/arch/x86/kernel/head_64-xen.S	2011-01-31 17:49:31.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head_64-xen.S	2011-02-01 14:38:38.000000000 +0100
@@ -92,53 +92,6 @@ NEXT_PAGE(hypercall_page)

 #undef NEXT_PAGE

-	.data
-
-	.align 16
-	.globl cpu_gdt_descr
-cpu_gdt_descr:
-	.word	gdt_end-cpu_gdt_table-1
-gdt:
-	.quad	cpu_gdt_table
-#ifdef CONFIG_SMP
-	.rept	NR_CPUS-1
-	.word	0
-	.quad	0
-	.endr
-#endif
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-
-	.section .data.page_aligned, "aw"
-	.align PAGE_SIZE
-
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-
-ENTRY(cpu_gdt_table)
-	.quad	0x0000000000000000	/* NULL descriptor */
-	.quad	0x00cf9b000000ffff	/* __KERNEL32_CS */
-	.quad	0x00af9b000000ffff	/* __KERNEL_CS */
-	.quad	0x00cf93000000ffff	/* __KERNEL_DS */
-	.quad	0x00cffb000000ffff	/* __USER32_CS */
-	.quad	0x00cff3000000ffff	/* __USER_DS, __USER32_DS  */
-	.quad	0x00affb000000ffff	/* __USER_CS */
-	.quad	0x0			/* unused */
-	.quad	0,0			/* TSS */
-	.quad	0,0			/* LDT */
-	.quad   0,0,0			/* three TLS descriptors */
-	.quad	0x0000f40000000000	/* node/CPU stored in limit */
-gdt_end:
-	/* asm/segment.h:GDT_ENTRIES must match this */
-	/* This should be a multiple of the cache line size */
-	/* GDTs of other CPUs are now dynamically allocated */
-
-	/* zero the remaining page */
-	.fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
-
 	.section .bss.page_aligned, "aw", @nobits
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
--- head-2011-03-11.orig/arch/x86/kernel/io_apic_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/io_apic_32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/bootmem.h>
 #include <linux/mc146818rtc.h>
 #include <linux/compiler.h>
 #include <linux/acpi.h>
@@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);

-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;

 /*
  *	Is the SiS APIC rmw bug present ?
@@ -89,15 +90,21 @@ int sis_apic_bug = -1;
 int nr_ioapic_registers[MAX_IO_APICS];

 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;

 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];

 /* # of MP IRQ source entries */
 int mp_irq_entries;

+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 static int disable_timer_pin_1 __initdata;

 /*
@@ -128,7 +135,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 #endif

@@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
 	struct physdev_apic apic_op;
 	int ret;

-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
 	apic_op.reg = reg;
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
 	if (ret)
@@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
 #else
 	struct physdev_apic apic_op;

-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
 	apic_op.reg = reg;
 	apic_op.value = value;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
 	}
 }

-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int pin, reg;
@@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
 }

 /* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
 }

 /* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0, 0x00010000);
+	__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
 }

 /* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+				IO_APIC_REDIR_LEVEL_TRIGGER);
 }

 /* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+				IO_APIC_REDIR_MASKED);
 }

-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;

@@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }

-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;

@@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-
+
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
@@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
 	ioapic_mask_entry(apic, pin);
 }

-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
 {
 	int apic, pin;

@@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
 	cpumask_t tmp;
-
+
 	cpus_and(tmp, cpumask, cpu_online_map);
 	if (cpus_empty(tmp))
 		tmp = TARGET_CPUS;
@@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>
-
+
 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -422,14 +431,14 @@ static int physical_balance __read_mostl
 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;

 static struct irq_cpu_info {
-	unsigned long * last_irq;
-	unsigned long * irq_delta;
+	unsigned long *last_irq;
+	unsigned long *irq_delta;
 	unsigned long irq;
 } irq_cpu_data[NR_CPUS];

 #define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) 	(irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) 	(irq_cpu_data[cpu].irq_delta[irq])

 #define IDLE_ENOUGH(cpu,now) \
 	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -468,8 +477,8 @@ inside:
 			if (cpu == -1)
 				cpu = NR_CPUS-1;
 		}
-	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
-			(search_idle && !IDLE_ENOUGH(cpu,now)));
+	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+			(search_idle && !IDLE_ENOUGH(cpu, now)));

 	return cpu;
 }
@@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
 	unsigned long now = jiffies;
 	cpumask_t allowed_mask;
 	unsigned int new_cpu;
-
+
 	if (irqbalance_disabled)
-		return;
+		return;

 	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
-	if (cpu != new_cpu) {
+	if (cpu != new_cpu)
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-	}
 }

 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
 			if (!irq_desc[j].action)
 				continue;
 			/* Is it a significant load ?  */
-			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
 						useful_load_threshold)
 				continue;
 			balance_irq(i, j);
 		}
 	}
 	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 	return;
 }

@@ -535,22 +543,22 @@ static void do_irq_balance(void)
 			/* Is this an active IRQ or balancing disabled ? */
 			if (!irq_desc[j].action || irq_balancing_disabled(j))
 				continue;
-			if ( package_index == i )
-				IRQ_DELTA(package_index,j) = 0;
+			if (package_index == i)
+				IRQ_DELTA(package_index, j) = 0;
 			/* Determine the total count per processor per IRQ */
 			value_now = (unsigned long) kstat_cpu(i).irqs[j];

 			/* Determine the activity per processor per IRQ */
-			delta = value_now - LAST_CPU_IRQ(i,j);
+			delta = value_now - LAST_CPU_IRQ(i, j);

 			/* Update last_cpu_irq[][] for the next time */
-			LAST_CPU_IRQ(i,j) = value_now;
+			LAST_CPU_IRQ(i, j) = value_now;

 			/* Ignore IRQs whose rate is less than the clock */
 			if (delta < useful_load_threshold)
 				continue;
 			/* update the load for the processor or package total */
-			IRQ_DELTA(package_index,j) += delta;
+			IRQ_DELTA(package_index, j) += delta;

 			/* Keep track of the higher numbered sibling as well */
 			if (i != package_index)
@@ -576,7 +584,8 @@ static void do_irq_balance(void)
 	max_cpu_irq = ULONG_MAX;

 tryanothercpu:
-	/* Look for heaviest loaded processor.
+	/*
+	 * Look for heaviest loaded processor.
 	 * We may come back to get the next heaviest loaded processor.
 	 * Skip processors with trivial loads.
 	 */
@@ -585,7 +594,7 @@ tryanothercpu:
 	for_each_online_cpu(i) {
 		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
-		if (max_cpu_irq <= CPU_IRQ(i))
+		if (max_cpu_irq <= CPU_IRQ(i))
 			continue;
 		if (tmp_cpu_irq < CPU_IRQ(i)) {
 			tmp_cpu_irq = CPU_IRQ(i);
@@ -594,8 +603,9 @@ tryanothercpu:
 	}

 	if (tmp_loaded == -1) {
- 	 /* In the case of small number of heavy interrupt sources,
-	  * loading some of the cpus too much. We use Ingo's original
+	 /*
+	  * In the case of small number of heavy interrupt sources,
+	  * loading some of the cpus too much. We use Ingo's original
 	  * approach to rotate them around.
 	  */
 		if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -604,13 +614,14 @@ tryanothercpu:
 		}
 		goto not_worth_the_effort;
 	}
-
+
 	first_attempt = 0;		/* heaviest search */
 	max_cpu_irq = tmp_cpu_irq;	/* load */
 	max_loaded = tmp_loaded;	/* processor */
 	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
-	/* if imbalance is less than approx 10% of max load, then
+
+	/*
+	 * if imbalance is less than approx 10% of max load, then
 	 * observe diminishing returns action. - quit
 	 */
 	if (imbalance < (max_cpu_irq >> 3))
@@ -626,26 +637,25 @@ tryanotherirq:
 		/* Is this an active IRQ? */
 		if (!irq_desc[j].action)
 			continue;
-		if (imbalance <= IRQ_DELTA(max_loaded,j))
+		if (imbalance <= IRQ_DELTA(max_loaded, j))
 			continue;
 		/* Try to find the IRQ that is closest to the imbalance
 		 * without going over.
 		 */
-		if (move_this_load < IRQ_DELTA(max_loaded,j)) {
-			move_this_load = IRQ_DELTA(max_loaded,j);
+		if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+			move_this_load = IRQ_DELTA(max_loaded, j);
 			selected_irq = j;
 		}
 	}
-	if (selected_irq == -1) {
+	if (selected_irq == -1)
 		goto tryanothercpu;
-	}

 	imbalance = move_this_load;
-
+
 	/* For physical_balance case, we accumulated both load
 	 * values in the one of the siblings cpu_irq[],
 	 * to use the same code for physical and logical processors
-	 * as much as possible.
+	 * as much as possible.
 	 *
 	 * NOTE: the cpu_irq[] array holds the sum of the load for
 	 * sibling A and sibling B in the slot for the lowest numbered
@@ -674,11 +684,11 @@ tryanotherirq:
 		/* mark for change destination */
 		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));

-		/* Since we made a change, come back sooner to
+		/* Since we made a change, come back sooner to
 		 * check for more variation.
 		 */
 		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 		return;
 	}
 	goto tryanotherirq;
@@ -689,7 +699,7 @@ not_worth_the_effort:
 	 * upward
 	 */
 	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
 	return;
 }

@@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
 	cpumask_t tmp;

 	cpus_shift_right(tmp, cpu_online_map, 2);
-        c = &boot_cpu_data;
+	c = &boot_cpu_data;
 	/* When not overwritten by the command line ask subarchitecture. */
 	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
 		irqbalance_disabled = NO_BALANCE_IRQ;
 	if (irqbalance_disabled)
 		return 0;
-
+
 	 /* disable irqbalance completely if there is only one processor online */
 	if (num_online_cpus() < 2) {
 		irqbalance_disabled = 1;
@@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
 		physical_balance = 1;

 	for_each_online_cpu(i) {
-		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
 		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
 			printk(KERN_ERR "balanced_irq_init: out of memory");
 			goto failed;
 		}
-		memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
-		memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
 	}
-
+
 	printk(KERN_INFO "Starting balanced_irq\n");
 	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
 		return 0;
@@ -799,7 +807,7 @@ void send_IPI_self(int vector)
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 #endif
 }
 #endif /* !CONFIG_SMP */
@@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;

 	return -1;
@@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))

-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
-		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+		for (apic = 0; apic < nr_ioapics; apic++) {
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,

 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
 		"slot:%d, pin:%d.\n", bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;

 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);

 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;

-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);

 /*
- * This function currently is only a helper for the i386 smp boot process where
+ * This function currently is only a helper for the i386 smp boot process where
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
@@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */

-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)

 /* PCI interrupts are always polarity one level triggered,
@@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)

 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;

 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3) {
+	case 0: /* conforms, ie. bus-type dependent polarity */
 	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-		{
-			polarity = test_bit(bus, mp_bus_not_pci)?
-				default_ISA_polarity(idx):
-				default_PCI_polarity(idx);
-			break;
-		}
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+		polarity = test_bit(bus, mp_bus_not_pci)?
+			default_ISA_polarity(idx):
+			default_PCI_polarity(idx);
+		break;
+	}
+	case 1: /* high active */
+	{
+		polarity = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
+	case 3: /* low active */
+	{
+		polarity = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
 	}
 	return polarity;
 }

 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;

 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
+	case 0: /* conforms, ie. bus-type dependent */
 	{
-		case 0: /* conforms, ie. bus-type dependent */
-		{
-			trigger = test_bit(bus, mp_bus_not_pci)?
-					default_ISA_trigger(idx):
-					default_PCI_trigger(idx);
+		trigger = test_bit(bus, mp_bus_not_pci)?
+				default_ISA_trigger(idx):
+				default_PCI_trigger(idx);
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					trigger = default_MCA_trigger(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
-#endif
+		switch (mp_bus_id_to_type[bus]) {
+		case MP_BUS_ISA: /* ISA pin */
+		{
+			/* set before the switch */
 			break;
 		}
-		case 1: /* edge */
+		case MP_BUS_EISA: /* EISA pin */
 		{
-			trigger = 0;
+			trigger = default_EISA_trigger(idx);
 			break;
 		}
-		case 2: /* reserved */
+		case MP_BUS_PCI: /* PCI pin */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 1;
+			/* set before the switch */
 			break;
 		}
-		case 3: /* level */
+		case MP_BUS_MCA: /* MCA pin */
 		{
-			trigger = 1;
+			trigger = default_MCA_trigger(idx);
 			break;
 		}
-		default: /* invalid */
+		default:
 		{
 			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 0;
+			trigger = 1;
 			break;
 		}
 	}
+#endif
+		break;
+	}
+	case 1: /* edge */
+	{
+		trigger = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 1;
+		break;
+	}
+	case 3: /* level */
+	{
+		trigger = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 0;
+		break;
+	}
+	}
 	return trigger;
 }

@@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;

 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");

 	if (test_bit(bus, mp_bus_not_pci))
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in

 	for (apic = 0; apic < nr_ioapics; apic++) {
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			idx = find_irq_entry(apic,pin,mp_INT);
-			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
 				return irq_trigger(idx);
 		}
 	}
@@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
 		/*
 		 * add it to the IO-APIC irq-routing table:
 		 */
-		memset(&entry,0,sizeof(entry));
+		memset(&entry, 0, sizeof(entry));

 		entry.delivery_mode = INT_DELIVERY_MODE;
 		entry.dest_mode = INT_DEST_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest =
+		entry.dest.logical.logical_dest =
 					cpu_mask_to_apicid(TARGET_CPUS);

-		idx = find_irq_entry(apic,pin,mp_INT);
+		idx = find_irq_entry(apic, pin, mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						" IO-APIC (apicid-pin) %d-%d",
-						mp_ioapics[apic].mpc_apicid,
+						mp_ioapics[apic].mp_apicid,
 						pin);
 				first_notcon = 0;
 			} else
 				apic_printk(APIC_VERBOSE, ", %d-%d",
-					mp_ioapics[apic].mpc_apicid, pin);
+					mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}

@@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
+
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
@@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
 		apic_printk(APIC_VERBOSE, " not connected.\n");
 }

+#ifndef CONFIG_XEN
 /*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-#ifndef CONFIG_XEN
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;

-	memset(&entry,0,sizeof(entry));
-
-	disable_8259A_irq(0);
-
-	/* mask LVT0 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	memset(&entry, 0, sizeof(entry));

 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
@@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin

 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_chip;
-	set_irq_handler(0, handle_edge_irq);
+	ioapic_register_intr(0, vector, IOAPIC_EDGE);

 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }

 void __init print_IO_APIC(void)
@@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
 	if (apic_verbosity == APIC_QUIET)
 		return;

- 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);

 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
 		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);

-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
 	return;
 }

-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
 	}
 }

-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;

@@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *

 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
 			GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
@@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
 	printk("\n");
 }

-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }

 void /*__init*/ print_PIC(void)
@@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
 	v = inb(0xa0) << 8 | inb(0x20);
 	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);

-	outb(0x0b,0xa0);
-	outb(0x0b,0x20);
+	outb(0x0b, 0xa0);
+	outb(0x0b, 0x20);
 	v = inb(0xa0) << 8 | inb(0x20);
-	outb(0x0a,0xa0);
-	outb(0x0a,0x20);
+	outb(0x0a, 0xa0);
+	outb(0x0a, 0x20);

 	spin_unlock_irqrestore(&i8259A_lock, flags);

@@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
 	v = inb(0x4d1) << 8 | inb(0x4d0);
 	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
+#else
+void __init print_IO_APIC(void) {}
 #endif /* !CONFIG_XEN */

 static void __init enable_IO_APIC(void)
@@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
 		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
 	}
 #ifndef CONFIG_XEN
-	for(apic = 0; apic < nr_ioapics; apic++) {
+	for (apic = 0; apic < nr_ioapics; apic++) {
 		int pin;
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */

-#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
+#ifndef CONFIG_XEN
 static void __init setup_ioapic_ids_from_mpc(void)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
 	unsigned char old_id;
 	unsigned long flags;

+#ifdef CONFIG_X86_NUMAQ
+	if (found_numaq)
+		return;
+#endif
+
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
 	 * no meaning without the serial APIC bus.
@@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		old_id = mp_ioapics[apic].mpc_apicid;

-		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+		old_id = mp_ioapics[apic].mp_apicid;
+
+		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
 		}

 		/*
@@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mpc_apicid)) {
+					mp_ioapics[apic].mp_apicid)) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mpc_apicid = i;
+			mp_ioapics[apic].mp_apicid = i;
 		} else {
 			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic].mpc_apicid);
+					mp_ioapics[apic].mp_apicid);
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}

@@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic].mpc_apicid)
+		if (old_id != mp_ioapics[apic].mp_apicid)
 			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mpc_dstapic == old_id)
-					mp_irqs[i].mpc_dstapic
-						= mp_ioapics[apic].mpc_apicid;
+				if (mp_irqs[i].mp_dstapic == old_id)
+					mp_irqs[i].mp_dstapic
+						= mp_ioapics[apic].mp_apicid;

 		/*
 		 * Read the right value from the MPC table and
 		 * write it into the ID register.
-	 	 */
+		 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mpc_apicid);
+			mp_ioapics[apic].mp_apicid);

-		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
 }
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif

-#ifndef CONFIG_XEN
 int no_timer_check __initdata;

 static int __init notimercheck(char *s)
@@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
  * The local APIC irq-chip implementation:
  */

-static void ack_apic(unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
 	ack_APIC_irq();
 }

-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;

 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }

-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;

 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }

 static struct irq_chip lapic_chip __read_mostly = {
-	.name		= "local-APIC-edge",
+	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
 	.unmask		= unmask_lapic_irq,
-	.eoi		= ack_apic,
+	.ack		= ack_lapic_irq,
 };

+static void lapic_register_intr(int irq, int vector)
+{
+	irq_desc[irq].status &= ~IRQ_LEVEL;
+	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+				      "edge");
+	set_intr_gate(vector, interrupt[irq]);
+}
+
 static void __init setup_nmi(void)
 {
 	/*
- 	 * Dirty trick to enable the NMI watchdog ...
+	 * Dirty trick to enable the NMI watchdog ...
 	 * We put the 8259A master into AEOI mode and
 	 * unmask on all local APICs LVT0 as NMI.
 	 *
 	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
 	 * is from Maciej W. Rozycki - so we do not have to EOI from
 	 * the NMI handler or the timer interrupt.
-	 */
+	 */
 	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");

 	enable_NMI_through_LVT0();
@@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
 static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
+	int no_pin1 = 0;
 	int vector;
+	unsigned int ver;
 	unsigned long flags;

 	local_irq_save(flags);

+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
+
 	/*
 	 * get/set the timer IRQ vector:
 	 */
@@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
 	set_intr_gate(vector, interrupt[0]);

 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.  Also
+	 * timer interrupts need to be acknowledged manually in
+	 * the 8259A for the i82489DX when using the NMI
+	 * watchdog as that APIC treats NMIs as level-triggered.
+	 * The AEOI mode will finish them in the 8259A
+	 * automatically.
 	 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = 1;
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
+	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));

 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;

-	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		vector, apic1, pin1, apic2, pin2);
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    vector, apic1, pin1, apic2, pin2);
+
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}

 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
-				"IO-APIC\n");
-	}
-
-	printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
-	if (pin2 != -1) {
-		printk("\n..... (found pin %d) ...", pin2);
+		if (!no_pin1)
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
+
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		setup_timer_IRQ0_pin(apic2, pin2, vector);
+		unmask_IO_APIC_irq(0);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
-			printk("works.\n");
-			if (pin1 != -1)
-				replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-			else
-				add_pin_to_irq(0, apic2, pin2);
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
-	printk(" failed.\n");

 	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+			    "through the IO-APIC - disabling NMI Watchdog!\n");
+		nmi_watchdog = NMI_NONE;
 	}
+	timer_ack = 0;

-	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");

-	disable_8259A_irq(0);
-	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
-				      "fasteoi");
-	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
+	lapic_register_intr(0, vector);
+	apic_write(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
 	enable_8259A_irq(0);

 	if (timer_irq_works()) {
-		printk(" works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-	printk(" failed.\n");
+	disable_8259A_irq(0);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");

-	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");

-	timer_ack = 0;
 	init_8259A(0);
 	make_8259A_irq(0);
-	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_DM_EXTINT);

 	unlock_ExtINT_logic();

 	if (timer_irq_works()) {
-		printk(" works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
-	printk(" failed :(.\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
 	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-		"report.  Then try booting with the 'noapic' option");
+		"report.  Then try booting with the 'noapic' option.\n");
 out:
 	local_irq_restore(flags);
 }
@@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
 #endif

 /*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
  */
 #define PIC_IRQS	(1 << PIC_CASCADE_IR)

@@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
 	int i;

 	/* Reserve all the system vectors. */
-	for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+	for (i = first_system_vector; i < NR_VECTORS; i++)
 		set_bit(i, used_vectors);
 #endif

 	enable_IO_APIC();

-	if (acpi_ioapic)
-		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
-	else
-		io_apic_irqs = ~PIC_IRQS;
+	io_apic_irqs = ~PIC_IRQS;

 	printk("ENABLING IO-APIC IRQs\n");

+#ifndef CONFIG_XEN
 	/*
 	 * Set up IO-APIC IRQ routing.
 	 */
 	if (!acpi_ioapic)
 		setup_ioapic_ids_from_mpc();
-#ifndef CONFIG_XEN
 	sync_Arb_IDs();
 #endif
 	setup_IO_APIC_irqs();
@@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
 		print_IO_APIC();
 }

-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
 /*
  *	Called after all the initialization is done. If we didnt find any
  *	APIC bugs then we can allow the modify fast path
  */
-
+
 static int __init io_apic_bug_finalize(void)
 {
-	if(sis_apic_bug == -1)
+	if (sis_apic_bug == -1)
 		sis_apic_bug = 0;
 	if (is_initial_xendomain()) {
 		struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
@@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
 	struct sys_device dev;
 	struct IO_APIC_route_entry entry[0];
 };
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];

 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
 	int i;
-
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		entry[i] = ioapic_read_entry(dev->id, i);

 	return 0;
@@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 	int i;
-
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;

 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		ioapic_write_entry(dev->id, i, entry[i]);

 	return 0;
@@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev

 static int __init ioapic_init_sysfs(void)
 {
-	struct sys_device * dev;
+	struct sys_device *dev;
 	int i, size, error = 0;

 	error = sysdev_class_register(&ioapic_sysdev_class);
 	if (error)
 		return error;

-	for (i = 0; i < nr_ioapics; i++ ) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+	for (i = 0; i < nr_ioapics; i++) {
+		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
 			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+		mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
 		if (!mp_ioapic_data[i]) {
 			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
 			continue;
 		}
-		memset(mp_ioapic_data[i], 0, size);
 		dev = &mp_ioapic_data[i]->dev;
-		dev->id = i;
+		dev->id = i;
 		dev->cls = &ioapic_sysdev_class;
 		error = sysdev_register(dev);
 		if (error) {
@@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((INT_DEST_MODE == 0) ?
-				MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
 				MSI_ADDR_DEST_MODE_LOGICAL) |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
 				MSI_ADDR_REDIRECTION_CPU:
@@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
 			MSI_DATA_TRIGGER_EDGE |
 			MSI_DATA_LEVEL_ASSERT |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-				MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
 				MSI_DATA_DELIVERY_LOWPRI) |
 			MSI_DATA_VECTOR(vector);
 	}
@@ -2720,13 +2753,13 @@ int arch_setup_ht_irq(unsigned int irq,
 #endif /* CONFIG_HT_IRQ */

 /* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
+			ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */

 #ifdef CONFIG_ACPI

 #ifndef CONFIG_XEN
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
 {
 	union IO_APIC_reg_00 reg_00;
 	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
 	int i = 0;

 	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
 	 * supports up to 16 on one shared APIC bus.
-	 *
+	 *
 	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
 	 *      advantage of new APIC bus architecture.
 	 */
@@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
 	}

 	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice
+	 * Every APIC in a system must have a unique ID or we get lots of nice
 	 * 'stuck on smp_invalidate_needed IPI wait' messages.
 	 */
 	if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
 			"trying %d\n", ioapic, apic_id, i);

 		apic_id = i;
-	}
+	}

 	tmp = apicid_to_cpu_present(apic_id);
 	physids_or(apic_id_map, apic_id_map, tmp);
@@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
 #endif /* !CONFIG_XEN */


-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
 }


-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
 }


-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
 {
 	struct IO_APIC_route_entry entry;

@@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
 	 * corresponding device driver registers for this IRQ.
 	 */

-	memset(&entry,0,sizeof(entry));
+	memset(&entry, 0, sizeof(entry));

 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
@@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,

 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);

 	ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
 		return -1;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
@@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
 	return 0;
 }
 early_param("noapic", parse_noapic);
+
+#ifndef CONFIG_XEN
+void __init ioapic_init_mappings(void)
+{
+	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+	int i;
+
+	for (i = 0; i < nr_ioapics; i++) {
+		if (smp_found_config) {
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
+			if (!ioapic_phys) {
+				printk(KERN_ERR
+				       "WARNING: bogus zero IO-APIC "
+				       "address found in MPTABLE, "
+				       "disabling IO/APIC support!\n");
+				smp_found_config = 0;
+				skip_ioapic_setup = 1;
+				goto fake_ioapic_page;
+			}
+		} else {
+fake_ioapic_page:
+			ioapic_phys = (unsigned long)
+				      alloc_bootmem_pages(PAGE_SIZE);
+			ioapic_phys = __pa(ioapic_phys);
+		}
+		set_fixmap_nocache(idx, ioapic_phys);
+		printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+		       __fix_to_virt(idx), ioapic_phys);
+		idx++;
+	}
+}
+#endif
--- head-2011-03-11.orig/arch/x86/kernel/io_apic_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/io_apic_64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -45,6 +45,7 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
+#include <asm/i8259.h>
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
@@ -63,10 +64,16 @@ struct irq_cfg {
 };

 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;

 static int assign_irq_vector(int irq, cpumask_t mask);

+#ifndef CONFIG_XEN
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+#endif
+
 #define __apicdebuginit  __init

 int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -89,14 +96,14 @@ unsigned long io_apic_irqs;

 #define clear_IO_APIC() ((void)0)
 #else
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;

 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 #endif

 static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
+static DEFINE_SPINLOCK(vector_lock);

 /*
  * # of IRQ routing registers
@@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];

 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;

 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];

 /* # of MP IRQ source entries */
 int mp_irq_entries;

+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
@@ -141,7 +150,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 #endif

@@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
 	struct physdev_apic apic_op;
 	int ret;

-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
 	apic_op.reg = reg;
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
 	if (ret)
@@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
 #else
 	struct physdev_apic apic_op;

-	apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+	apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
 	apic_op.reg = reg;
 	apic_op.value = value;
 	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
 			break;
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		if ((reg >> 14) & 1) {
+		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
 			spin_unlock_irqrestore(&ioapic_lock, flags);
 			return true;
 		}
@@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
 			break;
 		io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~0x000000ff;
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
 		io_apic_modify(apic, reg);
 		if (!entry->next)
@@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
 }

 #ifndef CONFIG_XEN
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+				      int oldapic, int oldpin,
+				      int newapic, int newpin)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (1) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+		}
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 #define __DO_ACTION(R, ACTION, FINAL)					\
 									\
 {									\
@@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)

-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask,	0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask,	0, &= ~IO_APIC_REDIR_MASKED, )

 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
 }
 __setup("disable_timer_pin_1", disable_timer_pin_setup);

-#ifndef CONFIG_XEN
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-#endif /* !CONFIG_XEN */
-

 /*
  * Find the IRQ entry number of a certain pin.
@@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;

 	return -1;
@@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))

-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
 	int i;

 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,

 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
 		bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;

 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;

 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);

 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;

-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,

 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;

 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)

 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;

 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;

 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");

 	if (test_bit(bus, mp_bus_not_pci)) {
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	} else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
 	return irq;
 }

+void lock_vector_lock(void)
+{
+	/* Used to the online set of cpus does not change
+	 * during assign_irq_vector.
+	 */
+	spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+	spin_unlock(&vector_lock);
+}
+
 static int __assign_irq_vector(int irq, cpumask_t mask)
 {
 	struct physdev_irq irq_op;
@@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)

 	vector = cfg->vector;
 	cpus_and(mask, cfg->domain, cpu_online_map);
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu_mask_nr(cpu, mask)
 		per_cpu(vector_irq, cpu)[vector] = -1;

 	cfg->vector = 0;
@@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);

 	/*
@@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
 				first_notcon = 0;
 			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 		if (!first_notcon) {
@@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo

 #ifndef CONFIG_XEN
 /*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;

 	memset(&entry, 0, sizeof(entry));

-	disable_8259A_irq(0);
-
-	/* mask LVT0 */
-	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
@@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin

 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");

@@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }

 void __apicdebuginit print_IO_APIC(void)
@@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);

 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
 	spin_unlock_irqrestore(&ioapic_lock, flags);

 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);

@@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo

 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo

 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }

 void __apicdebuginit print_PIC(void)
@@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
 	v = inb(0x4d1) << 8 | inb(0x4d0);
 	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
+#else
+void __apicdebuginit print_IO_APIC(void) {}
 #endif /* !CONFIG_XEN */

 void __init enable_IO_APIC(void)
@@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
 static int ioapic_retrigger_irq(unsigned int irq)
 {
 	struct irq_cfg *cfg = &irq_cfg[irq];
-	cpumask_t mask;
 	unsigned long flags;

 	spin_lock_irqsave(&vector_lock, flags);
-	mask = cpumask_of_cpu(first_cpu(cfg->domain));
-	send_IPI_mask(mask, cfg->vector);
+	send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
 	spin_unlock_irqrestore(&vector_lock, flags);

 	return 1;
@@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
 }

 #ifndef CONFIG_XEN
-static void enable_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;

@@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }

-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;

@@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
 	ack_APIC_irq();
 }

-static void end_lapic_irq (unsigned int i) { /* nothing */ }
-
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-	.name = "local-APIC",
-	.typename = "local-APIC-edge",
-	.startup = NULL, /* startup_irq() not used for IRQ0 */
-	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
-	.enable = enable_lapic_irq,
-	.disable = disable_lapic_irq,
-	.ack = ack_lapic_irq,
-	.end = end_lapic_irq,
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC",
+	.mask		= mask_lapic_irq,
+	.unmask		= unmask_lapic_irq,
+	.ack		= ack_lapic_irq,
 };

+static void lapic_register_intr(int irq)
+{
+	irq_desc[irq].status &= ~IRQ_LEVEL;
+	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+				      "edge");
+}
+
 static void __init setup_nmi(void)
 {
 	/*
@@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
 	struct irq_cfg *cfg = irq_cfg + 0;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
+	int no_pin1 = 0;

 	local_irq_save(flags);

@@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
 	assign_irq_vector(0, TARGET_CPUS);

 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);

 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;

-	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		cfg->vector, apic1, pin1, apic2, pin2);
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    cfg->vector, apic1, pin1, apic2, pin2);
+
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}

 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
-			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-				"connected to IO-APIC\n");
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-				"through the 8259A ... ");
-	if (pin2 != -1) {
-		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-			apic2, pin2);
+		if (!no_pin1)
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
+
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		unmask_IO_APIC_irq(0);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
-			apic_printk(APIC_VERBOSE," works.\n");
-			nmi_watchdog_default();
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
-	apic_printk(APIC_VERBOSE," failed.\n");

 	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+			    "through the IO-APIC - disabling NMI Watchdog!\n");
+		nmi_watchdog = NMI_NONE;
 	}

-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");

-	disable_8259A_irq(0);
-	irq_desc[0].chip = &lapic_irq_type;
+	lapic_register_intr(0);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);

 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
+	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-	apic_printk(APIC_VERBOSE," failed.\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");

-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");

 	init_8259A(0);
 	make_8259A_irq(0);
@@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
 	unlock_ExtINT_logic();

 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
-	apic_printk(APIC_VERBOSE," failed :(.\n");
-	panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+		"report.  Then try booting with the 'noapic' option.\n");
 out:
 	local_irq_restore(flags);
 }
@@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);

 /*
  *
- * IRQs that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
  */
 #define PIC_IRQS	(1<<2)

@@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
 {
 	enable_IO_APIC();

-	if (acpi_ioapic)
-		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
-	else
-		io_apic_irqs = ~PIC_IRQS;
+	io_apic_irqs = ~PIC_IRQS;

 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");

@@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi

 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
 		return -1;

 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
@@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
 		} else {
 			ioapic_phys = (unsigned long)
 				alloc_bootmem_pages(PAGE_SIZE);
--- head-2011-03-11.orig/arch/x86/kernel/ldt-xen.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/ldt-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -20,9 +20,9 @@
 #include <asm/mmu_context.h>

 #ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
 {
-	if (current->active_mm)
+	if (current->active_mm == current_mm)
 		load_LDT(&current->active_mm->context);
 }
 #endif
@@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i

 	if (reload) {
 #ifdef CONFIG_SMP
-		cpumask_t mask;
-
 		preempt_disable();
 #endif
 		make_pages_readonly(newldt,
@@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
 				    XENFEAT_writable_descriptor_tables);
 		load_LDT(pc);
 #ifdef CONFIG_SMP
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
+		if (!cpus_equal(current->mm->cpu_vm_mask,
+				cpumask_of_cpu(smp_processor_id())))
+			smp_call_function(flush_ldt, current->mm, 1);
 		preempt_enable();
 #endif
 	}
--- head-2011-03-11.orig/arch/x86/kernel/machine_kexec_32.c	2011-01-31 14:54:00.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/machine_kexec_32.c	2011-02-01 14:38:38.000000000 +0100
@@ -131,6 +131,8 @@ void machine_kexec_setup_load_arg(xen_ke
 	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
 	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);

+	if (image->type == KEXEC_TYPE_DEFAULT)
+		xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
 }

 int __init machine_kexec_setup_resources(struct resource *hypervisor,
--- head-2011-03-11.orig/arch/x86/kernel/microcode-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/microcode-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -5,13 +5,14 @@
  *		      2006	Shaohua Li <shaohua.li@intel.com>
  *
  *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
  *	Pentium III, Xeon, Pentium 4, etc.
  *
- *	Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
- *	Order Number 245472 or free download from:
- *
- *	http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/design/pentium4/manuals/253668.htm
  *
  *	For more information, go to http://www.urbanmyth.org/microcode
  *
@@ -26,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -86,6 +88,7 @@ static int do_microcode_update (const vo

 static int microcode_open (struct inode *unused1, struct file *unused2)
 {
+	cycle_kernel_lock();
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
 }

@@ -162,7 +165,7 @@ static int request_microcode(void)
 		c->x86, c->x86_model, c->x86_mask);
 	error = request_firmware(&firmware, name, &microcode_pdev->dev);
 	if (error) {
-		pr_debug("microcode: ucode data file %s load failed\n", name);
+		pr_debug("microcode: data file %s load failed\n", name);
 		return error;
 	}

@@ -183,6 +186,9 @@ static int __init microcode_init (void)
 {
 	int error;

+	printk(KERN_INFO
+		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
 	error = microcode_dev_init();
 	if (error)
 		return error;
@@ -195,8 +201,6 @@ static int __init microcode_init (void)

 	request_microcode();

-	printk(KERN_INFO
-		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
 	return 0;
 }

--- head-2011-03-11.orig/arch/x86/kernel/mpparse-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/mpparse-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,9 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/setup.h>

 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -32,27 +35,10 @@
 #include <mach_mpparse.h>
 #endif

-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
+static void *_bus_to_virt(unsigned long ma)
+{
+	return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
+}

 /*
  * Checksum an MP configuration block.
@@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
 	return sum & 0xFF;
 }

-#ifdef CONFIG_X86_NUMAQ
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-    __cpuinitdata;
-#endif
-
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
 {
 #ifndef CONFIG_XEN
 	int apicid;
@@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
 		disabled_cpus++;
 		return;
 	}
-#ifdef CONFIG_X86_NUMAQ
-	apicid = mpc_apic_id(m, translation_table[mpc_record]);
-#else
-	apicid = m->mpc_apicid;
-#endif
+
+	if (x86_quirks->mpc_apic_id)
+		apicid = x86_quirks->mpc_apic_id(m);
+	else
+		apicid = m->mpc_apicid;
+
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 		bootup_cpu = " (Bootup-CPU)";
 		boot_cpu_physical_apicid = m->mpc_apicid;
@@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
 #endif
 }

+#ifdef CONFIG_X86_IO_APIC
 static void __init MP_bus_info(struct mpc_config_bus *m)
 {
 	char str[7];
-
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;

-#ifdef CONFIG_X86_NUMAQ
-	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
-	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+	if (x86_quirks->mpc_oem_bus_info)
+		x86_quirks->mpc_oem_bus_info(m, str);
+	else
+		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);

 #if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
-		mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+		if (x86_quirks->mpc_oem_pci_bus)
+			x86_quirks->mpc_oem_pci_bus(m);
+
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
+#endif

 #ifdef CONFIG_X86_IO_APIC

@@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
 	if (bad_ioapic(m->mpc_apicaddr))
 		return;

-	mp_ioapics[nr_ioapics] = *m;
+	mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+	mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+	mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+	mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+	mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
 	nr_ioapics++;
 }

-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	mp_irqs[mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!!\n");
 }

-#endif
-
-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
 {
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-		m->mpc_irqtype, m->mpc_irqflag & 3,
-		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
-		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
 }

-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
 {
-	printk(KERN_INFO
-	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-	       m->trans_local);
+	mp_irq->mp_dstapic = m->mpc_dstapic;
+	mp_irq->mp_type = m->mpc_type;
+	mp_irq->mp_irqtype = m->mpc_irqtype;
+	mp_irq->mp_irqflag = m->mpc_irqflag;
+	mp_irq->mp_srcbus = m->mpc_srcbus;
+	mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+	mp_irq->mp_dstirq = m->mpc_dstirq;
+}

-	if (mpc_record >= MAX_MPC_ENTRY)
-		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-	else
-		translation_table[mpc_record] = m;	/* stash this for later */
-	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-		node_set_online(m->trans_quad);
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	m->mpc_dstapic = mp_irq->mp_dstapic;
+	m->mpc_type = mp_irq->mp_type;
+	m->mpc_irqtype = mp_irq->mp_irqtype;
+	m->mpc_irqflag = mp_irq->mp_irqflag;
+	m->mpc_srcbus = mp_irq->mp_srcbus;
+	m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+	m->mpc_dstirq = mp_irq->mp_dstirq;
 }

-/*
- * Read/parse the MPC oem tables
- */
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	if (mp_irq->mp_dstapic != m->mpc_dstapic)
+		return 1;
+	if (mp_irq->mp_type != m->mpc_type)
+		return 2;
+	if (mp_irq->mp_irqtype != m->mpc_irqtype)
+		return 3;
+	if (mp_irq->mp_irqflag != m->mpc_irqflag)
+		return 4;
+	if (mp_irq->mp_srcbus != m->mpc_srcbus)
+		return 5;
+	if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+		return 6;
+	if (mp_irq->mp_dstirq != m->mpc_dstirq)
+		return 7;
+
+	return 0;
+}

-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-				    unsigned short oemsize)
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	int count = sizeof(*oemtable);	/* the header size */
-	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+	int i;

-	mpc_record = 0;
-	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-	       oemtable);
-	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-		printk(KERN_WARNING
-		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-		       oemtable->oem_signature[0], oemtable->oem_signature[1],
-		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
-		return;
-	}
-	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-		return;
-	}
-	while (count < oemtable->oem_length) {
-		switch (*oemptr) {
-		case MP_TRANSLATION:
-			{
-				struct mpc_config_translation *m =
-				    (struct mpc_config_translation *)oemptr;
-				MP_translation_info(m);
-				oemptr += sizeof(*m);
-				count += sizeof(*m);
-				++mpc_record;
-				break;
-			}
-		default:
-			{
-				printk(KERN_WARNING
-				       "Unrecognised OEM table entry type! - %d\n",
-				       (int)*oemptr);
-				return;
-			}
-		}
+	print_MP_intsrc_info(m);
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+			return;
 	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
 }

-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
-				 char *productid)
+#endif
+
+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 {
-	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  May not be a NUMA-Q system!\n");
-	if (mpc->mpc_oemptr)
-		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-				 mpc->mpc_oemsize);
+	apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+		m->mpc_irqtype, m->mpc_irqflag & 3,
+		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
+		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
 }
-#endif /* CONFIG_X86_NUMAQ */

 /*
  * Read/parse the MPC
  */

-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+				char *str)
 {
-	char str[16];
-	char oem[10];
-	int count = sizeof(*mpc);
-	unsigned char *mpt = ((unsigned char *)mpc) + count;

 	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
 		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -313,20 +280,44 @@ static int __init smp_read_mpc(struct mp
 	}
 	memcpy(oem, mpc->mpc_oem, 8);
 	oem[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);

 	memcpy(str, mpc->mpc_productid, 12);
 	str[12] = 0;
-	printk("Product ID: %s ", str);

-#ifdef CONFIG_X86_32
-	mps_oem_check(mpc, oem, str);
-#endif
-	printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);

 #ifndef CONFIG_XEN
 	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+#endif
+
+	return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;

+#ifdef CONFIG_X86_32
+	/*
+	 * need to make sure summit and es7000's mps_oem_check is safe to be
+	 * called early via genericarch 's mps_oem_check
+	 */
+	if (early) {
+#ifdef CONFIG_X86_NUMAQ
+		numaq_mps_oem_check(mpc, oem, str);
+#endif
+	} else
+		mps_oem_check(mpc, oem, str);
+#endif
+#ifndef CONFIG_XEN
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -335,12 +326,17 @@ static int __init smp_read_mpc(struct mp
 	if (early)
 		return 1;

+	if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+		struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+		x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+	}
+
 	/*
 	 *      Now process the configuration blocks.
 	 */
-#ifdef CONFIG_X86_NUMAQ
-	mpc_record = 0;
-#endif
+	if (x86_quirks->mpc_record)
+		*x86_quirks->mpc_record = 0;
+
 	while (count < mpc->mpc_length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
@@ -358,7 +354,9 @@ static int __init smp_read_mpc(struct mp
 			{
 				struct mpc_config_bus *m =
 				    (struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
 				MP_bus_info(m);
+#endif
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -404,10 +402,14 @@ static int __init smp_read_mpc(struct mp
 			count = mpc->mpc_length;
 			break;
 		}
-#ifdef CONFIG_X86_NUMAQ
-		++mpc_record;
-#endif
+		if (x86_quirks->mpc_record)
+			(*x86_quirks->mpc_record)++;
 	}
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_bigsmp_probe();
+#endif
+
 	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -433,7 +435,7 @@ static void __init construct_default_ioi
 	intsrc.mpc_type = MP_INTSRC;
 	intsrc.mpc_irqflag = 0;	/* conforming */
 	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+	intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;

 	intsrc.mpc_irqtype = mp_INT;

@@ -494,42 +496,11 @@ static void __init construct_default_ioi
 	MP_intsrc_info(&intsrc);
 }

-#endif

-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void __init construct_ioapic_table(int mpc_default_type)
 {
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
 	struct mpc_config_ioapic ioapic;
-#endif
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-#ifndef CONFIG_XEN
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-#endif
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
+	struct mpc_config_bus bus;

 	bus.mpc_type = MP_BUS;
 	bus.mpc_busid = 0;
@@ -558,7 +529,6 @@ static inline void __init construct_defa
 		MP_bus_info(&bus);
 	}

-#ifdef CONFIG_X86_IO_APIC
 	ioapic.mpc_type = MP_IOAPIC;
 	ioapic.mpc_apicid = 2;
 	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -570,7 +540,44 @@ static inline void __init construct_defa
 	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
 	 */
 	construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+#ifndef CONFIG_XEN
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
 #endif
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	construct_ioapic_table(mpc_default_type);
+
 	lintsrc.mpc_type = MP_LINTSRC;
 	lintsrc.mpc_irqflag = 0;	/* conforming */
 	lintsrc.mpc_srcbusid = 0;
@@ -589,7 +596,7 @@ static struct intel_mp_floating *mpf_fou
  * Scan the memory blocks for an SMP configuration block.
  */
 #ifndef CONFIG_XEN
-static void __init __get_smp_config(unsigned early)
+static void __init __get_smp_config(unsigned int early)
 #else
 void __init get_smp_config(void)
 #define early 0
@@ -597,6 +604,10 @@ void __init get_smp_config(void)
 {
 	struct intel_mp_floating *mpf = mpf_found;

+	if (x86_quirks->mach_get_smp_config) {
+		if (x86_quirks->mach_get_smp_config(early))
+			return;
+	}
 	if (acpi_lapic && early)
 		return;
 	/*
@@ -613,7 +624,7 @@ void __init get_smp_config(void)

 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
 	if (mpf->mpf_feature2 & (1 << 7)) {
 		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
@@ -646,8 +657,10 @@ void __init get_smp_config(void)
 		 * Read the physical hardware table.  Anything here will
 		 * override the defaults.
 		 */
-		if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
+		if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
+#endif
 			printk(KERN_ERR
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
@@ -704,10 +717,11 @@ void __init get_smp_config(void)
 static int __init smp_scan_config(unsigned long base, unsigned long length,
 				  unsigned reserve)
 {
-	unsigned int *bp = isa_bus_to_virt(base);
+	unsigned int *bp = _bus_to_virt(base);
 	struct intel_mp_floating *mpf;

-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+			bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);

 	while (length > 0) {
@@ -717,16 +731,22 @@ static int __init smp_scan_config(unsign
 		    !mpf_checksum((unsigned char *)bp, 16) &&
 		    ((mpf->mpf_specification == 1)
 		     || (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
+#endif
 			mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
 #ifndef CONFIG_XEN
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
 			       mpf, virt_to_phys(mpf));
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+			if (!reserve)
+				return 1;
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
+				unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
 				 * table size yet, as only few megabytes from
@@ -736,27 +756,18 @@ static int __init smp_scan_config(unsign
 				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
 				 * in reserve_bootmem.
 				 */
-				unsigned long size = PAGE_SIZE;
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+				reserve_bootmem_generic(mpf->mpf_physptr, size,
 						BOOTMEM_DEFAULT);
 			}
 #else
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-				mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
-#endif
-#elif !defined(CONFIG_XEN)
-			if (!reserve)
-				return 1;
-
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
-			if (mpf->mpf_physptr)
-				reserve_bootmem_generic(mpf->mpf_physptr,
-							PAGE_SIZE);
+				mpf, ((void *)bp - _bus_to_virt(base)) + base);
 #endif
-		return 1;
+			return 1;
 		}
 		bp += 4;
 		length -= 16;
@@ -764,12 +775,16 @@ static int __init smp_scan_config(unsign
 	return 0;
 }

-static void __init __find_smp_config(unsigned reserve)
+static void __init __find_smp_config(unsigned int reserve)
 {
 #ifndef CONFIG_XEN
 	unsigned int address;
 #endif

+	if (x86_quirks->mach_find_smp_config) {
+		if (x86_quirks->mach_find_smp_config(reserve))
+			return;
+	}
 	/*
 	 * FIXME: Linux assumes you have 640K of base ram..
 	 * this continues the error...
@@ -816,302 +831,297 @@ void __init find_smp_config(void)
 	__find_smp_config(1);
 }

-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
-
-#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];

-#ifdef	CONFIG_X86_IO_APIC
+static int  __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+	int i;

-#define MP_ISA_BUS		0
+	if (m->mpc_irqtype != mp_INT)
+		return 0;

-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+	if (m->mpc_irqflag != 0x0f)
+		return 0;

-static int mp_find_ioapic(int gsi)
-{
-	int i = 0;
+	/* not legacy */

-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_base)
-		    && (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
+
+		if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+			continue;
+		if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+			continue;
+		if (irq_used[i]) {
+			/* already claimed */
+			return -2;
+		}
+		irq_used[i] = 1;
+		return i;
 	}

-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	/* not found */
 	return -1;
 }

-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return io_apic_get_unique_id(nr_ioapics, id);
-	else
-#endif
-		return id;
-#else
-	int i;
-	DECLARE_BITMAP(used, 256);
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
 #endif
-}

-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int  __init replace_intsrc_all(struct mp_config_table *mpc,
+					unsigned long mpc_new_phys,
+					unsigned long mpc_new_length)
 {
-	int idx = 0;
-
-	if (bad_ioapic(address))
-		return;
+#ifdef CONFIG_X86_IO_APIC
+	int i;
+	int nr_m_spare = 0;
+#endif

-	idx = nr_ioapics;
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;

-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
+	printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+	while (count < mpc->mpc_length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m =
+				    (struct mpc_config_processor *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_BUS:
+			{
+				struct mpc_config_bus *m =
+				    (struct mpc_config_bus *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_IOAPIC:
+			{
+				mpt += sizeof(struct mpc_config_ioapic);
+				count += sizeof(struct mpc_config_ioapic);
+				break;
+			}
+		case MP_INTSRC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_intsrc *m =
+				    (struct mpc_config_intsrc *)mpt;

-#ifndef CONFIG_XEN
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+				apic_printk(APIC_VERBOSE, "OLD ");
+				print_MP_intsrc_info(m);
+				i = get_MP_intsrc_index(m);
+				if (i > 0) {
+					assign_to_mpc_intsrc(&mp_irqs[i], m);
+					apic_printk(APIC_VERBOSE, "NEW ");
+					print_mp_irq_info(&mp_irqs[i]);
+				} else if (!i) {
+					/* legacy, do nothing */
+				} else if (nr_m_spare < SPARE_SLOT_NUM) {
+					/*
+					 * not found (-1), or duplicated (-2)
+					 * are invalid entries,
+					 * we need to use the slot  later
+					 */
+					m_spare[nr_m_spare] = m;
+					nr_m_spare++;
+				}
 #endif
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].mpc_apicver = 0;
+				mpt += sizeof(struct mpc_config_intsrc);
+				count += sizeof(struct mpc_config_intsrc);
+				break;
+			}
+		case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m =
+				    (struct mpc_config_lintsrc *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		default:
+			/* wrong mptable */
+			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+			printk(KERN_ERR "type %x\n", *mpt);
+			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+					1, mpc, mpc->mpc_length, 1);
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_X86_IO_APIC
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (irq_used[i])
+			continue;
+
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
+
+		if (nr_m_spare > 0) {
+			apic_printk(APIC_VERBOSE, "*NEW* found\n");
+			nr_m_spare--;
+			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+			m_spare[nr_m_spare] = NULL;
+		} else {
+			struct mpc_config_intsrc *m =
+			    (struct mpc_config_intsrc *)mpt;
+			count += sizeof(struct mpc_config_intsrc);
+			if (!mpc_new_phys) {
+				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+			} else {
+				if (count <= mpc_new_length)
+					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+				else {
+					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+					goto out;
+				}
+			}
+			assign_to_mpc_intsrc(&mp_irqs[i], m);
+			mpc->mpc_length = count;
+			mpt += sizeof(struct mpc_config_intsrc);
+		}
+		print_mp_irq_info(&mp_irqs[i]);
+	}
 #endif
-	/*
-	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
-
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+out:
+	/* update checksum */
+	mpc->mpc_checksum = 0;
+	mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+					   mpc->mpc_length);

-	nr_ioapics++;
+	return 0;
 }

-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int ioapic = -1;
-	int pin = -1;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+static int __initdata enable_update_mptable;

-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
+static int __init update_mptable_setup(char *str)
+{
+	enable_update_mptable = 1;
+	return 0;
+}
+early_param("update_mptable", update_mptable_setup);

-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;	/* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
-	intsrc.mpc_dstirq = pin;	/* INTIN# */
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;

-	MP_intsrc_info(&intsrc);
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+	enable_update_mptable = 1;
+	alloc_mptable = 1;
+	if (!p)
+		return 0;
+	mpc_new_length = memparse(p, &p);
+	return 0;
 }
+early_param("alloc_mptable", parse_alloc_mptable_opt);

-void __init mp_config_acpi_legacy_irqs(void)
+void __init early_reserve_e820_mpc_new(void)
 {
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
-	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-
-	/*
-	 * Older generations of ES7000 have no legacy identity mappings
-	 */
-	if (es7000_plat == 1)
-		return;
-
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;	/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+	if (enable_update_mptable && alloc_mptable) {
+		u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+		startt = TRAMPOLINE_BASE;
 #endif
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS
-			    && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-			    (irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;	/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;	/* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		MP_intsrc_info(&intsrc);
+		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
 	}
 }

-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
 {
-	int ioapic;
-	int ioapic_pin;
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
+	char str[16];
+	char oem[10];
+	struct intel_mp_floating *mpf;
+	struct mp_config_table *mpc;
+	struct mp_config_table *mpc_new;
+
+	if (!enable_update_mptable)
+		return 0;
+
+	mpf = mpf_found;
+	if (!mpf)
+		return 0;

-	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
+	 * Now see if we need to go further.
 	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
+	if (mpf->mpf_feature1 != 0)
+		return 0;

-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-#endif
+	if (!mpf->mpf_physptr)
+		return 0;

-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
+	mpc = _bus_to_virt(mpf->mpf_physptr);

-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;

-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+	printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
+	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);

-#ifndef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+	if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+		mpc_new_phys = 0;
+		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			 mpc_new_length);
+	}
+
+	if (!mpc_new_phys) {
+		unsigned char old, new;
+		/* check if we can change the postion */
+		mpc->mpc_checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		mpc->mpc_checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		if (old == new) {
+			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			return 0;
+		}
+		printk(KERN_INFO "use in-positon replacing\n");
+	} else {
+		maddr_t mpc_new_bus;

-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-		       ioapic_pin);
-		return gsi;
-	}
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-		return gsi;
-#endif
+		mpc_new_bus = phys_to_machine(mpc_new_phys);
+		mpf->mpf_physptr = mpc_new_bus;
+		mpc_new = phys_to_virt(mpc_new_phys);
+		memcpy(mpc_new, mpc, mpc->mpc_length);
+		mpc = mpc_new;
+		/* check if we can modify that */
+		if (mpc_new_bus - mpf->mpf_physptr) {
+			struct intel_mp_floating *mpf_new;
+			/* steal 16 bytes from [0, 1k) */
+			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			mpf_new = isa_bus_to_virt(0x400 - 16);
+			memcpy(mpf_new, mpf, 16);
+			mpf = mpf_new;
+			mpf->mpf_physptr = mpc_new_bus;
+		}
+		mpf->mpf_checksum = 0;
+		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
 	}

-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
 	/*
-	 * For GSI >= 64, use IRQ compression
+	 * only replace the one with mp_INT and
+	 *	 MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+	 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+	 * may need pci=routeirq for all coverage
 	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
+	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+	return 0;
 }

-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);
--- head-2011-03-11.orig/arch/x86/kernel/pci-dma-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/pci-dma-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -5,13 +5,13 @@

 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
+#include <asm/amd_iommu.h>

-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
+static int forbid_dac __read_mostly;

-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);

 static int iommu_sac_force __read_mostly;
@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
 void __init dma32_reserve_bootmem(void)
 {
 	unsigned long size, align;
-	if (end_pfn <= MAX_DMA32_PFN)
+	if (max_pfn <= MAX_DMA32_PFN)
 		return;

+	/*
+	 * check aperture_64.c allocate_aperture() for reason about
+	 * using 512M as goal
+	 */
 	align = 64ULL<<20;
 	size = round_up(dma32_bootmem_size, align);
 	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-				 __pa(MAX_DMA_ADDRESS));
+				 512ULL<<20);
 	if (dma32_bootmem_ptr)
 		dma32_bootmem_size = size;
 	else
@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
 }
 static void __init dma32_free_bootmem(void)
 {
-	int node;

-	if (end_pfn <= MAX_DMA32_PFN)
+	if (max_pfn <= MAX_DMA32_PFN)
 		return;

 	if (!dma32_bootmem_ptr)
 		return;

-	for_each_online_node(node)
-		free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
-				  dma32_bootmem_size);
+	free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);

 	dma32_bootmem_ptr = NULL;
 	dma32_bootmem_size = 0;
@@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
 #define dma32_free_bootmem() ((void)0)
 #endif

-static const struct dma_mapping_ops swiotlb_dma_ops = {
+static struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.map_single = swiotlb_map_single_phys,
 	.unmap_single = swiotlb_unmap_single,
@@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
 	 * The order of these functions is important for
 	 * fall-back/fail-over reasons
 	 */
-#ifdef CONFIG_GART_IOMMU
 	gart_iommu_hole_init();
-#endif

-#ifdef CONFIG_CALGARY_IOMMU
 	detect_calgary();
-#endif

 	detect_intel_iommu();

-#ifdef CONFIG_SWIOTLB
+	amd_iommu_detect();
+
 	swiotlb_init();
 	if (swiotlb) {
 		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
 		dma_ops = &swiotlb_dma_ops;
 	}
-#endif
 }

+#ifndef CONFIG_XEN
+unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
+{
+	unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
+
+	return size >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(iommu_num_pages);
+#endif
+
 /*
  * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
  * documentation.
@@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
 			swiotlb = 1;
 #endif

-#ifdef CONFIG_GART_IOMMU
 		gart_parse_options(p);
-#endif

 #ifdef CONFIG_CALGARY_IOMMU
 		if (!strncmp(p, "calgary", 7))
@@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
 		!check_pages_physically_contiguous(pfn, offset, size));
 }

-#ifdef CONFIG_X86_32
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-				dma_addr_t device_addr, size_t size, int flags)
-{
-	void __iomem *mem_base = NULL;
-	int pages = size >> PAGE_SHIFT;
-	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
-	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-		goto out;
-	if (!size)
-		goto out;
-	if (dev->dma_mem)
-		goto out;
-
-	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
-	mem_base = ioremap(bus_addr, size);
-	if (!mem_base)
-		goto out;
-
-	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
-	if (!dev->dma_mem)
-		goto out;
-	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!dev->dma_mem->bitmap)
-		goto free1_out;
-
-	dev->dma_mem->virt_base = mem_base;
-	dev->dma_mem->device_base = device_addr;
-	dev->dma_mem->size = pages;
-	dev->dma_mem->flags = flags;
-
-	if (flags & DMA_MEMORY_MAP)
-		return DMA_MEMORY_MAP;
-
-	return DMA_MEMORY_IO;
-
- free1_out:
-	kfree(dev->dma_mem);
- out:
-	if (mem_base)
-		iounmap(mem_base);
-	return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-
-	if (!mem)
-		return;
-	dev->dma_mem = NULL;
-	iounmap(mem->virt_base);
-	kfree(mem->bitmap);
-	kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	int pos, err;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
-
-	pages >>= PAGE_SHIFT;
-
-	if (!mem)
-		return ERR_PTR(-EINVAL);
-
-	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
-	if (err != 0)
-		return ERR_PTR(err);
-	return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
-				       dma_addr_t *dma_handle, void **ret)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	int order = get_order(size);
-
-	if (mem) {
-		int page = bitmap_find_free_region(mem->bitmap, mem->size,
-						     order);
-		if (page >= 0) {
-			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
-			*ret = mem->virt_base + (page << PAGE_SHIFT);
-			memset(*ret, 0, size);
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-			*ret = NULL;
-	}
-	return (mem != NULL);
-}
-
-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
-	if (mem && vaddr >= mem->virt_base && vaddr <
-		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
-		bitmap_release_region(mem->bitmap, page, order);
-		return 1;
-	}
-	return 0;
-}
-#else
-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
-#define dma_release_coherent(dev, order, vaddr) (0)
-#endif /* CONFIG_X86_32 */
-
 int dma_supported(struct device *dev, u64 mask)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
-		printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
-				 dev->bus_id);
+		dev_info(dev, "PCI: Disallowing DAC for device\n");
 		return 0;
 	}
 #endif

-	if (dma_ops->dma_supported)
-		return dma_ops->dma_supported(dev, mask);
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);

 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
@@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
 	   type. Normally this doesn't make any difference, but gives
 	   more gentle handling of IOMMU overflow. */
 	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
-		printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
-				 dev->bus_id, mask);
+		dev_info(dev, "Force SAC with mask %Lx\n", mask);
 		return 0;
 	}

@@ -422,6 +309,9 @@ void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		   gfp_t gfp)
 {
+#ifndef CONFIG_XEN
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+#endif
 	void *memory = NULL;
 	struct page *page;
 	unsigned long dma_mask = 0;
@@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
 	/* ignore region specifiers */
 	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);

-	if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
 		return memory;

 	if (!dev) {
@@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
 			/* Let low level make its own zone decisions */
 			gfp &= ~(GFP_DMA32|GFP_DMA);

-			if (dma_ops->alloc_coherent)
-				return dma_ops->alloc_coherent(dev, size,
+			if (ops->alloc_coherent)
+				return ops->alloc_coherent(dev, size,
 							   dma_handle, gfp);
 			return NULL;
 		}
@@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
 		}
 	}

-	if (dma_ops->alloc_coherent) {
+	if (ops->alloc_coherent) {
 		free_pages((unsigned long)memory, order);
 		gfp &= ~(GFP_DMA|GFP_DMA32);
-		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+		return ops->alloc_coherent(dev, size, dma_handle, gfp);
 	}

-	if (dma_ops->map_simple) {
-		*dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
+	if (ops->map_simple) {
+		*dma_handle = ops->map_simple(dev, virt_to_bus(memory),
 					      size,
 					      PCI_DMA_BIDIRECTIONAL);
 		if (*dma_handle != bad_dma_address)
@@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size,
 			 void *vaddr, dma_addr_t bus)
 {
+#ifndef CONFIG_XEN
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+#endif
+
 	int order = get_order(size);
 	WARN_ON(irqs_disabled());	/* for portability */
-	if (dma_release_coherent(dev, order, vaddr))
+	if (dma_release_from_coherent(dev, order, vaddr))
 		return;
 #ifndef CONFIG_XEN
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, bus, size, 0);
+	if (ops->unmap_single)
+		ops->unmap_single(dev, bus, size, 0);
 #endif
 	xen_destroy_contiguous_region((unsigned long)vaddr, order);
 	free_pages((unsigned long)vaddr, order);
@@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);

 static int __init pci_iommu_init(void)
 {
-#ifdef CONFIG_CALGARY_IOMMU
 	calgary_iommu_init();
-#endif

 	intel_iommu_init();

-#ifdef CONFIG_GART_IOMMU
+	amd_iommu_init();
+
 	gart_iommu_init();
-#endif

 	no_iommu_init();
 	return 0;
--- head-2011-03-11.orig/arch/x86/kernel/pci-nommu-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/pci-nommu-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -84,18 +84,12 @@ static int nommu_dma_supported(struct de
 	return 1;
 }

-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
-	return (dma_addr == bad_dma_address);
-}
-
-static const struct dma_mapping_ops nommu_dma_ops = {
+static struct dma_mapping_ops nommu_dma_ops = {
 	.map_single = gnttab_map_single,
 	.unmap_single = gnttab_unmap_single,
 	.map_sg = gnttab_map_sg,
 	.unmap_sg = gnttab_unmap_sg,
 	.dma_supported = nommu_dma_supported,
-	.mapping_error = nommu_mapping_error
 };

 void __init no_iommu_init(void)
--- head-2011-03-11.orig/arch/x86/kernel/probe_roms_32.c	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/probe_roms_32.c	2011-02-01 14:38:38.000000000 +0100
@@ -131,7 +131,7 @@ void __init probe_roms(void)
 	upper = system_rom_resource.start;

 	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
+	rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
 	if (romsignature(rom)) {
 		length = extension_rom_resource.end - extension_rom_resource.start + 1;
 		if (romchecksum(rom, length)) {
--- head-2011-03-11.orig/arch/x86/kernel/process-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process-xen.c	2011-03-03 15:59:49.000000000 +0100
@@ -6,6 +6,13 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
+#include <linux/clockchips.h>
+#include <asm/system.h>
+
+unsigned long idle_halt;
+EXPORT_SYMBOL(idle_halt);
+unsigned long idle_nomwait;
+EXPORT_SYMBOL(idle_nomwait);

 struct kmem_cache *task_xstate_cachep;

@@ -45,6 +52,41 @@ void arch_task_cache_init(void)
 				  SLAB_PANIC, NULL);
 }

+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void xen_idle(void)
+{
+	current_thread_info()->status &= ~TS_POLLING;
+	/*
+	 * TS_POLLING-cleared state must be visible before we
+	 * test NEED_RESCHED:
+	 */
+	smp_mb();
+
+	if (!need_resched())
+		safe_halt();	/* enables interrupts racelessly */
+	else
+		local_irq_enable();
+	current_thread_info()->status |= TS_POLLING;
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
 static void do_nothing(void *unused)
 {
 }
@@ -61,7 +103,7 @@ void cpu_idle_wait(void)
 {
 	smp_mb();
 	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
+	smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);

@@ -125,60 +167,175 @@ static void poll_idle(void)
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
+static int __cpuinitdata force_mwait;
+
+#define MWAIT_INFO			0x05
+#define MWAIT_ECX_EXTENDED_INFO		0x01
+#define MWAIT_EDX_C1			0xf0
+
 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 {
+	u32 eax, ebx, ecx, edx;
+
 	if (force_mwait)
 		return 1;

-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		switch(c->x86) {
-		case 0x10:
-		case 0x11:
-			return 0;
+	if (c->cpuid_level < MWAIT_INFO)
+		return 0;
+
+	cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+	/* Check, whether EDX has extended info about MWAIT */
+	if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+		return 1;
+
+	/*
+	 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+	 * C1  supports MWAIT
+	 */
+	return (edx & MWAIT_EDX_C1);
+}
+
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		return 0;
+
+	if (c->x86 < 0x0F)
+		return 0;
+
+	/* Family 0x0f models < rev F do not have C1E */
+	if (c->x86 == 0x0f && c->x86_model < 0x40)
+		return 0;
+
+	return 1;
+}
+
+static cpumask_t c1e_mask = CPU_MASK_NONE;
+static int c1e_detected;
+
+void c1e_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, c1e_mask);
+}
+
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
+{
+	if (need_resched())
+		return;
+
+	if (!c1e_detected) {
+		u32 lo, hi;
+
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+			c1e_detected = 1;
+			if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+				mark_tsc_unstable("TSC halt in AMD C1E");
+			printk(KERN_INFO "System has AMD C1E enabled\n");
+			set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
 		}
 	}
-	return 1;
+
+	if (c1e_detected) {
+		int cpu = smp_processor_id();
+
+		if (!cpu_isset(cpu, c1e_mask)) {
+			cpu_set(cpu, c1e_mask);
+			/*
+			 * Force broadcast so ACPI can not interfere. Needs
+			 * to run with interrupts enabled as it uses
+			 * smp_function_call.
+			 */
+			local_irq_enable();
+			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+					   &cpu);
+			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+			       cpu);
+			local_irq_disable();
+		}
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+		default_idle();
+
+		/*
+		 * The switch back from broadcast mode needs to be
+		 * called with interrupts disabled.
+		 */
+		 local_irq_disable();
+		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		 local_irq_enable();
+	} else
+		default_idle();
 }
 #endif

 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifndef CONFIG_XEN
-	static int selected;
-
-	if (selected)
-		return;
 #ifdef CONFIG_X86_SMP
 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
 			" performance may degrade.\n");
 	}
 #endif
+	if (pm_idle)
+		return;
+
 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		/*
-		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
-		if (!pm_idle) {
-			printk(KERN_INFO "using mwait in idle threads.\n");
-			pm_idle = mwait_idle;
-		}
-	}
-	selected = 1;
+		printk(KERN_INFO "using mwait in idle threads.\n");
+		pm_idle = mwait_idle;
+	} else if (check_c1e_idle(c)) {
+		printk(KERN_INFO "using C1E aware idle routine\n");
+		pm_idle = c1e_idle;
+	} else
+		pm_idle = default_idle;
 #endif
 }

 static int __init idle_setup(char *str)
 {
+	if (!str)
+		return -EINVAL;
+
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
-	}
 #ifndef CONFIG_XEN
-	else if (!strcmp(str, "mwait"))
+	} else if (!strcmp(str, "mwait"))
 		force_mwait = 1;
+	else if (!strcmp(str, "halt")) {
+		/*
+		 * When the boot option of idle=halt is added, halt is
+		 * forced to be used for CPU idle. In such case CPU C2/C3
+		 * won't be used again.
+		 * To continue to load the CPU idle driver, don't touch
+		 * the boot_option_idle_override.
+		 */
+		pm_idle = default_idle;
+		idle_halt = 1;
+		return 0;
+	} else if (!strcmp(str, "nomwait")) {
+		/*
+		 * If the boot option of "idle=nomwait" is added,
+		 * it means that mwait will be disabled for CPU C2/C3
+		 * states. In such case it won't touch the variable
+		 * of boot_option_idle_override.
+		 */
+		idle_nomwait = 1;
+		return 0;
 #endif
-	else
+	} else
 		return -1;

 	boot_option_idle_override = 1;
--- head-2011-03-11.orig/arch/x86/kernel/process_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process_32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -59,15 +59,11 @@
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
 #include <asm/kdebug.h>
+#include <asm/idle.h>

 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");

-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);

@@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
 	return ((unsigned long *)tsk->thread.sp)[3];
 }

-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_HOTPLUG_CPU
+#ifndef CONFIG_XEN
+#include <asm/nmi.h>

-void disable_hlt(void)
+static void cpu_exit_clear(void)
 {
-	hlt_counter++;
-}
+	int cpu = raw_smp_processor_id();

-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
-	hlt_counter--;
-}
+	idle_task_exit();

-EXPORT_SYMBOL(enable_hlt);
+	cpu_uninit();
+	irq_ctx_exit(cpu);

-static void xen_idle(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-	/*
-	 * TS_POLLING-cleared state must be visible before we
-	 * test NEED_RESCHED:
-	 */
-	smp_mb();
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);

-	if (!need_resched())
-		safe_halt();	/* enables interrupts racelessly */
-	else
-		local_irq_enable();
-	current_thread_info()->status |= TS_POLLING;
+	numa_remove_cpu(cpu);
+	c1e_remove_cpu(cpu);
 }
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
 #endif

-#ifdef CONFIG_HOTPLUG_CPU
 static inline void play_dead(void)
 {
 	idle_task_exit();
@@ -152,13 +129,11 @@ void cpu_idle(void)

 	/* endless idle loop with no priority at all */
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched()) {
-			void (*idle)(void);

 			check_pgt_cache();
 			rmb();
-			idle = xen_idle; /* no alternatives */

 			if (rcu_pending(cpu))
 				rcu_check_callbacks(cpu, 0);
@@ -168,7 +143,10 @@ void cpu_idle(void)

 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
-			idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+			xen_idle();
+			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
--- head-2011-03-11.orig/arch/x86/kernel/process_64-xen.c	2011-02-02 08:32:46.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process_64-xen.c	2011-02-02 08:34:01.000000000 +0100
@@ -64,15 +64,6 @@ asmlinkage extern void ret_from_fork(voi

 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;

-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);

 void idle_notifier_register(struct notifier_block *n)
@@ -102,25 +93,13 @@ void exit_idle(void)
 	__exit_idle();
 }

-static void xen_idle(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-	/*
-	 * TS_POLLING-cleared state must be visible before we
-	 * test NEED_RESCHED:
-	 */
-	smp_mb();
-	if (!need_resched())
-		safe_halt();	/* enables interrupts racelessly */
-	else
-		local_irq_enable();
-	current_thread_info()->status |= TS_POLLING;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 static inline void play_dead(void)
 {
 	idle_task_exit();
+#ifndef CONFIG_XEN
+	c1e_remove_cpu(raw_smp_processor_id());
+#endif
 	local_irq_disable();
 	cpu_clear(smp_processor_id(), cpu_initialized);
 	preempt_enable_no_resched();
@@ -145,12 +124,11 @@ void cpu_idle(void)
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched()) {
-			void (*idle)(void);

 			rmb();
-			idle = xen_idle; /* no alternatives */
+
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
 			/*
@@ -160,7 +138,10 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
-			idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+			xen_idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
@@ -270,7 +251,7 @@ void exit_thread(void)
 	}
 }

-void load_gs_index(unsigned gs)
+void xen_load_gs_index(unsigned gs)
 {
 	WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
 }
@@ -371,10 +352,10 @@ int copy_thread(int nr, unsigned long cl
 	p->thread.fs = me->thread.fs;
 	p->thread.gs = me->thread.gs;

-	asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
-	asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
-	asm("mov %%es,%0" : "=m" (p->thread.es));
-	asm("mov %%ds,%0" : "=m" (p->thread.ds));
+	savesegment(gs, p->thread.gsindex);
+	savesegment(fs, p->thread.fsindex);
+	savesegment(es, p->thread.es);
+	savesegment(ds, p->thread.ds);

 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -415,7 +396,9 @@ out:
 void
 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
-	asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+	loadsegment(fs, 0);
+	loadsegment(es, 0);
+	loadsegment(ds, 0);
 	load_gs_index(0);
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
@@ -554,8 +537,8 @@ static inline void __switch_to_xtra(stru
 struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
-	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+	struct thread_struct *prev = &prev_p->thread;
+	struct thread_struct *next = &next_p->thread;
 	int cpu = smp_processor_id();
 #ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -658,12 +641,25 @@ __switch_to(struct task_struct *prev_p,
 	 */
 	if (unlikely(next->es))
 		loadsegment(es, next->es);
-
+
 	if (unlikely(next->ds))
 		loadsegment(ds, next->ds);

+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_leave_lazy_cpu_mode();
+
 	/*
 	 * Switch FS and GS.
+	 *
+	 * Segment register != 0 always requires a reload.  Also
+	 * reload when it has changed.  When prev process used 64bit
+	 * base always reload to avoid an information leak.
 	 */
 	if (unlikely(next->fsindex))
 		loadsegment(fs, next->fsindex);
@@ -682,7 +678,8 @@ __switch_to(struct task_struct *prev_p,
 	 */
 	write_pda(pcurrent, next_p);
 	write_pda(kernelstack,
-	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+		  (unsigned long)task_stack_page(next_p) +
+		  THREAD_SIZE - PDA_STACKOFFSET);
 #ifdef CONFIG_CC_STACKPROTECTOR
 	write_pda(stack_canary, next_p->stack_canary);

@@ -843,7 +840,7 @@ long do_arch_prctl(struct task_struct *t
 			set_32bit_tls(task, FS_TLS, addr);
 			if (doit) {
 				load_TLS(&task->thread, cpu);
-				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+				loadsegment(fs, FS_TLS_SEL);
 			}
 			task->thread.fsindex = FS_TLS_SEL;
 			task->thread.fs = 0;
@@ -853,7 +850,7 @@ long do_arch_prctl(struct task_struct *t
 			if (doit) {
 				/* set the selector to 0 to not confuse
 				   __switch_to */
-				asm volatile("movl %0,%%fs" :: "r" (0));
+				loadsegment(fs, 0);
                                 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
 								  addr);
 			}
@@ -877,7 +874,7 @@ long do_arch_prctl(struct task_struct *t
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
 		else if (doit) {
-			asm("movl %%gs,%0" : "=r" (gsindex));
+			savesegment(gs, gsindex);
 			if (gsindex)
 				rdmsrl(MSR_KERNEL_GS_BASE, base);
 			else
--- head-2011-03-11.orig/arch/x86/kernel/setup-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/setup-xen.c	2011-03-04 15:09:03.000000000 +0100
@@ -1,143 +1,1132 @@
-#include <linux/kernel.h>
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *	David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/efi.h>
 #include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
 #include <linux/percpu.h>
-#include <asm/smp.h>
-#include <asm/percpu.h>
+#include <linux/crash_dump.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/efi.h>
 #include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
 #include <asm/processor.h>
-#include <asm/setup.h>
+#include <asm/bugs.h>
+
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <mach_apic.h>
+#include <asm/paravirt.h>
+
+#include <asm/percpu.h>
 #include <asm/topology.h>
-#include <asm/mpspec.h>
 #include <asm/apicdef.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
+
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+	xen_panic_event, NULL, 0 /* try to go last */
+};
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+unsigned long *pfn_to_mfn_frame_list_list,
+#ifdef CONFIG_X86_64
+	*pfn_to_mfn_frame_list[512];
+#else
+	*pfn_to_mfn_frame_list[128];
+#endif
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+#endif
+
+#ifndef ARCH_SETUP
+#define ARCH_SETUP
+#endif

-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
 #ifndef CONFIG_XEN
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+#endif
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};

-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);

-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
+unsigned long init_pg_tables_end __initdata = ~0UL;
 #endif
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1, .hard_math = 1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1, .hard_math = 1 };
+EXPORT_SYMBOL(boot_cpu_data);
+#ifndef CONFIG_XEN
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+	MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+#endif
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
+struct ist_info ist_info;
 #endif

-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
 /*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK	0x07FF
+#define RAMDISK_PROMPT_FLAG		0x8000
+#define RAMDISK_LOAD_FLAG		0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
  */
-static void __init setup_per_cpu_maps(void)
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+	    sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+     edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
+{
+
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
+
+	/* We need to move the initrd down into lowmem */
+	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+					 PAGE_SIZE);
+
+	if (ramdisk_here == -1ULL)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
+
+	/* Note: this includes all the lowmem currently occupied by
+	   the initrd, we rely on that fact to keep the data intact. */
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+			 "NEW RAMDISK");
+	initrd_start = ramdisk_here + PAGE_OFFSET;
+	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
+
+	q = (char *)initrd_start;
+
+	/* Copy any lowmem portion of the initrd */
+	if (ramdisk_image < end_of_lowmem) {
+		clen = end_of_lowmem - ramdisk_image;
+		p = (char *)__va(ramdisk_image);
+		memcpy(q, p, clen);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+
+	/* Copy the highmem portion of the initrd */
+	while (ramdisk_size) {
+		slop = ramdisk_image & ~PAGE_MASK;
+		clen = ramdisk_size;
+		if (clen > MAX_MAP_CHUNK-slop)
+			clen = MAX_MAP_CHUNK-slop;
+		mapaddr = ramdisk_image & PAGE_MASK;
+		p = early_ioremap(mapaddr, clen+slop);
+		memcpy(q, p+slop, clen);
+		early_iounmap(p, clen+slop);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+		" %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+}
+#endif
+
+static void __init reserve_initrd(void)
 {
 #ifndef CONFIG_XEN
-	int cpu;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;

-	for_each_possible_cpu(cpu) {
-		per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
-		per_cpu(x86_bios_cpu_apicid, cpu) =
-						x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
-		per_cpu(x86_cpu_to_node_map, cpu) =
-						x86_cpu_to_node_map_init[cpu];
+	if (!boot_params.hdr.type_of_loader ||
+	    !ramdisk_image || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+#else
+	unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+	unsigned long ramdisk_size  = xen_start_info->mod_len;
+	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+	if (!xen_start_info->mod_start || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
 #endif
+
+	initrd_start = 0;
+
+	if (ramdisk_size >= (end_of_lowmem>>1)) {
+		free_early(ramdisk_image, ramdisk_end);
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
 	}

-	/* indicate the early static arrays will soon be gone */
-	x86_cpu_to_apicid_early_ptr = NULL;
-	x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = NULL;
+	printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
+			ramdisk_end);
+
+
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start + ramdisk_size;
+#ifdef CONFIG_X86_64_XEN
+		initrd_below_start_ok = 1;
 #endif
+		return;
+	}
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	relocate_initrd();
+#else
+	printk(KERN_ERR "initrd extends beyond end of memory "
+	       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+	       ramdisk_end, end_of_lowmem);
+	initrd_start = 0;
 #endif
+	free_early(ramdisk_image, ramdisk_end);
+}
+#else
+static void __init reserve_initrd(void)
+{
 }
+#endif /* CONFIG_BLK_DEV_INITRD */

-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
+static void __init parse_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
+		default:
+			break;
+		}
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+#endif
+}

-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
+static void __init e820_reserve_setup_data(void)
 {
-	int i;
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+	int found = 0;

-	/* alloc_bootmem zeroes memory */
-	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
-	for (i = 0; i < nr_cpu_ids; i++)
-		cpu_set(i, cpumask_of_cpu_map[i]);
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		e820_update_range(pa_data, sizeof(*data)+data->len,
+			 E820_RAM, E820_RESERVED_KERN);
+		found = 1;
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+	if (!found)
+		return;
+
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	memcpy(&e820_saved, &e820, sizeof(struct e820map));
+	printk(KERN_INFO "extended physical RAM map:\n");
+	e820_print_map("reserve setup_data");
+#endif
 }
-#else
-static inline void setup_cpumask_of_cpu(void) { }
+
+static void __init reserve_early_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
 #endif
+}

-#ifdef CONFIG_X86_32
 /*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+ * --------- Crashkernel reservation ------------------------------
  */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
+
+#ifdef CONFIG_KEXEC
+
+#ifndef CONFIG_XEN
+/**
+ * Reserve @size bytes of crashkernel memory at any suitable offset.
+ *
+ * @size: Size of the crashkernel memory to reserve.
+ * Returns the base address on success, and -1ULL on failure.
+ */
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
+{
+	const unsigned long long alignment = 16<<20; 	/* 16M */
+	unsigned long long start = 0LL;
+
+	while (1) {
+		int ret;
+
+		start = find_e820_area(start, ULONG_MAX, size, alignment);
+		if (start == -1ULL)
+			return start;
+
+		/* try to reserve it */
+		ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
+		if (ret >= 0)
+			return start;
+
+		start += alignment;
+	}
+}
+
+static inline unsigned long long get_total_mem(void)
+{
+	unsigned long long total;
+
+	total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	total += highend_pfn - highstart_pfn;
 #endif

+	return total << PAGE_SHIFT;
+}
+
+static void __init reserve_crashkernel(void)
+{
+	unsigned long long total_mem;
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	total_mem = get_total_mem();
+
+	ret = parse_crashkernel(boot_command_line, total_mem,
+			&crash_size, &crash_base);
+	if (ret != 0 || crash_size <= 0)
+		return;
+
+	/* 0 means: find the address automatically */
+	if (crash_base <= 0) {
+		crash_base = find_and_reserve_crashkernel(crash_size);
+		if (crash_base == -1ULL) {
+			pr_info("crashkernel reservation failed. "
+				"No suitable area found.\n");
+			return;
+		}
+	} else {
+		ret = reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE);
+		if (ret < 0) {
+			pr_info("crashkernel reservation failed - "
+				"memory is in use\n");
+			return;
+		}
+	}
+
+	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+			"for crashkernel (System RAM: %ldMB)\n",
+			(unsigned long)(crash_size >> 20),
+			(unsigned long)(crash_base >> 20),
+			(unsigned long)(total_mem >> 20));
+
+	crashk_res.start = crash_base;
+	crashk_res.end   = crash_base + crash_size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+
+static struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+static void __init reserve_standard_io_resources(void)
+{
+	int i;
+
+	/* Nothing to do if not running in dom0. */
+	if (!is_initial_xendomain())
+		return;
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+	char *end;
+	if (!arg)
+		return -EINVAL;
+	elfcorehdr_addr = memparse(arg, &end);
+	return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
 /*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
  */
-void __init setup_per_cpu_areas(void)
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
 {
-	int i, highest_cpu = 0;
-	unsigned long size;
+#ifdef CONFIG_XEN
+	unsigned int i;
+	unsigned long p2m_pages;
+	struct physdev_set_iopl set_iopl;

-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
+#ifdef CONFIG_X86_32
+	/* Force a quick death if the kernel panics (not domain 0). */
+	extern int panic_timeout;
+	if (!panic_timeout && !is_initial_xendomain())
+		panic_timeout = 1;
 #endif

-	/* Copy section for each CPU (we discard the original) */
-	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
-			  size);
-
-	for_each_possible_cpu(i) {
-		char *ptr;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
-#else
-		int node = early_cpu_to_node(i);
-		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
-			printk(KERN_INFO
-			       "cpu %d has no node or node-local memory\n", i);
-		}
-		else
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+	/* Register a call for panic conditions. */
+	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_writable_pagetables));
+#ifdef CONFIG_X86_32
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_4gb_segments));
+#endif
+	set_iopl.iopl = 1;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+	visws_early_detect();
+	pre_setup_arch_hook();
+#else
+	printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
+
+	early_cpu_init();
+	early_ioremap_init();
+
+#ifndef CONFIG_XEN
+	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+	screen_info = boot_params.screen_info;
+	edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+	apm_info.bios = boot_params.apm_bios_info;
+	ist_info = boot_params.ist_info;
+	if (boot_params.sys_desc_table.length != 0) {
+		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+		machine_id = boot_params.sys_desc_table.table[0];
+		machine_submodel_id = boot_params.sys_desc_table.table[1];
+		BIOS_revision = boot_params.sys_desc_table.table[2];
+	}
+#endif
+	saved_video_mode = boot_params.hdr.vid_mode;
+	bootloader_type = boot_params.hdr.type_of_loader;
+
+#ifdef CONFIG_BLK_DEV_RAM
+	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+#ifdef CONFIG_X86_32
+		     "EL32",
+#else
+		     "EL64",
 #endif
-		if (!ptr)
-			panic("Cannot allocate cpu data for CPU %d\n", i);
+	 4)) {
+		efi_enabled = 1;
+		efi_reserve_early();
+	}
+#endif
+#else /* CONFIG_XEN */
+#ifdef CONFIG_X86_32
+	/* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+	   properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+	*/
+	ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+#else
+ 	ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+#endif
+	if (is_initial_xendomain()) {
+		const struct dom0_vga_console_info *info =
+			(void *)((char *)xen_start_info +
+			         xen_start_info->console.dom0.info_off);
+
+		dom0_init_screen_info(info,
+		                      xen_start_info->console.dom0.info_size);
+		xen_start_info->console.domU.mfn = 0;
+		xen_start_info->console.domU.evtchn = 0;
+	} else
+		screen_info.orig_video_isVGA = 0;
+	copy_edid();
+#endif /* CONFIG_XEN */
+
+	ARCH_SETUP
+
+	setup_memory_map();
+	parse_setup_data();
+	/* update the e820_saved too */
+	e820_reserve_setup_data();
+
+	copy_edd();
+
+#ifndef CONFIG_XEN
+	if (!boot_params.hdr.root_flags)
+		root_mountflags &= ~MS_RDONLY;
+#endif
+	init_mm.start_code = (unsigned long) _text;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+	init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+#else
+	init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
+		       xen_start_info->nr_pt_frames) << PAGE_SHIFT;
+#endif
+#else
+	init_mm.brk = (unsigned long) &_end;
+#endif
+
+	code_resource.start = virt_to_phys(_text);
+	code_resource.end = virt_to_phys(_etext)-1;
+	data_resource.start = virt_to_phys(_etext);
+	data_resource.end = virt_to_phys(_edata)-1;
+	bss_resource.start = virt_to_phys(&__bss_start);
+	bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	parse_early_param();
+
 #ifdef CONFIG_X86_64
-		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+	check_efer();
+#endif
+
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+	/*
+	 * Must be before kernel pagetables are setup
+	 * or fixmap area is touched.
+	 */
+	vmi_init();
+#endif
+
+	/* after early param, so could get panic from serial */
+	reserve_early_setup_data();
+
+	if (acpi_mps_check()) {
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+		disable_apic = 1;
+#endif
+		setup_clear_cpu_cap(X86_FEATURE_APIC);
+	}
+
+#ifdef CONFIG_PCI
+	if (pci_early_dump_regs)
+		early_dump_pci_devices();
+#endif
+
+	finish_e820_parsing();
+
+#ifdef CONFIG_X86_32
+	if (is_initial_xendomain())
+		probe_roms();
+#endif
+
+#ifndef CONFIG_XEN
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
+
+	if (efi_enabled)
+		efi_init();
+
+#ifdef CONFIG_X86_32
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
 #else
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
+	early_gart_iommu_check();
 #endif
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#endif /* CONFIG_XEN */

-		highest_cpu = i;
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram_pfn();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+#ifndef CONFIG_XEN
+	if (mtrr_trim_uncached_memory(max_pfn))
+		max_pfn = e820_end_of_ram_pfn();
+#endif
+
+#ifdef CONFIG_X86_32
+	/* max_low_pfn get updated here */
+	find_low_pfn_range();
+#else
+	num_physpages = max_pfn;
+	max_mapnr = max_pfn;
+
+
+	/* How many end-of-memory variables you have, grandma! */
+	/* need this before calling reserve_initrd */
+	if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+		max_low_pfn = e820_end_of_low_ram_pfn();
+	else
+		max_low_pfn = max_pfn;
+
+	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	/* max_pfn_mapped is updated here */
+	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+	max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+	if (max_pfn > max_low_pfn) {
+		max_pfn_mapped = init_memory_mapping(1UL<<32,
+						     max_pfn<<PAGE_SHIFT);
+		/* can we preseve max_low_pfn ?*/
+		max_low_pfn = max_pfn;
 	}
+#endif

-	nr_cpu_ids = highest_cpu + 1;
-	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */

-	/* Setup percpu data maps */
-	setup_per_cpu_maps();
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif

-	/* Setup cpumask_of_cpu map */
-	setup_cpumask_of_cpu();
-}
+	reserve_initrd();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+	vsmp_init();
+#endif
+
+	if (is_initial_xendomain())
+		dmi_scan_machine();
+
+	io_delay_init();
+
+#ifdef CONFIG_ACPI
+	if (!is_initial_xendomain()) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+
+#ifdef CONFIG_ACPI_NUMA
+	/*
+	 * Parse SRAT to discover nodes.
+	 */
+	acpi_numa_init();
+#endif
+
+	initmem_init(0, max_pfn);
+
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	reserve_crashkernel();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+	/*
+	 * dma32_reserve_bootmem() allocates bootmem which may conflict
+	 * with the crashkernel command line, so do that after
+	 * reserve_crashkernel()
+	 */
+	dma32_reserve_bootmem();
+#endif
+
+	reserve_ibft_region();
+
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
+	xen_pagetable_setup_start(swapper_pg_dir);
+	paging_init();
+	xen_pagetable_setup_done(swapper_pg_dir);
+	paravirt_post_allocator_init();
+
+#ifdef CONFIG_X86_64
+	map_vsyscall();
+#endif
+
+#ifdef CONFIG_XEN
+	p2m_pages = max_pfn;
+	if (xen_start_info->nr_pages > max_pfn) {
+		/*
+		 * the max_pfn was shrunk (probably by mem= or highmem=
+		 * kernel parameter); shrink reservation with the HV
+		 */
+		struct xen_memory_reservation reservation = {
+			.address_bits = 0,
+			.extent_order = 0,
+			.domid = DOMID_SELF
+		};
+		unsigned int difference;
+		int ret;
+
+		difference = xen_start_info->nr_pages - max_pfn;

+		set_xen_guest_handle(reservation.extent_start,
+				     ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+		reservation.nr_extents = difference;
+		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					   &reservation);
+		BUG_ON(ret != difference);
+	}
+	else if (max_pfn > xen_start_info->nr_pages)
+		p2m_pages = xen_start_info->nr_pages;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		unsigned long i, j;
+		unsigned int k, fpp;
+
+		/* Make sure we have a large enough P->M table. */
+		phys_to_machine_mapping = alloc_bootmem_pages(
+			max_pfn * sizeof(unsigned long));
+		memset(phys_to_machine_mapping, ~0,
+		       max_pfn * sizeof(unsigned long));
+		memcpy(phys_to_machine_mapping,
+		       (unsigned long *)xen_start_info->mfn_list,
+		       p2m_pages * sizeof(unsigned long));
+		free_bootmem(
+			__pa(xen_start_info->mfn_list),
+			PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+					sizeof(unsigned long))));
+
+		/*
+		 * Initialise the list of the frames that specify the list of
+		 * frames that make up the p2m table. Used by save/restore.
+		 */
+		pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
+
+		fpp = PAGE_SIZE/sizeof(unsigned long);
+		for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+			if (j == fpp)
+				j = 0;
+			if (j == 0) {
+				k++;
+				BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
+				pfn_to_mfn_frame_list[k] =
+					alloc_bootmem_pages(PAGE_SIZE);
+				pfn_to_mfn_frame_list_list[k] =
+					virt_to_mfn(pfn_to_mfn_frame_list[k]);
+			}
+			pfn_to_mfn_frame_list[k][j] =
+				virt_to_mfn(&phys_to_machine_mapping[i]);
+		}
+		HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+			virt_to_mfn(pfn_to_mfn_frame_list_list);
+	}
+
+	/* Mark all ISA DMA channels in-use - using them wouldn't work. */
+	for (i = 0; i < MAX_DMA_CHANNELS; ++i)
+		if (i != 4 && request_dma(i, "xen") != 0)
+			BUG();
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_GENERICARCH
+	generic_apic_probe();
 #endif
+
+#ifndef CONFIG_XEN
+	early_quirks();
+#endif
+
+	/*
+	 * Read APIC and some other early information from ACPI tables.
+	 */
+	acpi_boot_init();
+
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		get_smp_config();
+#endif
+
+	prefill_possible_map();
+#ifdef CONFIG_X86_64
+	init_cpu_to_node();
+#endif
+
+#ifndef CONFIG_XEN
+	init_apic_mappings();
+	ioapic_init_mappings();
+
+	kvm_guest_init();
+
+	e820_reserve_resources();
+	e820_mark_nosave_regions(max_low_pfn);
+#else
+	if (is_initial_xendomain())
+		e820_reserve_resources();
+#endif
+
+#ifdef CONFIG_X86_32
+	if (is_initial_xendomain())
+		request_resource(&iomem_resource, &video_ram_resource);
+#endif
+	reserve_standard_io_resources();
+
+#ifndef CONFIG_XEN
+	e820_setup_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+		conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+	conswitchp = &dummy_con;
+#endif
+#endif
+#else /* CONFIG_XEN */
+	if (is_initial_xendomain())
+		e820_setup_gap();
+
+#ifdef CONFIG_VT
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+#ifdef CONFIG_VGA_CONSOLE
+	if (is_initial_xendomain())
+		conswitchp = &vga_con;
+#endif
+#endif
+#endif /* CONFIG_XEN */
+}
+
+#ifdef CONFIG_XEN
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	HYPERVISOR_shutdown(SHUTDOWN_crash);
+	/* we're never actually going to get here... */
+	return NOTIFY_DONE;
+}
+#endif /* !CONFIG_XEN */
--- head-2011-03-11.orig/arch/x86/kernel/setup64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,370 +0,0 @@
-/*
- * X86-64 specific CPU setup.
- * Copyright (C) 1995  Linus Torvalds
- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
- * See setup.c for older changelog.
- *
- * Jun Nakajima <jun.nakajima@intel.com>
- *   Modified for Xen
- *
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/mmu_context.h>
-#include <asm/smp.h>
-#include <asm/i387.h>
-#include <asm/percpu.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-#ifdef CONFIG_XEN
-#include <asm/hypervisor.h>
-#endif
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
-
-#ifndef CONFIG_X86_NO_IDT
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-#endif
-
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata = 0;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on	Enable(default)
-off	Disable
-*/
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-                __supported_pte_mask |= _PAGE_NX;
- 		do_not_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-        }
-	return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32 = 0;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off	PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
-	if (!strcmp(str, "on"))
-		force_personality32 &= ~READ_IMPLIES_EXEC;
-	else if (!strcmp(str, "off"))
-		force_personality32 |= READ_IMPLIES_EXEC;
-	return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-#ifdef CONFIG_XEN
-static void __init_refok switch_pt(int cpu)
-{
-	if (cpu == 0)
-		xen_init_pt();
-	xen_pt_switch(__pa_symbol(init_level4_pgt));
-	xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
-}
-#define switch_pt() switch_pt(cpu)
-
-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
-{
-	unsigned long frames[16];
-	unsigned long va;
-	int f;
-
-	for (va = gdt_descr->address, f = 0;
-	     va < gdt_descr->address + gdt_descr->size;
-	     va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
-		make_page_readonly(
-			(void *)va, XENFEAT_writable_descriptor_tables);
-	}
-	if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
-                               sizeof (struct desc_struct)))
-		BUG();
-}
-#else
-static void switch_pt(void)
-{
-	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-}
-
-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
-{
-	load_gdt(gdt_descr);
-	load_idt(idt_descr);
-}
-#endif
-
-void pda_init(int cpu)
-{
-	struct x8664_pda *pda = cpu_pda(cpu);
-
-	/* Setup up data that may be needed in __get_free_pages early */
-	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
-#ifndef CONFIG_XEN
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
-#else
-	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
-					(unsigned long)pda))
-		BUG();
-#endif
-	pda->cpunumber = cpu;
-	pda->irqcount = -1;
-	pda->kernelstack =
-		(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
-	pda->active_mm = &init_mm;
-	pda->mmu_state = 0;
-
-	if (cpu == 0) {
-		/* others are initialized in smpboot.c */
-		pda->pcurrent = &init_task;
-		pda->irqstackptr = boot_cpu_stack;
-	} else {
-		pda->irqstackptr = (char *)
-			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-		if (!pda->irqstackptr)
-			panic("cannot allocate irqstack for cpu %d", cpu);
-	}
-
-	switch_pt();
-
-	pda->irqstackptr += IRQSTACKSIZE-64;
-}
-
-#ifndef CONFIG_X86_NO_TSS
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-#endif
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-#ifndef CONFIG_XEN
-	/*
-	 * LSTAR and STAR live in a bit strange symbiosis.
-	 * They both write to the same internal register. STAR allows to set CS/DS
-	 * but only a 32bit target. LSTAR sets the 64bit rip.
-	 */
-	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
-	wrmsrl(MSR_CSTAR, ignore_sysret);
-
-	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK,
-	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	syscall32_cpu_init ();
-#else
-	{
-		static const struct callback_register cstar = {
-			.type = CALLBACKTYPE_syscall32,
-			.address = (unsigned long)ignore_sysret
-		};
-		if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
-			printk(KERN_WARNING "Unable to register CSTAR callback\n");
-	}
-#endif
-}
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-        if (!(efer & EFER_NX) || do_not_nx) {
-                __supported_pte_mask &= ~_PAGE_NX;
-        }
-}
-
-unsigned long kernel_eflags;
-
-#ifndef CONFIG_X86_NO_TSS
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#endif
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init (void)
-{
-	int cpu = stack_smp_processor_id();
-#ifndef CONFIG_X86_NO_TSS
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v;
-	char *estacks = NULL;
-	unsigned i;
-#endif
-	struct task_struct *me;
-
-	/* CPU 0 is initialised in head64.c */
-	if (cpu != 0) {
-		pda_init(cpu);
-	}
-#ifndef CONFIG_X86_NO_TSS
-	else
-		estacks = boot_exception_stacks;
-#endif
-
-	me = current;
-
-	if (cpu_test_and_set(cpu, cpu_initialized))
-		panic("CPU#%d already initialized!\n", cpu);
-
-	printk("Initializing CPU#%d\n", cpu);
-
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
-#ifndef CONFIG_XEN
-	if (cpu)
-		memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
-#endif
-
-	cpu_gdt_descr[cpu].size = GDT_SIZE;
-	cpu_gdt_init(&cpu_gdt_descr[cpu]);
-
-	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
-	syscall_init();
-
-	wrmsrl(MSR_FS_BASE, 0);
-	wrmsrl(MSR_KERNEL_GS_BASE, 0);
-	barrier();
-
-	check_efer();
-
-#ifndef CONFIG_X86_NO_TSS
-	/*
-	 * set up and load the per-CPU TSS
-	 */
-	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-		};
-		if (cpu) {
-			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-			if (!estacks)
-				panic("Cannot allocate exception stack %ld %d\n",
-				      v, cpu);
-		}
-		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
-	}
-
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
-	/*
-	 * <= is required because the CPU will access up to
-	 * 8 bits beyond the end of the IO permission bitmap.
-	 */
-	for (i = 0; i <= IO_BITMAP_LONGS; i++)
-		t->io_bitmap[i] = ~0UL;
-#endif
-
-	atomic_inc(&init_mm.mm_count);
-	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
-	enter_lazy_tlb(&init_mm, me);
-
-#ifndef CONFIG_X86_NO_TSS
-	set_tss_desc(cpu, t);
-#endif
-#ifndef CONFIG_XEN
-	load_TR_desc();
-#endif
-	load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else {
-#endif
-	/*
-	 * Clear all 6 debug registers:
-	 */
-
-	set_debugreg(0UL, 0);
-	set_debugreg(0UL, 1);
-	set_debugreg(0UL, 2);
-	set_debugreg(0UL, 3);
-	set_debugreg(0UL, 6);
-	set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-	/* If the kgdb is connected no debug regs should be altered. */
-	}
-#endif
-
-	fpu_init();
-
-	asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
-	if (raw_irqs_disabled())
-		kernel_eflags &= ~X86_EFLAGS_IF;
-
-	if (is_uv_system())
-		uv_cpu_init();
-}
--- head-2011-03-11.orig/arch/x86/kernel/setup_32-xen.c	2011-03-04 15:07:31.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,1153 +0,0 @@
-/*
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- *  Memory region support
- *	David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- *  Added E820 sanitization routine (removes overlapping memory regions);
- *  Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- *    Patrick Mochel <mochel@osdl.org>, March 2002
- *
- *  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *  Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <video/edid.h>
-
-#include <asm/mtrr.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/mmzone.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/io.h>
-#include <asm/hypervisor.h>
-#include <xen/interface/physdev.h>
-#include <xen/interface/memory.h>
-#include <xen/features.h>
-#include <xen/firmware.h>
-#include <xen/xencons.h>
-#include <setup_arch.h>
-#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_XEN
-#include <xen/interface/kexec.h>
-#endif
-
-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
-static struct notifier_block xen_panic_block = {
-	xen_panic_event, NULL, 0 /* try to go last */
-};
-
-/*
- * Machine setup..
- */
-static struct resource data_resource = {
-	.name	= "Kernel data",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
-	.name	= "Kernel code",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-	.name	= "dma1",
-	.start	= 0x0000,
-	.end	= 0x001f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic1",
-	.start	= 0x0020,
-	.end	= 0x0021,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer0",
-	.start	= 0x0040,
-	.end    = 0x0043,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer1",
-	.start  = 0x0050,
-	.end    = 0x0053,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0060,
-	.end	= 0x0060,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0064,
-	.end	= 0x0064,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma page reg",
-	.start	= 0x0080,
-	.end	= 0x008f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic2",
-	.start	= 0x00a0,
-	.end	= 0x00a1,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma2",
-	.start	= 0x00c0,
-	.end	= 0x00df,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "fpu",
-	.start	= 0x00f0,
-	.end	= 0x00ff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1 };
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1 };
-EXPORT_SYMBOL(boot_cpu_data);
-
-unsigned int def_to_bigsmp;
-
-#ifndef CONFIG_X86_PAE
-unsigned long mmu_cr4_features;
-#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
-#endif
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-/* user-defined highmem size */
-static unsigned int highmem_pages = -1;
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-#ifndef CONFIG_XEN
-#define copy_edid() (edid_info = boot_params.edid_info)
-#endif
-struct ist_info ist_info;
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
-	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-EXPORT_SYMBOL(ist_info);
-#endif
-
-extern void early_cpu_init(void);
-extern int root_mountflags;
-
-unsigned long saved_video_mode;
-
-#define RAMDISK_IMAGE_START_MASK	0x07FF
-#define RAMDISK_PROMPT_FLAG		0x8000
-#define RAMDISK_LOAD_FLAG		0x4000
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-/*
- * Point at the empty zero page to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
-unsigned long *phys_to_machine_mapping;
-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-/* Raw start-of-day parameters from the hypervisor. */
-start_info_t *xen_start_info;
-EXPORT_SYMBOL(xen_start_info);
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-#ifndef CONFIG_XEN
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
-	    sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
-     edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#endif
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "nopentium") == 0) {
-		setup_clear_cpu_cap(X86_FEATURE_PSE);
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long mem_size;
-
-		mem_size = memparse(arg, &arg);
-		limit_regions(mem_size);
-		user_defined_memmap = 1;
-	}
-	return 0;
-}
-early_param("mem", parse_mem);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	elfcorehdr_addr = memparse(arg, &arg);
-	return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-#endif /* CONFIG_PROC_VMCORE */
-
-/*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
-static int __init parse_highmem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
-	return 0;
-}
-early_param("highmem", parse_highmem);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
-static int __init parse_vmalloc(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	__VMALLOC_RESERVE = memparse(arg, &arg);
-	return 0;
-}
-early_param("vmalloc", parse_vmalloc);
-
-#ifndef CONFIG_XEN
-/*
- * reservetop=size reserves a hole at the top of the kernel address space which
- * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
- * so relocating the fixmap can be done before paging initialization.
- */
-static int __init parse_reservetop(char *arg)
-{
-	unsigned long address;
-
-	if (!arg)
-		return -EINVAL;
-
-	address = memparse(arg, &arg);
-	reserve_top_address(address);
-	return 0;
-}
-early_param("reservetop", parse_reservetop);
-#endif
-
-/*
- * Determine low and high memory ranges:
- */
-unsigned long __init find_max_low_pfn(void)
-{
-	unsigned long max_low_pfn;
-
-	max_low_pfn = max_pfn;
-	if (max_low_pfn > MAXMEM_PFN) {
-		if (highmem_pages == -1)
-			highmem_pages = max_pfn - MAXMEM_PFN;
-		if (highmem_pages + MAXMEM_PFN < max_pfn)
-			max_pfn = MAXMEM_PFN + highmem_pages;
-		if (highmem_pages + MAXMEM_PFN > max_pfn) {
-			printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
-			highmem_pages = 0;
-		}
-		max_low_pfn = MAXMEM_PFN;
-#ifndef CONFIG_HIGHMEM
-		/* Maximum memory usable is what is directly addressable */
-		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
-					MAXMEM>>20);
-		if (max_pfn > MAX_NONPAE_PFN)
-			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-		else
-			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-		max_pfn = MAXMEM_PFN;
-#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
-		if (max_pfn > MAX_NONPAE_PFN) {
-			max_pfn = MAX_NONPAE_PFN;
-			printk(KERN_WARNING "Warning only 4GB will be used.\n");
-			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-		}
-#endif /* !CONFIG_HIGHMEM64G */
-#endif /* !CONFIG_HIGHMEM */
-	} else {
-		if (highmem_pages == -1)
-			highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
-		if (highmem_pages >= max_pfn) {
-			printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
-			highmem_pages = 0;
-		}
-		if (highmem_pages) {
-			if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
-				printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
-				highmem_pages = 0;
-			}
-			max_low_pfn -= highmem_pages;
-		}
-#else
-		if (highmem_pages)
-			printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-#endif
-	}
-	return max_low_pfn;
-}
-
-#ifndef CONFIG_XEN
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-#endif
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-static void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
-{
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
- 	min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
-		xen_start_info->nr_pt_frames;
-
-	max_low_pfn = find_max_low_pfn();
-
-#ifdef CONFIG_HIGHMEM
-	highstart_pfn = highend_pfn = max_pfn;
-	if (max_pfn > max_low_pfn) {
-		highstart_pfn = max_low_pfn;
-	}
-	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-		pages_to_mb(highend_pfn - highstart_pfn));
-	num_physpages = highend_pfn;
-	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
-	num_physpages = max_low_pfn;
-	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-#ifdef CONFIG_FLATMEM
-	max_mapnr = num_physpages;
-#endif
-	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-			pages_to_mb(max_low_pfn));
-
-	setup_bootmem_allocator();
-
-	return max_low_pfn;
-}
-
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	add_active_range(0, 0, min(xen_start_info->nr_pages, highend_pfn));
-	add_active_range(0, highend_pfn, highend_pfn);
-#else
-	add_active_range(0, 0, min(xen_start_info->nr_pages, max_low_pfn));
-	add_active_range(0, max_low_pfn, max_low_pfn);
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-
-static inline unsigned long long get_total_mem(void)
-{
-	unsigned long long total;
-
-	total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	total += highend_pfn - highstart_pfn;
-#endif
-
-	return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-#ifndef CONFIG_XEN
-static void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = get_total_mem();
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size > 0) {
-		if (crash_base > 0) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(crash_size >> 20),
-					(unsigned long)(crash_base >> 20),
-					(unsigned long)(total_mem >> 20));
-
-			if (reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-				printk(KERN_INFO "crashkernel reservation "
-					"failed - memory is in use\n");
-				return;
-			}
-
-			crashk_res.start = crash_base;
-			crashk_res.end   = crash_base + crash_size - 1;
-		} else
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-	}
-}
-#else
-#define reserve_crashkernel xen_machine_kexec_setup_resources
-#endif
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static bool do_relocate_initrd = false;
-
-static void __init reserve_initrd(void)
-{
-	unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
-	unsigned long ramdisk_size  = xen_start_info->mod_len;
-	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
-
-	initrd_start = 0;
-
-	if (!xen_start_info->mod_start || !ramdisk_size)
-		return;		/* No initrd provided by bootloader */
-
-	if (ramdisk_end < ramdisk_image) {
-		printk(KERN_ERR "initrd wraps around end of memory, "
-		       "disabling initrd\n");
-		return;
-	}
-	if (ramdisk_size >= end_of_lowmem/2) {
-		printk(KERN_ERR "initrd too large to handle, "
-		       "disabling initrd\n");
-		return;
-	}
-	if (ramdisk_end <= end_of_lowmem) {
-		/* All in lowmem, easy case */
-		reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
-		initrd_start = ramdisk_image + PAGE_OFFSET;
-		initrd_end = initrd_start+ramdisk_size;
-		return;
-	}
-
-	/* We need to move the initrd down into lowmem */
-	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
-
-	/* Note: this includes all the lowmem currently occupied by
-	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
-	initrd_start = ramdisk_here + PAGE_OFFSET;
-	initrd_end   = initrd_start + ramdisk_size;
-
-	do_relocate_initrd = true;
-}
-
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
-
-static void __init relocate_initrd(void)
-{
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
-
-	if (!do_relocate_initrd)
-		return;
-
-	ramdisk_here = initrd_start - PAGE_OFFSET;
-
-	q = (char *)initrd_start;
-
-	/* Copy any lowmem portion of the initrd */
-	if (ramdisk_image < end_of_lowmem) {
-		clen = end_of_lowmem - ramdisk_image;
-		p = (char *)__va(ramdisk_image);
-		memcpy(q, p, clen);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
-
-	/* Copy the highmem portion of the initrd */
-	while (ramdisk_size) {
-		slop = ramdisk_image & ~PAGE_MASK;
-		clen = ramdisk_size;
-		if (clen > MAX_MAP_CHUNK-slop)
-			clen = MAX_MAP_CHUNK-slop;
-		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_ioremap(mapaddr, clen+slop);
-		memcpy(q, p+slop, clen);
-		early_iounmap(p, clen+slop);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
-}
-
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-void __init setup_bootmem_allocator(void)
-{
-	unsigned long bootmap_size;
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 */
-	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
-	register_bootmem_low_pages(max_low_pfn);
-
-	/*
-	 * Reserve the bootmem bitmap itself as well. We do this in two
-	 * steps (first step was init_bootmem()) because this catches
-	 * the (very unlikely) case of us accidentally initializing the
-	 * bootmem allocator with an invalid RAM area.
-	 */
-	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
-			 BOOTMEM_DEFAULT);
-
-#ifndef CONFIG_XEN
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
-	/* reserve EBDA region */
-	reserve_ebda_region();
-
-#ifdef CONFIG_SMP
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-#endif /* !CONFIG_XEN */
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	reserve_initrd();
-#endif
-	numa_kva_reserve();
-	reserve_crashkernel();
-
-	reserve_ibft_region();
-}
-
-/*
- * The node 0 pgdat is initialized before all of these because
- * it's needed for bootmem.  node>0 pgdats have their virtual
- * space allocated before the pagetables are in place to access
- * them, so they can't be cleared then.
- *
- * This should all compile down to nothing when NUMA is off.
- */
-static void __init remapped_pgdat_init(void)
-{
-	int nid;
-
-	for_each_online_node(nid) {
-		if (nid != 0)
-			memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-	}
-}
-
-#ifdef CONFIG_MCA
-static void set_mca_bus(int x)
-{
-	MCA_bus = x;
-}
-#else
-static void set_mca_bus(int x) { }
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
-
-#ifdef CONFIG_NUMA
-/*
- * In the golden day, when everything among i386 and x86_64 will be
- * integrated, this will not live here
- */
-void *x86_cpu_to_node_map_early_ptr;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-	[0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-#endif
-
-/*
- * Determine if we were loaded by an EFI loader.  If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
- * for initialization.  Note, the efi init code path is determined by the
- * global efi_enabled. This allows the same kernel image to be used on existing
- * systems (with a traditional BIOS) as well as on EFI systems.
- */
-void __init setup_arch(char **cmdline_p)
-{
-	int i, j, k, fpp;
-	struct physdev_set_iopl set_iopl;
-	unsigned long max_low_pfn;
-	unsigned long p2m_pages;
-
-	/* Force a quick death if the kernel panics (not domain 0). */
-	extern int panic_timeout;
-	if (!panic_timeout && !is_initial_xendomain())
-		panic_timeout = 1;
-
-	/* Register a call for panic conditions. */
-	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
-
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_4gb_segments));
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_writable_pagetables));
-
-	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
-	pre_setup_arch_hook();
-	early_cpu_init();
-	early_ioremap_init();
-#ifdef CONFIG_SMP
-	prefill_possible_map();
-#endif
-
-#ifdef CONFIG_EFI
-	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4))
-		efi_enabled = 1;
-#endif
-
-	/* This must be initialized to UNNAMED_MAJOR for ipconfig to work
-	   properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
-	*/
-	ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
-	screen_info = boot_params.screen_info;
-	copy_edid();
-	apm_info.bios = boot_params.apm_bios_info;
-	ist_info = boot_params.ist_info;
-	saved_video_mode = boot_params.hdr.vid_mode;
-	if( boot_params.sys_desc_table.length != 0 ) {
-		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
-	bootloader_type = boot_params.hdr.type_of_loader;
-
-	if (is_initial_xendomain()) {
-		const struct dom0_vga_console_info *info =
-			(void *)((char *)xen_start_info +
-			         xen_start_info->console.dom0.info_off);
-
-		dom0_init_screen_info(info,
-		                      xen_start_info->console.dom0.info_size);
-		xen_start_info->console.domU.mfn = 0;
-		xen_start_info->console.domU.evtchn = 0;
-	} else
-		screen_info.orig_video_isVGA = 0;
-
-#ifdef CONFIG_BLK_DEV_RAM
-	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
-	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-
-	ARCH_SETUP
-
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	print_memory_map(memory_setup());
-
-	copy_edd();
-
-	if (!boot_params.hdr.root_flags)
-		root_mountflags &= ~MS_RDONLY;
-	init_mm.start_code = (unsigned long) _text;
-	init_mm.end_code = (unsigned long) _etext;
-	init_mm.end_data = (unsigned long) _edata;
-	init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
-		       xen_start_info->nr_pt_frames) << PAGE_SHIFT;
-
-	code_resource.start = virt_to_phys(_text);
-	code_resource.end = virt_to_phys(_etext)-1;
-	data_resource.start = virt_to_phys(_etext);
-	data_resource.end = virt_to_phys(_edata)-1;
-	bss_resource.start = virt_to_phys(&__bss_start);
-	bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
-	if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
-		i = COMMAND_LINE_SIZE;
-	memcpy(boot_command_line, xen_start_info->cmd_line, i);
-	boot_command_line[i - 1] = '\0';
-	parse_early_param();
-
-	if (user_defined_memmap) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
-
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-	if (efi_enabled)
-		efi_init();
-
-	/* update e820 for memory not covered by WB MTRRs */
-	propagate_e820_map();
-	mtrr_bp_init();
-#ifndef CONFIG_XEN
-	if (mtrr_trim_uncached_memory(max_pfn))
-		propagate_e820_map();
-#endif
-
-	max_low_pfn = setup_memory();
-
-#ifdef CONFIG_KVM_CLOCK
-	kvmclock_init();
-#endif
-
-#ifdef CONFIG_VMI
-	/*
-	 * Must be after max_low_pfn is determined, and before kernel
-	 * pagetables are setup.
-	 */
-	vmi_init();
-#endif
-	kvm_guest_init();
-
-	/*
-	 * NOTE: before this point _nobody_ is allowed to allocate
-	 * any memory using the bootmem allocator.  Although the
-	 * allocator is now initialised only the first 8Mb of the kernel
-	 * virtual address space has been mapped.  All allocations before
-	 * paging_init() has completed must use the alloc_bootmem_low_pages()
-	 * variant (which allocates DMA'able memory) and care must be taken
-	 * not to exceed the 8Mb limit.
-	 */
-
-#ifdef CONFIG_SMP
-	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
-	paging_init();
-
-	/*
-	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
-	 */
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-#endif
-
-	remapped_pgdat_init();
-	sparse_init();
-	zone_sizes_init();
-
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-#endif
-
-	p2m_pages = max_pfn;
-	if (xen_start_info->nr_pages > max_pfn) {
-		/*
-		 * the max_pfn was shrunk (probably by mem= or highmem=
-		 * kernel parameter); shrink reservation with the HV
-		 */
-		struct xen_memory_reservation reservation = {
-			.address_bits = 0,
-			.extent_order = 0,
-			.domid = DOMID_SELF
-		};
-		unsigned int difference;
-		int ret;
-
-		difference = xen_start_info->nr_pages - max_pfn;
-
-		set_xen_guest_handle(reservation.extent_start,
-				     ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
-		reservation.nr_extents = difference;
-		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-					   &reservation);
-		BUG_ON (ret != difference);
-	}
-	else if (max_pfn > xen_start_info->nr_pages)
-		p2m_pages = xen_start_info->nr_pages;
-
-	/* Make sure we have a correctly sized P->M table. */
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		phys_to_machine_mapping = alloc_bootmem_low_pages(
-		     max_pfn * sizeof(unsigned long));
-		memset(phys_to_machine_mapping, ~0,
-		       max_pfn * sizeof(unsigned long));
-		memcpy(phys_to_machine_mapping,
-		       (unsigned long *)xen_start_info->mfn_list,
-		       p2m_pages * sizeof(unsigned long));
-		free_bootmem(
-		     __pa(xen_start_info->mfn_list),
-		     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
-				     sizeof(unsigned long))));
-
-		/*
-		 * Initialise the list of the frames that specify the list of
-		 * frames that make up the p2m table. Used by save/restore
-		 */
-		pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
-
-		fpp = PAGE_SIZE/sizeof(unsigned long);
-		for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
-			if ((j % fpp) == 0) {
-				k++;
-				BUG_ON(k>=16);
-				pfn_to_mfn_frame_list[k] =
-					alloc_bootmem_low_pages(PAGE_SIZE);
-				pfn_to_mfn_frame_list_list[k] =
-					virt_to_mfn(pfn_to_mfn_frame_list[k]);
-				j=0;
-			}
-			pfn_to_mfn_frame_list[k][j] =
-				virt_to_mfn(&phys_to_machine_mapping[i]);
-		}
-		HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
-		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		     virt_to_mfn(pfn_to_mfn_frame_list_list);
-	}
-
-	/* Mark all ISA DMA channels in-use - using them wouldn't work. */
-	for (i = 0; i < MAX_DMA_CHANNELS; ++i)
-		if (i != 4 && request_dma(i, "xen") != 0)
-			BUG();
-
-	/*
-	 * NOTE: at this point the bootmem allocator is fully available.
-	 */
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	relocate_initrd();
-#endif
-
-	paravirt_post_allocator_init();
-
-	if (is_initial_xendomain())
-		dmi_scan_machine();
-
-	io_delay_init();
-
-#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
-	/*
-	 * setup to use the early static init tables during kernel startup
-	 * X86_SMP will exclude sub-arches that don't deal well with it.
-	 */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
-#ifdef CONFIG_X86_GENERICARCH
-	generic_apic_probe();
-#endif
-
-	set_iopl.iopl = 1;
-	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-
-#ifdef CONFIG_ACPI
-	if (!is_initial_xendomain()) {
-		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
-		acpi_disabled = 1;
-		acpi_ht = 0;
-	}
-
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-#endif
-
-#ifndef CONFIG_XEN
-	early_quirks();
-#endif
-
-#ifdef CONFIG_ACPI
-	acpi_boot_init();
-
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
-	if (def_to_bigsmp)
-		printk(KERN_WARNING "More than 8 CPUs detected and "
-			"CONFIG_X86_PC cannot handle it.\nUse "
-			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (smp_found_config)
-		get_smp_config();
-#endif
-
-	e820_register_memory();
-	e820_mark_nosave_regions();
-
-	if (is_initial_xendomain()) {
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-		if (!efi_enabled ||
-		    (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-			conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-		conswitchp = &dummy_con;
-#endif
-#endif
-	} else {
-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
-		conswitchp = &dummy_con;
-#endif
-	}
-}
-
-static int
-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	HYPERVISOR_shutdown(SHUTDOWN_crash);
-	/* we're never actually going to get here... */
-	return NOTIFY_DONE;
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-	int i;
-
-	/* Nothing to do if not running in dom0. */
-	if (!is_initial_xendomain())
-		return 0;
-
-	printk(KERN_INFO "Setting up standard PCI resources\n");
-	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
-
-	request_resource(&iomem_resource, &video_ram_resource);
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
-}
-
-subsys_initcall(request_standard_resources);
--- head-2011-03-11.orig/arch/x86/kernel/setup_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,1442 +0,0 @@
-/*
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <linux/console.h>
-#include <linux/seq_file.h>
-#include <linux/crash_dump.h>
-#include <linux/root_dev.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/efi.h>
-#include <linux/acpi.h>
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/mmzone.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dmi.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/sort.h>
-#include <linux/uaccess.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/vsyscall.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <video/edid.h>
-#include <asm/e820.h>
-#include <asm/dma.h>
-#include <asm/gart.h>
-#include <asm/mpspec.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/cacheflush.h>
-#include <asm/mce.h>
-#include <asm/ds.h>
-#include <asm/topology.h>
-#include <asm/pat.h>
-
-#include <mach_apic.h>
-#ifdef CONFIG_XEN
-#include <linux/percpu.h>
-#include <xen/interface/physdev.h>
-#include "setup_arch_pre.h"
-#include <asm/hypervisor.h>
-#include <xen/interface/nmi.h>
-#include <xen/features.h>
-#include <xen/firmware.h>
-#include <xen/xencons.h>
-#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
-#include <asm/mach-xen/setup_arch_post.h>
-#include <xen/interface/memory.h>
-
-#ifdef CONFIG_XEN
-#include <xen/interface/kexec.h>
-#endif
-
-extern unsigned long start_pfn;
-extern struct edid_info edid_info;
-
-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
-static struct notifier_block xen_panic_block = {
-	xen_panic_event, NULL, 0 /* try to go last */
-};
-
-unsigned long *phys_to_machine_mapping;
-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
-
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
-DEFINE_PER_CPU(int, nr_multicall_ents);
-
-/* Raw start-of-day parameters from the hypervisor. */
-start_info_t *xen_start_info;
-EXPORT_SYMBOL(xen_start_info);
-#endif
-
-/*
- * Machine setup..
- */
-
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-unsigned long mmu_cr4_features;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-unsigned long saved_video_mode;
-
-int force_mwait __cpuinitdata;
-
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct sys_desc_table_struct {
-	unsigned short length;
-	unsigned char table[0];
-};
-
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-char __initdata command_line[COMMAND_LINE_SIZE];
-
-static struct resource standard_io_resources[] = {
-	{ .name = "dma1", .start = 0x00, .end = 0x1f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic1", .start = 0x20, .end = 0x21,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer0", .start = 0x40, .end = 0x43,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "timer1", .start = 0x50, .end = 0x53,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x60, .end = 0x60,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x64, .end = 0x64,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "fpu", .start = 0xf0, .end = 0xff,
-		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
-
-static struct resource data_resource = {
-	.name = "Kernel data",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-static struct resource code_resource = {
-	.name = "Kernel code",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-static struct resource bss_resource = {
-	.name = "Kernel bss",
-	.start = 0,
-	.end = 0,
-	.flags = IORESOURCE_RAM,
-};
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-	char *end;
-	if (!arg)
-		return -EINVAL;
-	elfcorehdr_addr = memparse(arg, &end);
-	return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-#ifndef CONFIG_NUMA
-static void __init
-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long bootmap_size, bootmap;
-
-	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
-				 PAGE_SIZE);
-	if (bootmap == -1L)
-		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-	e820_register_active_regions(0, start_pfn, end_pfn);
-#ifdef CONFIG_XEN
-	if (xen_start_info->nr_pages < end_pfn)
-		end_pfn = xen_start_info->nr_pages;
-#endif
-	free_bootmem_with_active_regions(0, end_pfn);
-	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
-	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
-}
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-#ifndef CONFIG_XEN
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
-	    sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
-     edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#endif
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-#ifdef CONFIG_KEXEC
-#ifndef CONFIG_XEN
-static void __init reserve_crashkernel(void)
-{
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
-	int ret;
-
-	total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
-
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
-	if (ret == 0 && crash_size) {
-		if (crash_base <= 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"you have to specify a base address\n");
-			return;
-		}
-
-		if (reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-			printk(KERN_INFO "crashkernel reservation failed - "
-					"memory is in use\n");
-			return;
-		}
-
-		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-				"for crashkernel (System RAM: %ldMB)\n",
-				(unsigned long)(crash_size >> 20),
-				(unsigned long)(crash_base >> 20),
-				(unsigned long)(total_mem >> 20));
-		crashk_res.start = crash_base;
-		crashk_res.end   = crash_base + crash_size - 1;
-		insert_resource(&iomem_resource, &crashk_res);
-	}
-}
-#else
-#define reserve_crashkernel xen_machine_kexec_setup_resources
-#endif
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
-       machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
-/*
- * setup_arch - architecture-specific boot-time initializations
- *
- * Note: On x86_64, fixmaps are ready for use even before this is called.
- */
-void __init setup_arch(char **cmdline_p)
-{
-	unsigned i;
-
-#ifdef CONFIG_XEN
-	extern struct e820map machine_e820;
-
-	printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
-	/* Register a call for panic conditions. */
-	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
-
-	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_writable_pagetables));
-
-	early_ioremap_init();
-
- 	ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
-	screen_info = boot_params.screen_info;
-
-	if (is_initial_xendomain()) {
-		const struct dom0_vga_console_info *info =
-			(void *)((char *)xen_start_info +
-			         xen_start_info->console.dom0.info_off);
-
-		dom0_init_screen_info(info,
-		                      xen_start_info->console.dom0.info_size);
-		xen_start_info->console.domU.mfn = 0;
-		xen_start_info->console.domU.evtchn = 0;
-	} else
-		screen_info.orig_video_isVGA = 0;
-
-	copy_edid();
-#else
-	printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
-	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
-	screen_info = boot_params.screen_info;
-	edid_info = boot_params.edid_info;
-#endif	/* !CONFIG_XEN */
-	saved_video_mode = boot_params.hdr.vid_mode;
-	bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
-	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
-	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
-	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL64", 4))
-		efi_enabled = 1;
-#endif
-
-	ARCH_SETUP
-
-	memory_setup();
-	copy_edd();
-
-	if (!boot_params.hdr.root_flags)
-		root_mountflags &= ~MS_RDONLY;
-	init_mm.start_code = (unsigned long) &_text;
-	init_mm.end_code = (unsigned long) &_etext;
-	init_mm.end_data = (unsigned long) &_edata;
-	init_mm.brk = (unsigned long) &_end;
-
-	code_resource.start = virt_to_phys(&_text);
-	code_resource.end = virt_to_phys(&_etext)-1;
-	data_resource.start = virt_to_phys(&_etext);
-	data_resource.end = virt_to_phys(&_edata)-1;
-	bss_resource.start = virt_to_phys(&__bss_start);
-	bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
-	early_identify_cpu(&boot_cpu_data);
-
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-	parse_setup_data();
-
-	parse_early_param();
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	if (init_ohci1394_dma_early)
-		init_ohci1394_dma_on_all_controllers();
-#endif
-
-	finish_e820_parsing();
-
-#ifndef CONFIG_XEN
-	/* after parse_early_param, so could debug it */
-	insert_resource(&iomem_resource, &code_resource);
-	insert_resource(&iomem_resource, &data_resource);
-	insert_resource(&iomem_resource, &bss_resource);
-#endif
-
-	early_gart_iommu_check();
-
-	e820_register_active_regions(0, 0, -1UL);
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
-	end_pfn = e820_end_of_ram();
-	/* update e820 for memory not covered by WB MTRRs */
-	mtrr_bp_init();
-#ifndef CONFIG_XEN
-	if (mtrr_trim_uncached_memory(end_pfn)) {
-		e820_register_active_regions(0, 0, -1UL);
-		end_pfn = e820_end_of_ram();
-	}
-#endif
-
-	num_physpages = end_pfn;
-	max_mapnr = end_pfn;
-
-	check_efer();
-
-	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
-	if (efi_enabled)
-		efi_init();
-
-#ifndef CONFIG_XEN
-	vsmp_init();
-#endif
-
-	if (is_initial_xendomain())
-		dmi_scan_machine();
-
-	io_delay_init();
-
-#ifdef CONFIG_KVM_CLOCK
-	kvmclock_init();
-#endif
-
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	/* setup to use the early static init tables during kernel startup */
-	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
-	/* How many end-of-memory variables you have, grandma! */
-	max_low_pfn = end_pfn;
-	max_pfn = end_pfn;
-	high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
-
-	/* Remove active ranges so rediscovery with NUMA-awareness happens */
-	remove_all_active_ranges();
-
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * Parse SRAT to discover nodes.
-	 */
-	acpi_numa_init();
-#endif
-
-#ifdef CONFIG_NUMA
-	numa_initmem_init(0, end_pfn);
-#else
-	contig_initmem_init(0, end_pfn);
-#endif
-
-#ifndef CONFIG_XEN
-	dma32_reserve_bootmem();
-
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-       acpi_reserve_bootmem();
-#endif
-
-	if (efi_enabled)
-		efi_reserve_bootmem();
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-#ifdef CONFIG_XEN
-	if (xen_start_info->mod_start) {
-		unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
-		unsigned long ramdisk_size  = xen_start_info->mod_len;
-#else
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-#endif
-		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-		unsigned long end_of_mem    = end_pfn << PAGE_SHIFT;
-
-		if (ramdisk_end <= end_of_mem) {
-			/*
-			 * don't need to reserve again, already reserved early
-			 * in x86_64_start_kernel, and early_res_to_bootmem
-			 * convert that to reserved in bootmem
-			 */
-			initrd_start = ramdisk_image + PAGE_OFFSET;
-			initrd_end = initrd_start+ramdisk_size;
-#ifdef CONFIG_XEN
-			initrd_below_start_ok = 1;
-#endif
-		} else {
-			free_bootmem(ramdisk_image, ramdisk_size);
-			printk(KERN_ERR "initrd extends beyond end of memory "
-			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			       ramdisk_end, end_of_mem);
-			initrd_start = 0;
-		}
-	}
-#endif
-	reserve_crashkernel();
-
-	reserve_ibft_region();
-
-	paging_init();
-	map_vsyscall();
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	* Find and reserve possible boot-time SMP configuration:
-	*/
-	find_smp_config();
-#endif
-#ifdef CONFIG_XEN
-	{
-		int i, j, k, fpp;
-		unsigned long p2m_pages;
-
-		p2m_pages = end_pfn;
-		if (xen_start_info->nr_pages > end_pfn) {
-			/*
-			 * the end_pfn was shrunk (probably by mem= or highmem=
-			 * kernel parameter); shrink reservation with the HV
-			 */
-			struct xen_memory_reservation reservation = {
-				.address_bits = 0,
-				.extent_order = 0,
-				.domid = DOMID_SELF
-			};
-			unsigned int difference;
-			int ret;
-
-			difference = xen_start_info->nr_pages - end_pfn;
-
-			set_xen_guest_handle(reservation.extent_start,
-					     ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
-			reservation.nr_extents = difference;
-			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-						   &reservation);
-			BUG_ON (ret != difference);
-		}
-		else if (end_pfn > xen_start_info->nr_pages)
-			p2m_pages = xen_start_info->nr_pages;
-
-		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-			/* Make sure we have a large enough P->M table. */
-			phys_to_machine_mapping = alloc_bootmem_pages(
-				end_pfn * sizeof(unsigned long));
-			memset(phys_to_machine_mapping, ~0,
-			       end_pfn * sizeof(unsigned long));
-			memcpy(phys_to_machine_mapping,
-			       (unsigned long *)xen_start_info->mfn_list,
-			       p2m_pages * sizeof(unsigned long));
-			free_bootmem(
-				__pa(xen_start_info->mfn_list),
-				PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
-						sizeof(unsigned long))));
-
-			/*
-			 * Initialise the list of the frames that specify the
-			 * list of frames that make up the p2m table. Used by
-                         * save/restore.
-			 */
-			pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
-
-			fpp = PAGE_SIZE/sizeof(unsigned long);
-			for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
-				if ((j % fpp) == 0) {
-					k++;
-					BUG_ON(k>=fpp);
-					pfn_to_mfn_frame_list[k] =
-						alloc_bootmem_pages(PAGE_SIZE);
-					pfn_to_mfn_frame_list_list[k] =
-						virt_to_mfn(pfn_to_mfn_frame_list[k]);
-					j=0;
-				}
-				pfn_to_mfn_frame_list[k][j] =
-					virt_to_mfn(&phys_to_machine_mapping[i]);
-			}
-			HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
-			HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-				virt_to_mfn(pfn_to_mfn_frame_list_list);
-		}
-
-		/* Mark all ISA DMA channels in-use - using them wouldn't work. */
-		for (i = 0; i < MAX_DMA_CHANNELS; ++i)
-			if (i != 4 && request_dma(i, "xen") != 0)
-				BUG();
-	}
-
-#ifdef CONFIG_ACPI
-	if (!is_initial_xendomain()) {
-		acpi_disabled = 1;
-		acpi_ht = 0;
-	}
-#endif
-#endif
-
-#ifndef CONFIG_XEN
-	early_quirks();
-#endif
-
-#ifdef CONFIG_ACPI
-	/*
-	 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
-	 * Call this early for SRAT node setup.
-	 */
-	acpi_boot_table_init();
-
-	/*
-	 * Read APIC and some other early information from ACPI tables.
-	 */
-	acpi_boot_init();
-#endif
-
-	init_cpu_to_node();
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		get_smp_config();
-#ifndef CONFIG_XEN
-	init_apic_mappings();
-	ioapic_init_mappings();
-#endif
-#endif
-#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
-	prefill_possible_map();
-#endif
-
-	kvm_guest_init();
-
-	/*
-	 * We trust e820 completely. No explicit ROM probing in memory.
-	 */
-#ifdef CONFIG_XEN
-	if (is_initial_xendomain())
-		e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
-#else
-	e820_reserve_resources(e820.map, e820.nr_map);
-	e820_mark_nosave_regions();
-#endif
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-
-#ifdef CONFIG_XEN
-	if (is_initial_xendomain())
-		e820_setup_gap(machine_e820.map, machine_e820.nr_map);
-#else
-	e820_setup_gap(e820.map, e820.nr_map);
-#endif
-
-#ifdef CONFIG_XEN
-	{
-		struct physdev_set_iopl set_iopl;
-
-		set_iopl.iopl = 1;
-		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-
-		if (is_initial_xendomain()) {
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-			conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-			conswitchp = &dummy_con;
-#endif
-#endif
-		} else {
-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
-			conswitchp = &dummy_con;
-#endif
-                }
-	}
-#else	/* CONFIG_XEN */
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-		conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-	conswitchp = &dummy_con;
-#endif
-#endif
-
-#endif /* !CONFIG_XEN */
-
-	/* do this before identify_cpu for boot cpu */
-	check_enable_amd_mmconf_dmi();
-}
-
-#ifdef CONFIG_XEN
-static int
-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	HYPERVISOR_shutdown(SHUTDOWN_crash);
-	/* we're never actually going to get here... */
-	return NOTIFY_DONE;
-}
-#endif /* !CONFIG_XEN */
-
-
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
-	unsigned int *v;
-
-	if (c->extended_cpuid_level < 0x80000004)
-		return 0;
-
-	v = (unsigned int *) c->x86_model_id;
-	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-	c->x86_model_id[48] = 0;
-	return 1;
-}
-
-
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
-	unsigned int n, dummy, eax, ebx, ecx, edx;
-
-	n = c->extended_cpuid_level;
-
-	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
-		       "D cache %dK (%d bytes/line)\n",
-		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24) + (edx>>24);
-		/* On K8 L1 TLB is inclusive, so don't count it */
-		c->x86_tlbsize = 0;
-	}
-
-	if (n >= 0x80000006) {
-		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
-		ecx = cpuid_ecx(0x80000006);
-		c->x86_cache_size = ecx >> 16;
-		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
-		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-		c->x86_cache_size, ecx & 0xFF);
-	}
-	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-}
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
-	int i, node;
-
-	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	unsigned bits;
-#ifdef CONFIG_NUMA
-	int cpu = smp_processor_id();
-	int node = 0;
-	unsigned apicid = hard_smp_processor_id();
-#endif
-	bits = c->x86_coreid_bits;
-
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
-	node = c->phys_proc_id;
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
-	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
-		int ht_nodeid = c->initial_apicid;
-
-		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
-		/* Pick a nearby node */
-		if (!node_online(node))
-			node = nearby_node(apicid);
-	}
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	unsigned bits, ecx;
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level < 0x80000008)
-		return;
-
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
-
-	c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK		0x18000000
-#define CPUID_PROCESSOR_SIGNATURE	1
-#define CPUID_XFAM		0x0ff00000
-#define CPUID_XFAM_K8		0x00000000
-#define CPUID_XFAM_10H		0x00100000
-#define CPUID_XFAM_11H		0x00200000
-#define CPUID_XMOD		0x000f0000
-#define CPUID_XMOD_REV_F	0x00040000
-
-#ifndef CONFIG_XEN
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
-	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK)
-			return 1;
-		break;
-	default:
-		/* err on the side of caution */
-		return 1;
-	}
-	return 0;
-}
-#endif
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
-	early_init_amd_mc(c);
-
- 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-	unsigned level;
-
-#ifdef CONFIG_SMP
-	unsigned long value;
-
-	/*
-	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
-	 * bit 6 of msr C001_0015
-	 *
-	 * Errata 63 for SH-B3 steppings
-	 * Errata 122 for all steppings (F+ have it disabled by default)
-	 */
-	if (c->x86 == 15) {
-		rdmsrl(MSR_K8_HWCR, value);
-		value |= 1 << 6;
-		wrmsrl(MSR_K8_HWCR, value);
-	}
-#endif
-
-	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_cpu_cap(c, 0*32+31);
-
-	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
-			     level >= 0x0f58))
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
-	/* Enable workaround for FXSAVE leak */
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	level = get_model_name(c);
-	if (!level) {
-		switch (c->x86) {
-		case 15:
-			/* Should distinguish Models here, but this is only
-			   a fallback anyways. */
-			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
-	display_cacheinfo(c);
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level >= 0x80000008)
-		amd_detect_cmp(c);
-
-	if (c->extended_cpuid_level >= 0x80000006 &&
-		(cpuid_edx(0x80000006) & 0xf000))
-		num_cache_leaves = 4;
-	else
-		num_cache_leaves = 3;
-
-	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_K8);
-
-	/* MFENCE stops RDTSC speculation */
-	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10)
-		fam10h_check_enable_mmcfg();
-
-#ifndef CONFIG_XEN
-	if (amd_apic_timer_broken())
-		disable_apic_timer = 1;
-
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-		unsigned long long tseg;
-
-		/*
-		 * Split up direct mapping around the TSEG SMM area.
-		 * Don't do it for gbpages because there seems very little
-		 * benefit in doing so.
-		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
-			set_memory_4k((unsigned long)__va(tseg), 1);
-	}
-#endif
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	u32 eax, ebx, ecx, edx;
-	int index_msb, core_bits;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
-	if (!cpu_has(c, X86_FEATURE_HT))
-		return;
-	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		goto out;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of "
-			       "siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
-
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(index_msb);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings);
-
-		core_bits = get_count_order(c->x86_max_cores);
-
-		c->cpu_core_id = phys_pkg_id(index_msb) &
-					       ((1 << core_bits) - 1);
-	}
-out:
-	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
-	}
-
-#endif
-}
-
-#ifndef CONFIG_XEN
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, t;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	cpuid_count(4, 0, &eax, &t, &t, &t);
-
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-#endif
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-	unsigned node;
-	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
-
-	/* Don't do the funky fallback heuristics the AMD version employs
-	   for now. */
-	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
-		node = first_node(node_online_map);
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
-	/* Cache sizes */
-	unsigned n;
-
-	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9) {
-		unsigned eax = cpuid_eax(10);
-		/* Check for version and the number of counters */
-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-	}
-
-	if (cpu_has_ds) {
-		unsigned int l1, l2;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-	}
-
-
-	if (cpu_has_bts)
-		ds_init_intel(c);
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-		/* CPUID workaround for Intel 0F34 CPU */
-		if (c->x86_vendor == X86_VENDOR_INTEL &&
-		    c->x86 == 0xF && c->x86_model == 0x3 &&
-		    c->x86_mask == 0x4)
-			c->x86_phys_bits = 36;
-	}
-
-	if (c->x86 == 15)
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if (c->x86 == 6)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-#ifndef CONFIG_XEN
-	c->x86_max_cores = intel_num_cpu_cores(c);
-#endif
-
-	srat_detect_node();
-}
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x6 && c->x86_model >= 0xf)
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
-	/* Cache sizes */
-	unsigned n;
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
-	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
-	char *v = c->x86_vendor_id;
-
-	if (!strcmp(v, "AuthenticAMD"))
-		c->x86_vendor = X86_VENDOR_AMD;
-	else if (!strcmp(v, "GenuineIntel"))
-		c->x86_vendor = X86_VENDOR_INTEL;
-	else if (!strcmp(v, "CentaurHauls"))
-		c->x86_vendor = X86_VENDOR_CENTAUR;
-	else
-		c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
-   needed before check_bugs. Everything advanced is in identify_cpu
-   below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
-	u32 tfms, xlvl;
-
-	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
-	c->x86_vendor_id[0] = '\0'; /* Unset */
-	c->x86_model_id[0] = '\0';  /* Unset */
-	c->x86_clflush_size = 64;
-	c->x86_cache_alignment = c->x86_clflush_size;
-#ifndef CONFIG_XEN
-	c->x86_max_cores = 1;
-	c->x86_coreid_bits = 0;
-#endif
-	c->extended_cpuid_level = 0;
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
-	/* Get vendor name */
-	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-	      (unsigned int *)&c->x86_vendor_id[0],
-	      (unsigned int *)&c->x86_vendor_id[8],
-	      (unsigned int *)&c->x86_vendor_id[4]);
-
-	get_cpu_vendor(c);
-
-	/* Initialize the standard set of capabilities */
-	/* Note that the vendor-specific code below might override */
-
-	/* Intel-defined flags: level 0x00000001 */
-	if (c->cpuid_level >= 0x00000001) {
-		__u32 misc;
-		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
-		      &c->x86_capability[0]);
-		c->x86 = (tfms >> 8) & 0xf;
-		c->x86_model = (tfms >> 4) & 0xf;
-		c->x86_mask = tfms & 0xf;
-		if (c->x86 == 0xf)
-			c->x86 += (tfms >> 20) & 0xff;
-		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-	} else {
-		/* Have CPUID level 0 only - unheard of */
-		c->x86 = 4;
-	}
-
-#ifndef CONFIG_XEN
-	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
-	c->phys_proc_id = c->initial_apicid;
-#endif
-#endif
-	/* AMD-defined flags: level 0x80000001 */
-	xlvl = cpuid_eax(0x80000000);
-	c->extended_cpuid_level = xlvl;
-	if ((xlvl & 0xffff0000) == 0x80000000) {
-		if (xlvl >= 0x80000001) {
-			c->x86_capability[1] = cpuid_edx(0x80000001);
-			c->x86_capability[6] = cpuid_ecx(0x80000001);
-		}
-		if (xlvl >= 0x80000004)
-			get_model_name(c); /* Default name */
-	}
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		/* Don't set x86_cpuid_level here for now to not confuse. */
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-
-	c->extended_cpuid_level = cpuid_eax(0x80000000);
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		early_init_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		early_init_intel(c);
-		break;
-	case X86_VENDOR_CENTAUR:
-		early_init_centaur(c);
-		break;
-	}
-
-	validate_pat_support(c);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-
-	early_identify_cpu(c);
-
-	init_scattered_cpuid_features(c);
-
-#ifndef CONFIG_XEN
-	c->apicid = phys_pkg_id(0);
-#endif
-
-	/*
-	 * Vendor-specific initialization.  In this section we
-	 * canonicalize the feature flags, meaning if there are
-	 * features a certain CPU supports which CPUID doesn't
-	 * tell us, CPUID claiming incorrect flags, or other bugs,
-	 * we handle them here.
-	 *
-	 * At the end of this section, c->x86_capability better
-	 * indicate the features this CPU genuinely supports!
-	 */
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		init_amd(c);
-		break;
-
-	case X86_VENDOR_INTEL:
-		init_intel(c);
-		break;
-
-	case X86_VENDOR_CENTAUR:
-		init_centaur(c);
-		break;
-
-	case X86_VENDOR_UNKNOWN:
-	default:
-		display_cacheinfo(c);
-		break;
-	}
-
-	detect_ht(c);
-
-	/*
-	 * On SMP, boot_cpu_data holds the common feature set between
-	 * all CPUs; so make sure that we indicate which features are
-	 * common between the CPUs.  The first time this routine gets
-	 * executed, c == &boot_cpu_data.
-	 */
-	if (c != &boot_cpu_data) {
-		/* AND the already accumulated flags with these */
-		for (i = 0; i < NCAPINTS; i++)
-			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
-	}
-
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
-	mcheck_init(c);
-#endif
-	select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
-	numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
-	identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
-	BUG_ON(c == &boot_cpu_data);
-	identify_cpu(c);
-	mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
-	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
-	return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
-	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
-
-	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
-	else
-		printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
-	int bit;
-	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-	return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
--- head-2011-03-11.orig/arch/x86/kernel/smp-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/smp-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 }

-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	atomic_t started;
-	atomic_t finished;
-	int wait;
-};
-
-void lock_ipi_call_lock(void)
+void xen_send_call_func_single_ipi(int cpu)
 {
-	spin_lock_irq(&call_lock);
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
 }

-void unlock_ipi_call_lock(void)
+void xen_send_call_func_ipi(cpumask_t mask)
 {
-	spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
-				int nonatomic, int wait)
-{
-	struct call_data_struct data;
-	int cpus = num_online_cpus() - 1;
-
-	if (!cpus)
-		return;
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	mb();
-
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
-  * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int
-xen_smp_call_function_mask(cpumask_t mask,
-			      void (*func)(void *), void *info,
-			      int wait)
-{
-	struct call_data_struct data;
-	cpumask_t allbutself;
-	int cpus;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	/* Holding any lock stops cpus from going down. */
-	spin_lock(&call_lock);
-
-	allbutself = cpu_online_map;
-	cpu_clear(smp_processor_id(), allbutself);
-
-	cpus_and(mask, mask, allbutself);
-	cpus = cpus_weight(mask);
-
-	if (!cpus) {
-		spin_unlock(&call_lock);
-		return 0;
-	}
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	wmb();
-
-	/* Send a message to other CPUs */
-	if (cpus_equal(mask, allbutself) &&
-	    cpus_equal(cpu_online_map, cpu_callout_map))
-		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-	else
-		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-	spin_unlock(&call_lock);
-
-	return 0;
+	send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
 }

 static void stop_this_cpu(void *dummy)
@@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)

 void xen_smp_send_stop(void)
 {
-	int nolock;
 	unsigned long flags;

-	/* Don't deadlock on the call lock in panic */
-	nolock = !spin_trylock(&call_lock);
+	smp_call_function(stop_this_cpu, NULL, 0);
 	local_irq_save(flags);
-	__smp_call_function(stop_this_cpu, NULL, 0, 0);
-	if (!nolock)
-		spin_unlock(&call_lock);
 	disable_all_local_evtchn();
 	local_irq_restore(flags);
 }
@@ -298,30 +175,24 @@ irqreturn_t smp_reschedule_interrupt(int

 irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
 {
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	int wait = call_data->wait;
-
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	atomic_inc(&call_data->started);
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
-	(*func)(info);
+	generic_smp_call_function_interrupt();
 #ifdef CONFIG_X86_32
 	__get_cpu_var(irq_stat).irq_call_count++;
 #else
 	add_pda(irq_call_count, 1);
 #endif

-	if (wait) {
-		mb();
-		atomic_inc(&call_data->finished);
-	}
+	return IRQ_HANDLED;
+}
+
+irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
+{
+	generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
+	__get_cpu_var(irq_stat).irq_call_count++;
+#else
+	add_pda(irq_call_count, 1);
+#endif

 	return IRQ_HANDLED;
 }
--- head-2011-03-11.orig/arch/x86/kernel/time-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/time-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -460,7 +460,7 @@ irqreturn_t timer_interrupt(int irq, voi

 	/* Keep nmi watchdog up to date */
 #ifdef __i386__
-	per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
+	x86_add_percpu(irq_stat.irq0_irqs, 1);
 #else
 	add_pda(irq0_irqs, 1);
 #endif
@@ -750,9 +750,7 @@ void __init time_init(void)

 	update_wallclock();

-#ifndef CONFIG_X86_64
 	use_tsc_delay();
-#endif

 	/* Cannot request_irq() until kmem is initialised. */
 	late_time_init = setup_cpu0_timer_irq;
@@ -809,7 +807,8 @@ static void stop_hz_timer(void)

 	/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
 	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
-	    (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
+	    (j = get_next_timer_interrupt(jiffies),
+	     time_before_eq(j, jiffies))) {
 		cpu_clear(cpu, nohz_cpu_mask);
 		j = jiffies + 1;
 	}
--- head-2011-03-11.orig/arch/x86/kernel/traps_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/traps_32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -1,5 +1,6 @@
 /*
  *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
@@ -57,11 +58,10 @@
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/io.h>
+#include <asm/traps.h>

 #include "mach_traps.h"

-int panic_on_unrecovered_nmi;
-
 #ifndef CONFIG_XEN
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
@@ -82,43 +82,22 @@ gate_desc idt_table[256]
 	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 #endif

-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-#ifndef CONFIG_XEN
-asmlinkage void spurious_interrupt_bug(void);
-#else
-asmlinkage void fixup_4gb_segment(void);
-#endif
-asmlinkage void machine_check(void);
-
+int panic_on_unrecovered_nmi;
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;

 void printk_address(unsigned long address, int reliable)
 {
 #ifdef CONFIG_KALLSYMS
-	char namebuf[KSYM_NAME_LEN];
 	unsigned long offset = 0;
 	unsigned long symsize;
 	const char *symname;
-	char reliab[4] = "";
-	char *delim = ":";
 	char *modname;
+	char *delim = ":";
+	char namebuf[KSYM_NAME_LEN];
+	char reliab[4] = "";

 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -138,22 +117,23 @@ void printk_address(unsigned long addres
 #endif
 }

-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size)
 {
-	return	p > (void *)tinfo &&
-		p <= (void *)tinfo + THREAD_SIZE - size;
+	void *t = tinfo;
+	return	p > t && p <= t + THREAD_SIZE - size;
 }

 /* The form of the top of the frame on the stack */
 struct stack_frame {
-	struct stack_frame	*next_frame;
-	unsigned long		return_address;
+	struct stack_frame *next_frame;
+	unsigned long return_address;
 };

 static inline unsigned long
 print_context_stack(struct thread_info *tinfo,
-		    unsigned long *stack, unsigned long bp,
-		    const struct stacktrace_ops *ops, void *data)
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data)
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;

@@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
 	return bp;
 }

-#define MSG(msg)		ops->warning(data, msg)
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
@@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task

 	if (!stack) {
 		unsigned long dummy;
-
 		stack = &dummy;
 		if (task != current)
 			stack = (unsigned long *)task->thread.sp;
@@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
 	}
 #endif

-	while (1) {
+	for (;;) {
 		struct thread_info *context;

 		context = (struct thread_info *)
@@ -256,15 +233,15 @@ static void print_trace_address(void *da
 }

 static const struct stacktrace_ops print_trace_ops = {
-	.warning		= print_trace_warning,
-	.warning_symbol		= print_trace_warning_symbol,
-	.stack			= print_trace_stack,
-	.address		= print_trace_address,
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
 };

 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long *stack, unsigned long bp, char *log_lvl)
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
@@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
 		printk(KERN_EMERG "Code: ");

 		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET ||
-			probe_kernel_address(ip, c)) {
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
 			/* try starting at EIP */
 			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
 		for (i = 0; i < code_len; i++, ip++) {
 			if (ip < (u8 *)PAGE_OFFSET ||
-				probe_kernel_address(ip, c)) {
+					probe_kernel_address(ip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
@@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
 	return ud2 == 0x0b0f;
 }

-static int die_counter;
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+	unsigned long flags;
+
+	oops_enter();
+
+	if (die_owner != raw_smp_processor_id()) {
+		console_verbose();
+		raw_local_irq_save(flags);
+		__raw_spin_lock(&die_lock);
+		die_owner = smp_processor_id();
+		die_nest_count = 0;
+		bust_spinlocks(1);
+	} else {
+		raw_local_irq_save(flags);
+	}
+	die_nest_count++;
+	return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+	bust_spinlocks(0);
+	die_owner = -1;
+	add_taint(TAINT_DIE);
+	__raw_spin_unlock(&die_lock);
+	raw_local_irq_restore(flags);
+
+	if (!regs)
+		return;
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+
+	if (panic_on_oops)
+		panic("Fatal exception");
+
+	oops_exit();
+	do_exit(signr);
+}

 int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
@@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-
 	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;

-		show_registers(regs);
-		/* Executive summary in case the oops scrolled away */
-		sp = (unsigned long) (&regs->sp);
-		savesegment(ss, ss);
-		if (user_mode(regs)) {
-			sp = regs->sp;
-			ss = regs->ss & 0xffff;
-		}
-		printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-		print_symbol("%s", regs->ip);
-		printk(" SS:ESP %04x:%08lx\n", ss, sp);
-
-		return 0;
-	}
-
-	return 1;
+	show_registers(regs);
+	/* Executive summary in case the oops scrolled away */
+	sp = (unsigned long) (&regs->sp);
+	savesegment(ss, ss);
+	if (user_mode(regs)) {
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
+	}
+	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+	print_symbol("%s", regs->ip);
+	printk(" SS:ESP %04x:%08lx\n", ss, sp);
+	return 0;
 }

 /*
@@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
  */
 void die(const char *str, struct pt_regs *regs, long err)
 {
-	static struct {
-		raw_spinlock_t lock;
-		u32 lock_owner;
-		int lock_owner_depth;
-	} die = {
-		.lock =			__RAW_SPIN_LOCK_UNLOCKED,
-		.lock_owner =		-1,
-		.lock_owner_depth =	0
-	};
-	unsigned long flags;
-
-	oops_enter();
+	unsigned long flags = oops_begin();

-	if (die.lock_owner != raw_smp_processor_id()) {
-		console_verbose();
-		raw_local_irq_save(flags);
-		__raw_spin_lock(&die.lock);
-		die.lock_owner = smp_processor_id();
-		die.lock_owner_depth = 0;
-		bust_spinlocks(1);
-	} else {
-		raw_local_irq_save(flags);
-	}
-
-	if (++die.lock_owner_depth < 3) {
+	if (die_nest_count < 3) {
 		report_bug(regs->ip, regs);

 		if (__die(str, regs, err))
@@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
 		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
 	}

-	bust_spinlocks(0);
-	die.lock_owner = -1;
-	add_taint(TAINT_DIE);
-	__raw_spin_unlock(&die.lock);
-	raw_local_irq_restore(flags);
-
-	if (!regs)
-		return;
-
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-
-	if (panic_on_oops)
-		panic("Fatal exception");
-
-	oops_exit();
-	do_exit(SIGSEGV);
+	oops_end(flags, regs, SIGSEGV);
 }

 static inline void
@@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
 {									\
 	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 0, regs, error_code, NULL);		\
 }
@@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
 	info.si_code = sicode;						\
 	info.si_addr = (void __user *)siaddr;				\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 0, regs, error_code, &info);	\
 }
@@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
 void do_##name(struct pt_regs *regs, long error_code)			\
 {									\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 1, regs, error_code, NULL);		\
 }
@@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
 	info.si_addr = (void __user *)siaddr;				\
 	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-						== NOTIFY_STOP)		\
+							== NOTIFY_STOP)	\
 		return;							\
 	do_trap(trapnr, signr, str, 1, regs, error_code, &info);	\
 }

-DO_VM86_ERROR_INFO(0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
 DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR(9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
-DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
 DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)

-void __kprobes do_general_protection(struct pt_regs * regs,
-					      long error_code)
+void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
 {
+	struct task_struct *tsk;
 	struct thread_struct *thread;

 	thread = &current->thread;
@@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
 	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;

+	tsk = current;
 	if (!user_mode(regs))
 		goto gp_in_kernel;

-	current->thread.error_code = error_code;
-	current->thread.trap_no = 13;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;

-	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-	    printk_ratelimit()) {
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+			printk_ratelimit()) {
 		printk(KERN_INFO
-		    "%s[%d] general protection ip:%lx sp:%lx error:%lx",
-		    current->comm, task_pid_nr(current),
-		    regs->ip, regs->sp, error_code);
+			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			tsk->comm, task_pid_nr(tsk),
+			regs->ip, regs->sp, error_code);
 		print_vma_addr(" in ", regs->ip);
 		printk("\n");
 	}

-	force_sig(SIGSEGV, current);
+	force_sig(SIGSEGV, tsk);
 	return;

 gp_in_vm86:
@@ -648,14 +627,15 @@ gp_in_vm86:
 	return;

 gp_in_kernel:
-	if (!fixup_exception(regs)) {
-		current->thread.error_code = error_code;
-		current->thread.trap_no = 13;
-		if (notify_die(DIE_GPF, "general protection fault", regs,
+	if (fixup_exception(regs))
+		return;
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+	if (notify_die(DIE_GPF, "general protection fault", regs,
 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
-			return;
-		die("general protection fault", regs, error_code);
-	}
+		return;
+	die("general protection fault", regs, error_code);
 }

 static notrace __kprobes void
@@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,

 static DEFINE_SPINLOCK(nmi_print_lock);

-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
-	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;

 	spin_lock(&nmi_print_lock);
@@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
 	* to get a message out:
 	*/
 	bust_spinlocks(1);
-	printk(KERN_EMERG "%s", msg);
+	printk(KERN_EMERG "%s", str);
 	printk(" on CPU%d, ip %08lx, registers:\n",
 		smp_processor_id(), regs->ip);
 	show_registers(regs);
+	if (do_panic)
+		panic("Non maskable interrupt");
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
@@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;

-	/* Only the BSP gets external NMIs from the system: */
-	if (!smp_processor_id())
+	cpu = smp_processor_id();
+
+	/* Only the BSP gets external NMIs from the system. */
+	if (!cpu)
 		reason = get_nmi_reason();

 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-							== NOTIFY_STOP)
+								== NOTIFY_STOP)
 			return;
 #ifdef CONFIG_X86_LOCAL_APIC
 		/*
@@ -772,7 +757,7 @@ static notrace __kprobes void default_do
 		 */
 		if (nmi_watchdog_tick(regs, reason))
 			return;
-		if (!do_nmi_callback(regs, smp_processor_id()))
+		if (!do_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
@@ -782,6 +767,8 @@ static notrace __kprobes void default_do
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
 		return;
+
+	/* AK: following checks seem to be broken on modern chipsets. FIXME */
 	if (reason & 0x80)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
@@ -793,8 +780,6 @@ static notrace __kprobes void default_do
 	reassert_nmi();
 }

-static int ignore_nmis;
-
 notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
 {
 	int cpu;
@@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
 	tsk->thread.debugctlmsr = 0;

 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-					SIGTRAP) == NOTIFY_STOP)
+						SIGTRAP) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
 	if (regs->flags & X86_EFLAGS_IF)
@@ -940,9 +925,8 @@ clear_TF_reenable:
 void math_error(void __user *ip)
 {
 	struct task_struct *task;
-	unsigned short cwd;
-	unsigned short swd;
 	siginfo_t info;
+	unsigned short cwd, swd;

 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -961,7 +945,7 @@ void math_error(void __user *ip)
 	 * C1 reg you need in case of a stack fault, 0x040 is the stack
 	 * fault bit.  We should only be taking one exception at a time,
 	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't syncronizing its FPU usage
+	 * then we have a bad program that isn't synchronizing its FPU usage
 	 * and it will suffer the consequences since we won't be able to
 	 * fully reproduce the context of the exception
 	 */
@@ -970,7 +954,7 @@ void math_error(void __user *ip)
 	switch (swd & ~cwd & 0x3f) {
 	case 0x000: /* No unmasked exception */
 		return;
-	default:    /* Multiple exceptions */
+	default: /* Multiple exceptions */
 		break;
 	case 0x001: /* Invalid Op */
 		/*
@@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
 static void simd_math_error(void __user *ip)
 {
 	struct task_struct *task;
-	unsigned short mxcsr;
 	siginfo_t info;
+	unsigned short mxcsr;

 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt

 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 {
-	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+	struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
--- head-2011-03-11.orig/arch/x86/kernel/traps_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/traps_64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -10,73 +10,56 @@
  * 'Traps.c' handles hardware traps and faults after we have saved some
  * state in 'entry.S'.
  */
-#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
 #include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
-#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/timer.h>
-#include <linux/mm.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nmi.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
 #include <linux/bug.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-
-#include <mach_traps.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>

 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif

-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
 #include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
+#include <asm/nmi.h>
 #include <asm/smp.h>
+#include <asm/io.h>
 #include <asm/pgalloc.h>
-#include <asm/pda.h>
 #include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
+#include <asm/pda.h>
+#include <asm/traps.h>

-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
-asmlinkage void alignment_check(void);
-asmlinkage void machine_check(void);
-asmlinkage void spurious_interrupt_bug(void);
+#include <mach_traps.h>

+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 12;
 static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;

 static inline void conditional_sti(struct pt_regs *regs)
 {
@@ -100,34 +83,9 @@ static inline void preempt_conditional_c
 	dec_preempt_count();
 }

-int kstack_depth_to_print = 12;
-
 void printk_address(unsigned long address, int reliable)
 {
-#ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0, symsize;
-	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
-	char reliab[4] = "";
-
-	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
-	if (!symname) {
-		printk(" [<%016lx>]\n", address);
-		return;
-	}
-	if (!reliable)
-		strcpy(reliab, "? ");
-
-	if (!modname)
-		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
-	printk(" [<%016lx>]\n", address);
-#endif
+	printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
 }

 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
 	return NULL;
 }

-#define MSG(txt) ops->warning(data, txt)
-
 /*
  * x86-64 can have up to three kernel stacks:
  * process stack
@@ -234,11 +190,11 @@ struct stack_frame {
 	unsigned long return_address;
 };

-
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long bp,
-				const struct stacktrace_ops *ops, void *data,
-				unsigned long *end)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end)
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;

@@ -260,7 +216,7 @@ static inline unsigned long print_contex
 	return bp;
 }

-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
@@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
 	unsigned used = 0;
 	struct thread_info *tinfo;

-	if (!tsk)
-		tsk = current;
-	tinfo = task_thread_info(tsk);
+	if (!task)
+		task = current;

 	if (!stack) {
 		unsigned long dummy;
 		stack = &dummy;
-		if (tsk && tsk != current)
-			stack = (unsigned long *)tsk->thread.sp;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.sp;
 	}

 #ifdef CONFIG_FRAME_POINTER
 	if (!bp) {
-		if (tsk == current) {
+		if (task == current) {
 			/* Grab bp right from our regs */
-			asm("movq %%rbp, %0" : "=r" (bp):);
+			asm("movq %%rbp, %0" : "=r" (bp) :);
 		} else {
 			/* bp is the last reg pushed by switch_to */
-			bp = *(unsigned long *) tsk->thread.sp;
+			bp = *(unsigned long *) task->thread.sp;
 		}
 	}
 #endif

-
-
 	/*
 	 * Print function call entries in all stacks, starting at the
 	 * current stack address. If the stacks consist of nested
 	 * exceptions
 	 */
+	tinfo = task_thread_info(task);
 	for (;;) {
 		char *id;
 		unsigned long *estack_end;
@@ -383,18 +337,24 @@ static const struct stacktrace_ops print
 	.address = print_trace_address,
 };

-void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
-		unsigned long bp)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
 	printk("\nCall Trace:\n");
-	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("\n");
 }

+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
+{
+	show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
-							unsigned long bp)
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;
@@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
 	// back trace for this cpu.

 	if (sp == NULL) {
-		if (tsk)
-			sp = (unsigned long *)tsk->thread.sp;
+		if (task)
+			sp = (unsigned long *)task->thread.sp;
 		else
 			sp = (unsigned long *)&sp;
 	}

 	stack = sp;
-	for(i=0; i < kstack_depth_to_print; i++) {
+	for (i = 0; i < kstack_depth_to_print; i++) {
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
 				stack = (unsigned long *) (irqstack_end[-1]);
@@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, sp, bp);
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }

-void show_stack(struct task_struct *tsk, unsigned long * sp)
+void show_stack(struct task_struct *task, unsigned long *sp)
 {
-	_show_stack(tsk, NULL, sp, 0);
+	show_stack_log_lvl(task, NULL, sp, 0, "");
 }

 /*
@@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
  */
 void dump_stack(void)
 {
-	unsigned long dummy;
 	unsigned long bp = 0;
+	unsigned long stack;

 #ifdef CONFIG_FRAME_POINTER
 	if (!bp)
@@ -454,7 +414,7 @@ void dump_stack(void)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(NULL, NULL, &dummy, bp);
+	show_trace(NULL, NULL, &stack, bp);
 }

 EXPORT_SYMBOL(dump_stack);
@@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
 	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
-	u8 *ip;
-	unsigned int code_prologue = code_bytes * 43 / 64;
-	unsigned int code_len = code_bytes;

 	sp = regs->sp;
-	ip = (u8 *) regs->ip - code_prologue;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
 	 * time of the fault..
 	 */
 	if (!user_mode(regs)) {
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
 		unsigned char c;
+		u8 *ip;
+
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+				regs->bp, "");
 		printk("\n");

 		printk(KERN_EMERG "Code: ");
+
+		ip = (u8 *)regs->ip - code_prologue;
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
 			/* try starting at RIP */
-			ip = (u8 *) regs->ip;
+			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
 		for (i = 0; i < code_len; i++, ip++) {
@@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
 		}
 	}
 	printk("\n");
-}
+}

 int is_valid_bugaddr(unsigned long ip)
 {
@@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
 }

 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
+{
 	die_owner = -1;
 	bust_spinlocks(0);
 	die_nest_count--;
@@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
 	do_exit(signr);
 }

-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
-	static int die_counter;
-	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
 	printk("PREEMPT ");
 #endif
@@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+	if (notify_die(DIE_OOPS, str, regs, err,
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
 		return 1;
+
 	show_registers(regs);
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
@@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
 	return 0;
 }

-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	unsigned long flags = oops_begin();

@@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
 {
 	unsigned long flags;

-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
-	    NOTIFY_STOP)
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;

 	flags = oops_begin();
@@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
 	 * We are in trouble anyway, lets at least try
 	 * to get a message out.
 	 */
-	printk(str, smp_processor_id());
+	printk(KERN_EMERG "%s", str);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
 	show_registers(regs);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
@@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
 }
 #endif

-static void __kprobes do_trap(int trapnr, int signr, char *str,
-			      struct pt_regs * regs, long error_code,
-			      siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info)
 {
 	struct task_struct *tsk = current;

-	if (user_mode(regs)) {
-		/*
-		 * We want error_code and trap_no set for userspace
-		 * faults and kernelspace faults which result in
-		 * die(), but not kernelspace faults which are fixed
-		 * up.  die() gives the process no chance to handle
-		 * the signal and notice the kernel fault information,
-		 * so that won't result in polluting the information
-		 * about previously queued, but not yet delivered,
-		 * faults.  See also do_general_protection below.
-		 */
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = trapnr;
+	if (!user_mode(regs))
+		goto kernel_trap;

-		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-		    printk_ratelimit()) {
-			printk(KERN_INFO
-			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
-			       tsk->comm, tsk->pid, str,
-			       regs->ip, regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
+	/*
+	 * We want error_code and trap_no set for userspace faults and
+	 * kernelspace faults which result in die(), but not
+	 * kernelspace faults which are fixed up.  die() gives the
+	 * process no chance to handle the signal and notice the
+	 * kernel fault information, so that won't result in polluting
+	 * the information about previously queued, but not yet
+	 * delivered, faults.  See also do_general_protection below.
+	 */
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = trapnr;

-		if (info)
-			force_sig_info(signr, info, tsk);
-		else
-			force_sig(signr, tsk);
-		return;
+	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+	    printk_ratelimit()) {
+		printk(KERN_INFO
+		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+		       tsk->comm, tsk->pid, str,
+		       regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
 	}

+	if (info)
+		force_sig_info(signr, info, tsk);
+	else
+		force_sig(signr, tsk);
+	return;

+kernel_trap:
 	if (!fixup_exception(regs)) {
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = trapnr;
@@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
 }

 #define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-							== NOTIFY_STOP) \
-		return; \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
+{									\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
 	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, NULL); \
+	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
 }

-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	siginfo_t info; \
-	info.si_signo = signr; \
-	info.si_errno = 0; \
-	info.si_code = sicode; \
-	info.si_addr = (void __user *)siaddr; \
-	trace_hardirqs_fixup(); \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-							== NOTIFY_STOP) \
-		return; \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
+asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
+{									\
+	siginfo_t info;							\
+	info.si_signo = signr;						\
+	info.si_errno = 0;						\
+	info.si_code = sicode;						\
+	info.si_addr = (void __user *)siaddr;				\
+	trace_hardirqs_fixup();						\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
 	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, &info); \
+	do_trap(trapnr, signr, str, regs, error_code, &info);		\
 }

-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
-DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)

 /* Runs on IST stack */
 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
@@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
 		die(str, regs, error_code);
 }

-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
-						long error_code)
+asmlinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
 {
-	struct task_struct *tsk = current;
+	struct task_struct *tsk;

 	conditional_sti(regs);

-	if (user_mode(regs)) {
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = 13;
+	tsk = current;
+	if (!user_mode(regs))
+		goto gp_in_kernel;

-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(KERN_INFO
-		       "%s[%d] general protection ip:%lx sp:%lx error:%lx",
-			       tsk->comm, tsk->pid,
-			       regs->ip, regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;

-		force_sig(SIGSEGV, tsk);
-		return;
-	}
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+			printk_ratelimit()) {
+		printk(KERN_INFO
+			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			tsk->comm, tsk->pid,
+			regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}

+	force_sig(SIGSEGV, tsk);
+	return;
+
+gp_in_kernel:
 	if (fixup_exception(regs))
 		return;

@@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
 }

 static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
 		reason);
 	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");

 #if defined(CONFIG_EDAC)
-	if(edac_handler_set()) {
+	if (edac_handler_set()) {
 		edac_atomic_assert_error();
 		return;
 	}
@@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
 }

 static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk("NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
@@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,

 /* Runs on IST stack. This code must keep interrupts off all the time.
    Nested NMIs are prevented by the CPU. */
-asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 	int cpu;

 	cpu = smp_processor_id();

-	/* Only the BSP gets external NMIs from the system.  */
+	/* Only the BSP gets external NMIs from the system. */
 	if (!cpu)
 		reason = get_nmi_reason();

@@ -847,33 +813,58 @@ asmlinkage notrace  __kprobes void defau
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog_tick(regs,reason))
+		if (nmi_watchdog_tick(regs, reason))
 			return;
 #endif
-		if (!do_nmi_callback(regs,cpu))
+		if (!do_nmi_callback(regs, cpu))
 			unknown_nmi_error(reason, regs);

 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-		return;
+		return;

 	/* AK: following checks seem to be broken on modern chipsets. FIXME */
-
 	if (reason & 0x80)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
 		io_check_error(reason, regs);
 }

+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_enter();
+
+	add_pda(__nmi_count, 1);
+
+	if (!ignore_nmis)
+		default_do_nmi(regs);
+
+	nmi_exit();
+}
+
+void stop_nmi(void)
+{
+	acpi_nmi_disable();
+	ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+	ignore_nmis--;
+	acpi_nmi_enable();
+}
+
 /* runs on IST stack. */
 asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
 {
 	trace_hardirqs_fixup();

-	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
 		return;
-	}
+
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
@@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
 asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 				   unsigned long error_code)
 {
-	unsigned long condition;
 	struct task_struct *tsk = current;
+	unsigned long condition;
 	siginfo_t info;

 	trace_hardirqs_fixup();
@@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc

 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7) {
+		if (!tsk->thread.debugreg7)
 			goto clear_dr7;
-		}
 	}

 	tsk->thread.debugreg6 = condition;

-
 	/*
 	 * Single-stepping through TF: make sure we ignore any events in
 	 * kernel space (but re-enable TF when returning to user mode).
 	 */
 	if (condition & DR_STEP) {
-                if (!user_mode(regs))
-                       goto clear_TF_reenable;
+		if (!user_mode(regs))
+			goto clear_TF_reenable;
 	}

 	/* Ok, finally something we can handle */
@@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
 	force_sig_info(SIGTRAP, &info, tsk);

 clear_dr7:
-	set_debugreg(0UL, 7);
+	set_debugreg(0, 7);
 	preempt_conditional_cli(regs);
 	return;

@@ -961,6 +950,7 @@ clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 	regs->flags &= ~X86_EFLAGS_TF;
 	preempt_conditional_cli(regs);
+	return;
 }

 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 {
 	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct * task;
+	struct task_struct *task;
 	siginfo_t info;
 	unsigned short cwd, swd;

@@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
 	cwd = get_fpu_cwd(task);
 	swd = get_fpu_swd(task);
 	switch (swd & ~cwd & 0x3f) {
-		case 0x000:
-		default:
-			break;
-		case 0x001: /* Invalid Op */
-			/*
-			 * swd & 0x240 == 0x040: Stack Underflow
-			 * swd & 0x240 == 0x240: Stack Overflow
-			 * User must clear the SF bit (0x40) if set
-			 */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000: /* No unmasked exception */
+	default: /* Multiple exceptions */
+		break;
+	case 0x001: /* Invalid Op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 {
 	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct * task;
+	struct task_struct *task;
 	siginfo_t info;
 	unsigned short mxcsr;

@@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
 	 */
 	mxcsr = get_fpu_mxcsr(task);
 	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-		case 0x000:
-		default:
-			break;
-		case 0x001: /* Invalid Op */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000:
+	default:
+		break;
+	case 0x001: /* Invalid Op */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
 }

 /*
- *  'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
  *
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)

         /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */

-	restore_fpu_checking(&me->thread.xstate->fxsave);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
+		stts();
+		force_sig(SIGSEGV, me);
+		return;
+	}
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
@@ -1190,13 +1187,12 @@ void __init trap_init(void)
         ret = HYPERVISOR_set_trap_table(trap_table);
         if (ret)
 		printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
-
 	/*
 	 * initialize the per thread extended state:
 	 */
-        init_thread_xstate();
+	init_thread_xstate();
 	/*
-	 * Should be a barrier for any external CPU state.
+	 * Should be a barrier for any external CPU state:
 	 */
 	cpu_init();
 }
@@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
 	}
 }

-
 static int __init oops_setup(char *s)
-{
+{
 	if (!s)
 		return -EINVAL;
 	if (!strcmp(s, "panic"))
 		panic_on_oops = 1;
 	return 0;
-}
+}
 early_param("oops", oops_setup);

 static int __init kstack_setup(char *s)
 {
 	if (!s)
 		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s,NULL,0);
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
 	return 0;
 }
 early_param("kstack", kstack_setup);

-
 static int __init code_bytes_setup(char *s)
 {
 	code_bytes = simple_strtoul(s, NULL, 0);
--- head-2011-03-11.orig/arch/x86/kernel/vsyscall_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/vsyscall_64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -42,7 +42,8 @@
 #include <asm/topology.h>
 #include <asm/vgtod.h>

-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"

 /*
@@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
 	d |= cpu;
 	d |= (node & 0xf) << 12;
 	d |= (node >> 4) << 48;
-	if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
-							 + GDT_ENTRY_PER_CPU),
-					 d))
-		BUG();
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }

 static void __cpuinit cpu_vsyscall_init(void *arg)
@@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
 {
 	long cpu = (long)arg;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 	return NOTIFY_DONE;
 }

@@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2);
 #endif
-	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
--- head-2011-03-11.orig/arch/x86/mach-xen/setup.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mach-xen/setup.c	2011-02-03 14:23:14.000000000 +0100
@@ -17,6 +17,8 @@
 #include <xen/interface/callback.h>
 #include <xen/interface/memory.h>

+#ifdef CONFIG_X86_32
+
 #ifdef CONFIG_HOTPLUG_CPU
 #define DEFAULT_SEND_IPI	(1)
 #else
@@ -44,47 +46,6 @@ static int __init print_ipi_mode(void)

 late_initcall(print_ipi_mode);

-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- *
- * Description:
- *	This is included late in kernel/setup.c so that it can make
- *	use of all of the static functions.
- **/
-
-char * __init machine_specific_memory_setup(void)
-{
-	int rc;
-	struct xen_memory_map memmap;
-	static struct e820entry __initdata map[E820MAX];
-
-	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
-
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if ( rc == -ENOSYS ) {
-		memmap.nr_entries = 1;
-		map[0].addr = 0ULL;
-		map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
-		/* 8MB slack (to balance backend allocations). */
-		map[0].size += 8ULL << 20;
-		map[0].type = E820_RAM;
-		rc = 0;
-	}
-	BUG_ON(rc);
-
-	sanitize_e820_map(map, (char *)&memmap.nr_entries);
-
-	BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
-
-	return "Xen";
-}
-
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
 EXPORT_SYMBOL(machine_to_phys_mapping);
 unsigned int machine_to_phys_order;
@@ -117,30 +78,60 @@ void __init pre_setup_arch_hook(void)
 			(unsigned long *)xen_start_info->mfn_list;
 }

+#endif /* CONFIG_X86_32 */
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
 void __init machine_specific_arch_setup(void)
 {
 	int ret;
 	static struct callback_register __initdata event = {
 		.type = CALLBACKTYPE_event,
-		.address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
+		.address = CALLBACK_ADDR(hypervisor_callback)
 	};
 	static struct callback_register __initdata failsafe = {
 		.type = CALLBACKTYPE_failsafe,
-		.address = { __KERNEL_CS, (unsigned long)failsafe_callback },
+		.address = CALLBACK_ADDR(failsafe_callback)
 	};
+#ifdef CONFIG_X86_64
+	static struct callback_register __initdata syscall = {
+		.type = CALLBACKTYPE_syscall,
+		.address = CALLBACK_ADDR(system_call)
+	};
+#endif
 	static struct callback_register __initdata nmi_cb = {
 		.type = CALLBACKTYPE_nmi,
-		.address = { __KERNEL_CS, (unsigned long)nmi },
+		.address = CALLBACK_ADDR(nmi)
 	};

 	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 	if (ret == 0)
 		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+	if (ret == 0)
+		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
 #if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
 	if (ret == -ENOSYS)
 		ret = HYPERVISOR_set_callbacks(
 			event.address.cs, event.address.eip,
 			failsafe.address.cs, failsafe.address.eip);
+#else
+		ret = HYPERVISOR_set_callbacks(
+			event.address,
+			failsafe.address,
+			syscall.address);
+#endif
 #endif
 	BUG_ON(ret);

@@ -155,14 +146,41 @@ void __init machine_specific_arch_setup(
 	}
 #endif

+#ifdef CONFIG_X86_32
 	/* Do an early initialization of the fixmap area */
 	{
 		extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
 		unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
 		pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
 		pmd_t *pmd = pmd_offset(pud, addr);
+		unsigned int i;

 		make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
 		set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+			!= pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+		FIX_BUG_ON(SHARED_INFO);
+		FIX_BUG_ON(ISAMAP_BEGIN);
+		FIX_BUG_ON(ISAMAP_END);
+#undef __FIXADDR_TOP
+		BUG_ON(pte_index(hypervisor_virt_start));
+
+		/* Switch to the real shared_info page, and clear the
+		 * dummy page. */
+		set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+		HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+		memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+		/* Setup mapping of lower 1st MB */
+		for (i = 0; i < NR_FIX_ISAMAPS; i++)
+			if (is_initial_xendomain())
+				set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+			else
+				__set_fixmap(FIX_ISAMAP_BEGIN - i,
+					     virt_to_machine(empty_zero_page),
+					     PAGE_KERNEL_RO);
 	}
+#endif
 }
--- head-2011-03-11.orig/arch/x86/mm/Makefile	2011-01-31 14:53:50.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/Makefile	2011-02-01 14:38:38.000000000 +0100
@@ -27,6 +27,7 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology_6
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o

 obj-$(CONFIG_XEN)		+= hypervisor.o
+disabled-obj-$(CONFIG_XEN)	:= gup.o

 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o

--- head-2011-03-11.orig/arch/x86/mm/dump_pagetables-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/dump_pagetables-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -45,7 +45,7 @@ static struct addr_marker address_marker
 	{ 0, "User Space" },
 #ifdef CONFIG_X86_64
 	{ HYPERVISOR_VIRT_START,      "Hypervisor Space" },
-	{ HYPERVISOR_VIRT_END,        "Low Kernel Mapping" },
+	{ PAGE_OFFSET,                "Low Kernel Mapping" },
 	{ VMALLOC_START,              "vmalloc() Area" },
 	{ VMEMMAP_START,              "Vmemmap" },
 	{ __START_KERNEL_map,         "High Kernel Mapping" },
@@ -160,8 +160,8 @@ static void note_page(struct seq_file *m
 	 * we have now. "break" is either changing perms, levels or
 	 * address space marker.
 	 */
-	prot = pgprot_val(new_prot) & ~(PTE_MASK);
-	cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+	prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
+	cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);

 	if (!st->level) {
 		/* First entry */
@@ -234,7 +234,7 @@ static void walk_pmd_level(struct seq_fi
 		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 		if (!hypervisor_space(st->current_address)
 		    && !pmd_none(*start)) {
-			pgprotval_t prot = __pmd_val(*start) & ~PTE_MASK;
+			pgprotval_t prot = __pmd_val(*start) & PTE_FLAGS_MASK;

 			if (pmd_large(*start) || !pmd_present(*start))
 				note_page(m, st, __pgprot(prot), 3);
@@ -267,7 +267,7 @@ static void walk_pud_level(struct seq_fi
 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
 		if (!hypervisor_space(st->current_address)
 		    && !pud_none(*start)) {
-			pgprotval_t prot = __pud_val(*start) & ~PTE_MASK;
+			pgprotval_t prot = __pud_val(*start) & PTE_FLAGS_MASK;

 			if (pud_large(*start) || !pud_present(*start))
 				note_page(m, st, __pgprot(prot), 2);
@@ -303,7 +303,7 @@ static void walk_pgd_level(struct seq_fi
 	for (i = 0; i < PTRS_PER_PGD; i++) {
 		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
 		if (!pgd_none(*start)) {
-			pgprotval_t prot = __pgd_val(*start) & ~PTE_MASK;
+			pgprotval_t prot = __pgd_val(*start) & PTE_FLAGS_MASK;

 			if (pgd_large(*start) || !pgd_present(*start))
 				note_page(m, &st, __pgprot(prot), 1);
--- head-2011-03-11.orig/arch/x86/mm/fault-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/fault-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -49,17 +50,23 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)

+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+#ifdef CONFIG_MMIOTRACE_HOOKS
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
+#endif
+	return 0;
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 #ifdef CONFIG_KPROBES
 	int ret = 0;

 	/* kprobe_running() needs smp_processor_id() */
-#ifdef CONFIG_X86_32
 	if (!user_mode_vm(regs)) {
-#else
-	if (!user_mode(regs)) {
-#endif
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
@@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
 		printk(KERN_CONT "NULL pointer dereference");
 	else
 		printk(KERN_CONT "paging request");
-#ifdef CONFIG_X86_32
-	printk(KERN_CONT " at %08lx\n", address);
-#else
-	printk(KERN_CONT " at %016lx\n", address);
-#endif
+	printk(KERN_CONT " at %p\n", (void *) address);
 	printk(KERN_ALERT "IP:");
 	printk_address(regs->ip, 1);
 	dump_pagetable(address);
@@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r

 	if (notify_page_fault(regs))
 		return;
+	if (unlikely(kmmio_fault(regs, address)))
+		return;

 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -831,14 +836,10 @@ bad_area_nosemaphore:
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
 			printk(
-#ifdef CONFIG_X86_32
-			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
-#else
-			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
-#endif
+			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			tsk->comm, task_pid_nr(tsk), address, regs->ip,
-			regs->sp, error_code);
+			tsk->comm, task_pid_nr(tsk), address,
+			(void *) regs->ip, (void *) regs->sp, error_code);
 			print_vma_addr(" in ", regs->ip);
 			printk("\n");
 		}
@@ -949,89 +950,52 @@ LIST_HEAD(pgd_list);
 void vmalloc_sync_all(void)
 {
 #ifdef CONFIG_X86_32
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
-	 *      This change works just fine with 2-level paging too.
-	 */
-#define sync_index(a) ((a) >> PMD_SHIFT)
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
-	static unsigned long start = TASK_SIZE;
-	unsigned long address;
+	unsigned long address = VMALLOC_START & PGDIR_MASK;

 	if (SHARED_KERNEL_PMD)
 		return;

 	BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
-	for (address = start;
-	     address < hypervisor_virt_start;
-	     address += PMD_SIZE) {
-		if (!test_bit(sync_index(address), insync)) {
-			unsigned long flags;
-			struct page *page;
-
-			spin_lock_irqsave(&pgd_lock, flags);
-			/* XEN: failure path assumes non-empty pgd_list. */
-			if (unlikely(list_empty(&pgd_list))) {
-				spin_unlock_irqrestore(&pgd_lock, flags);
-				return;
-			}
-			list_for_each_entry(page, &pgd_list, lru) {
-				pmd_t *pmd;
-
-				pgd_page_table(lock, page);
-				pmd = vmalloc_sync_one(page_address(page),
-						       address);
-				pgd_page_table(unlock, page);
-
-				if (!pmd)
-					break;
-			}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			if (!page)
-				set_bit(sync_index(address), insync);
+	for (; address < hypervisor_virt_start; address += PMD_SIZE) {
+		unsigned long flags;
+		struct page *page;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pmd_t *pmd;
+
+			pgd_page_table(lock, page);
+			pmd = vmalloc_sync_one(page_address(page), address);
+			pgd_page_table(unlock, page);
+
+			if (!pmd)
+				break;
 		}
-		if (address == start && test_bit(sync_index(address), insync))
-			start = address + PMD_SIZE;
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 #else /* CONFIG_X86_64 */
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = VMALLOC_START & PGDIR_MASK;
+	unsigned long start = VMALLOC_START & PGDIR_MASK;
 	unsigned long address;

 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			const pgd_t *pgd_ref = pgd_offset_k(address);
-			unsigned long flags;
-			struct page *page;
-
-			if (pgd_none(*pgd_ref))
-				continue;
-			spin_lock_irqsave(&pgd_lock, flags);
-			list_for_each_entry(page, &pgd_list, lru) {
-				pgd_t *pgd;
-				pgd = (pgd_t *)page_address(page) + pgd_index(address);
-				pgd_page_table(lock, page);
-				if (pgd_none(*pgd))
-					set_pgd(pgd, *pgd_ref);
-				else
-					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-				pgd_page_table(unlock, page);
-			}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			set_bit(pgd_index(address), insync);
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pgd_page_table(lock, page);
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+			pgd_page_table(unlock, page);
 		}
-		if (address == start)
-			start = address + PGDIR_SIZE;
+		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
 #endif
 }
--- head-2011-03-11.orig/arch/x86/mm/hypervisor.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/hypervisor.c	2011-02-01 14:38:38.000000000 +0100
@@ -711,6 +711,72 @@ void xen_destroy_contiguous_region(unsig
 }
 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);

+int __init early_create_contiguous_region(unsigned long pfn,
+					  unsigned int order,
+					  unsigned int address_bits)
+{
+	unsigned long *in_frames = discontig_frames, out_frame = pfn;
+	unsigned int i;
+	int rc, success;
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = 1UL << order,
+			.extent_order = 0,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = 1,
+			.extent_order = order,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (unlikely(order > MAX_CONTIG_ORDER))
+		return -ENOMEM;
+
+	for (i = 0; i < (1U << order); ++i) {
+		in_frames[i] = pfn_to_mfn(pfn + i);
+		set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
+	}
+
+	set_xen_guest_handle(exchange.in.extent_start, in_frames);
+	set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == (1UL << order));
+	BUG_ON(!success && (exchange.nr_exchanged || !rc));
+	BUG_ON(success && rc);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		/* Compatibility when XENMEM_exchange is unavailable. */
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &exchange.in) != (1UL << order))
+			BUG();
+		success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						&exchange.out) == 1);
+		if (!success) {
+			for (i = 0; i < (1U << order); ++i)
+				in_frames[i] = pfn + i;
+			if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						 &exchange.in) != (1UL << order))
+				BUG();
+		}
+	}
+#endif
+
+	for (i = 0; i < (1U << order); ++i, ++out_frame) {
+		if (!success)
+			out_frame = in_frames[i];
+		set_phys_to_machine(pfn + i, out_frame);
+	}
+
+	return success ? 0 : -ENOMEM;
+}
+
 static void undo_limit_pages(struct page *pages, unsigned int order)
 {
 	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
@@ -877,42 +943,9 @@ int write_ldt_entry(struct desc_struct *
 	return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
 }

-#define MAX_BATCHED_FULL_PTES 32
-
-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-			 unsigned long addr, unsigned long end, pgprot_t newprot,
-			 int dirty_accountable)
+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
+		    int type)
 {
-	int rc = 0, i = 0;
-	mmu_update_t u[MAX_BATCHED_FULL_PTES];
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
-		return 0;
-
-	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	do {
-		if (pte_present(*pte)) {
-			pte_t ptent = pte_modify(*pte, newprot);
-
-			if (dirty_accountable && pte_dirty(ptent))
-				ptent = pte_mkwrite(ptent);
-			u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
-				   | ((unsigned long)pte & ~PAGE_MASK)
-				   | MMU_PT_UPDATE_PRESERVE_AD;
-			u[i].val = __pte_val(ptent);
-			if (++i == MAX_BATCHED_FULL_PTES) {
-				if ((rc = HYPERVISOR_mmu_update(
-					&u[0], i, NULL, DOMID_SELF)) != 0)
-					break;
-				i = 0;
-			}
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	if (i)
-		rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
-	pte_unmap_unlock(pte - 1, ptl);
-	BUG_ON(rc && rc != -ENOSYS);
-	return !rc;
+	maddr_t mach_gp = virt_to_machine(gdt + entry);
+	return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
 }
--- head-2011-03-11.orig/arch/x86/mm/init_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/init_32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -54,6 +54,7 @@

 unsigned int __VMALLOC_RESERVE = 128 << 20;

+unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;

 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn

 static noinline int do_test_wp_bit(void);

+
+static unsigned long __initdata table_start;
+static unsigned long __initdata table_end;
+static unsigned long __initdata table_top;
+
+static int __initdata after_init_bootmem;
+
+static __init void *alloc_low_page(unsigned long *phys)
+{
+	unsigned long pfn = table_end++;
+	void *adr;
+
+	if (pfn >= table_top)
+		panic("alloc_low_page: ran out of memory");
+
+	adr = __va(pfn * PAGE_SIZE);
+	memset(adr, 0, PAGE_SIZE);
+	*phys  = pfn * PAGE_SIZE;
+	return adr;
+}
+
 /*
  * Creates a middle page table and puts a pointer to it in the
  * given global directory entry. This only returns the gd entry
@@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
 	pmd_t *pmd_table;

 #ifdef CONFIG_X86_PAE
+	unsigned long phys;
 	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
-		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
+		if (after_init_bootmem)
+			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		else
+			pmd_table = (pmd_t *)alloc_low_page(&phys);
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
@@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
 #endif
 		pte_t *page_table = NULL;

+		if (after_init_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
-		if (!page_table) {
-			page_table =
+			if (!page_table)
+				page_table =
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		} else {
+			unsigned long phys;
+			page_table = (pte_t *)alloc_low_page(&phys);
 		}

 		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+						unsigned long start_pfn,
+						unsigned long end_pfn,
+						int use_pse)
 {
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
+	unsigned pages_2m = 0, pages_4k = 0;

-	unsigned long max_ram_pfn = xen_start_info->nr_pages;
-	if (max_ram_pfn > max_low_pfn)
-		max_ram_pfn = max_low_pfn;
+	if (!cpu_has_pse)
+		use_pse = 0;

-	pgd_idx = pgd_index(PAGE_OFFSET);
+	pfn = start_pfn;
+	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
-	pfn = 0;
-	pmd_idx = pmd_index(PAGE_OFFSET);
-	pte_ofs = pte_index(PAGE_OFFSET);
-
 	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
 #ifdef CONFIG_XEN
 		/*
@@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
 #else
 		pmd = one_md_table_init(pgd);
 #endif
-		if (pfn >= max_low_pfn)
+
+		if (pfn >= end_pfn)
 			continue;
+#ifdef CONFIG_X86_PAE
+		pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
 		pmd += pmd_idx;
-		for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+#else
+		pmd_idx = 0;
+#endif
+		for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
 		     pmd++, pmd_idx++) {
 			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;

@@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
 			/*
 			 * Map with big pages if possible, otherwise
 			 * create normal page tables:
-			 *
-			 * Don't use a large page for the first 2/4MB of memory
-			 * because there are often fixed size MTRRs in there
-			 * and overlapping MTRRs into large pages can cause
-			 * slowdowns.
 			 */
-			if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+			if (use_pse) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;

@@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
 				    is_kernel_text(addr2))
 					prot = PAGE_KERNEL_LARGE_EXEC;

+				pages_2m++;
 				set_pmd(pmd, pfn_pmd(pfn, prot));

 				pfn += PTRS_PER_PTE;
-				max_pfn_mapped = pfn;
 				continue;
 			}
 			pte = one_page_table_init(pmd);

-			for (pte += pte_ofs;
-			     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+			pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+			pte += pte_ofs;
+			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
 			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
 				pgprot_t prot = PAGE_KERNEL;

 				/* XEN: Only map initial RAM allocation. */
-				if ((pfn >= max_ram_pfn) || pte_present(*pte))
+				if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
 					continue;
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;

+				pages_4k++;
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
-			max_pfn_mapped = pfn;
-			pte_ofs = 0;
 		}
-		pmd_idx = 0;
 	}
+	update_page_count(PG_LEVEL_2M, pages_2m);
+	update_page_count(PG_LEVEL_4K, pages_4k);
 }

-#ifndef CONFIG_XEN
-
-static inline int page_kills_ppro(unsigned long pagenr)
-{
-	if (pagenr >= 0x70000 && pagenr <= 0x7003F)
-		return 1;
-	return 0;
-}
-
-#else
-
-#define page_kills_ppro(p)	0
-
-#endif
-
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
  * is valid. The argument is a physical page number.
@@ -331,33 +347,66 @@ static void __init permanent_kmaps_init(
 	pkmap_page_table = pte;
 }

-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
-		ClearPageReserved(page);
-		init_page_count(page);
-		__free_page(page);
-		totalhigh_pages++;
-	} else
-		SetPageReserved(page);
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalhigh_pages++;
+}
+
+struct add_highpages_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+};
+
+static int __init add_highpages_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	int node_pfn;
+	struct page *page;
+	unsigned long final_start_pfn, final_end_pfn;
+	struct add_highpages_data *data;
+
+	data = (struct add_highpages_data *)datax;
+
+	final_start_pfn = max(start_pfn, data->start_pfn);
+	final_end_pfn = min(end_pfn, data->end_pfn);
+	if (final_start_pfn >= final_end_pfn)
+		return 0;
+
+	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+	     node_pfn++) {
+		if (!pfn_valid(node_pfn))
+			continue;
+		page = pfn_to_page(node_pfn);
+		add_one_highpage_init(page, node_pfn);
+	}
+
+	return 0;
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn)
+{
+	struct add_highpages_data data;
+
+	data.start_pfn = start_pfn;
+	data.end_pfn = end_pfn;
+
+	work_with_active_regions(nid, add_highpages_work_fn, &data);
 }

 #ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(int bad_ppro)
+static void __init set_highmem_pages_init(void)
 {
 	int pfn;

-	for (pfn = highstart_pfn; pfn < highend_pfn
-				  && pfn < xen_start_info->nr_pages; pfn++) {
-		/*
-		 * Holes under sparsemem might not have no mem_map[]:
-		 */
-		if (pfn_valid(pfn))
-			add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
-	}
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);

 	/* XEN: init high-mem pages outside initial allocation. */
-	for (; pfn < highend_pfn; pfn++) {
+	for (pfn = xen_start_info->nr_pages; pfn < highend_pfn; pfn++) {
 		ClearPageReserved(pfn_to_page(pfn));
 		init_page_count(pfn_to_page(pfn));
 	}
@@ -369,24 +418,11 @@ static void __init set_highmem_pages_ini
 #else
 # define kmap_init()				do { } while (0)
 # define permanent_kmaps_init(pgd_base)		do { } while (0)
-# define set_highmem_pages_init(bad_ppro)	do { } while (0)
+# define set_highmem_pages_init()	do { } while (0)
 #endif /* CONFIG_HIGHMEM */

-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-
-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-
 pgd_t *swapper_pg_dir;

-static void __init xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
-static void __init xen_pagetable_setup_done(pgd_t *base)
-{
-}
-
 /*
  * Build a proper pagetable for the kernel mappings.  Up until this
  * point, we've been running on some set of pagetables constructed by
@@ -406,27 +442,10 @@ static void __init xen_pagetable_setup_d
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
 {
-	pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
 	unsigned long vaddr, end;

-	xen_pagetable_setup_start(pgd_base);
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__PAGE_KERNEL |= _PAGE_GLOBAL;
-		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
-	}
-
-	kernel_physical_mapping_init(pgd_base);
-	remap_numa_kva();
-
 	/*
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
@@ -436,10 +455,13 @@ static void __init pagetable_init(void)
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
 	early_ioremap_reset();
+}

-	permanent_kmaps_init(pgd_base);
+static void __init pagetable_init(void)
+{
+	pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;

-	xen_pagetable_setup_done(pgd_base);
+	permanent_kmaps_init(pgd_base);
 }

 #if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
@@ -482,7 +504,7 @@ void zap_low_mappings(void)

 int nx_enabled;

-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);

 #ifdef CONFIG_X86_PAE
@@ -535,42 +557,364 @@ static void __init set_nx(void)
 }
 #endif

+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
 /*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
  */
-void __init paging_init(void)
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+	/* it could update max_pfn */
+
+	/* max_low_pfn is 0, we already have early_res support */
+
+	max_low_pfn = max_pfn;
+	if (max_low_pfn > MAXMEM_PFN) {
+		if (highmem_pages == -1)
+			highmem_pages = max_pfn - MAXMEM_PFN;
+		if (highmem_pages + MAXMEM_PFN < max_pfn)
+			max_pfn = MAXMEM_PFN + highmem_pages;
+		if (highmem_pages + MAXMEM_PFN > max_pfn) {
+			printk(KERN_WARNING "only %luMB highmem pages "
+				"available, ignoring highmem size of %uMB.\n",
+				pages_to_mb(max_pfn - MAXMEM_PFN),
+				pages_to_mb(highmem_pages));
+			highmem_pages = 0;
+		}
+		max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+		/* Maximum memory usable is what is directly addressable */
+		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+					MAXMEM>>20);
+		if (max_pfn > MAX_NONPAE_PFN)
+			printk(KERN_WARNING
+				 "Use a HIGHMEM64G enabled kernel.\n");
+		else
+			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+		max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+		if (max_pfn > MAX_NONPAE_PFN) {
+			max_pfn = MAX_NONPAE_PFN;
+			printk(KERN_WARNING "Warning only 4GB will be used."
+				"Use a HIGHMEM64G enabled kernel.\n");
+		}
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+	} else {
+		if (highmem_pages == -1)
+			highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+		if (highmem_pages >= max_pfn) {
+			printk(KERN_ERR "highmem size specified (%uMB) is "
+				"bigger than pages available (%luMB)!.\n",
+				pages_to_mb(highmem_pages),
+				pages_to_mb(max_pfn));
+			highmem_pages = 0;
+		}
+		if (highmem_pages) {
+			if (max_low_pfn - highmem_pages <
+			    64*1024*1024/PAGE_SIZE){
+				printk(KERN_ERR "highmem size %uMB results in "
+				"smaller than 64MB lowmem, ignoring it.\n"
+					, pages_to_mb(highmem_pages));
+				highmem_pages = 0;
+			}
+			max_low_pfn -= highmem_pages;
+		}
+#else
+		if (highmem_pages)
+			printk(KERN_ERR "ignoring highmem size on non-highmem"
+					" kernel!\n");
+#endif
+	}
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(unsigned long start_pfn,
+				  unsigned long end_pfn)
+{
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
+	memory_present(0, 0, highend_pfn);
+	e820_register_active_regions(0, 0, highend_pfn);
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	memory_present(0, 0, max_low_pfn);
+	e820_register_active_regions(0, 0, max_low_pfn);
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
+#endif
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+static void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] =
+		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
+void __init setup_bootmem_allocator(void)
 {
 	int i;
+	unsigned long bootmap_size, bootmap;
+	unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
+
+	/*
+	 * Initialize the boot-time allocator (with low memory only):
+	 */
+	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
+				 bootmap_size, PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+					 min_low_pfn, end_pfn);
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+		 bootmap, bootmap + bootmap_size);
+	for_each_online_node(i)
+		free_bootmem_with_active_regions(i, end_pfn);
+	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+
+	after_init_bootmem = 1;
+}
+
+static unsigned long __init extend_init_mapping(unsigned long tables_space)
+{
+	unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
+				  + xen_start_info->nr_pt_frames;
+	unsigned long start = start_pfn, va = (unsigned long)&_text;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	/* Ensure init mappings cover kernel text/data and initial tables. */
+	while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
+		pgd = pgd_offset_k(va);
+		pud = pud_offset(pgd, va);
+		pmd = pmd_offset(pud, va);
+		if (pmd_none(*pmd)) {
+			unsigned long pa = start_pfn++ << PAGE_SHIFT;
+
+			memset(__va(pa), 0, PAGE_SIZE);
+			make_lowmem_page_readonly(__va(pa),
+						  XENFEAT_writable_page_tables);
+			xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
+		}
+		pte = pte_offset_kernel(pmd, va);
+		if (pte_none(*pte)) {
+			pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
+
+			if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
+				BUG();
+		}
+		va += PAGE_SIZE;
+	}
+
+	/* Finally, blow away any spurious initial mappings. */
+	while (1) {
+		pgd = pgd_offset_k(va);
+		pud = pud_offset(pgd, va);
+		pmd = pmd_offset(pud, va);
+		if (pmd_none(*pmd))
+			break;
+		if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+			BUG();
+		va += PAGE_SIZE;
+	}
+
+	if (start_pfn > start)
+		reserve_early(start << PAGE_SHIFT,
+			      start_pfn << PAGE_SHIFT, "INITMAP");
+
+	return start_pfn;
+}
+
+static void __init find_early_table_space(unsigned long end)
+{
+	unsigned long puds, pmds, ptes, tables;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = PAGE_ALIGN(puds * sizeof(pud_t));
+
+	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+	tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+
+	if (cpu_has_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+		extra += PMD_SIZE;
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+
+	/* for fixmap */
+	tables += PAGE_SIZE
+	          * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
+	              - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
+	             >> PMD_SHIFT);
+
+	table_start = extend_init_mapping(tables);
+
+	table_end = table_start;
+	table_top = table_start + (tables>>PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_start << PAGE_SHIFT,
+		(table_start << PAGE_SHIFT) + tables);
+}
+
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+						unsigned long end)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+	unsigned long start_pfn, end_pfn;
+	unsigned long big_page_start;
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 */
+	if (!after_init_bootmem)
+		find_early_table_space(end);

 #ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
 #endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	big_page_start = PMD_SIZE;
+
+	if (start < big_page_start) {
+		start_pfn = start >> PAGE_SHIFT;
+		end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
+	} else {
+		/* head is not big page alignment ? */
+		start_pfn = start >> PAGE_SHIFT;
+		end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+	}
+	if (start_pfn < end_pfn)
+		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
+
+	/* big page range */
+	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < (big_page_start >> PAGE_SHIFT))
+		start_pfn =  big_page_start >> PAGE_SHIFT;
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn)
+		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
+						cpu_has_pse);
+
+	/* tail is not big page alignment ? */
+	start_pfn = end_pfn;
+	if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
+		end_pfn = end >> PAGE_SHIFT;
+		if (start_pfn < end_pfn)
+			kernel_physical_mapping_init(pgd_base, start_pfn,
+							 end_pfn, 0);
+	}
+
+	early_ioremap_page_table_range_init(pgd_base);
+
+	__flush_tlb_all();
+
+	if (!after_init_bootmem)
+		reserve_early(table_start << PAGE_SHIFT,
+			      table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_init_bootmem)
+		early_memtest(start, end);
+
+	return end >> PAGE_SHIFT;
+}
+
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
 	pagetable_init();

 	__flush_tlb_all();

 	kmap_init();

-	/* Switch to the real shared_info page, and clear the
-	 * dummy page. */
-	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-	memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
-	/* Setup mapping of lower 1st MB */
-	for (i = 0; i < NR_FIX_ISAMAPS; i++)
-		if (is_initial_xendomain())
-			set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
-		else
-			__set_fixmap(FIX_ISAMAP_BEGIN - i,
-				     virt_to_machine(empty_zero_page),
-				     PAGE_KERNEL_RO);
+	/*
+	 * NOTE: at this point the bootmem allocator is fully available.
+	 */
+	sparse_init();
+	zone_sizes_init();
 }

 /*
@@ -605,7 +949,7 @@ static struct kcore_list kcore_mem, kcor
 void __init mem_init(void)
 {
 	int codesize, reservedpages, datasize, initsize;
-	int tmp, bad_ppro;
+	int tmp;
 	unsigned long pfn;

 	pci_iommu_alloc();
@@ -613,19 +957,6 @@ void __init mem_init(void)
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
-	bad_ppro = ppro_with_ram_bug();
-
-#ifdef CONFIG_HIGHMEM
-	/* check that fixmap and pkmap do not overlap */
-	if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-		printk(KERN_ERR
-			"fixmap and kmap areas overlap - this will crash\n");
-		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-				PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
-				FIXADDR_START);
-		BUG();
-	}
-#endif
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 	/* XEN: init low-mem pages outside initial allocation. */
@@ -642,7 +973,7 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;

-	set_highmem_pages_init(bad_ppro);
+	set_highmem_pages_init();

 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
@@ -663,7 +994,6 @@ void __init mem_init(void)
 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
 	       );

-#if 1 /* double-sanity-check paranoia */
 	printk(KERN_INFO "virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
@@ -704,7 +1034,6 @@ void __init mem_init(void)
 #endif
 	BUG_ON(VMALLOC_START				> VMALLOC_END);
 	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
-#endif /* double-sanity-check paranoia */

 	if (boot_cpu_data.wp_works_ok < 0)
 		test_wp_bit();
@@ -761,6 +1090,8 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;

+#ifndef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 		size >> 10);
@@ -773,6 +1104,8 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 	start += size;
 	size = (unsigned long)__end_rodata - start;
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -835,3 +1168,9 @@ void free_initrd_mem(unsigned long start
 	free_init_pages("initrd memory", start, end);
 }
 #endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
+{
+	return reserve_bootmem(phys, len, flags);
+}
--- head-2011-03-11.orig/arch/x86/mm/init_64-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/init_64-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -21,6 +21,7 @@
 #include <linux/swap.h>
 #include <linux/smp.h>
 #include <linux/init.h>
+#include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
@@ -52,6 +53,14 @@

 #include <xen/features.h>

+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
 #if CONFIG_XEN_COMPAT <= 0x030002
 unsigned int __kernel_page_user;
 EXPORT_SYMBOL(__kernel_page_user);
@@ -60,7 +69,6 @@ EXPORT_SYMBOL(__kernel_page_user);
 int after_bootmem;

 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-extern unsigned long start_pfn;

 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
@@ -118,7 +126,7 @@ void __meminit early_make_page_readonly(
 }

 #ifndef CONFIG_XEN
-int direct_gbpages __meminitdata
+int direct_gbpages
 #ifdef CONFIG_DIRECT_GBPAGES
 				= 1
 #endif
@@ -145,55 +153,23 @@ early_param("gbpages", parse_direct_gbpa
  * around without checking the pgd every time.
  */

-void show_mem(void)
-{
-	long i, total = 0, reserved = 0;
-	long shared = 0, cached = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-
-	printk(KERN_INFO "Mem-info:\n");
-	show_free_areas();
-	for_each_online_pgdat(pgdat) {
-		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			/*
-			 * This loop can take a while with 256 GB and
-			 * 4k pages so defer the NMI watchdog:
-			 */
-			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-				touch_nmi_watchdog();
-
-			if (!pfn_valid(pgdat->node_start_pfn + i))
-				continue;
-
-			page = pfn_to_page(pgdat->node_start_pfn + i);
-			total++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-	}
-	printk(KERN_INFO "%lu pages of RAM\n",		total);
-	printk(KERN_INFO "%lu reserved pages\n",	reserved);
-	printk(KERN_INFO "%lu pages shared\n",		shared);
-	printk(KERN_INFO "%lu pages swap cached\n",	cached);
-}
-
 static unsigned long __meminitdata table_start;
-static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_cur;
+static unsigned long __meminitdata table_top;

-static __init void *spp_getpage(void)
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
 {
 	void *ptr;

 	if (after_bootmem)
 		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
-	else if (start_pfn < table_end) {
-		ptr = __va(start_pfn << PAGE_SHIFT);
-		start_pfn++;
+	else if (table_cur < table_top) {
+		ptr = __va(table_cur << PAGE_SHIFT);
+		table_cur++;
 		memset(ptr, 0, PAGE_SIZE);
 	} else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
 	return ptr;
 }

-#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
-#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
-
-static __init void
-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
+void
+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
-	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte, new_pte;
-
-	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+	pte_t *pte;

-	pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
-	if (pgd_none(*pgd)) {
-		printk(KERN_ERR
-			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
-		return;
-	}
-	pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
+	pud = pud_page + pud_index(vaddr);
 	if (pud_none(*pud)) {
 		pmd = (pmd_t *) spp_getpage();
 		make_page_readonly(pmd, XENFEAT_writable_page_tables);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+		pud_populate(&init_mm, pud, pmd);
 		if (pmd != pmd_offset(pud, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
 				pmd, pmd_offset(pud, 0));
@@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
 	if (pmd_none(*pmd)) {
 		pte = (pte_t *) spp_getpage();
 		make_page_readonly(pte, XENFEAT_writable_page_tables);
-		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+		pmd_populate_kernel(&init_mm, pmd, pte);
 		if (pte != pte_offset_kernel(pmd, 0)) {
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
-	if (pgprot_val(prot))
-		new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
-	else
-		new_pte = __pte(0);

 	pte = pte_offset_kernel(pmd, vaddr);
 	if (!pte_none(*pte) && __pte_val(new_pte) &&
+#ifdef CONFIG_ACPI
+	    /* __acpi_map_table() fails to properly call clear_fixmap() */
+	    (vaddr < __fix_to_virt(FIX_ACPI_END) ||
+	     vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
+#endif
 	    __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
 		pte_ERROR(*pte);
 	set_pte(pte, new_pte);
@@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
 	__flush_tlb_one(vaddr);
 }

-static __init void
-set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+void
+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte, new_pte;
+	pud_t *pud_page;

-	pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
+	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));

 	pgd = pgd_offset_k(vaddr);
 	if (pgd_none(*pgd)) {
@@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
 			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
 		return;
 	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
-		make_page_readonly(pmd, XENFEAT_writable_page_tables);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
-		if (pmd != pmd_offset(pud, 0)) {
-			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-				pmd, pmd_offset(pud, 0));
+	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+	set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+						pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+		pgd = pgd_offset_k((unsigned long)__va(phys));
+		if (pgd_none(*pgd)) {
+			pud = (pud_t *) spp_getpage();
+			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+						_PAGE_USER));
 		}
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		pte = (pte_t *) spp_getpage();
-		make_page_readonly(pte, XENFEAT_writable_page_tables);
-		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
-		if (pte != pte_offset_kernel(pmd, 0)) {
-			printk(KERN_ERR "PAGETABLE BUG #02!\n");
-			return;
+		pud = pud_offset(pgd, (unsigned long)__va(phys));
+		if (pud_none(*pud)) {
+			pmd = (pmd_t *) spp_getpage();
+			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+						_PAGE_USER));
 		}
+		pmd = pmd_offset(pud, phys);
+		BUG_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
 	}
-	new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
+}

-	pte = pte_offset_kernel(pmd, vaddr);
-	if (!pte_none(*pte) && __pte_val(new_pte) &&
-#ifdef CONFIG_ACPI
-	    /* __acpi_map_table() fails to properly call clear_fixmap() */
-	    (vaddr < __fix_to_virt(FIX_ACPI_END) ||
-	     vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
-#endif
-	    __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
-		pte_ERROR(*pte);
-	set_pte(pte, new_pte);
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}

-	/*
-	 * It's enough to flush this one mapping.
-	 * (PGE mappings get flushed as well)
-	 */
-	__flush_tlb_one(vaddr);
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
 }

-#ifndef CONFIG_XEN
 /*
  * The head.S code sets up the kernel high mapping:
  *
@@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
 }
 #endif

-/* NOTE: this is meant to be run only at boot */
-void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		printk(KERN_ERR "Invalid __set_fixmap\n");
-		return;
-	}
-	switch (idx) {
-	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-		set_pte_phys(address, phys, prot, 0);
-		set_pte_phys(address, phys, prot, 1);
-		break;
-	case FIX_EARLYCON_MEM_BASE:
-		xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
-				    pfn_pte_ma(phys >> PAGE_SHIFT, prot));
-		break;
-	default:
-		set_pte_phys_ma(address, phys, prot);
-		break;
-	}
-}
-
-static __meminit void *alloc_static_page(unsigned long *phys)
-{
-	unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
+	unsigned long pfn;
+	void *adr;

 	if (after_bootmem) {
-		void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
+		adr = (void *)get_zeroed_page(GFP_ATOMIC);
 		*phys = __pa(adr);

 		return adr;
 	}

-	*phys = start_pfn << PAGE_SHIFT;
-	start_pfn++;
-	memset((void *)va, 0, PAGE_SIZE);
-	return (void *)va;
+	BUG_ON(!table_cur);
+	pfn = table_cur++;
+	if (pfn >= table_top)
+		panic("alloc_low_page: ran out of memory");
+
+	adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
+	memset(adr, 0, PAGE_SIZE);
+	*phys  = pfn * PAGE_SIZE;
+	return adr;
 }

-#define PTE_SIZE PAGE_SIZE
+static __ref void unmap_low_page(void *adr)
+{
+	if (after_bootmem)
+		return;
+
+	early_iounmap(adr, PAGE_SIZE);
+}

 static inline int __meminit make_readonly(unsigned long paddr)
 {
 	extern char __vsyscall_0;
 	int readonly = 0;

-	/* Make new page tables read-only. */
+	/* Make new page tables read-only on the first pass. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && !max_pfn_mapped
 	    && (paddr >= (table_start << PAGE_SHIFT))
-	    && (paddr < (table_end << PAGE_SHIFT)))
+	    && (paddr < (table_top << PAGE_SHIFT)))
 		readonly = 1;
 	/* Make old page tables read-only. */
 	if (!xen_feature(XENFEAT_writable_page_tables)
 	    && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
-	    && (paddr < (start_pfn << PAGE_SHIFT)))
+	    && (paddr < (table_cur << PAGE_SHIFT)))
 		readonly = 1;

 	/*
@@ -425,118 +381,131 @@ static inline int __meminit make_readonl
 	return readonly;
 }

-#ifndef CONFIG_XEN
-/* Must run before zap_low_mappings */
-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
 {
-	pmd_t *pmd, *last_pmd;
-	unsigned long vaddr;
-	int i, pmds;
-
-	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
-	vaddr = __START_KERNEL_map;
-	pmd = level2_kernel_pgt;
-	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
-
-	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
-		for (i = 0; i < pmds; i++) {
-			if (pmd_present(pmd[i]))
-				goto continue_outer_loop;
-		}
-		vaddr += addr & ~PMD_MASK;
-		addr &= PMD_MASK;
+	unsigned pages = 0;
+	unsigned long last_map_addr = end;
+	int i;
+
+	pte_t *pte = pte_page + pte_index(addr);
+
+	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+		unsigned long pteval = addr | __PAGE_KERNEL;
+
+		if (addr >= end ||
+		    (!after_bootmem &&
+		     (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
+			break;
+
+		if (__pte_val(*pte))
+			continue;

-		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
-			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-		__flush_tlb_all();
-
-		return (void *)vaddr;
-continue_outer_loop:
-		;
+		if (make_readonly(addr))
+			pteval &= ~_PAGE_RW;
+		if (0)
+			printk("   pte=%p addr=%lx pte=%016lx\n",
+			       pte, addr, pteval);
+		if (!after_bootmem)
+			*pte = __pte(pteval & __supported_pte_mask);
+		else
+			set_pte(pte, __pte(pteval & __supported_pte_mask));
+		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+		pages++;
 	}
-	printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
-	return NULL;
+	update_page_count(PG_LEVEL_4K, pages);
+
+	return last_map_addr;
 }

-/*
- * To avoid virtual aliases later:
- */
-__meminit void early_iounmap(void *addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
 {
-	unsigned long vaddr;
-	pmd_t *pmd;
-	int i, pmds;
+	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

-	vaddr = (unsigned long)addr;
-	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
-	pmd = level2_kernel_pgt + pmd_index(vaddr);
-
-	for (i = 0; i < pmds; i++)
-		pmd_clear(pmd + i);
-
-	__flush_tlb_all();
+	BUG_ON(!max_pfn_mapped);
+	return phys_pte_init(pte, address, end);
 }
-#endif

 static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+			 unsigned long page_size_mask)
 {
+	unsigned long pages = 0;
+	unsigned long last_map_addr = end;
+	unsigned long start = address;
+
 	int i = pmd_index(address);

-	for (; i < PTRS_PER_PMD; i++) {
+	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
 		unsigned long pte_phys;
-		pmd_t *pmd = pmd_page + i;
-		pte_t *pte, *pte_save;
-		int k;
+		pmd_t *pmd = pmd_page + pmd_index(address);
+		pte_t *pte;

 		if (address >= end)
 			break;

 		if (__pmd_val(*pmd)) {
-			address += PMD_SIZE;
+			if (!pmd_large(*pmd)) {
+				spin_lock(&init_mm.page_table_lock);
+				last_map_addr = phys_pte_update(pmd, address,
+								end);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+			/* Count entries we're using from level2_ident_pgt */
+			if (start == 0)
+				pages++;
 			continue;
 		}

-		pte = alloc_static_page(&pte_phys);
-		pte_save = pte;
-		for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
-			unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
-
-			if (address >= end ||
-			    (!after_bootmem &&
-			     (address >> PAGE_SHIFT) >= xen_start_info->nr_pages))
-				pteval = 0;
-			else if (make_readonly(address))
-				pteval &= ~_PAGE_RW;
-			set_pte(pte, __pte(pteval & __supported_pte_mask));
+		if (page_size_mask & (1<<PG_LEVEL_2M)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pmd,
+				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			spin_unlock(&init_mm.page_table_lock);
+			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+			continue;
 		}
+
+		pte = alloc_low_page(&pte_phys);
+		last_map_addr = phys_pte_init(pte, address, end);
+		unmap_low_page(pte);
+
 		if (!after_bootmem) {
-			early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
-			*pmd = __pmd(pte_phys | _KERNPG_TABLE);
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pte_phys),
+						   XENFEAT_writable_page_tables);
+			*pmd = __pmd(pte_phys | _PAGE_TABLE);
 		} else {
-			make_page_readonly(pte_save, XENFEAT_writable_page_tables);
-			set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+			make_page_readonly(pte, XENFEAT_writable_page_tables);
+			spin_lock(&init_mm.page_table_lock);
+			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+			spin_unlock(&init_mm.page_table_lock);
 		}
 	}
-	return address;
+	update_page_count(PG_LEVEL_2M, pages);
+	return last_map_addr;
 }

 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
+			 unsigned long page_size_mask)
 {
 	pmd_t *pmd = pmd_offset(pud, 0);
 	unsigned long last_map_addr;

-	spin_lock(&init_mm.page_table_lock);
-	last_map_addr = phys_pmd_init(pmd, address, end);
-	spin_unlock(&init_mm.page_table_lock);
+	BUG_ON(!max_pfn_mapped);
+	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
 	__flush_tlb_all();
 	return last_map_addr;
 }

 static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+			 unsigned long page_size_mask)
 {
+	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
 	int i = pud_index(addr);

@@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned

 		if (__pud_val(*pud)) {
 			if (!pud_large(*pud))
-				last_map_addr = phys_pmd_update(pud, addr, end);
+				last_map_addr = phys_pmd_update(pud, addr, end,
+							 page_size_mask);
 			continue;
 		}

-		if (direct_gbpages) {
+		if (page_size_mask & (1<<PG_LEVEL_1G)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			spin_unlock(&init_mm.page_table_lock);
 			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
 			continue;
 		}

-		pmd = alloc_static_page(&pmd_phys);
+		pmd = alloc_low_page(&pmd_phys);
+		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+		unmap_low_page(pmd);

-		spin_lock(&init_mm.page_table_lock);
-		*pud = __pud(pmd_phys | _KERNPG_TABLE);
-		last_map_addr = phys_pmd_init(pmd, addr, end);
-		spin_unlock(&init_mm.page_table_lock);
-
-		early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+		if (!after_bootmem) {
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pmd_phys),
+						   XENFEAT_writable_page_tables);
+			if (page_size_mask & (1 << PG_LEVEL_NUM))
+				xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
+			else
+				*pud = __pud(pmd_phys | _PAGE_TABLE);
+		} else {
+			make_page_readonly(pmd, XENFEAT_writable_page_tables);
+			spin_lock(&init_mm.page_table_lock);
+			pud_populate(&init_mm, pud, __va(pmd_phys));
+			spin_unlock(&init_mm.page_table_lock);
+		}
 	}
 	__flush_tlb_all();
+	update_page_count(PG_LEVEL_1G, pages);

-	return last_map_addr >> PAGE_SHIFT;
+	return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
+		 unsigned long page_size_mask)
+{
+	pud_t *pud;
+
+	pud = (pud_t *)pgd_page_vaddr(*pgd);
+
+	return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
 }

 void __init xen_init_pt(void)
@@ -656,112 +651,36 @@ void __init xen_init_pt(void)
 	}
 }

-static void __init extend_init_mapping(unsigned long tables_space)
-{
-	unsigned long va = __START_KERNEL_map;
-	unsigned long start = start_pfn;
-	unsigned long phys, addr, *pte_page;
-	pmd_t *pmd;
-	pte_t *pte, new_pte;
-	unsigned long *page = (unsigned long *)init_level4_pgt;
-
-	addr = page[pgd_index(va)];
-	addr_to_page(addr, page);
-	addr = page[pud_index(va)];
-	addr_to_page(addr, page);
-
-	/* Kill mapping of low 1MB. */
-	while (va < (unsigned long)&_text) {
-		if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
-			BUG();
-		va += PAGE_SIZE;
-	}
-
-	/* Ensure init mappings cover kernel text/data and initial tables. */
-	while (va < (__START_KERNEL_map
-		     + (start_pfn << PAGE_SHIFT)
-		     + tables_space)) {
-		if (!(pmd_index(va) | pte_index(va))) {
-			pud_t *pud;
-
-			page = (unsigned long *)init_level4_pgt;
-			addr = page[pgd_index(va)];
-			addr_to_page(addr, page);
-			pud = (pud_t *)&page[pud_index(va)];
-			if (pud_none(*pud)) {
-				page = alloc_static_page(&phys);
-				early_make_page_readonly(
-					page, XENFEAT_writable_page_tables);
-				set_pud(pud, __pud(phys | _KERNPG_TABLE));
-			} else {
-				addr = page[pud_index(va)];
-				addr_to_page(addr, page);
-			}
-		}
-		pmd = (pmd_t *)&page[pmd_index(va)];
-		if (pmd_none(*pmd)) {
-			pte_page = alloc_static_page(&phys);
-			early_make_page_readonly(
-				pte_page, XENFEAT_writable_page_tables);
-			set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
-		} else {
-			addr = page[pmd_index(va)];
-			addr_to_page(addr, pte_page);
-		}
-		pte = (pte_t *)&pte_page[pte_index(va)];
-		if (pte_none(*pte)) {
-			new_pte = pfn_pte(
-				(va - __START_KERNEL_map) >> PAGE_SHIFT,
-				__pgprot(_KERNPG_TABLE));
-			xen_l1_entry_update(pte, new_pte);
-		}
-		va += PAGE_SIZE;
-	}
-
-	/* Finally, blow away any spurious initial mappings. */
-	while (1) {
-		if (!(pmd_index(va) | pte_index(va))) {
-			page = (unsigned long *)init_level4_pgt;
-			addr = page[pgd_index(va)];
-			addr_to_page(addr, page);
-			if (pud_none(((pud_t *)page)[pud_index(va)]))
-				break;
-			addr = page[pud_index(va)];
-			addr_to_page(addr, page);
-		}
-		pmd = (pmd_t *)&page[pmd_index(va)];
-		if (pmd_none(*pmd))
-			break;
-		if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
-			BUG();
-		va += PAGE_SIZE;
-	}
-
-	if (start_pfn > start)
-		reserve_early(start << PAGE_SHIFT,
-			      start_pfn << PAGE_SHIFT, "INITMAP");
-}
-
 static void __init find_early_table_space(unsigned long end)
 {
 	unsigned long puds, pmds, ptes, tables;

 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
 	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
+	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);

-	tables = round_up(puds * 8, PAGE_SIZE) +
-		round_up(pmds * 8, PAGE_SIZE) +
-		round_up(ptes * 8, PAGE_SIZE);
+	ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);

-	extend_init_mapping(tables);
+	if (!table_top) {
+		table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+			xen_start_info->nr_pt_frames;
+		table_cur = table_start;
+	} else {
+		/*
+		 * [table_start, table_top) gets passed to reserve_early(),
+		 * so we must not use table_cur here, despite continuing
+		 * to allocate from there. table_cur possibly being below
+		 * table_start is otoh not a problem.
+		 */
+		table_start = table_top;
+	}

-	table_start = start_pfn;
-	table_end = table_start + (tables>>PAGE_SHIFT);
+	table_top = table_cur + (tables >> PAGE_SHIFT);

-	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT,
-		(table_start << PAGE_SHIFT) + tables);
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
 }

 static void __init xen_finish_init_mapping(void)
@@ -783,18 +702,18 @@ static void __init xen_finish_init_mappi
 		xen_start_info->mod_start = (unsigned long)
 			__va(__pa(xen_start_info->mod_start));

-	/* Destroy the Xen-created mappings beyond the kernel image as
-	 * well as the temporary mappings created above. Prevents
-	 * overlap with modules area (if init mapping is very big).
-	 */
+	/* Destroy the Xen-created mappings beyond the kernel image. */
 	start = PAGE_ALIGN((unsigned long)_end);
-	end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
+	end   = __START_KERNEL_map + (table_start << PAGE_SHIFT);
 	for (; start < end; start += PAGE_SIZE)
 		if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
 			BUG();

-	/* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
-	table_end = ~0UL;
+	/* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
+	start = table_top;
+	WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
+	     table_start, table_cur, start);
+	table_top = ~0UL;

 	/* Switch to the real shared_info page, and clear the dummy page. */
 	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
@@ -811,8 +730,7 @@ static void __init xen_finish_init_mappi
 				     << PAGE_SHIFT,
 				     PAGE_KERNEL_RO);

-	/* Disable the 'start_pfn' allocator. */
-	table_end = start_pfn;
+	table_top = max(table_cur, start);
 }

 static void __init init_gbpages(void)
@@ -825,126 +743,91 @@ static void __init init_gbpages(void)
 #endif
 }

-#ifdef CONFIG_MEMTEST_BOOTPARAM
-
-static void __init memtest(unsigned long start_phys, unsigned long size,
-				 unsigned pattern)
+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+						unsigned long end,
+						unsigned long page_size_mask)
 {
-	unsigned long i;
-	unsigned long *start;
-	unsigned long start_bad;
-	unsigned long last_bad;
-	unsigned long val;
-	unsigned long start_phys_aligned;
-	unsigned long count;
-	unsigned long incr;
-
-	switch (pattern) {
-	case 0:
-		val = 0UL;
-		break;
-	case 1:
-		val = -1UL;
-		break;
-	case 2:
-		val = 0x5555555555555555UL;
-		break;
-	case 3:
-		val = 0xaaaaaaaaaaaaaaaaUL;
-		break;
-	default:
-		return;
-	}

-	incr = sizeof(unsigned long);
-	start_phys_aligned = ALIGN(start_phys, incr);
-	count = (size - (start_phys_aligned - start_phys))/incr;
-	start = __va(start_phys_aligned);
-	start_bad = 0;
-	last_bad = 0;
-
-	for (i = 0; i < count; i++)
-		start[i] = val;
-	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-		if (*start != val) {
-			if (start_phys_aligned == last_bad + incr) {
-				last_bad += incr;
-			} else {
-				if (start_bad) {
-					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-						val, start_bad, last_bad + incr);
-					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-				}
-				start_bad = last_bad = start_phys_aligned;
-			}
-		}
-	}
-	if (start_bad) {
-		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-			val, start_bad, last_bad + incr);
-		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-	}
+	unsigned long next, last_map_addr = end;

-}
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);

-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+	for (; start < end; start = next) {
+		pgd_t *pgd = pgd_offset_k(start);
+		unsigned long pud_phys;
+		pud_t *pud;

-static int __init parse_memtest(char *arg)
-{
-	if (arg)
-		memtest_pattern = simple_strtoul(arg, NULL, 0);
-	return 0;
-}
+		next = (start + PGDIR_SIZE) & PGDIR_MASK;
+		if (next > end)
+			next = end;

-early_param("memtest", parse_memtest);
+		if (__pgd_val(*pgd)) {
+			last_map_addr = phys_pud_update(pgd, __pa(start),
+						 __pa(end), page_size_mask);
+			continue;
+		}

-static void __init early_memtest(unsigned long start, unsigned long end)
-{
-	u64 t_start, t_size;
-	unsigned pattern;
+		pud = alloc_low_page(&pud_phys);
+		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+						 page_size_mask);
+		unmap_low_page(pud);

-	if (!memtest_pattern)
-		return;
+		if (!after_bootmem) {
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pud_phys),
+						   XENFEAT_writable_page_tables);
+			xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
+		} else {
+			make_page_readonly(pud, XENFEAT_writable_page_tables);
+			spin_lock(&init_mm.page_table_lock);
+			pgd_populate(&init_mm, pgd, __va(pud_phys));
+			spin_unlock(&init_mm.page_table_lock);
+		}
+	}

-	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
-	for (pattern = 0; pattern < memtest_pattern; pattern++) {
-		t_start = start;
-		t_size = 0;
-		while (t_start < end) {
-			t_start = find_e820_area_size(t_start, &t_size, 1);
+	return last_map_addr;
+}

-			/* done ? */
-			if (t_start >= end)
-				break;
-			if (t_start + t_size > end)
-				t_size = end - t_start;
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};

-			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
-				(unsigned long long)t_start,
-				(unsigned long long)t_start + t_size, pattern);
+#define NR_RANGE_MR 5

-			memtest(t_start, t_size, pattern);
+static int save_mr(struct map_range *mr, int nr_range,
+		   unsigned long start_pfn, unsigned long end_pfn,
+		   unsigned long page_size_mask)
+{

-			t_start += t_size;
-		}
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
 	}
-	printk(KERN_CONT "\n");
-}
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
+
+	return nr_range;
 }
-#endif

 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
  * the physical memory. To access them they are temporarily mapped.
  */
-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
 {
-	unsigned long next, last_map_addr = end;
-	unsigned long start_phys = start, end_phys = end;
+	unsigned long last_map_addr = 0;
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;

 	printk(KERN_INFO "init_memory_mapping\n");

@@ -955,51 +838,150 @@ unsigned long __init_refok init_memory_m
 	 * memory mapped. Unfortunately this is done currently before the
 	 * nodes are discovered.
 	 */
-	if (!after_bootmem) {
+	if (!after_bootmem)
 		init_gbpages();
-		find_early_table_space(end);
+
+	if (direct_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (cpu_has_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ?*/
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* big page (2M) range*/
+	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+			page_size_mask & (1<<PG_LEVEL_2M));
+
+	/* big page (1G) range */
+	start_pfn = end_pfn;
+	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = end_pfn;
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+			page_size_mask & (1<<PG_LEVEL_2M));
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = end_pfn;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			 (nr_range - 1 - i) * sizeof (struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
 	}

-	start = (unsigned long)__va(start);
-	end = (unsigned long)__va(end);
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

-	for (; start < end; start = next) {
-		pgd_t *pgd = pgd_offset_k(start);
-		unsigned long pud_phys;
-		pud_t *pud;
+	if (!after_bootmem)
+		find_early_table_space(end);

-		if (after_bootmem)
-			pud = pud_offset(pgd, start & PGDIR_MASK);
-		else
-			pud = alloc_static_page(&pud_phys);
-		next = start + PGDIR_SIZE;
-		if (next > end)
-			next = end;
-		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
-		if (!after_bootmem) {
-			early_make_page_readonly(pud, XENFEAT_writable_page_tables);
-			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+	if (!start) {
+		unsigned long addr, va = __START_KERNEL_map;
+		unsigned long *page = (unsigned long *)init_level4_pgt;
+
+		/* Kill mapping of memory below _text. */
+		while (va < (unsigned long)&_text) {
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
+		}
+
+		/* Blow away any spurious initial mappings. */
+		va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+		addr = page[pgd_index(va)];
+		addr_to_page(addr, page);
+		addr = page[pud_index(va)];
+		addr_to_page(addr, page);
+		while (pmd_index(va) | pte_index(va)) {
+			if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+				break;
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
 		}
 	}

-	if (!after_bootmem) {
-		BUG_ON(start_pfn != table_end);
+	for (i = 0; i < nr_range; i++)
+		last_map_addr = kernel_physical_mapping_init(
+					mr[i].start, mr[i].end,
+					mr[i].page_size_mask);
+
+	BUG_ON(table_cur > table_top);
+	if (!start)
 		xen_finish_init_mapping();
-	}
+	else if (table_cur < table_top)
+		/* Disable the 'table_cur' allocator. */
+		table_top = table_cur;

 	__flush_tlb_all();

-	if (!after_bootmem)
+	if (!after_bootmem && table_top > table_start)
 		reserve_early(table_start << PAGE_SHIFT,
-			      table_end << PAGE_SHIFT, "PGTABLE");
+			      table_top << PAGE_SHIFT, "PGTABLE");
+
+	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+			 last_map_addr, end);

 	if (!after_bootmem)
-		early_memtest(start_phys, end_phys);
+		early_memtest(start, end);

-	return last_map_addr;
+	return last_map_addr >> PAGE_SHIFT;
 }

 #ifndef CONFIG_NUMA
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long bootmap_size, bootmap;
+
+	e820_register_active_regions(0, start_pfn, end_pfn);
+#ifdef CONFIG_XEN
+	if (end_pfn > xen_start_info->nr_pages)
+		end_pfn = xen_start_info->nr_pages;
+#endif
+	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+					 0, end_pfn);
+	free_bootmem_with_active_regions(0, end_pfn);
+	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
+
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1007,9 +989,9 @@ void __init paging_init(void)
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-	max_zone_pfns[ZONE_NORMAL] = end_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_pfn;

-	memory_present(0, 0, end_pfn);
+	memory_present(0, 0, max_pfn);
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);

@@ -1099,8 +1081,8 @@ void __init mem_init(void)
 		ClearPageReserved(pfn_to_page(pfn));
 		init_page_count(pfn_to_page(pfn));
 	}
-	reservedpages = end_pfn - totalram_pages -
-					absent_pages_in_range(0, end_pfn);
+	reservedpages = max_pfn - totalram_pages -
+					absent_pages_in_range(0, max_pfn);
 	after_bootmem = 1;

 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -1119,7 +1101,7 @@ void __init mem_init(void)
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 				"%ldk reserved, %ldk data, %ldk init)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		end_pfn << (PAGE_SHIFT-10),
+		max_pfn << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
@@ -1182,6 +1164,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+	unsigned long rodata_start =
+		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
+	start = rodata_start;
+#endif

 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -1191,8 +1180,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);

 	rodata_test();

@@ -1214,24 +1202,26 @@ void free_initrd_mem(unsigned long start
 }
 #endif

-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
+	int ret;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;

-	if (pfn >= end_pfn) {
+	if (pfn >= max_pfn) {
 		/*
 		 * This can happen with kdump kernels when accessing
 		 * firmware tables:
 		 */
 		if (pfn < max_pfn_mapped)
-			return;
+			return -EFAULT;

-		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
 				phys, len);
-		return;
+		return -EFAULT;
 	}

 	/* Should check here against the e820 map to avoid double free */
@@ -1239,9 +1229,13 @@ void __init reserve_bootmem_generic(unsi
 	nid = phys_to_nid(phys);
 	next_nid = phys_to_nid(phys + len - 1);
 	if (nid == next_nid)
-		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
 	else
-		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+		ret = reserve_bootmem(phys, len, flags);
+
+	if (ret != 0)
+		return ret;
+
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
@@ -1252,6 +1246,8 @@ void __init reserve_bootmem_generic(unsi
 		set_dma_reserve(dma_reserve);
 	}
 #endif
+
+	return 0;
 }

 int kern_addr_valid(unsigned long addr)
@@ -1369,7 +1365,7 @@ vmemmap_populate(struct page *start_page
 	pmd_t *pmd;

 	for (; addr < end; addr = next) {
-		next = pmd_addr_end(addr, end);
+		void *p = NULL;

 		pgd = vmemmap_pgd_populate(addr, node);
 		if (!pgd)
@@ -1379,33 +1375,51 @@ vmemmap_populate(struct page *start_page
 		if (!pud)
 			return -ENOMEM;

-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
-			pte_t entry;
-			void *p;
+		if (!cpu_has_pse) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			pmd = vmemmap_pmd_populate(pud, addr, node);
+
+			if (!pmd)
+				return -ENOMEM;
+
+			p = vmemmap_pte_populate(pmd, addr, node);

-			p = vmemmap_alloc_block(PMD_SIZE, node);
 			if (!p)
 				return -ENOMEM;

-			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
-							PAGE_KERNEL_LARGE);
-			set_pmd(pmd, __pmd_ma(__pte_val(entry)));
-
-			/* check to see if we have contiguous blocks */
-			if (p_end != p || node_start != node) {
-				if (p_start)
-					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
-						addr_start, addr_end-1, p_start, p_end-1, node_start);
-				addr_start = addr;
-				node_start = node;
-				p_start = p;
-			}
-			addr_end = addr + PMD_SIZE;
-			p_end = p + PMD_SIZE;
+			addr_end = addr + PAGE_SIZE;
+			p_end = p + PAGE_SIZE;
 		} else {
-			vmemmap_verify((pte_t *)pmd, node, addr, next);
+			next = pmd_addr_end(addr, end);
+
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd)) {
+				pte_t entry;
+
+				p = vmemmap_alloc_block(PMD_SIZE, node);
+				if (!p)
+					return -ENOMEM;
+
+				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+						PAGE_KERNEL_LARGE);
+				set_pmd(pmd, __pmd_ma(__pte_val(entry)));
+
+				/* check to see if we have contiguous blocks */
+				if (p_end != p || node_start != node) {
+					if (p_start)
+						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						       addr_start, addr_end-1, p_start, p_end-1, node_start);
+					addr_start = addr;
+					node_start = node;
+					p_start = p;
+				}
+
+				addr_end = addr + PMD_SIZE;
+				p_end = p + PMD_SIZE;
+			} else
+				vmemmap_verify((pte_t *)pmd, node, addr, next);
 		}
+
 	}
 	return 0;
 }
--- head-2011-03-11.orig/arch/x86/mm/ioremap-xen.c	2011-02-07 15:39:13.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/ioremap-xen.c	2011-02-07 15:40:39.000000000 +0100
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>

 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -255,7 +256,8 @@ int ioremap_check_change_attr(unsigned l
 	for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
 		unsigned long pfn = mfn_to_local_pfn(mfn);

-		if (pfn >= max_pfn_mapped)
+		if (pfn >= max_low_pfn_mapped &&
+		    (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
 			continue;
 		rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
 					 PAGE_SIZE, prot_val);
@@ -278,11 +280,14 @@ static void __iomem *__ioremap_caller(re
 {
 	unsigned long offset, vaddr;
 	phys_addr_t mfn, last_addr;
+	const resource_size_t unaligned_phys_addr = phys_addr;
+	const unsigned long unaligned_size = size;
 	struct vm_struct *area;
 	unsigned long new_prot_val;
 	pgprot_t prot;
 	int retval;
 	domid_t domid = DOMID_IO;
+	void __iomem *ret_addr;

 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -299,7 +304,7 @@ static void __iomem *__ioremap_caller(re
 	/*
 	 * Don't remap the low PCI/ISA area, it's always mapped..
 	 */
-	if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
+	if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
 		return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);

 	/*
@@ -323,7 +328,7 @@ static void __iomem *__ioremap_caller(re
 	phys_addr &= PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;

-	retval = reserve_memtype(phys_addr, phys_addr + size,
+	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
 						prot_val, &new_prot_val);
 	if (retval) {
 		pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -391,7 +396,10 @@ static void __iomem *__ioremap_caller(re
 		return NULL;
 	}

-	return (void __iomem *) (vaddr + offset);
+	ret_addr = (void __iomem *) (vaddr + offset);
+	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+	return ret_addr;
 }

 /**
@@ -419,7 +427,7 @@ void __iomem *ioremap_nocache(resource_s
 {
 	/*
 	 * Ideally, this should be:
-	 *	pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+	 *	pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
 	 * UC MINUS.
@@ -443,7 +451,7 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
 {
-	if (pat_wc_enabled)
+	if (pat_enabled)
 		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
 					__builtin_return_address(0));
 	else
@@ -483,6 +491,14 @@ static void __iomem *ioremap_default(res
 }
 #endif

+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+				unsigned long prot_val)
+{
+	return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
 /**
  * iounmap - Free a IO remapping
  * @addr: virtual address from ioremap_*
@@ -507,6 +523,8 @@ void iounmap(volatile void __iomem *addr
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);

+	mmiotrace_iounmap(addr);
+
 	/* Use the vm area unlocked, assuming the caller
 	   ensures there isn't another iounmap for the same address
 	   in parallel. Reuse of the virtual address is prevented by
@@ -514,7 +532,7 @@ void iounmap(volatile void __iomem *addr
 	   cpa takes care of the direct mappings. */
 	read_lock(&vmlist_lock);
 	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
+		if (p->addr == (void __force *)addr)
 			break;
 	}
 	read_unlock(&vmlist_lock);
@@ -528,7 +546,7 @@ void iounmap(volatile void __iomem *addr
 	free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));

 	/* Finally remove it */
-	o = remove_vm_area((void *)addr);
+	o = remove_vm_area((void __force *)addr);
 	BUG_ON(p != o || o == NULL);
 	kfree(p);
 }
@@ -548,7 +566,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);

-	addr = (void *)ioremap_default(start, PAGE_SIZE);
+	addr = (void __force *)ioremap_default(start, PAGE_SIZE);
 	if (addr)
 		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));

@@ -576,8 +594,7 @@ static int __init early_ioremap_debug_se
 early_param("early_ioremap_debug", early_ioremap_debug_setup);

 static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-		__section(.bss.page_aligned);
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;

 #ifdef CONFIG_X86_32
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -676,10 +693,11 @@ static void __init __early_set_fixmap(en
 		return;
 	}
 	pte = early_ioremap_pte(addr);
+
 	if (pgprot_val(flags))
 		set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
 	else
-		pte_clear(NULL, addr, pte);
+		pte_clear(&init_mm, addr, pte);
 	__flush_tlb_one(addr);
 }

@@ -707,13 +725,11 @@ static int __init check_early_ioremap_le
 {
 	if (!early_ioremap_nested)
 		return 0;
-
-	printk(KERN_WARNING
+	WARN(1, KERN_WARNING
 	       "Debug warning: early ioremap leak of %d areas detected.\n",
-	       early_ioremap_nested);
+		early_ioremap_nested);
 	printk(KERN_WARNING
-	       "please boot with early_ioremap_debug and report the dmesg.\n");
-	WARN_ON(1);
+		"please boot with early_ioremap_debug and report the dmesg.\n");

 	return 1;
 }
--- head-2011-03-11.orig/arch/x86/mm/pageattr-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pageattr-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -34,6 +34,47 @@ struct cpa_data {
 	unsigned	force_split : 1;
 };

+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+	unsigned long flags;
+
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	direct_pages_count[level] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void split_page_count(int level)
+{
+	direct_pages_count[level]--;
+	direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+int arch_report_meminfo(char *page)
+{
+	int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+	n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+#ifdef CONFIG_X86_64
+	if (direct_gbpages)
+		n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
+			direct_pages_count[PG_LEVEL_1G] << 20);
+#endif
+	return n;
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
 #ifdef CONFIG_X86_64

 static inline unsigned long highmap_start_pfn(void)
@@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
 {
 	BUG_ON(irqs_disabled());

-	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }

 static void __cpa_flush_range(void *arg)
@@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
 	BUG_ON(irqs_disabled());
 	WARN_ON(PAGE_ALIGN(start) != start);

-	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+	on_each_cpu(__cpa_flush_range, NULL, 1);

 	if (!cache)
 		return;
@@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr

 	return pte_offset_kernel(pmd, address);
 }
+EXPORT_SYMBOL_GPL(lookup_address);

 /*
  * Set the new pmd in all the pgds we know about:
@@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
 	}
 #endif

+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+		split_page_count(level);
+
+#ifdef CONFIG_X86_64
+	if (address >= (unsigned long)__va(1UL<<32) &&
+		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+		split_page_count(level);
+#endif
+
 	/*
 	 * Get the target mfn from the original entry:
 	 */
@@ -566,10 +618,9 @@ repeat:
 	if (!__pte_val(old_pte)) {
 		if (!primary)
 			return 0;
-		printk(KERN_WARNING "CPA: called for zero pte. "
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
 		       "vaddr = %lx cpa->vaddr = %lx\n", address,
 		       cpa->vaddr);
-		WARN_ON(1);
 		return -EINVAL;
 	}

@@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
 	struct cpa_data alias_cpa;
 	int ret = 0;

-	if (cpa->pfn > max_pfn_mapped)
+	if (cpa->pfn >= max_pfn_mapped)
 		return 0;

+#ifdef CONFIG_X86_64
+	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+		return 0;
+#endif
 	/*
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
-	if (!within(cpa->vaddr, PAGE_OFFSET,
-		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+	if (!(within(cpa->vaddr, PAGE_OFFSET,
+		    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+		|| within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+	)) {

 		alias_cpa = *cpa;
 		alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -796,6 +856,51 @@ static inline int change_page_attr_clear
 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
 }

+#ifdef CONFIG_XEN
+static void _free_memtype(u64 pstart, u64 pend)
+{
+	u64 pa = pstart &= __PHYSICAL_MASK;
+	u64 ma = phys_to_machine(pa);
+
+	while ((pa += PAGE_SIZE) < pend) {
+		if (phys_to_machine(pa) != ma + (pa - pstart)) {
+			free_memtype(ma, ma + (pa - pstart));
+			pstart = pa;
+			ma = phys_to_machine(pa);
+		}
+	}
+	free_memtype(ma, ma + (pend - pstart));
+}
+#define free_memtype _free_memtype
+
+static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
+{
+	u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
+	u64 ma = phys_to_machine(pa);
+	int rc = 0;
+
+	while ((pa += PAGE_SIZE) < pend) {
+		if (phys_to_machine(pa) != ma + (pa - pcur)) {
+			rc = reserve_memtype(ma, ma + (pa - pcur),
+					     req_type, NULL);
+			if (rc)
+				break;
+			pcur = pa;
+			ma = phys_to_machine(pa);
+		}
+	}
+	if (likely(!rc))
+		rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
+
+	if (unlikely(!rc) && pstart < pcur)
+		_free_memtype(pstart, pcur);
+
+	return rc;
+}
+#define reserve_memtype(s, e, r, n) \
+	_reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
+#endif
+
 int _set_memory_uc(unsigned long addr, int numpages)
 {
 	/*
@@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
-	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 			    _PAGE_CACHE_UC_MINUS, NULL))
 		return -EINVAL;

@@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i

 int set_memory_wc(unsigned long addr, int numpages)
 {
-	if (!pat_wc_enabled)
+	if (!pat_enabled)
 		return set_memory_uc(addr, numpages);

-	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+	if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 		_PAGE_CACHE_WC, NULL))
 		return -EINVAL;

@@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i

 int set_memory_wb(unsigned long addr, int numpages)
 {
-	free_memtype(addr, addr + numpages * PAGE_SIZE);
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);

 	return _set_memory_wb(addr, numpages);
 }
--- head-2011-03-11.orig/arch/x86/mm/pat-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pat-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -12,6 +12,8 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>

 #include <asm/msr.h>
 #include <asm/tlbflush.h>
@@ -26,11 +28,11 @@
 #include <asm/io.h>

 #ifdef CONFIG_X86_PAT
-int __read_mostly pat_wc_enabled = 1;
+int __read_mostly pat_enabled = 1;

 void __cpuinit pat_disable(char *reason)
 {
-	pat_wc_enabled = 0;
+	pat_enabled = 0;
 	printk(KERN_INFO "%s\n", reason);
 }

@@ -42,6 +44,19 @@ static int __init nopat(char *str)
 early_param("nopat", nopat);
 #endif

+
+static int debug_enable;
+static int __init pat_debug_setup(char *str)
+{
+	debug_enable = 1;
+	return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+#define dprintk(fmt, arg...) \
+	do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+
 static u64 __read_mostly boot_pat_state;

 enum {
@@ -53,24 +68,25 @@ enum {
 	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
 };

-#define PAT(x,y)	((u64)PAT_ ## y << ((x)*8))
+#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))

 void pat_init(void)
 {
 	u64 pat;

-	if (!pat_wc_enabled)
+	if (!pat_enabled)
 		return;

 	/* Paranoia check. */
-	if (!cpu_has_pat) {
-		printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+	if (!cpu_has_pat && boot_pat_state) {
 		/*
-		 * Panic if this happens on the secondary CPU, and we
+		 * If this happens we are on a secondary CPU, but
 		 * switched to PAT on the boot CPU. We have no way to
 		 * undo PAT.
-		*/
-		BUG_ON(boot_pat_state);
+		 */
+		printk(KERN_ERR "PAT enabled, "
+		       "but not supported by secondary CPU\n");
+		BUG();
 	}

 #ifndef CONFIG_XEN
@@ -87,8 +103,8 @@ void pat_init(void)
 	 *      011 UC		_PAGE_CACHE_UC
 	 * PAT bit unused
 	 */
-	pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
-	      PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);

 	/* Boot CPU check */
 	if (!boot_pat_state)
@@ -113,13 +129,13 @@ void pat_init(void)
 static char *cattr_name(unsigned long flags)
 {
 	switch (flags & _PAGE_CACHE_MASK) {
-		case _PAGE_CACHE_UC:		return "uncached";
-		case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
-		case _PAGE_CACHE_WB:		return "write-back";
-		case _PAGE_CACHE_WC:		return "write-combining";
-		case _PAGE_CACHE_WP:		return "write-protected";
-		case _PAGE_CACHE_WT:		return "write-through";
-		default:			return "broken";
+	case _PAGE_CACHE_UC:		return "uncached";
+	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+	case _PAGE_CACHE_WB:		return "write-back";
+	case _PAGE_CACHE_WC:		return "write-combining";
+	case _PAGE_CACHE_WP:		return "write-protected";
+	case _PAGE_CACHE_WT:		return "write-through";
+	default:			return "broken";
 	}
 }

@@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); 	/
  * The intersection is based on "Effective Memory Type" tables in IA-32
  * SDM vol 3a
  */
-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
-				unsigned long *ret_prot)
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 {
-	unsigned long pat_type;
-	u8 mtrr_type;
-
-	pat_type = prot & _PAGE_CACHE_MASK;
-	prot &= (~_PAGE_CACHE_MASK);
-
-	/*
-	 * We return the PAT request directly for types where PAT takes
-	 * precedence with respect to MTRR and for UC_MINUS.
-	 * Consistency checks with other PAT requests is done later
-	 * while going through memtype list.
-	 */
-	if (pat_type == _PAGE_CACHE_WC) {
-		*ret_prot = prot | _PAGE_CACHE_WC;
-		return 0;
-	} else if (pat_type == _PAGE_CACHE_UC_MINUS) {
-		*ret_prot = prot | _PAGE_CACHE_UC_MINUS;
-		return 0;
-	} else if (pat_type == _PAGE_CACHE_UC) {
-		*ret_prot = prot | _PAGE_CACHE_UC;
-		return 0;
-	}
-
 	/*
 	 * Look for MTRR hint to get the effective type in case where PAT
 	 * request is for WB.
 	 */
-	mtrr_type = mtrr_type_lookup(start, end);
+	if (req_type == _PAGE_CACHE_WB) {
+		u8 mtrr_type;

-	if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
-		*ret_prot = prot | _PAGE_CACHE_UC;
-	} else if (mtrr_type == MTRR_TYPE_WRCOMB) {
-		*ret_prot = prot | _PAGE_CACHE_WC;
-	} else {
-		*ret_prot = prot | _PAGE_CACHE_WB;
+		mtrr_type = mtrr_type_lookup(start, end);
+		if (mtrr_type == MTRR_TYPE_UNCACHABLE)
+			return _PAGE_CACHE_UC;
+		if (mtrr_type == MTRR_TYPE_WRCOMB)
+			return _PAGE_CACHE_WC;
+	}
+
+	return req_type;
+}
+
+static int chk_conflict(struct memtype *new, struct memtype *entry,
+			unsigned long *type)
+{
+	if (new->type != entry->type) {
+		if (type) {
+			new->type = entry->type;
+			*type = entry->type;
+		} else
+			goto conflict;
 	}

+	 /* check overlaps with more than one entry in the list */
+	list_for_each_entry_continue(entry, &memtype_list, nd) {
+		if (new->end <= entry->start)
+			break;
+		else if (new->type != entry->type)
+			goto conflict;
+	}
 	return 0;
+
+ conflict:
+	printk(KERN_INFO "%s:%d conflicting memory types "
+	       "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
+	       new->end, cattr_name(new->type), cattr_name(entry->type));
+	return -EBUSY;
 }

+static struct memtype *cached_entry;
+static u64 cached_start;
+
 /*
  * req_type typically has one of the:
  * - _PAGE_CACHE_WB
@@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
  * req_type will have a special case value '-1', when requester want to inherit
  * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
  *
- * If ret_type is NULL, function will return an error if it cannot reserve the
- * region with req_type. If ret_type is non-null, function will return
- * available type in ret_type in case of no error. In case of any error
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
  * it will return a negative return value.
  */
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
-			unsigned long *ret_type)
+			unsigned long *new_type)
 {
-	struct memtype *new_entry = NULL;
-	struct memtype *parse;
+	struct memtype *new, *entry;
 	unsigned long actual_type;
+	struct list_head *where;
 	int err = 0;

-	/* Only track when pat_wc_enabled */
-	if (!pat_wc_enabled) {
+ 	BUG_ON(start >= end); /* end is exclusive */
+
+	if (!pat_enabled) {
 		/* This is identical to page table setting without PAT */
-		if (ret_type) {
-			if (req_type == -1) {
-				*ret_type = _PAGE_CACHE_WB;
-			} else {
-				*ret_type = req_type;
-			}
+		if (new_type) {
+			if (req_type == -1)
+				*new_type = _PAGE_CACHE_WB;
+			else
+				*new_type = req_type & _PAGE_CACHE_MASK;
 		}
 		return 0;
 	}

 	/* Low ISA region is always mapped WB in page table. No need to track */
-	if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
-		if (ret_type)
-			*ret_type = _PAGE_CACHE_WB;
-
+	if (is_ISA_range(start, end - 1)) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_WB;
 		return 0;
 	}

@@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
 		 */
 		u8 mtrr_type = mtrr_type_lookup(start, end);

-		if (mtrr_type == MTRR_TYPE_WRBACK) {
-			req_type = _PAGE_CACHE_WB;
+		if (mtrr_type == MTRR_TYPE_WRBACK)
 			actual_type = _PAGE_CACHE_WB;
-		} else {
-			req_type = _PAGE_CACHE_UC_MINUS;
+		else
 			actual_type = _PAGE_CACHE_UC_MINUS;
-		}
-	} else {
-		req_type &= _PAGE_CACHE_MASK;
-		err = pat_x_mtrr_type(start, end, req_type, &actual_type);
-	}
-
-	if (err) {
-		if (ret_type)
-			*ret_type = actual_type;
+	} else
+		actual_type = pat_x_mtrr_type(start, end,
+					      req_type & _PAGE_CACHE_MASK);

-		return -EINVAL;
-	}
-
-	new_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
-	if (!new_entry)
+	new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!new)
 		return -ENOMEM;

-	new_entry->start = start;
-	new_entry->end = end;
-	new_entry->type = actual_type;
+	new->start = start;
+	new->end = end;
+	new->type = actual_type;

-	if (ret_type)
-		*ret_type = actual_type;
+	if (new_type)
+		*new_type = actual_type;

 	spin_lock(&memtype_lock);

-	/* Search for existing mapping that overlaps the current range */
-	list_for_each_entry(parse, &memtype_list, nd) {
-		struct memtype *saved_ptr;
+	if (cached_entry && start >= cached_start)
+		entry = cached_entry;
+	else
+		entry = list_entry(&memtype_list, struct memtype, nd);

-		if (parse->start >= end) {
-			pr_debug("New Entry\n");
-			list_add(&new_entry->nd, parse->nd.prev);
-			new_entry = NULL;
+	/* Search for existing mapping that overlaps the current range */
+	where = NULL;
+	list_for_each_entry_continue(entry, &memtype_list, nd) {
+		if (end <= entry->start) {
+			where = entry->nd.prev;
+			cached_entry = list_entry(where, struct memtype, nd);
 			break;
-		}
-
-		if (start <= parse->start && end >= parse->start) {
-			if (actual_type != parse->type && ret_type) {
-				actual_type = parse->type;
-				*ret_type = actual_type;
-				new_entry->type = actual_type;
-			}
-
-			if (actual_type != parse->type) {
-				printk(
-		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-					current->comm, current->pid,
-					start, end,
-					cattr_name(actual_type),
-					cattr_name(parse->type));
-				err = -EBUSY;
-				break;
-			}
-
-			saved_ptr = parse;
-			/*
-			 * Check to see whether the request overlaps more
-			 * than one entry in the list
-			 */
-			list_for_each_entry_continue(parse, &memtype_list, nd) {
-				if (end <= parse->start) {
-					break;
-				}
-
-				if (actual_type != parse->type) {
-					printk(
-		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-						current->comm, current->pid,
-						start, end,
-						cattr_name(actual_type),
-						cattr_name(parse->type));
-					err = -EBUSY;
-					break;
-				}
-			}
-
-			if (err) {
-				break;
+		} else if (start <= entry->start) { /* end > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+				where = entry->nd.prev;
+				cached_entry = list_entry(where,
+							struct memtype, nd);
 			}
-
-			pr_debug("Overlap at 0x%Lx-0x%Lx\n",
-			       saved_ptr->start, saved_ptr->end);
-			/* No conflict. Go ahead and add this new entry */
-			list_add(&new_entry->nd, saved_ptr->nd.prev);
-			new_entry = NULL;
 			break;
-		}
-
-		if (start < parse->end) {
-			if (actual_type != parse->type && ret_type) {
-				actual_type = parse->type;
-				*ret_type = actual_type;
-				new_entry->type = actual_type;
-			}
-
-			if (actual_type != parse->type) {
-				printk(
-		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-					current->comm, current->pid,
-					start, end,
-					cattr_name(actual_type),
-					cattr_name(parse->type));
-				err = -EBUSY;
-				break;
-			}
-
-			saved_ptr = parse;
-			/*
-			 * Check to see whether the request overlaps more
-			 * than one entry in the list
-			 */
-			list_for_each_entry_continue(parse, &memtype_list, nd) {
-				if (end <= parse->start) {
-					break;
-				}
-
-				if (actual_type != parse->type) {
-					printk(
-		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-						current->comm, current->pid,
-						start, end,
-						cattr_name(actual_type),
-						cattr_name(parse->type));
-					err = -EBUSY;
-					break;
+		} else if (start < entry->end) { /* start > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+				cached_entry = list_entry(entry->nd.prev,
+							struct memtype, nd);
+
+				/*
+				 * Move to right position in the linked
+				 * list to add this new entry
+				 */
+				list_for_each_entry_continue(entry,
+							&memtype_list, nd) {
+					if (start <= entry->start) {
+						where = entry->nd.prev;
+						break;
+					}
 				}
 			}
-
-			if (err) {
-				break;
-			}
-
-			pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
-				 saved_ptr->start, saved_ptr->end);
-			/* No conflict. Go ahead and add this new entry */
-			list_add(&new_entry->nd, &saved_ptr->nd);
-			new_entry = NULL;
 			break;
 		}
 	}

 	if (err) {
-		printk(KERN_INFO
-	"reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
-			start, end, cattr_name(new_entry->type),
-			cattr_name(req_type));
-		kfree(new_entry);
+		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
+		       "track %s, req %s\n",
+		       start, end, cattr_name(new->type), cattr_name(req_type));
+		kfree(new);
 		spin_unlock(&memtype_lock);
 		return err;
 	}

-	if (new_entry) {
-		/* No conflict. Not yet added to the list. Add to the tail */
-		list_add_tail(&new_entry->nd, &memtype_list);
-		pr_debug("New Entry\n");
-	}
+	cached_start = start;

-	if (ret_type) {
-		pr_debug(
-	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
-			start, end, cattr_name(actual_type),
-			cattr_name(req_type), cattr_name(*ret_type));
-	} else {
-		pr_debug(
-	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
-			start, end, cattr_name(actual_type),
-			cattr_name(req_type));
-	}
+	if (where)
+		list_add(&new->nd, where);
+	else
+		list_add_tail(&new->nd, &memtype_list);

 	spin_unlock(&memtype_lock);
+
+	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+		start, end, cattr_name(new->type), cattr_name(req_type),
+		new_type ? cattr_name(*new_type) : "-");
+
 	return err;
 }

 int free_memtype(u64 start, u64 end)
 {
-	struct memtype *ml;
+	struct memtype *entry;
 	int err = -EINVAL;

-	/* Only track when pat_wc_enabled */
-	if (!pat_wc_enabled) {
+	if (!pat_enabled)
 		return 0;
-	}

 	/* Low ISA region is always mapped WB. No need to track */
-	if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+	if (is_ISA_range(start, end - 1))
 		return 0;
-	}

 	spin_lock(&memtype_lock);
-	list_for_each_entry(ml, &memtype_list, nd) {
-		if (ml->start == start && ml->end == end) {
-			list_del(&ml->nd);
-			kfree(ml);
+	list_for_each_entry(entry, &memtype_list, nd) {
+		if (entry->start == start && entry->end == end) {
+			if (cached_entry == entry || cached_start == start)
+				cached_entry = NULL;
+
+			list_del(&entry->nd);
+			kfree(entry);
 			err = 0;
 			break;
 		}
@@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
 			current->comm, current->pid, start, end);
 	}

-	pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 	return err;
 }


-/*
- * /dev/mem mmap interface. The memtype used for mapping varies:
- * - Use UC for mappings with O_SYNC flag
- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
- *   inherit the memtype from existing mapping.
- * - Else use UC_MINUS memtype (for backward compatibility with existing
- *   X drivers.
- */
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
 				unsigned long size, pgprot_t vma_prot)
 {
 	return vma_prot;
 }

-#ifdef CONFIG_NONPROMISC_DEVMEM
-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 static inline int range_is_allowed(unsigned long mfn, unsigned long size)
 {
 	return 1;
@@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
 	}
 	return 1;
 }
-#endif /* CONFIG_NONPROMISC_DEVMEM */
+#endif /* CONFIG_STRICT_DEVMEM */

 int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
 	u64 addr = (u64)mfn << PAGE_SHIFT;
-	unsigned long flags = _PAGE_CACHE_UC_MINUS;
+	unsigned long flags = -1;
 	int retval;

 	if (!range_is_allowed(mfn, size))
 		return 0;

 	if (file->f_flags & O_SYNC) {
-		flags = _PAGE_CACHE_UC;
+		flags = _PAGE_CACHE_UC_MINUS;
 	}

 #ifndef CONFIG_X86_32
@@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 */
-	if (!pat_wc_enabled &&
-	    ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
-		test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
-		test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
-		test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
-	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+	if (!pat_enabled &&
+	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
 		flags = _PAGE_CACHE_UC;
 	}
 #endif
 #endif

 	/*
-	 * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+	 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
+	 *
 	 * Without O_SYNC, we want to get
 	 * - WB for WB-able memory and no other conflicting mappings
 	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
 	 * - Inherit from confliting mappings otherwise
 	 */
-	if (flags != _PAGE_CACHE_UC_MINUS) {
+	if (flags != -1) {
 		retval = reserve_memtype(addr, addr + size, flags, NULL);
 	} else {
 		retval = reserve_memtype(addr, addr + size, -1, &flags);
@@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
 	free_memtype(addr, addr + size);
 }

+#if defined(CONFIG_DEBUG_FS)
+
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *list_node, *print_entry;
+	int i = 1;
+
+	print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	list_for_each_entry(list_node, &memtype_list, nd) {
+		if (pos == i) {
+			*print_entry = *list_node;
+			spin_unlock(&memtype_lock);
+			return print_entry;
+		}
+		++i;
+	}
+	spin_unlock(&memtype_lock);
+	kfree(print_entry);
+	return NULL;
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) {
+		++*pos;
+		seq_printf(seq, "PAT memtype list:\n");
+	}
+
+	return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+	struct memtype *print_entry = (struct memtype *)v;
+
+	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+			print_entry->start, print_entry->end);
+	kfree(print_entry);
+	return 0;
+}
+
+static struct seq_operations memtype_seq_ops = {
+	.start = memtype_seq_start,
+	.next  = memtype_seq_next,
+	.stop  = memtype_seq_stop,
+	.show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+	.open    = memtype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+	debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+				NULL, &memtype_fops);
+	return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS */
--- head-2011-03-11.orig/arch/x86/mm/pgtable-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pgtable-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -4,6 +4,7 @@
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
+#include <asm/fixmap.h>
 #include <asm/hypervisor.h>
 #include <asm/mmu_context.h>

@@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
 static void pgd_ctor(void *p)
 {
 	pgd_t *pgd = p;
-	unsigned long flags;

 	pgd_test_and_unpin(pgd);

-	/* Clear usermode parts of PGD */
-	memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
 	   references from swapper_pg_dir. */
@@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
 		__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
 #endif

-#ifndef CONFIG_X86_PAE
 	/* list required to sync kernel mapping updates */
 	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-#endif
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
 }

 static void pgd_dtor(void *pgd)
@@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)

 #ifdef CONFIG_X86_PAE
 /*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-	int i;
-
-	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-		pgd_t pgd = pgdp[i];
-
-		if (__pgd_val(pgd) != 0) {
-			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
-			pgdp[i] = xen_make_pgd(0);
-
-			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(mm, pmd);
-		}
-	}
-
-	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
-		xen_destroy_contiguous_region((unsigned long)pgdp, 0);
-}
-
-/*
  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
  * updating the top-level pagetable entries to guarantee the
  * processor notices the update.  Since this is expensive, and
@@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
  * and initialize the kernel pmds here.
  */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	pud_t *pud;
-	pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
-	unsigned long addr, flags;
-	int i;
-
-	/*
-	 * We can race save/restore (if we sleep during a GFP_KERNEL memory
-	 * allocation). We therefore store virtual addresses of pmds as they
-	 * do not change across save/restore, and poke the machine addresses
-	 * into the pgdir under the pgd_lock.
-	 */
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
-		pmds[i] = pmd_alloc_one(mm, addr);
-		if (!pmds[i])
-			goto out_oom;
-	}
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
-	/* Protect against save/restore: move below 4GB under pgd_lock. */
-	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
-	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
-		spin_unlock_irqrestore(&pgd_lock, flags);
-out_oom:
-		while (i--)
-			pmd_free(mm, pmds[i]);
-		return 0;
-	}
-
-	/* Copy kernel pmd contents and write-protect the new pmds. */
-	pud = pud_offset(pgd, 0);
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-	     i++, pud++, addr += PUD_SIZE) {
-		if (i >= KERNEL_PGD_BOUNDARY) {
-			memcpy(pmds[i],
-			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-			make_lowmem_page_readonly(
-				pmds[i], XENFEAT_writable_page_tables);
-		}
-
-		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
-		pud_populate(mm, pud, pmds[i]);
-	}
-
-	/* List required to sync kernel mapping updates and
-	 * to pin/unpin on save/restore. */
-	pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
-
-	return 1;
-}
+#define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD

 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 {
@@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
 		xen_tlb_flush();
 }
 #else  /* !CONFIG_X86_PAE */
+
 /* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+#define PREALLOCATED_PMDS	0
+
+#endif	/* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
 {
-	return 1;
+	int i;
+
+#ifdef CONFIG_X86_PAE
+	if (contig)
+		xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
+#endif
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++)
+		if (pmds[i])
+			pmd_free(mm, pmds[i]);
 }

-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
 {
+	int i;
+	bool failed = false;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
+		if (pmd == NULL)
+			failed = true;
+		pmds[i] = pmd;
+	}
+
+	if (failed) {
+		free_pmds(pmds, mm, false);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (__pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = xen_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+
+#ifdef CONFIG_X86_PAE
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+		xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+#endif
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+		return;
+
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < PREALLOCATED_PMDS;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmds[i];
+
+		if (i >= KERNEL_PGD_BOUNDARY) {
+			memcpy(pmd,
+			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+			make_lowmem_page_readonly(
+				pmd, XENFEAT_writable_page_tables);
+		}
+
+		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
+		pud_populate(mm, pud, pmd);
+	}
 }
-#endif	/* CONFIG_X86_PAE */

 #ifdef CONFIG_X86_64
 /* We allocate two contiguous pages for kernel and user. */
@@ -616,22 +611,55 @@ static void pgd_mop_up_pmds(struct mm_st

 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+	pgd_t *pgd;
+	pmd_t *pmds[PREALLOCATED_PMDS];
+	unsigned long flags;
+
+	pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+
+	if (pgd == NULL)
+		goto out;

-	/* so that alloc_pd can use it */
 	mm->pgd = pgd;
-	if (pgd) {
-		/* Store a back link for vmalloc_sync_all(). */
-		set_page_private(virt_to_page(pgd), (unsigned long)mm);
-		pgd_ctor(pgd);
-	}

-	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-		free_pages((unsigned long)pgd, PGD_ORDER);
-		pgd = NULL;
+	if (preallocate_pmds(pmds, mm) != 0)
+		goto out_free_pgd;
+
+	if (paravirt_pgd_alloc(mm) != 0)
+		goto out_free_pmds;
+
+	/*
+	 * Make sure that pre-populating the pmds is atomic with
+	 * respect to anything walking the pgd_list, so that they
+	 * never see a partially populated pgd.
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+
+#ifdef CONFIG_X86_PAE
+	/* Protect against save/restore: move below 4GB under pgd_lock. */
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+		spin_unlock_irqrestore(&pgd_lock, flags);
+		goto out_free_pmds;
 	}
+#endif
+
+	pgd_ctor(pgd);
+	pgd_prepopulate_pmd(mm, pgd, pmds);
+
+	/* Store a back link for vmalloc_sync_all(). */
+	set_page_private(virt_to_page(pgd), (unsigned long)mm);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);

 	return pgd;
+
+out_free_pmds:
+	free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
+out_free_pgd:
+	free_pages((unsigned long)pgd, PGD_ORDER);
+out:
+	return NULL;
 }

 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
@@ -647,6 +675,7 @@ void pgd_free(struct mm_struct *mm, pgd_
 	pgd_dtor(pgd);

 	pgd_mop_up_pmds(mm, pgd);
+	paravirt_pgd_free(mm, pgd);
 	free_pages((unsigned long)pgd, PGD_ORDER);
 }

@@ -689,7 +718,7 @@ int ptep_test_and_clear_young(struct vm_

 	if (pte_young(*ptep))
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-					 &ptep->pte);
+					 (unsigned long *) &ptep->pte);

 	if (ret)
 		pte_update(vma->vm_mm, addr, ptep);
@@ -711,3 +740,42 @@ int ptep_clear_flush_young(struct vm_are

 	return young;
 }
+
+int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
+{
+	unsigned long address = __fix_to_virt(idx);
+	pte_t pte;
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+
+	switch (idx) {
+#ifdef CONFIG_X86_64
+	extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+		pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+		set_pte_vaddr_pud(level3_user_pgt, address, pte);
+		break;
+	case FIX_EARLYCON_MEM_BASE:
+		xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
+				    pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+		fixmaps_set++;
+		return;
+#else
+	case FIX_WP_TEST:
+	case FIX_VDSO:
+		pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+		break;
+#endif
+	default:
+		pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
+		break;
+	}
+	set_pte_vaddr(address, pte);
+	fixmaps_set++;
+}
--- head-2011-03-11.orig/arch/x86/mm/pgtable_32-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pgtable_32-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -25,51 +25,49 @@
 #include <xen/features.h>
 #include <asm/hypervisor.h>

-void show_mem(void)
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
-	int total = 0, reserved = 0;
-	int shared = 0, cached = 0;
-	int highmem = 0;
-	struct page *page;
-	pg_data_t *pgdat;
-	unsigned long i;
-	unsigned long flags;
-
-	printk(KERN_INFO "Mem-info:\n");
-	show_free_areas();
-	for_each_online_pgdat(pgdat) {
-		pgdat_resize_lock(pgdat, &flags);
-		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-				touch_nmi_watchdog();
-			page = pgdat_page_nr(pgdat, i);
-			total++;
-			if (PageHighMem(page))
-				highmem++;
-			if (PageReserved(page))
-				reserved++;
-			else if (PageSwapCache(page))
-				cached++;
-			else if (page_count(page))
-				shared += page_count(page) - 1;
-		}
-		pgdat_resize_unlock(pgdat, &flags);
-	}
-	printk(KERN_INFO "%d pages of RAM\n", total);
-	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
-	printk(KERN_INFO "%d reserved pages\n", reserved);
-	printk(KERN_INFO "%d pages shared\n", shared);
-	printk(KERN_INFO "%d pages swap cached\n", cached);
-
-	printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
-	printk(KERN_INFO "%lu pages writeback\n",
-					global_page_state(NR_WRITEBACK));
-	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-	printk(KERN_INFO "%lu pages slab\n",
-		global_page_state(NR_SLAB_RECLAIMABLE) +
-		global_page_state(NR_SLAB_UNRECLAIMABLE));
-	printk(KERN_INFO "%lu pages pagetables\n",
-					global_page_state(NR_PAGETABLE));
+#ifndef CONFIG_XEN
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_val(pteval))
+		set_pte_present(&init_mm, vaddr, pte, pteval);
+	else
+		pte_clear(&init_mm, vaddr, pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+#else
+	if (HYPERVISOR_update_va_mapping(vaddr, pteval,
+					 UVMF_INVLPG|UVMF_ALL))
+		BUG();
+#endif
 }

 /*
@@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
 	__flush_tlb_one(vaddr);
 }

-static int fixmaps;
 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
 EXPORT_SYMBOL(__FIXADDR_TOP);

-void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-	pte_t pte;
-
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
-		return;
-	}
-	switch (idx) {
-	case FIX_WP_TEST:
-	case FIX_VDSO:
-		pte = pfn_pte(phys >> PAGE_SHIFT, flags);
-		break;
-	default:
-		pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
-		break;
-	}
-	if (HYPERVISOR_update_va_mapping(address, pte,
-					 UVMF_INVLPG|UVMF_ALL))
-		BUG();
-	fixmaps++;
-}
-
 /**
  * reserve_top_address - reserves a hole in the top of kernel address space
  * @reserve - size of hole to reserve
@@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
  */
 void __init reserve_top_address(unsigned long reserve)
 {
-	BUG_ON(fixmaps > 0);
+	BUG_ON(fixmaps_set > 0);
 	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
 	       (int)-reserve);
 	__FIXADDR_TOP = -reserve - PAGE_SIZE;
 	__VMALLOC_RESERVE += reserve;
 }

+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	__VMALLOC_RESERVE = memparse(arg, &arg);
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+#ifndef CONFIG_XEN
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
+#endif
+
 void make_lowmem_page_readonly(void *va, unsigned int feature)
 {
 	pte_t *pte;
--- head-2011-03-11.orig/arch/x86/pci/amd_bus.c	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/pci/amd_bus.c	2011-02-01 14:38:38.000000000 +0100
@@ -350,6 +350,7 @@ static int __init early_fill_mp_bus_info

 #define ENABLE_CF8_EXT_CFG      (1ULL << 46)

+#ifndef CONFIG_XEN
 static void enable_pci_io_ecs(void *unused)
 {
 	u64 reg;
@@ -378,6 +379,7 @@ static int __cpuinit amd_cpu_notify(stru
 static struct notifier_block __cpuinitdata amd_cpu_notifier = {
 	.notifier_call	= amd_cpu_notify,
 };
+#endif /* CONFIG_XEN */

 static void __init pci_enable_pci_io_ecs(void)
 {
@@ -419,10 +421,19 @@ static int __init pci_io_ecs_init(void)
 	if (early_pci_allowed())
 		pci_enable_pci_io_ecs();

+#ifndef CONFIG_XEN
 	register_cpu_notifier(&amd_cpu_notifier);
 	for_each_online_cpu(cpu)
 		amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
 			       (void *)(long)cpu);
+#else
+	if (cpu = 1, cpu) {
+		u64 reg;
+		rdmsrl(MSR_AMD64_NB_CFG, reg);
+		if (!(reg & ENABLE_CF8_EXT_CFG))
+			return 0;
+	}
+#endif
 	pci_probe |= PCI_HAS_IO_ECS;

 	return 0;
@@ -430,6 +441,10 @@ static int __init pci_io_ecs_init(void)

 static int __init amd_postcore_init(void)
 {
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return 0;
+#endif
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return 0;

--- head-2011-03-11.orig/arch/x86/pci/irq-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/pci/irq-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -11,8 +11,8 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dmi.h>
-#include <asm/io.h>
-#include <asm/smp.h>
+#include <linux/io.h>
+#include <linux/smp.h>
 #include <asm/io_apic.h>
 #include <linux/irq.h>
 #include <linux/acpi.h>
@@ -45,7 +45,8 @@ struct irq_router {
 	char *name;
 	u16 vendor, device;
 	int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
-	int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
+	int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
+		int new);
 };

 struct irq_router_handler {
@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
  *  and perform checksum verification.
  */

-static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
 {
 	struct irq_routing_table *rt;
 	int i;
@@ -74,10 +75,11 @@ static inline struct irq_routing_table *
 	    rt->size < sizeof(struct irq_routing_table))
 		return NULL;
 	sum = 0;
-	for (i=0; i < rt->size; i++)
+	for (i = 0; i < rt->size; i++)
 		sum += addr[i];
 	if (!sum) {
-		DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
+		DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
+			rt);
 		return rt;
 	}
 	return NULL;
@@ -104,7 +106,9 @@ static struct irq_routing_table * __init
 			return rt;
 		printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
 	}
-	for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
+	for (addr = (u8 *) isa_bus_to_virt(0xf0000);
+	     addr < (u8 *) isa_bus_to_virt(0x100000);
+	     addr += 16) {
 		rt = pirq_check_routing_table(addr);
 		if (rt)
 			return rt;
@@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
 	struct irq_info *e;

 	memset(busmap, 0, sizeof(busmap));
-	for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+	for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
 		e = &rt->slots[i];
 #ifdef DEBUG
 		{
 			int j;
 			DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
-			for(j=0; j<4; j++)
+			for (j = 0; j < 4; j++)
 				DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
 			DBG("\n");
 		}
 #endif
 		busmap[e->bus] = 1;
 	}
-	for(i = 1; i < 256; i++) {
+	for (i = 1; i < 256; i++) {
 		int node;
 		if (!busmap[i] || pci_find_bus(0, i))
 			continue;
@@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
 	return (nr & 1) ? (x >> 4) : (x & 0xf);
 }

-static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
+static void write_config_nybble(struct pci_dev *router, unsigned offset,
+	unsigned nr, unsigned int val)
 {
 	u8 x;
 	unsigned reg = offset + (nr >> 1);
@@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };

 	WARN_ON_ONCE(pirq > 4);
-	return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+	return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
 }

 static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
@@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev

 /*
  * Cyrix: nibble offset 0x5C
- * 0x5C bits 7:4 is INTB bits 3:0 is INTA
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
  * 0x5D bits 7:4 is INTD bits 3:0 is INTC
  */
 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
@@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
  *	Apparently there are systems implementing PCI routing table using
  *	link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
  *	We try our best to handle both link mappings.
- *
+ *
  *	Currently (2003-05-21) it appears most SiS chipsets follow the
  *	definition of routing registers from the SiS-5595 southbridge.
  *	According to the SiS 5595 datasheets the revision id's of the
@@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
  *
  *	0x62:	USBIRQ:
  *		bit 6 OHCI function disabled (0), enabled (1)
- *
+ *
  *	0x6a:	ACPI/SCI IRQ: bits 4-6 reserved
  *
  *	0x7e:	Data Acq. Module IRQ - bits 4-6 reserved
@@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
 {
 	WARN_ON_ONCE(pirq >= 9);
 	if (pirq > 8) {
-		printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+		dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
 		return 0;
 	}
 	return read_config_nybble(router, 0x74, pirq-1);
@@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
 {
 	WARN_ON_ONCE(pirq >= 9);
 	if (pirq > 8) {
-		printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+		dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
 		return 0;
 	}
 	write_config_nybble(router, 0x74, pirq-1, irq);
@@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
 	return inb(0xc01) & 0xf;
 }

-static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
+	int pirq, int irq)
 {
 	outb(pirq, 0xc00);
 	outb(irq, 0xc01);
@@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
 	u8 irq;
 	irq = 0;
 	if (pirq <= 4)
-	{
 		irq = read_config_nybble(router, 0x56, pirq - 1);
-	}
-	printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
-		dev->vendor, dev->device, pirq, irq);
+	dev_info(&dev->dev,
+		 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
+		 dev->vendor, dev->device, pirq, irq);
 	return irq;
 }

 static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
-	printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
-		dev->vendor, dev->device, pirq, irq);
+	dev_info(&dev->dev,
+		 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
+		 dev->vendor, dev->device, pirq, irq);
 	if (pirq <= 4)
-	{
 		write_config_nybble(router, 0x56, pirq - 1, irq);
-	}
 	return 1;
 }

@@ -553,50 +557,51 @@ static __init int intel_router_probe(str
 	if (pci_dev_present(pirq_440gx))
 		return 0;

-	switch(device)
-	{
-		case PCI_DEVICE_ID_INTEL_82371FB_0:
-		case PCI_DEVICE_ID_INTEL_82371SB_0:
-		case PCI_DEVICE_ID_INTEL_82371AB_0:
-		case PCI_DEVICE_ID_INTEL_82371MX:
-		case PCI_DEVICE_ID_INTEL_82443MX_0:
-		case PCI_DEVICE_ID_INTEL_82801AA_0:
-		case PCI_DEVICE_ID_INTEL_82801AB_0:
-		case PCI_DEVICE_ID_INTEL_82801BA_0:
-		case PCI_DEVICE_ID_INTEL_82801BA_10:
-		case PCI_DEVICE_ID_INTEL_82801CA_0:
-		case PCI_DEVICE_ID_INTEL_82801CA_12:
-		case PCI_DEVICE_ID_INTEL_82801DB_0:
-		case PCI_DEVICE_ID_INTEL_82801E_0:
-		case PCI_DEVICE_ID_INTEL_82801EB_0:
-		case PCI_DEVICE_ID_INTEL_ESB_1:
-		case PCI_DEVICE_ID_INTEL_ICH6_0:
-		case PCI_DEVICE_ID_INTEL_ICH6_1:
-		case PCI_DEVICE_ID_INTEL_ICH7_0:
-		case PCI_DEVICE_ID_INTEL_ICH7_1:
-		case PCI_DEVICE_ID_INTEL_ICH7_30:
-		case PCI_DEVICE_ID_INTEL_ICH7_31:
-		case PCI_DEVICE_ID_INTEL_ESB2_0:
-		case PCI_DEVICE_ID_INTEL_ICH8_0:
-		case PCI_DEVICE_ID_INTEL_ICH8_1:
-		case PCI_DEVICE_ID_INTEL_ICH8_2:
-		case PCI_DEVICE_ID_INTEL_ICH8_3:
-		case PCI_DEVICE_ID_INTEL_ICH8_4:
-		case PCI_DEVICE_ID_INTEL_ICH9_0:
-		case PCI_DEVICE_ID_INTEL_ICH9_1:
-		case PCI_DEVICE_ID_INTEL_ICH9_2:
-		case PCI_DEVICE_ID_INTEL_ICH9_3:
-		case PCI_DEVICE_ID_INTEL_ICH9_4:
-		case PCI_DEVICE_ID_INTEL_ICH9_5:
-		case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
-		case PCI_DEVICE_ID_INTEL_ICH10_0:
-		case PCI_DEVICE_ID_INTEL_ICH10_1:
-		case PCI_DEVICE_ID_INTEL_ICH10_2:
-		case PCI_DEVICE_ID_INTEL_ICH10_3:
-			r->name = "PIIX/ICH";
-			r->get = pirq_piix_get;
-			r->set = pirq_piix_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_INTEL_82371FB_0:
+	case PCI_DEVICE_ID_INTEL_82371SB_0:
+	case PCI_DEVICE_ID_INTEL_82371AB_0:
+	case PCI_DEVICE_ID_INTEL_82371MX:
+	case PCI_DEVICE_ID_INTEL_82443MX_0:
+	case PCI_DEVICE_ID_INTEL_82801AA_0:
+	case PCI_DEVICE_ID_INTEL_82801AB_0:
+	case PCI_DEVICE_ID_INTEL_82801BA_0:
+	case PCI_DEVICE_ID_INTEL_82801BA_10:
+	case PCI_DEVICE_ID_INTEL_82801CA_0:
+	case PCI_DEVICE_ID_INTEL_82801CA_12:
+	case PCI_DEVICE_ID_INTEL_82801DB_0:
+	case PCI_DEVICE_ID_INTEL_82801E_0:
+	case PCI_DEVICE_ID_INTEL_82801EB_0:
+	case PCI_DEVICE_ID_INTEL_ESB_1:
+	case PCI_DEVICE_ID_INTEL_ICH6_0:
+	case PCI_DEVICE_ID_INTEL_ICH6_1:
+	case PCI_DEVICE_ID_INTEL_ICH7_0:
+	case PCI_DEVICE_ID_INTEL_ICH7_1:
+	case PCI_DEVICE_ID_INTEL_ICH7_30:
+	case PCI_DEVICE_ID_INTEL_ICH7_31:
+	case PCI_DEVICE_ID_INTEL_ESB2_0:
+	case PCI_DEVICE_ID_INTEL_ICH8_0:
+	case PCI_DEVICE_ID_INTEL_ICH8_1:
+	case PCI_DEVICE_ID_INTEL_ICH8_2:
+	case PCI_DEVICE_ID_INTEL_ICH8_3:
+	case PCI_DEVICE_ID_INTEL_ICH8_4:
+	case PCI_DEVICE_ID_INTEL_ICH9_0:
+	case PCI_DEVICE_ID_INTEL_ICH9_1:
+	case PCI_DEVICE_ID_INTEL_ICH9_2:
+	case PCI_DEVICE_ID_INTEL_ICH9_3:
+	case PCI_DEVICE_ID_INTEL_ICH9_4:
+	case PCI_DEVICE_ID_INTEL_ICH9_5:
+	case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+	case PCI_DEVICE_ID_INTEL_ICH10_0:
+	case PCI_DEVICE_ID_INTEL_ICH10_1:
+	case PCI_DEVICE_ID_INTEL_ICH10_2:
+	case PCI_DEVICE_ID_INTEL_ICH10_3:
+	case PCI_DEVICE_ID_INTEL_PCH_0:
+	case PCI_DEVICE_ID_INTEL_PCH_1:
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
 	}
 	return 0;
 }
@@ -610,7 +615,7 @@ static __init int via_router_probe(struc
 	 * workarounds for some buggy BIOSes
 	 */
 	if (device == PCI_DEVICE_ID_VIA_82C586_0) {
-		switch(router->device) {
+		switch (router->device) {
 		case PCI_DEVICE_ID_VIA_82C686:
 			/*
 			 * Asus k7m bios wrongly reports 82C686A
@@ -635,7 +640,7 @@ static __init int via_router_probe(struc
 		}
 	}

-	switch(device) {
+	switch (device) {
 	case PCI_DEVICE_ID_VIA_82C586_0:
 		r->name = "VIA";
 		r->get = pirq_via586_get;
@@ -658,28 +663,27 @@ static __init int via_router_probe(struc

 static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_VLSI_82C534:
-			r->name = "VLSI 82C534";
-			r->get = pirq_vlsi_get;
-			r->set = pirq_vlsi_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_VLSI_82C534:
+		r->name = "VLSI 82C534";
+		r->get = pirq_vlsi_get;
+		r->set = pirq_vlsi_set;
+		return 1;
 	}
 	return 0;
 }


-static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int serverworks_router_probe(struct irq_router *r,
+		struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_SERVERWORKS_OSB4:
-		case PCI_DEVICE_ID_SERVERWORKS_CSB5:
-			r->name = "ServerWorks";
-			r->get = pirq_serverworks_get;
-			r->set = pirq_serverworks_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+	case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+		r->name = "ServerWorks";
+		r->get = pirq_serverworks_get;
+		r->set = pirq_serverworks_set;
+		return 1;
 	}
 	return 0;
 }
@@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
 {
 	if (device != PCI_DEVICE_ID_SI_503)
 		return 0;
-
+
 	r->name = "SIS";
 	r->get = pirq_sis_get;
 	r->set = pirq_sis_set;
@@ -697,50 +701,45 @@ static __init int sis_router_probe(struc

 static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_CYRIX_5520:
-			r->name = "NatSemi";
-			r->get = pirq_cyrix_get;
-			r->set = pirq_cyrix_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_CYRIX_5520:
+		r->name = "NatSemi";
+		r->get = pirq_cyrix_get;
+		r->set = pirq_cyrix_set;
+		return 1;
 	}
 	return 0;
 }

 static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_OPTI_82C700:
-			r->name = "OPTI";
-			r->get = pirq_opti_get;
-			r->set = pirq_opti_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_OPTI_82C700:
+		r->name = "OPTI";
+		r->get = pirq_opti_get;
+		r->set = pirq_opti_set;
+		return 1;
 	}
 	return 0;
 }

 static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_ITE_IT8330G_0:
-			r->name = "ITE";
-			r->get = pirq_ite_get;
-			r->set = pirq_ite_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_ITE_IT8330G_0:
+		r->name = "ITE";
+		r->get = pirq_ite_get;
+		r->set = pirq_ite_set;
+		return 1;
 	}
 	return 0;
 }

 static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
+	switch (device) {
 	case PCI_DEVICE_ID_AL_M1533:
 	case PCI_DEVICE_ID_AL_M1563:
-		printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
 		r->name = "ALI";
 		r->get = pirq_ali_get;
 		r->set = pirq_ali_set;
@@ -751,25 +750,24 @@ static __init int ali_router_probe(struc

 static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_AMD_VIPER_740B:
-			r->name = "AMD756";
-			break;
-		case PCI_DEVICE_ID_AMD_VIPER_7413:
-			r->name = "AMD766";
-			break;
-		case PCI_DEVICE_ID_AMD_VIPER_7443:
-			r->name = "AMD768";
-			break;
-		default:
-			return 0;
+	switch (device) {
+	case PCI_DEVICE_ID_AMD_VIPER_740B:
+		r->name = "AMD756";
+		break;
+	case PCI_DEVICE_ID_AMD_VIPER_7413:
+		r->name = "AMD766";
+		break;
+	case PCI_DEVICE_ID_AMD_VIPER_7443:
+		r->name = "AMD768";
+		break;
+	default:
+		return 0;
 	}
 	r->get = pirq_amd756_get;
 	r->set = pirq_amd756_set;
 	return 1;
 }
-
+
 static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
 	switch (device) {
@@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
  *	FIXME: should we have an option to say "generic for
  *	chipset" ?
  */
-
+
 static void __init pirq_find_router(struct irq_router *r)
 {
 	struct irq_routing_table *rt = pirq_table;
@@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
 	r->name = "default";
 	r->get = NULL;
 	r->set = NULL;
-
+
 	DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
 	    rt->rtr_vendor, rt->rtr_device);

@@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
 		return;
 	}

-	for( h = pirq_routers; h->vendor; h++) {
+	for (h = pirq_routers; h->vendor; h++) {
 		/* First look for a router match */
-		if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
+		if (rt->rtr_vendor == h->vendor &&
+			h->probe(r, pirq_router_dev, rt->rtr_device))
 			break;
 		/* Fall back to a device match */
-		if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
+		if (pirq_router_dev->vendor == h->vendor &&
+			h->probe(r, pirq_router_dev, pirq_router_dev->device))
 			break;
 	}
-	printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
-		pirq_router.name,
-		pirq_router_dev->vendor,
-		pirq_router_dev->device,
-		pci_name(pirq_router_dev));
+	dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
+		 pirq_router.name,
+		 pirq_router_dev->vendor, pirq_router_dev->device);

 	/* The device remains referenced for the kernel lifetime */
 }
@@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
 static struct irq_info *pirq_get_info(struct pci_dev *dev)
 {
 	struct irq_routing_table *rt = pirq_table;
-	int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
+	int entries = (rt->size - sizeof(struct irq_routing_table)) /
+		sizeof(struct irq_info);
 	struct irq_info *info;

 	for (info = rt->slots; entries--; info++)
-		if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+		if (info->bus == dev->bus->number &&
+			PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
 			return info;
 	return NULL;
 }
@@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
 	/* Find IRQ pin */
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 	if (!pin) {
-		DBG(KERN_DEBUG " -> no interrupt pin\n");
+		dev_dbg(&dev->dev, "no interrupt pin\n");
 		return 0;
 	}
 	pin = pin - 1;
@@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci

 	if (!pirq_table)
 		return 0;
-
-	DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
+
 	info = pirq_get_info(dev);
 	if (!info) {
-		DBG(" -> not found in routing table\n" KERN_DEBUG);
+		dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
+			'A' + pin);
 		return 0;
 	}
 	pirq = info->irq[pin].link;
 	mask = info->irq[pin].bitmap;
 	if (!pirq) {
-		DBG(" -> not routed\n" KERN_DEBUG);
+		dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
 		return 0;
 	}
-	DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
+	dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
+		'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
 	mask &= pcibios_irq_mask;

 	/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
 	}

 	/* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
-	if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
+	if (acer_tm360_irqrouting && dev->irq == 11 &&
+		dev->vendor == PCI_VENDOR_ID_O2) {
 		pirq = 0x68;
 		mask = 0x400;
 		dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
 	 */
 	newirq = dev->irq;
 	if (newirq && !((1 << newirq) & mask)) {
-		if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
-		else printk("\n" KERN_WARNING
-			"PCI: IRQ %i for device %s doesn't match PIRQ mask "
-			"- try pci=usepirqmask\n" KERN_DEBUG, newirq,
-			pci_name(dev));
+		if (pci_probe & PCI_USE_PIRQ_MASK)
+			newirq = 0;
+		else
+			dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
+				 "%#x; try pci=usepirqmask\n", newirq, mask);
 	}
 	if (!newirq && assign) {
 		for (i = 0; i < 16; i++) {
 			if (!(mask & (1 << i)))
 				continue;
-			if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
+			if (pirq_penalty[i] < pirq_penalty[newirq] &&
+				can_request_irq(i, IRQF_SHARED))
 				newirq = i;
 		}
 	}
-	DBG(" -> newirq=%d", newirq);
+	dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);

 	/* Check if it is hardcoded */
 	if ((pirq & 0xf0) == 0xf0) {
 		irq = pirq & 0xf;
-		DBG(" -> hardcoded IRQ %d\n", irq);
-		msg = "Hardcoded";
-	} else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
-	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
-		DBG(" -> got IRQ %d\n", irq);
-		msg = "Found";
+		msg = "hardcoded";
+	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
+		msg = "found";
 		eisa_set_level_irq(irq);
-	} else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
-		DBG(" -> assigning IRQ %d", newirq);
+	} else if (newirq && r->set &&
+		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
 			eisa_set_level_irq(newirq);
-			DBG(" ... OK\n");
-			msg = "Assigned";
+			msg = "assigned";
 			irq = newirq;
 		}
 	}

 	if (!irq) {
-		DBG(" ... failed\n");
 		if (newirq && mask == (1 << newirq)) {
-			msg = "Guessed";
+			msg = "guessed";
 			irq = newirq;
-		} else
+		} else {
+			dev_dbg(&dev->dev, "can't route interrupt\n");
 			return 0;
+		}
 	}
-	printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
+	dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);

 	/* Update IRQ for all devices with the same pirq value */
 	while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
 		if (!info)
 			continue;
 		if (info->irq[pin].link == pirq) {
-			/* We refuse to override the dev->irq information. Give a warning! */
-		    	if ( dev2->irq && dev2->irq != irq && \
+			/*
+			 * We refuse to override the dev->irq
+			 * information. Give a warning!
+			 */
+			if (dev2->irq && dev2->irq != irq && \
 			(!(pci_probe & PCI_USE_PIRQ_MASK) || \
-			((1 << dev2->irq) & mask)) ) {
+			((1 << dev2->irq) & mask))) {
 #ifndef CONFIG_PCI_MSI
-		    		printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
-				       pci_name(dev2), dev2->irq, irq);
+				dev_info(&dev2->dev, "IRQ routing conflict: "
+					 "have IRQ %d, want IRQ %d\n",
+					 dev2->irq, irq);
 #endif
-		    		continue;
-		    	}
+				continue;
+			}
 			dev2->irq = irq;
 			pirq_penalty[irq]++;
 			if (dev != dev2)
-				printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
+				dev_info(&dev->dev, "sharing IRQ %d with %s\n",
+					 irq, pci_name(dev2));
 		}
 	}
 	return 1;
@@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
 	DBG(KERN_DEBUG "PCI: IRQ fixup\n");
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		/*
-		 * If the BIOS has set an out of range IRQ number, just ignore it.
-		 * Also keep track of which IRQ's are already in use.
+		 * If the BIOS has set an out of range IRQ number, just
+		 * ignore it.  Also keep track of which IRQ's are
+		 * already in use.
 		 */
 		if (dev->irq >= 16) {
-			DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
+			dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
 			dev->irq = 0;
 		}
-		/* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
-		if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
+		/*
+		 * If the IRQ is already assigned to a PCI device,
+		 * ignore its ISA use penalty
+		 */
+		if (pirq_penalty[dev->irq] >= 100 &&
+				pirq_penalty[dev->irq] < 100000)
 			pirq_penalty[dev->irq] = 0;
 		pirq_penalty[dev->irq]++;
 	}
@@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
 		/*
 		 * Recalculate IRQ numbers if we use the I/O APIC.
 		 */
-		if (io_apic_assign_pci_irqs)
-		{
+		if (io_apic_assign_pci_irqs) {
 			int irq;

 			if (pin) {
-				pin--;		/* interrupt pins are numbered starting from 1 */
-				irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+				/*
+				 * interrupt pins are numbered starting
+				 * from 1
+				 */
+				pin--;
+				irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+					PCI_SLOT(dev->devfn), pin);
 	/*
 	 * Busses behind bridges are typically not listed in the MP-table.
 	 * In this case we have to look up the IRQ based on the parent bus,
@@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
 	 * busses itself so we should get into this branch reliably.
 	 */
 				if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
-					struct pci_dev * bridge = dev->bus->self;
+					struct pci_dev *bridge = dev->bus->self;

 					pin = (pin + PCI_SLOT(dev->devfn)) % 4;
-					irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+					irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
 							PCI_SLOT(bridge->devfn), pin);
 					if (irq >= 0)
-						printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
-							pci_name(bridge), 'A' + pin, irq);
+						dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
+							 pci_name(bridge),
+							 'A' + pin, irq);
 				}
 				if (irq >= 0) {
-					printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
-						pci_name(dev), 'A' + pin, irq);
+					dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
 					dev->irq = irq;
 				}
 			}
@@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
 {
 	if (!broken_hp_bios_irq9) {
 		broken_hp_bios_irq9 = 1;
-		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+			d->ident);
 	}
 	return 0;
 }
@@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
 {
 	if (!acer_tm360_irqrouting) {
 		acer_tm360_irqrouting = 1;
-		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+			d->ident);
 	}
 	return 0;
 }
@@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
 			DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
-			DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
+			DMI_MATCH(DMI_PRODUCT_VERSION,
+				"HP Pavilion Notebook Model GE"),
 			DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
 		},
 	},
@@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
 	{ }
 };

-static int __init pcibios_irq_init(void)
+int __init pcibios_irq_init(void)
 {
 	DBG(KERN_DEBUG "PCI: IRQ init\n");

@@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
 		pirq_find_router(&pirq_router);
 		if (pirq_table->exclusive_irqs) {
 			int i;
-			for (i=0; i<16; i++)
+			for (i = 0; i < 16; i++)
 				if (!(pirq_table->exclusive_irqs & (1 << i)))
 					pirq_penalty[i] += 100;
 		}
-		/* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
+		/*
+		 * If we're using the I/O APIC, avoid using the PCI IRQ
+		 * routing table
+		 */
 		if (io_apic_assign_pci_irqs)
 			pirq_table = NULL;
 	}
@@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
 	return 0;
 }

-subsys_initcall(pcibios_irq_init);
-
-
 static void pirq_penalize_isa_irq(int irq, int active)
 {
 	/*
@@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
 	if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
 		char *msg = "";

-		pin--;		/* interrupt pins are numbered starting from 1 */
+		pin--; /* interrupt pins are numbered starting from 1 */

 		if (io_apic_assign_pci_irqs) {
 			int irq;
@@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
 			 */
 			temp_dev = dev;
 			while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
-				struct pci_dev * bridge = dev->bus->self;
+				struct pci_dev *bridge = dev->bus->self;

 				pin = (pin + PCI_SLOT(dev->devfn)) % 4;
-				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
 						PCI_SLOT(bridge->devfn), pin);
 				if (irq >= 0)
-					printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
-						pci_name(bridge), 'A' + pin, irq);
+					dev_warn(&dev->dev, "using bridge %s "
+						 "INT %c to get IRQ %d\n",
+						 pci_name(bridge), 'A' + pin,
+						 irq);
 				dev = bridge;
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
-					pci_name(dev), 'A' + pin, irq);
+				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
+					 "INT %c -> IRQ %d\n", 'A' + pin, irq);
 				dev->irq = irq;
 				return 0;
 			} else
-				msg = " Probably buggy MP table.";
+				msg = "; probably buggy MP table";
 		} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
 			msg = "";
 		else
-			msg = " Please try using pci=biosirq.";
+			msg = "; please try using pci=biosirq";

-		/* With IDE legacy devices the IRQ lookup failure is not a problem.. */
-		if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
+		/*
+		 * With IDE legacy devices the IRQ lookup failure is not
+		 * a problem..
+		 */
+		if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
+				!(dev->class & 0x5))
 			return 0;

-		printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
-		       'A' + pin, pci_name(dev), msg);
+		dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
+			 'A' + pin, msg);
 	}
 	return 0;
 }
--- head-2011-03-11.orig/arch/x86/vdso/Makefile	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/Makefile	2011-02-01 14:38:38.000000000 +0100
@@ -65,9 +65,7 @@ obj-$(VDSO32-y)			+= vdso32-syms.lds
 vdso32.so-$(VDSO32-y)		+= int80
 vdso32.so-$(CONFIG_COMPAT)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
-xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
-xen-vdso32-$(CONFIG_X86_32)	+= syscall
-vdso32.so-$(CONFIG_XEN)		+= $(xen-vdso32-y)
+vdso32.so-$(CONFIG_X86_XEN)	+= syscall

 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)

--- head-2011-03-11.orig/arch/x86/vdso/vdso32.S	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/vdso32.S	2011-02-01 14:38:38.000000000 +0100
@@ -9,7 +9,7 @@ vdso32_int80_end:

 	.globl vdso32_syscall_start, vdso32_syscall_end
 vdso32_syscall_start:
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
 	.incbin "arch/x86/vdso/vdso32-syscall.so"
 #endif
 vdso32_syscall_end:
@@ -19,16 +19,4 @@ vdso32_sysenter_start:
 	.incbin "arch/x86/vdso/vdso32-sysenter.so"
 vdso32_sysenter_end:

-#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
-	.globl vdso32_int80_start, vdso32_int80_end
-vdso32_int80_start:
-	.incbin "arch/x86/vdso/vdso32-int80.so"
-vdso32_int80_end:
-#elif defined(CONFIG_X86_XEN)
-	.globl vdso32_syscall_start, vdso32_syscall_end
-vdso32_syscall_start:
-	.incbin "arch/x86/vdso/vdso32-syscall.so"
-vdso32_syscall_end:
-#endif
-
 __FINIT
--- head-2011-03-11.orig/arch/x86/vdso/vdso32-setup-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/vdso32-setup-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
 	}
 }

-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
 static struct page *vdso32_pages[1];

 #ifdef CONFIG_X86_64

-#if CONFIG_XEN_COMPAT < 0x030200
-static int use_int80 = 1;
-#endif
-static int use_sysenter __read_mostly = -1;
-
-#define	vdso32_sysenter()	(use_sysenter > 0)
+#define	vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define	vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))

-/* May not be __init: called during resume */
-void syscall32_cpu_init(void)
+void __cpuinit syscall32_cpu_init(void)
 {
-	static const struct callback_register cstar = {
+	static const struct callback_register __cpuinitconst cstar = {
 		.type = CALLBACKTYPE_syscall32,
 		.address = (unsigned long)ia32_cstar_target
 	};
-	static const struct callback_register sysenter = {
+	static const struct callback_register __cpuinitconst sysenter = {
 		.type = CALLBACKTYPE_sysenter,
 		.address = (unsigned long)ia32_sysenter_target
 	};

-	if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
-	    (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
-#if CONFIG_XEN_COMPAT < 0x030200
-		return;
-	use_int80 = 0;
-#else
-		BUG();
-#endif
-
-	if (use_sysenter < 0) {
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-			use_sysenter = 1;
-		if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
-			use_sysenter = 1;
-	}
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
+		setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
+		setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
 }

 #define compat_uses_vma		1
@@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
 #else  /* CONFIG_X86_32 */

 #define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))

 extern asmlinkage void ia32pv_cstar_target(void);
 static const struct callback_register __cpuinitconst cstar = {
@@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
 		.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
 	};

-	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+	if (vdso32_syscall()) {
 		if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
 			BUG();
 		return;
 	}

-	if (!boot_cpu_has(X86_FEATURE_SEP))
+	if (!vdso32_sysenter())
 		return;

 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
@@ -341,34 +320,26 @@ int __init sysenter_setup(void)

 #ifdef CONFIG_X86_32
 	gate_vma_init();
-#endif

-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
-	if (use_int80) {
-		extern const char vdso32_int80_start, vdso32_int80_end;
-
-		vsyscall = &vdso32_int80_start;
-		vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
-	} else
-#elif defined(CONFIG_X86_32)
-	if (boot_cpu_has(X86_FEATURE_SYSCALL)
-	    && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
-		|| HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
-		setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
-	barrier(); /* until clear_bit()'s constraints are correct ... */
 	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
-		extern const char vdso32_syscall_start, vdso32_syscall_end;
-
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
+		    && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
+			setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
+		else {
+			setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+		}
+	}
+#endif
+	if (vdso32_syscall()) {
 		vsyscall = &vdso32_syscall_start;
 		vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
-	} else
-#endif
-	if (!vdso32_sysenter()) {
-		vsyscall = &vdso32_default_start;
-		vsyscall_len = &vdso32_default_end - &vdso32_default_start;
-	} else {
+	} else if (vdso32_sysenter()){
 		vsyscall = &vdso32_sysenter_start;
 		vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+	} else {
+		vsyscall = &vdso32_int80_start;
+		vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
 	}

 	memcpy(syscall_page, vsyscall, vsyscall_len);
--- head-2011-03-11.orig/arch/x86/xen/Kconfig	2011-01-31 17:49:31.000000000 +0100
+++ head-2011-03-11/arch/x86/xen/Kconfig	2011-02-01 14:38:38.000000000 +0100
@@ -31,14 +31,14 @@ config XEN_PVHVM
 config XEN_MAX_DOMAIN_MEMORY
        int
        default 128
-       depends on XEN
+       depends on PARAVIRT_XEN
        help
          This only affects the sizing of some bss arrays, the unused
          portions of which are freed.

 config XEN_SAVE_RESTORE
        bool
-       depends on XEN && PM
+       depends on PARAVIRT_XEN && PM
        default y

 config XEN_DEBUG_FS
--- head-2011-03-11.orig/drivers/acpi/processor_driver.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/acpi/processor_driver.c	2011-02-01 14:38:38.000000000 +0100
@@ -512,10 +512,12 @@ static int __cpuinit acpi_processor_add(
 		per_cpu(processors, pr->id) = pr;
 #endif

-	sysdev = get_cpu_sysdev(pr->id);
-	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
-		result = -EFAULT;
-		goto err_free_cpumask;
+	if (pr->id != -1) {
+		sysdev = get_cpu_sysdev(pr->id);
+		if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
+			result = -EFAULT;
+			goto err_free_cpumask;
+		}
 	}

 #if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
@@ -599,7 +601,8 @@ static int acpi_processor_remove(struct

 	acpi_processor_power_exit(pr, device);

-	sysfs_remove_link(&device->dev.kobj, "sysdev");
+	if (pr->id != -1)
+		sysfs_remove_link(&device->dev.kobj, "sysdev");

 	if (pr->cdev) {
 		sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
--- head-2011-03-11.orig/drivers/acpi/processor_perflib.c	2011-01-31 17:02:29.000000000 +0100
+++ head-2011-03-11/drivers/acpi/processor_perflib.c	2011-02-01 14:38:38.000000000 +0100
@@ -187,6 +187,12 @@ int acpi_processor_ppc_has_changed(struc
 {
 	int ret;

+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+	/* Xen hypervisor can handle cpufreq _PPC event */
+	if (ignore_ppc < 0 && processor_pmperf_external())
+		ignore_ppc = 0;
+#endif
+
 	if (ignore_ppc) {
 		/*
 		 * Only when it is notification event, the _OST object
--- head-2011-03-11.orig/drivers/char/tpm/tpm_vtpm.c	2011-01-31 14:53:38.000000000 +0100
+++ head-2011-03-11/drivers/char/tpm/tpm_vtpm.c	2011-02-01 14:38:38.000000000 +0100
@@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
 {
 	int rc;
 	int error = 0;
-	long flags;
+	unsigned long flags;
 	unsigned char buffer[1];
 	struct vtpm_state *vtpms;
 	vtpms = (struct vtpm_state *)chip_get_private(chip);
--- head-2011-03-11.orig/drivers/dma/ioat/dma.h	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/dma/ioat/dma.h	2011-02-01 14:38:38.000000000 +0100
@@ -363,6 +363,7 @@ __ioat_dca_init(struct pci_dev *pdev, vo
 }
 #define ioat_dca_init __ioat_dca_init
 #define ioat2_dca_init __ioat_dca_init
+#define ioat3_dca_init __ioat_dca_init
 #endif

 #endif /* IOATDMA_H */
--- head-2011-03-11.orig/drivers/hwmon/coretemp-xen.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/hwmon/coretemp-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -360,10 +360,11 @@ static int coretemp_device_add(unsigned
 	if (err)
 		goto exit_entry_free;

-	/* check if family 6, models 0xe, 0xf, 0x16, 0x17 */
+	/* check if family 6, models 0xe, 0xf, 0x16, 0x17, 0x1A */
 	if (info.x86 != 0x6 ||
 	    !((pdev_entry->x86_model == 0xe) || (pdev_entry->x86_model == 0xf) ||
-		(pdev_entry->x86_model == 0x16) || (pdev_entry->x86_model == 0x17))) {
+		(pdev_entry->x86_model == 0x16) || (pdev_entry->x86_model == 0x17) ||
+		(pdev_entry->x86_model == 0x1A))) {

 		/* supported CPU not found, but report the unknown
 		   family 6 CPU */
--- head-2011-03-11.orig/drivers/pci/msi-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/pci/msi-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -53,12 +53,10 @@ arch_msi_check_device(struct pci_dev *de
 	return 0;
 }

-static void msi_set_enable(struct pci_dev *dev, int enable)
+static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
 {
-	int pos;
 	u16 control;

-	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
 	if (pos) {
 		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 		control &= ~PCI_MSI_FLAGS_ENABLE;
@@ -68,6 +66,11 @@ static void msi_set_enable(struct pci_de
 	}
 }

+static void msi_set_enable(struct pci_dev *dev, int enable)
+{
+	__msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
+}
+
 static void msix_set_enable(struct pci_dev *dev, int enable)
 {
 	int pos;
@@ -180,8 +183,7 @@ static int msi_get_dev_owner(struct pci_

 	BUG_ON(!is_initial_xendomain());
 	if (get_owner && (owner = get_owner(dev)) >= 0) {
-		printk(KERN_INFO "get owner for dev %x get %x \n",
-		       dev->devfn, owner);
+		dev_info(&dev->dev, "get owner: %x \n", owner);
 		return owner;
 	}

@@ -201,7 +203,7 @@ static int msi_unmap_pirq(struct pci_dev
 		? pirq : evtchn_get_xen_pirq(pirq);

 	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
-		printk(KERN_WARNING "unmap irq %x failed\n", pirq);
+		dev_warn(&dev->dev, "unmap irq %d failed\n", pirq);

 	if (rc < 0)
 		return rc;
@@ -249,7 +251,7 @@ static int msi_map_vector(struct pci_dev
 	map_irq.table_base = table_base;

 	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
-		printk(KERN_WARNING "map irq failed\n");
+		dev_warn(&dev->dev, "map irq failed\n");

 	if (rc < 0)
 		return rc;
@@ -360,10 +362,9 @@ static int msix_capability_init(struct p
 		mapped = 0;
 		list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
 			if (pirq_entry->entry_nr == entries[i].entry) {
-				printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
-				       not freed before acquire again.\n", entries[i].entry,
-					   dev->bus->number, PCI_SLOT(dev->devfn),
-					   PCI_FUNC(dev->devfn));
+				dev_warn(&dev->dev,
+					 "msix entry %d was not freed\n",
+					 entries[i].entry);
 				(entries + i)->vector = pirq_entry->pirq;
 				mapped = 1;
 				break;
@@ -489,9 +490,8 @@ int pci_enable_msi(struct pci_dev* dev)

 	/* Check whether driver already requested for MSI-X irqs */
 	if (dev->msix_enabled) {
-		printk(KERN_INFO "PCI: %s: Can't enable MSI.  "
-		       "Device already has MSI-X enabled\n",
-		       pci_name(dev));
+		dev_info(&dev->dev, "can't enable MSI "
+			 "(MSI-X already enabled)\n");
 		return -EINVAL;
 	}

@@ -573,7 +573,8 @@ int pci_enable_msix(struct pci_dev* dev,
 		temp = dev->irq;
 		ret = pci_frontend_enable_msix(dev, entries, nvec);
 		if (ret) {
-			printk("get %x from pci_frontend_enable_msix\n", ret);
+			dev_warn(&dev->dev,
+				 "got %x from frontend_enable_msix\n", ret);
 			return ret;
 		}
 		dev->msix_enabled = 1;
@@ -624,9 +625,8 @@ int pci_enable_msix(struct pci_dev* dev,
 	temp = dev->irq;
 	/* Check whether driver already requested for MSI vector */
 	if (dev->msi_enabled) {
-		printk(KERN_INFO "PCI: %s: Can't enable MSI-X.  "
-		       "Device already has an MSI irq assigned\n",
-		       pci_name(dev));
+		dev_info(&dev->dev, "can't enable MSI-X "
+		       "(MSI IRQ already assigned)\n");
 		return -EINVAL;
 	}

--- head-2011-03-11.orig/drivers/xen/Makefile	2011-02-28 15:13:33.000000000 +0100
+++ head-2011-03-11/drivers/xen/Makefile	2011-02-01 14:38:38.000000000 +0100
@@ -1,4 +1,4 @@
-obj-$(CONFIG_PARAVIRT_XEN)	+= grant-table.o features.o events.o
+obj-$(CONFIG_PARAVIRT_XEN)	+= grant-table.o features.o events.o manage.o
 xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o

 xen-balloon-$(CONFIG_XEN)	:= balloon/
--- head-2011-03-11.orig/drivers/xen/balloon/balloon.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/balloon/balloon.c	2011-02-01 14:38:38.000000000 +0100
@@ -82,7 +82,7 @@ struct balloon_stats balloon_stats;
 /* We increase/decrease in batches which fit in a page */
 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];

-#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
+#ifdef CONFIG_HIGHMEM
 #define inc_totalhigh_pages() (totalhigh_pages++)
 #define dec_totalhigh_pages() (totalhigh_pages--)
 #else
--- head-2011-03-11.orig/drivers/xen/balloon/sysfs.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/balloon/sysfs.c	2011-02-01 14:38:38.000000000 +0100
@@ -45,6 +45,7 @@

 #define BALLOON_SHOW(name, format, args...)			\
 	static ssize_t show_##name(struct sys_device *dev,	\
+				   struct sysdev_attribute *attr, \
 				   char *buf)			\
 	{							\
 		return sprintf(buf, format, ##args);		\
@@ -56,14 +57,15 @@ BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(b
 BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
 BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));

-static ssize_t show_target_kb(struct sys_device *dev, char *buf)
+static ssize_t show_target_kb(struct sys_device *dev,
+			      struct sysdev_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
 }

 static ssize_t store_target_kb(struct sys_device *dev,
-			       const char *buf,
-			       size_t count)
+			       struct sysdev_attribute *attr,
+			       const char *buf, size_t count)
 {
 	char memstring[64], *endchar;
 	unsigned long long target_bytes;
--- head-2011-03-11.orig/drivers/xen/blktap/blktap.c	2011-02-17 10:11:08.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap/blktap.c	2011-02-17 10:11:18.000000000 +0100
@@ -54,6 +54,7 @@
 #include <linux/gfp.h>
 #include <linux/poll.h>
 #include <linux/delay.h>
+#include <linux/nsproxy.h>
 #include <asm/tlbflush.h>

 #define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
@@ -502,7 +503,7 @@ found:

 		if ((class = get_xen_class()) != NULL)
 			device_create(class, NULL, MKDEV(blktap_major, minor),
-				      "blktap%d", minor);
+				      NULL, "blktap%d", minor);
 	}

 out:
@@ -1743,7 +1744,8 @@ static int __init blkif_init(void)
 		 * We only create the device when a request of a new device is
 		 * made.
 		 */
-		device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
+		device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
+			      "blktap0");
 	} else {
 		/* this is bad, but not fatal */
 		WPRINTK("blktap: sysfs xen_class not created\n");
--- head-2011-03-11.orig/drivers/xen/blktap2/device.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap2/device.c	2011-02-01 14:38:38.000000000 +0100
@@ -3,6 +3,7 @@
 #include <linux/cdrom.h>
 #include <linux/hdreg.h>
 #include <linux/module.h>
+#include <linux/version.h>
 #include <asm/tlbflush.h>

 #include <scsi/scsi.h>
--- head-2011-03-11.orig/drivers/xen/blktap2/sysfs.c	2011-03-11 10:58:58.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap2/sysfs.c	2011-02-01 14:38:38.000000000 +0100
@@ -307,8 +307,8 @@ blktap_sysfs_create(struct blktap *tap)

 	ring = &tap->ring;

-	dev = device_create_drvdata(class, NULL, ring->devno, tap,
-				    "blktap%d", tap->minor);
+	dev = device_create(class, NULL, ring->devno, tap,
+			    "blktap%d", tap->minor);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);

--- head-2011-03-11.orig/drivers/xen/char/mem.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/char/mem.c	2011-02-01 14:38:38.000000000 +0100
@@ -35,7 +35,7 @@ static inline int uncached_access(struct

 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
-#ifdef CONFIG_NONPROMISC_DEVMEM
+#ifdef CONFIG_STRICT_DEVMEM
 	u64 from = ((u64)pfn) << PAGE_SHIFT;
 	u64 to = from + size;
 	u64 cursor = from;
@@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are

 static struct vm_operations_struct mmap_mem_ops = {
 	.open  = mmap_mem_open,
-	.close = mmap_mem_close
+	.close = mmap_mem_close,
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+	.access = generic_access_phys
+#endif
 };

 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
--- head-2011-03-11.orig/drivers/xen/console/console.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/console/console.c	2011-02-01 14:38:38.000000000 +0100
@@ -431,9 +431,7 @@ static void __xencons_tx_flush(void)

 	if (work_done && (xencons_tty != NULL)) {
 		wake_up_interruptible(&xencons_tty->write_wait);
-		if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
-		    (xencons_tty->ldisc.write_wakeup != NULL))
-			(xencons_tty->ldisc.write_wakeup)(xencons_tty);
+		tty_wakeup(xencons_tty);
 	}
 }

@@ -634,8 +632,8 @@ static void xencons_close(struct tty_str
 	tty->closing = 1;
 	tty_wait_until_sent(tty, 0);
 	tty_driver_flush_buffer(tty);
-	if (tty->ldisc.flush_buffer != NULL)
-		tty->ldisc.flush_buffer(tty);
+	if (tty->ldisc.ops->flush_buffer != NULL)
+		tty->ldisc.ops->flush_buffer(tty);
 	tty->closing = 0;
 	spin_lock_irqsave(&xencons_lock, flags);
 	xencons_tty = NULL;
--- head-2011-03-11.orig/drivers/xen/core/evtchn.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/evtchn.c	2011-02-01 14:38:38.000000000 +0100
@@ -126,7 +126,11 @@ static int irq_bindcount[NR_IRQS];

 #ifdef CONFIG_SMP

+#if CONFIG_NR_CPUS <= 256
 static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+#else
+static u16 cpu_evtchn[NR_EVENT_CHANNELS];
+#endif
 static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];

 static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
@@ -767,8 +771,9 @@ static struct irq_chip dynirq_chip = {
 };

 /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
-static int pirq_eoi_does_unmask;
+static bool pirq_eoi_does_unmask;
 static unsigned long *pirq_needs_eoi;
+static DECLARE_BITMAP(probing_pirq, NR_PIRQS);

 static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
 {
@@ -815,25 +820,31 @@ static inline void pirq_query_unmask(int
 		set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
 }

-/*
- * On startup, if there is no action associated with the IRQ then we are
- * probing. In this case we should not share with others as it will confuse us.
- */
-#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
+static int set_type_pirq(unsigned int irq, unsigned int type)
+{
+	if (type != IRQ_TYPE_PROBE)
+		return -EINVAL;
+	set_bit(irq - PIRQ_BASE, probing_pirq);
+	return 0;
+}

 static void enable_pirq(unsigned int irq)
 {
 	struct evtchn_bind_pirq bind_pirq;
 	int evtchn = evtchn_from_irq(irq);

-	if (VALID_EVTCHN(evtchn))
+	if (VALID_EVTCHN(evtchn)) {
+		clear_bit(irq - PIRQ_BASE, probing_pirq);
 		goto out;
+	}

 	bind_pirq.pirq = evtchn_get_xen_pirq(irq);
 	/* NB. We are happy to share unless we are probing. */
-	bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
+	bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
+			  || (irq_desc[irq].status & IRQ_AUTODETECT)
+			  ? 0 : BIND_PIRQ__WILL_SHARE;
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
-		if (!probing_irq(irq))
+		if (bind_pirq.flags)
 			pr_info("Failed to obtain physical IRQ %d\n", irq);
 		return;
 	}
@@ -910,6 +921,7 @@ static struct irq_chip pirq_chip = {
 	.ack      = ack_pirq,
 	.end      = end_pirq,
 	.eoi      = end_pirq,
+	.set_type = set_type_pirq,
 #ifdef CONFIG_SMP
 	.set_affinity = set_affinity_irq,
 #endif
@@ -985,6 +997,7 @@ void disable_all_local_evtchn(void)
 			synch_set_bit(i, &s->evtchn_mask[0]);
 }

+#ifdef CONFIG_PM_SLEEP
 static void restore_cpu_virqs(unsigned int cpu)
 {
 	struct evtchn_bind_virq bind_virq;
@@ -1077,6 +1090,7 @@ void irq_resume(void)
 	}

 }
+#endif

 #if defined(CONFIG_X86_IO_APIC)
 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
@@ -1159,7 +1173,7 @@ void __init xen_init_IRQ(void)
 		* BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
  	eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
 	if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
-		pirq_eoi_does_unmask = 1;
+		pirq_eoi_does_unmask = true;

 	/* No event channels are 'live' right now. */
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
--- head-2011-03-11.orig/drivers/xen/core/gnttab.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/gnttab.c	2011-02-01 14:38:38.000000000 +0100
@@ -448,6 +448,7 @@ static int map_pte_fn(pte_t *pte, struct
 	return 0;
 }

+#ifdef CONFIG_PM_SLEEP
 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
 			unsigned long addr, void *data)
 {
@@ -455,6 +456,7 @@ static int unmap_pte_fn(pte_t *pte, stru
 	set_pte_at(&init_mm, addr, pte, __pte(0));
 	return 0;
 }
+#endif

 void *arch_gnttab_alloc_shared(unsigned long *frames)
 {
@@ -635,6 +637,75 @@ void __gnttab_dma_map_page(struct page *
 	} while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
 }

+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+static unsigned int GNTMAP_pte_special;
+
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
+			   unsigned int count)
+{
+	unsigned int i;
+
+	if (unlikely(cmd != GNTTABOP_map_grant_ref))
+		count = 0;
+
+	for (i = 0; i < count; ++i, ++map) {
+		if (!(map->flags & GNTMAP_host_map)
+		    || !(map->flags & GNTMAP_application_map))
+			continue;
+		if (GNTMAP_pte_special)
+			map->flags |= GNTMAP_pte_special;
+		else {
+			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(gnttab_pre_map_adjust);
+
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
+{
+	unsigned int i;
+	int rc = 0;
+
+	for (i = 0; i < count && rc == 0; ++i, ++map) {
+		pte_t pte;
+
+		if (!(map->flags & GNTMAP_host_map)
+		    || !(map->flags & GNTMAP_application_map))
+			continue;
+
+#ifdef CONFIG_X86
+		pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
+				| _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
+				| _PAGE_SPECIAL)
+			       & __supported_pte_mask);
+#else
+#error Architecture not yet supported.
+#endif
+		if (!(map->flags & GNTMAP_readonly))
+			pte = pte_mkwrite(pte);
+
+		if (map->flags & GNTMAP_contains_pte) {
+			mmu_update_t u;
+
+			u.ptr = map->host_addr;
+			u.val = __pte_val(pte);
+			rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+		} else
+			rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(gnttab_post_map_adjust);
+#endif
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+
 int gnttab_resume(void)
 {
 	if (max_nr_grant_frames() < nr_grant_frames)
@@ -642,6 +713,7 @@ int gnttab_resume(void)
 	return gnttab_map(0, nr_grant_frames - 1);
 }

+#ifdef CONFIG_PM_SLEEP
 int gnttab_suspend(void)
 {
 #ifdef CONFIG_X86
@@ -651,6 +723,7 @@ int gnttab_suspend(void)
 #endif
 	return 0;
 }
+#endif

 #else /* !CONFIG_XEN */

@@ -761,6 +834,18 @@ int __devinit gnttab_init(void)
 	gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
 	gnttab_free_head  = NR_RESERVED_ENTRIES;

+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
+	    && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
+#ifdef CONFIG_X86
+		GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
+				      >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
+#else
+#error Architecture not yet supported.
+#endif
+	}
+#endif
+
 	return 0;

  ini_nomem:
--- head-2011-03-11.orig/drivers/xen/core/machine_kexec.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/machine_kexec.c	2011-02-01 14:38:38.000000000 +0100
@@ -57,8 +57,7 @@ void __init xen_machine_kexec_setup_reso

 	/* allocate xen_phys_cpus */

-	xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
-	BUG_ON(xen_phys_cpus == NULL);
+	xen_phys_cpus = alloc_bootmem(k * sizeof(struct resource));

 	/* fill in xen_phys_cpus with per-cpu crash note information */

@@ -91,7 +90,7 @@ void __init xen_machine_kexec_setup_reso
 	xen_hypervisor_res.start = range.start;
 	xen_hypervisor_res.end = range.start + range.size - 1;
 	xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
 	insert_resource(&iomem_resource, &xen_hypervisor_res);
 #endif

@@ -106,7 +105,7 @@ void __init xen_machine_kexec_setup_reso
 	if (range.size) {
 		crashk_res.start = range.start;
 		crashk_res.end = range.start + range.size - 1;
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
 		insert_resource(&iomem_resource, &crashk_res);
 #endif
 	}
@@ -141,15 +140,13 @@ void __init xen_machine_kexec_setup_reso
 					  xen_max_nr_phys_cpus))
 		goto err;

-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
 	for (k = 0; k < xen_max_nr_phys_cpus; k++) {
 		res = xen_phys_cpus + k;
 		if (!res->parent) /* outside of xen_hypervisor_res range */
 			insert_resource(&iomem_resource, res);
 	}
-#endif

-#ifdef CONFIG_X86
 	if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
 					 get_order(sizeof(vmcoreinfo_note)),
 					 BITS_PER_LONG))
@@ -168,7 +165,7 @@ void __init xen_machine_kexec_setup_reso
 	return;
 }

-#ifndef CONFIG_X86_64
+#ifndef CONFIG_X86
 void __init xen_machine_kexec_register_resources(struct resource *res)
 {
 	int k;
--- head-2011-03-11.orig/drivers/xen/core/machine_reboot.c	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/machine_reboot.c	2011-02-01 14:38:38.000000000 +0100
@@ -52,6 +52,7 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }

+#ifdef CONFIG_PM_SLEEP
 static void pre_suspend(void)
 {
 	HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
@@ -108,6 +109,7 @@ static void post_suspend(int suspend_can
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(pfn_to_mfn_frame_list_list);
 }
+#endif

 #else /* !(defined(__i386__) || defined(__x86_64__)) */

@@ -126,6 +128,7 @@ static void post_suspend(int suspend_can

 #endif

+#ifdef CONFIG_PM_SLEEP
 struct suspend {
 	int fast_suspend;
 	void (*resume_notifier)(int);
@@ -221,7 +224,8 @@ int __xen_suspend(int fast_suspend, void

 	if (fast_suspend) {
 		xenbus_suspend();
-		err = stop_machine_run(take_machine_down, &suspend, 0);
+		err = stop_machine(take_machine_down, &suspend,
+				   &cpumask_of_cpu(0));
 		if (err < 0)
 			xenbus_suspend_cancel();
 	} else {
@@ -244,3 +248,4 @@ int __xen_suspend(int fast_suspend, void

 	return 0;
 }
+#endif
--- head-2011-03-11.orig/drivers/xen/core/reboot.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/reboot.c	2011-02-01 14:38:38.000000000 +0100
@@ -28,17 +28,12 @@ MODULE_LICENSE("Dual BSD/GPL");
 /* Ignore multiple shutdown requests. */
 static int shutting_down = SHUTDOWN_INVALID;

-/* Was last suspend request cancelled? */
-static int suspend_cancelled;
-
 /* Can we leave APs online when we suspend? */
 static int fast_suspend;

 static void __shutdown_handler(struct work_struct *unused);
 static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);

-static int setup_suspend_evtchn(void);
-
 int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));

 static int shutdown_process(void *__unused)
@@ -68,6 +63,13 @@ static int shutdown_process(void *__unus
 	return 0;
 }

+#ifdef CONFIG_PM_SLEEP
+
+static int setup_suspend_evtchn(void);
+
+/* Was last suspend request cancelled? */
+static int suspend_cancelled;
+
 static void xen_resume_notifier(int _suspend_cancelled)
 {
 	int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
@@ -117,6 +119,10 @@ static int xen_suspend(void *__unused)
 	return 0;
 }

+#else
+# define xen_suspend NULL
+#endif
+
 static void switch_shutdown_state(int new_state)
 {
 	int prev_state, old_state = SHUTDOWN_INVALID;
@@ -193,8 +199,10 @@ static void shutdown_handler(struct xenb
 		new_state = SHUTDOWN_POWEROFF;
 	else if (strcmp(str, "reboot") == 0)
 		ctrl_alt_del();
+#ifdef CONFIG_PM_SLEEP
 	else if (strcmp(str, "suspend") == 0)
 		new_state = SHUTDOWN_SUSPEND;
+#endif
 	else if (strcmp(str, "halt") == 0)
 		new_state = SHUTDOWN_HALT;
 	else
@@ -245,6 +253,7 @@ static struct xenbus_watch sysrq_watch =
 	.callback = sysrq_handler
 };

+#ifdef CONFIG_PM_SLEEP
 static irqreturn_t suspend_int(int irq, void* dev_id)
 {
 	switch_shutdown_state(SHUTDOWN_SUSPEND);
@@ -272,6 +281,9 @@ static int setup_suspend_evtchn(void)

 	return 0;
 }
+#else
+#define setup_suspend_evtchn() 0
+#endif

 static int setup_shutdown_watcher(void)
 {
--- head-2011-03-11.orig/drivers/xen/core/smpboot.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/smpboot.c	2011-02-01 14:38:38.000000000 +0100
@@ -27,6 +27,7 @@

 extern irqreturn_t smp_reschedule_interrupt(int, void *);
 extern irqreturn_t smp_call_function_interrupt(int, void *);
+extern irqreturn_t smp_call_function_single_interrupt(int, void *);

 extern int local_setup_timer(unsigned int cpu);
 extern void local_teardown_timer(unsigned int cpu);
@@ -47,8 +48,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);

 static DEFINE_PER_CPU(int, resched_irq);
 static DEFINE_PER_CPU(int, callfunc_irq);
+static DEFINE_PER_CPU(int, call1func_irq);
 static char resched_name[NR_CPUS][15];
 static char callfunc_name[NR_CPUS][15];
+static char call1func_name[NR_CPUS][15];

 void __init prefill_possible_map(void)
 {
@@ -64,20 +67,19 @@ void __init prefill_possible_map(void)
 			break;
 #endif
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0)
+		if (rc >= 0) {
 			cpu_set(i, cpu_possible_map);
+			nr_cpu_ids = i + 1;
+		}
 	}
 }

-void __init smp_alloc_memory(void)
-{
-}
-
 static int __cpuinit xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;

-	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
+		per_cpu(call1func_irq, cpu) = -1;

 	sprintf(resched_name[cpu], "resched%u", cpu);
 	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
@@ -101,6 +103,17 @@ static int __cpuinit xen_smp_intr_init(u
 		goto fail;
 	per_cpu(callfunc_irq, cpu) = rc;

+	sprintf(call1func_name[cpu], "call1func%u", cpu);
+	rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
+				    cpu,
+				    smp_call_function_single_interrupt,
+				    IRQF_DISABLED|IRQF_NOBALANCING,
+				    call1func_name[cpu],
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(call1func_irq, cpu) = rc;
+
 	rc = xen_spinlock_init(cpu);
 	if (rc < 0)
 		goto fail;
@@ -115,6 +128,8 @@ static int __cpuinit xen_smp_intr_init(u
 		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
 	if (per_cpu(callfunc_irq, cpu) >= 0)
 		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	if (per_cpu(call1func_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
 	xen_spinlock_cleanup(cpu);
 	return rc;
 }
@@ -127,6 +142,7 @@ static void __cpuinit xen_smp_intr_exit(

 	unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
 	unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
 	xen_spinlock_cleanup(cpu);
 }
 #endif
@@ -134,11 +150,7 @@ static void __cpuinit xen_smp_intr_exit(
 void __cpuinit cpu_bringup(void)
 {
 	cpu_init();
-#ifdef __i386__
 	identify_secondary_cpu(&current_cpu_data);
-#else
-	identify_cpu(&current_cpu_data);
-#endif
 	touch_softlockup_watchdog();
 	preempt_disable();
 	local_irq_enable();
@@ -218,9 +230,6 @@ void __init smp_prepare_cpus(unsigned in
 	struct task_struct *idle;
 	int apicid;
 	struct vcpu_get_physid cpu_id;
-#ifdef __x86_64__
-	struct desc_ptr *gdt_descr;
-#endif
 	void *gdt_addr;

 	apicid = 0;
@@ -249,20 +258,10 @@ void __init smp_prepare_cpus(unsigned in
 		if (IS_ERR(idle))
 			panic("failed fork for CPU %d", cpu);

-#ifdef __x86_64__
-		gdt_descr = &cpu_gdt_descr[cpu];
-		gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-		if (unlikely(!gdt_descr->address)) {
-			pr_crit("CPU%d failed to allocate GDT\n", cpu);
-			continue;
-		}
-		gdt_descr->size = GDT_SIZE;
-		memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
-		gdt_addr = (void *)gdt_descr->address;
-#else
+#ifdef __i386__
 		init_gdt(cpu);
-		gdt_addr = get_cpu_gdt_table(cpu);
 #endif
+		gdt_addr = get_cpu_gdt_table(cpu);
 		make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);

 		apicid = cpu;
@@ -305,8 +304,8 @@ void __init smp_prepare_boot_cpu(void)
 {
 #ifdef __i386__
 	init_gdt(smp_processor_id());
-	switch_to_new_gdt();
 #endif
+	switch_to_new_gdt();
 	prefill_possible_map();
 }

--- head-2011-03-11.orig/drivers/xen/core/spinlock.c	2011-03-15 16:17:10.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/spinlock.c	2011-03-15 16:51:35.000000000 +0100
@@ -5,6 +5,10 @@
  *	portions of this file.
  */
 #define XEN_SPINLOCK_SOURCE
+#include <linux/spinlock_types.h>
+
+#ifdef TICKET_SHIFT
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -55,6 +59,7 @@ void __cpuinit xen_spinlock_cleanup(unsi
 	WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
 }

+#ifdef CONFIG_PM_SLEEP
 void __cpuinit spinlock_resume(void)
 {
 	unsigned int cpu;
@@ -64,6 +69,7 @@ void __cpuinit spinlock_resume(void)
 		xen_spinlock_init(cpu);
 	}
 }
+#endif

 static unsigned int spin_adjust(struct spinning *spinning,
 				const raw_spinlock_t *lock,
@@ -86,7 +92,7 @@ static unsigned int spin_adjust(struct s

 unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
 {
-	return spin_adjust(__get_cpu_var(spinning), lock, token);
+	return spin_adjust(x86_read_percpu(spinning), lock, token);
 }

 bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
@@ -99,21 +105,21 @@ bool xen_spin_wait(raw_spinlock_t *lock,

 	/* If kicker interrupt not initialized yet, just spin. */
 	if (unlikely(!cpu_online(raw_smp_processor_id()))
-	    || unlikely(!__get_cpu_var(poll_evtchn)))
+	    || unlikely(!x86_read_percpu(poll_evtchn)))
 		return false;

 	/* announce we're spinning */
 	spinning.ticket = *ptok >> TICKET_SHIFT;
 	spinning.lock = lock;
-	spinning.prev = __get_cpu_var(spinning);
+	spinning.prev = x86_read_percpu(spinning);
 	smp_wmb();
-	__get_cpu_var(spinning) = &spinning;
+	x86_write_percpu(spinning, &spinning);
 	upcall_mask = current_vcpu_info()->evtchn_upcall_mask;

 	do {
 		bool nested = false;

-		clear_evtchn(__get_cpu_var(poll_evtchn));
+		clear_evtchn(x86_read_percpu(poll_evtchn));

 		/*
 		 * Check again to make sure it didn't become free while
@@ -126,7 +132,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 			 * without rechecking the lock.
 			 */
 			if (spinning.prev)
-				set_evtchn(__get_cpu_var(poll_evtchn));
+				set_evtchn(x86_read_percpu(poll_evtchn));
 			rc = true;
 			break;
 		}
@@ -153,11 +159,11 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 					bool kick, free;

 					other->ticket = -1;
-					__raw_spin_unlock_body;
+					__ticket_spin_unlock_body;
 					if (!kick)
 						break;
 					xen_spin_kick(lock, token);
-					__raw_spin_lock_preamble;
+					__ticket_spin_lock_preamble;
 					if (!free)
 						token = spin_adjust(
 							other->prev, lock,
@@ -181,7 +187,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,

 		current_vcpu_info()->evtchn_upcall_mask = upcall_mask;

-		rc = !test_evtchn(__get_cpu_var(poll_evtchn));
+		rc = !test_evtchn(x86_read_percpu(poll_evtchn));
 		if (!rc)
 			inc_irq_stat(irq_lock_count);
 	} while (spinning.prev || rc);
@@ -192,11 +198,12 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 	 */

 	/* announce we're done */
-	__get_cpu_var(spinning) = other = spinning.prev;
+	other = spinning.prev;
+	x86_write_percpu(spinning, other);
 	raw_local_irq_disable();
-	rm_idx = __get_cpu_var(rm_seq.idx);
+	rm_idx = x86_read_percpu(rm_seq.idx);
 	smp_wmb();
-	__get_cpu_var(rm_seq.idx) = rm_idx + 1;
+	x86_write_percpu(rm_seq.idx, rm_idx + 1);
 	mb();

 	/*
@@ -211,7 +218,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 			if (other->ticket + 1)
 				continue;
 			lock = other->lock;
-			__raw_spin_lock_preamble;
+			__ticket_spin_lock_preamble;
 			if (!free)
 				token = spin_adjust(other->prev, lock, token);
 			other->ticket = token >> TICKET_SHIFT;
@@ -220,7 +227,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 	}

 	rm_idx &= 1;
-	while (__get_cpu_var(rm_seq.ctr[rm_idx].counter))
+	while (x86_read_percpu(rm_seq.ctr[rm_idx].counter))
 		cpu_relax();
 	raw_local_irq_restore(upcall_mask);
 	*ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
@@ -283,3 +290,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
 	}
 }
 EXPORT_SYMBOL(xen_spin_kick);
+
+#endif /* TICKET_SHIFT */
--- head-2011-03-11.orig/drivers/xen/fbfront/xenfb.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/fbfront/xenfb.c	2011-02-17 10:11:23.000000000 +0100
@@ -18,6 +18,7 @@
  * frame buffer.
  */

+#include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fb.h>
@@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
 	return pfn_to_mfn(vmalloc_to_pfn(address));
 }

+static __devinit void
+xenfb_make_preferred_console(void)
+{
+	struct console *c;
+
+	if (console_set_on_cmdline)
+		return;
+
+	acquire_console_sem();
+	for (c = console_drivers; c; c = c->next) {
+		if (!strcmp(c->name, "tty") && c->index == 0)
+			break;
+	}
+	release_console_sem();
+	if (c) {
+		unregister_console(c);
+		c->flags |= CON_CONSDEV;
+		c->flags &= ~CON_PRINTBUFFER; /* don't print again */
+		register_console(c);
+	}
+}
+
 static int __devinit xenfb_probe(struct xenbus_device *dev,
 				 const struct xenbus_device_id *id)
 {
@@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
 	if (ret < 0)
 		goto error;

+	xenfb_make_preferred_console();
 	return 0;

  error_nomem:
@@ -884,4 +908,5 @@ static void __exit xenfb_cleanup(void)
 module_init(xenfb_init);
 module_exit(xenfb_cleanup);

+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
 MODULE_LICENSE("GPL");
--- head-2011-03-11.orig/drivers/xen/fbfront/xenkbd.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/fbfront/xenkbd.c	2011-02-01 14:38:38.000000000 +0100
@@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
 module_init(xenkbd_init);
 module_exit(xenkbd_cleanup);

+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
 MODULE_LICENSE("GPL");
--- head-2011-03-11.orig/drivers/xen/gntdev/gntdev.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/gntdev/gntdev.c	2011-02-01 14:38:38.000000000 +0100
@@ -400,7 +400,7 @@ static int __init gntdev_init(void)
 	}

 	device = device_create(class, NULL, MKDEV(gntdev_major, 0),
-			       GNTDEV_NAME);
+			       NULL, GNTDEV_NAME);
 	if (IS_ERR(device)) {
 		pr_err("Error creating gntdev device in xen_class\n");
 		pr_err("gntdev created, major number = %d\n", gntdev_major);
--- head-2011-03-11.orig/drivers/xen/netback/netback.c	2011-02-09 15:55:20.000000000 +0100
+++ head-2011-03-11/drivers/xen/netback/netback.c	2011-02-01 14:38:38.000000000 +0100
@@ -36,7 +36,7 @@

 #include "common.h"
 #include <linux/if_vlan.h>
-#include <linux/tcp.h>
+#include <net/tcp.h>
 #include <xen/balloon.h>
 #include <xen/interface/memory.h>
 #include <xen/net-util.h>
@@ -115,7 +115,7 @@ static inline int netif_page_index(struc
  */
 #define PKT_PROT_LEN    (ETH_HLEN + VLAN_HLEN + \
 			 sizeof(struct iphdr) + MAX_IPOPTLEN + \
-			 sizeof(struct tcphdr) + 40 /* MAX_TCP_OPTION_SPACE */)
+			 sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)

 static struct pending_tx_info {
 	netif_tx_request_t req;
--- head-2011-03-11.orig/drivers/xen/netfront/accel.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/netfront/accel.c	2011-02-01 14:38:38.000000000 +0100
@@ -28,6 +28,7 @@
  * IN THE SOFTWARE.
  */

+#include <linux/version.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/list.h>
--- head-2011-03-11.orig/drivers/xen/netfront/netfront.c	2011-02-09 16:04:02.000000000 +0100
+++ head-2011-03-11/drivers/xen/netfront/netfront.c	2011-02-01 14:38:38.000000000 +0100
@@ -637,7 +637,7 @@ static int network_open(struct net_devic
 	}
 	spin_unlock_bh(&np->rx_lock);

-	network_maybe_wake_tx(dev);
+	netif_start_queue(dev);

 	return 0;
 }
--- head-2011-03-11.orig/drivers/xen/sfc_netback/accel.h	2010-01-18 15:23:12.000000000 +0100
+++ head-2011-03-11/drivers/xen/sfc_netback/accel.h	2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@
 #ifndef NETBACK_ACCEL_H
 #define NETBACK_ACCEL_H

+#include <linux/version.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
--- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel.h	2011-01-31 17:29:16.000000000 +0100
+++ head-2011-03-11/drivers/xen/sfc_netfront/accel.h	2011-02-01 14:38:38.000000000 +0100
@@ -35,6 +35,7 @@
 #include <xen/evtchn.h>

 #include <linux/kernel.h>
+#include <linux/version.h>
 #include <linux/list.h>

 enum netfront_accel_post_status {
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_client.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_client.c	2011-02-01 14:38:38.000000000 +0100
@@ -149,7 +149,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
 	char *path;

 	va_start(ap, pathfmt);
-	path = kvasprintf(GFP_KERNEL, pathfmt, ap);
+	path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
 	va_end(ap);

 	if (!path) {
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_comms.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_comms.c	2011-02-01 14:38:38.000000000 +0100
@@ -248,14 +248,11 @@ int xb_init_comms(void)
 		intf->rsp_cons = intf->rsp_prod;
 	}

+#if defined(CONFIG_XEN) || defined(MODULE)
 	if (xenbus_irq)
 		unbind_from_irqhandler(xenbus_irq, &xb_waitq);

-#if defined(CONFIG_XEN) || defined(MODULE)
 	err = bind_caller_port_to_irqhandler(
-#else
-	err = bind_evtchn_to_irqhandler(
-#endif
 		xen_store_evtchn, wake_waiting,
 		0, "xenbus", &xb_waitq);
 	if (err <= 0) {
@@ -264,6 +261,20 @@ int xb_init_comms(void)
 	}

 	xenbus_irq = err;
+#else
+	if (xenbus_irq) {
+		/* Already have an irq; assume we're resuming */
+		rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
+	} else {
+		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
+						0, "xenbus", &xb_waitq);
+		if (err <= 0) {
+			pr_err("XENBUS request irq failed %i\n", err);
+			return err;
+		}
+		xenbus_irq = err;
+	}
+#endif

 	return 0;
 }
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_probe.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_probe.c	2011-02-01 14:38:38.000000000 +0100
@@ -36,6 +36,7 @@
 		 __FUNCTION__, __LINE__, ##args)

 #include <linux/kernel.h>
+#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
--- head-2011-03-11.orig/fs/aio.c	2011-03-11 10:58:46.000000000 +0100
+++ head-2011-03-11/fs/aio.c	2011-03-11 10:59:16.000000000 +0100
@@ -1307,7 +1307,7 @@ static int make_aio_fd(struct kioctx *io
 	int fd;
 	struct file *file;

-	fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
+	fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
 	if (fd < 0)
 		return fd;

--- head-2011-03-11.orig/include/Kbuild	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/include/Kbuild	2011-02-01 14:38:38.000000000 +0100
@@ -8,5 +8,6 @@ header-y += mtd/
 header-y += rdma/
 header-y += video/
 header-y += drm/
+header-y += xen/public/
 header-y += xen/
 header-y += scsi/
--- head-2011-03-11.orig/include/asm-generic/pgtable.h	2011-03-11 10:54:24.000000000 +0100
+++ head-2011-03-11/include/asm-generic/pgtable.h	2011-03-11 10:59:22.000000000 +0100
@@ -156,10 +156,6 @@ static inline void pmdp_set_wrprotect(st
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif

-#ifndef arch_change_pte_range
-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
-#endif
-
 #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
 extern pmd_t pmdp_splitting_flush(struct vm_area_struct *vma,
 				  unsigned long address,
--- head-2011-03-11.orig/arch/x86/include/asm/kexec.h	2011-01-31 14:53:50.000000000 +0100
+++ head-2011-03-11/arch/x86/include/asm/kexec.h	2011-02-01 14:38:38.000000000 +0100
@@ -5,8 +5,21 @@
 # define PA_CONTROL_PAGE	0
 # define VA_CONTROL_PAGE	1
 # define PA_PGD			2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE		3
 # define PAGES_NR		4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+ */
+#  define PA_PMD_0		8
+#  define VA_PMD_0		9
+#  define PA_PMD_1		10
+#  define VA_PMD_1		11
+#  define PA_SWAP_PAGE		12
+#  define PAGES_NR		13
+# endif /* CONFIG_XEN */
 #else
 # define PA_CONTROL_PAGE	0
 # define VA_CONTROL_PAGE	1
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/desc.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/desc.h	2011-02-01 14:38:38.000000000 +0100
@@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
 #endif

+struct gdt_page {
+	struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return per_cpu(gdt_page, cpu).gdt;
+}
+
 #ifdef CONFIG_X86_64
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-extern struct desc_ptr cpu_gdt_descr[];
-/* the cpu gdt accessor */
-#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)

 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist, unsigned seg)
@@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
 }

 #else
-struct gdt_page {
-	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
-	return per_cpu(gdt_page, cpu).gdt;
-}
-
 static inline void pack_gate(gate_desc *gate, unsigned char type,
 			     unsigned long base, unsigned dpl, unsigned flags,
 			     unsigned short seg)
@@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
 	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }

+#define SYS_VECTOR_FREE		0
+#define SYS_VECTOR_ALLOCED	1
+
+extern int first_system_vector;
+extern char system_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+	if (system_vectors[vector] == SYS_VECTOR_FREE) {
+		system_vectors[vector] = SYS_VECTOR_ALLOCED;
+		if (first_system_vector > vector)
+			first_system_vector = vector;
+	} else
+		BUG();
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+	alloc_system_vector(n);
+	set_intr_gate(n, addr);
+}
+
 /*
  * This routine sets up an interrupt gate at directory privilege level 3.
  */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap.h	2011-02-01 14:38:38.000000000 +0100
@@ -7,7 +7,58 @@
 # include "fixmap_64.h"
 #endif

+extern int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
+
+static inline void __set_fixmap(enum fixed_addresses idx,
+				maddr_t phys, pgprot_t flags)
+{
+	xen_set_fixmap(idx, phys, flags);
+}
+
+#define set_fixmap(idx, phys)				\
+	__set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys)			\
+	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
 #define clear_fixmap(idx)			\
 	__set_fixmap(idx, 0, __pgprot(0))

+#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+	/*
+	 * this branch gets completely eliminated after inlining,
+	 * except when someone tries to use fixaddr indices in an
+	 * illegal way. (such as mixing up address types or using
+	 * out-of-range indices).
+	 *
+	 * If it doesn't get removed, the linker will complain
+	 * loudly with a reasonably clear error message..
+	 */
+	if (idx >= __end_of_fixed_addresses)
+		__this_fixmap_does_not_exist();
+
+	return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+	return __virt_to_fix(vaddr);
+}
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_32.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_32.h	2011-02-01 14:38:38.000000000 +0100
@@ -58,10 +58,17 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_IO_APIC
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
 #endif
+#else
+	FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS	256
+	FIX_ISAMAP_END,
+	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
 #ifdef CONFIG_X86_VISWS_APIC
 	FIX_CO_CPU,	/* Cobalt timer */
 	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
@@ -78,51 +85,38 @@ enum fixed_addresses {
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
 #endif
-#ifdef CONFIG_ACPI
-	FIX_ACPI_BEGIN,
-	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
 #ifdef CONFIG_PCI_MMCONFIG
 	FIX_PCIE_MCFG,
 #endif
 #ifdef CONFIG_PARAVIRT
 	FIX_PARAVIRT_BOOTMAP,
 #endif
-	FIX_SHARED_INFO,
-#define NR_FIX_ISAMAPS	256
-	FIX_ISAMAP_END,
-	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
 	__end_of_permanent_fixed_addresses,
 	/*
 	 * 256 temporary boot-time mappings, used by early_ioremap(),
 	 * before ioremap() is functional.
 	 *
-	 * We round it up to the next 512 pages boundary so that we
+	 * We round it up to the next 256 pages boundary so that we
 	 * can have a single pgd entry and a single pte table:
 	 */
 #define NR_FIX_BTMAPS		64
 #define FIX_BTMAPS_NESTING	4
-	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
-			(__end_of_permanent_fixed_addresses & 511),
+	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+			(__end_of_permanent_fixed_addresses & 255),
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
 	FIX_WP_TEST,
+#ifdef CONFIG_ACPI
+	FIX_ACPI_BEGIN,
+	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
 	FIX_OHCI1394_BASE,
 #endif
 	__end_of_fixed_addresses
 };

-extern void __set_fixmap(enum fixed_addresses idx,
-			 maddr_t phys, pgprot_t flags);
 extern void reserve_top_address(unsigned long reserve);

-#define set_fixmap(idx, phys)				\
-	__set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys)			\
-	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)

 #define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)

@@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
 #define FIXADDR_START		(FIXADDR_TOP - __FIXADDR_SIZE)
 #define FIXADDR_BOOT_START	(FIXADDR_TOP - __FIXADDR_BOOT_SIZE)

-#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
-	/*
-	 * this branch gets completely eliminated after inlining,
-	 * except when someone tries to use fixaddr indices in an
-	 * illegal way. (such as mixing up address types or using
-	 * out-of-range indices).
-	 *
-	 * If it doesn't get removed, the linker will complain
-	 * loudly with a reasonably clear error message..
-	 */
-	if (idx >= __end_of_fixed_addresses)
-		__this_fixmap_does_not_exist();
-
-	return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
-	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
-	return __virt_to_fix(vaddr);
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_64.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_64.h	2011-02-01 14:38:38.000000000 +0100
@@ -12,6 +12,7 @@
 #define _ASM_FIXMAP_64_H

 #include <linux/kernel.h>
+#include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
@@ -40,7 +41,6 @@ enum fixed_addresses {
 	VSYSCALL_HPET,
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
-	FIX_HPET_BASE,
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -53,14 +53,21 @@ enum fixed_addresses {
 	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
 				  + MAX_EFI_IO_PAGES - 1,
 #endif
+#ifdef CONFIG_PARAVIRT
+	FIX_PARAVIRT_BOOTMAP,
+#else
+	FIX_SHARED_INFO,
+#endif
 #ifdef CONFIG_ACPI
 	FIX_ACPI_BEGIN,
 	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
 #endif
-	FIX_SHARED_INFO,
 #define NR_FIX_ISAMAPS	256
 	FIX_ISAMAP_END,
 	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 	__end_of_permanent_fixed_addresses,
 	/*
 	 * 256 temporary boot-time mappings, used by early_ioremap(),
@@ -71,27 +78,12 @@ enum fixed_addresses {
 	 */
 #define NR_FIX_BTMAPS		64
 #define FIX_BTMAPS_NESTING	4
-	FIX_BTMAP_END =
-		__end_of_permanent_fixed_addresses + 512 -
+	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
 			(__end_of_permanent_fixed_addresses & 511),
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
 	__end_of_fixed_addresses
 };

-extern void __set_fixmap(enum fixed_addresses idx,
-			 unsigned long phys, pgprot_t flags);
-
-#define set_fixmap(idx, phys)			\
-	__set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys)			\
-	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
 #define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
 #define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
@@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
 #define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
 #define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)

-#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without translation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
-	/*
-	 * this branch gets completely eliminated after inlining,
-	 * except when someone tries to use fixaddr indices in an
-	 * illegal way. (such as mixing up address types or using
-	 * out-of-range indices).
-	 *
-	 * If it doesn't get removed, the linker will complain
-	 * loudly with a reasonably clear error message..
-	 */
-	if (idx >= __end_of_fixed_addresses)
-		__this_fixmap_does_not_exist();
-
-	return __fix_to_virt(idx);
-}
-
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/highmem.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/highmem.h	2011-02-01 14:38:38.000000000 +0100
@@ -73,6 +73,9 @@ struct page *kmap_atomic_to_page(void *p

 #define flush_cache_kmaps()	do { } while (0)

+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+
 void clear_highpage(struct page *);
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 {
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypercall.h	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypercall.h	2011-02-01 14:38:38.000000000 +0100
@@ -332,9 +332,19 @@ static inline int __must_check
 HYPERVISOR_grant_table_op(
 	unsigned int cmd, void *uop, unsigned int count)
 {
+	bool fixup = false;
+	int rc;
+
 	if (arch_use_lazy_mmu_mode())
 		xen_multicall_flush(false);
-	return _hypercall3(int, grant_table_op, cmd, uop, count);
+#ifdef GNTTABOP_map_grant_ref
+	if (cmd == GNTTABOP_map_grant_ref)
+#endif
+		fixup = gnttab_pre_map_adjust(cmd, uop, count);
+	rc = _hypercall3(int, grant_table_op, cmd, uop, count);
+	if (rc == 0 && fixup)
+		rc = gnttab_post_map_adjust(uop, count);
+	return rc;
 }

 static inline int __must_check
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypervisor.h	2011-02-01 14:38:38.000000000 +0100
@@ -35,7 +35,6 @@

 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/version.h>
 #include <linux/errno.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/platform.h>
@@ -119,6 +118,8 @@ int xen_create_contiguous_region(
     unsigned long vstart, unsigned int order, unsigned int address_bits);
 void xen_destroy_contiguous_region(
     unsigned long vstart, unsigned int order);
+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
+				   unsigned int address_bits);

 struct page;

@@ -188,6 +189,29 @@ static inline void xen_multicall_flush(b

 #endif /* CONFIG_XEN && !MODULE */

+#ifdef CONFIG_XEN
+
+struct gnttab_map_grant_ref;
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
+			   unsigned int count);
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
+#else
+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
+					 unsigned int count)
+{
+	BUG();
+	return -ENOSYS;
+}
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define gnttab_pre_map_adjust(...) false
+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN */
+
 #if defined(CONFIG_X86_64)
 #define MULTI_UVMFLAGS_INDEX 2
 #define MULTI_UVMDOMID_INDEX 3
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/io.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/io.h	2011-02-01 14:38:38.000000000 +0100
@@ -3,20 +3,140 @@

 #define ARCH_HAS_IOREMAP_WC

+#include <linux/compiler.h>
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+#ifndef __ASSEMBLY__
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+#endif
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
+
+#define readq_relaxed(a) __readq(a)
+#define __raw_readq __readq
+#define __raw_writeq writeq
+
+/* Let people know we have them */
+#define readq readq
+#define writeq writeq
+#endif
+
+#define native_io_delay xen_io_delay
+
 #ifdef CONFIG_X86_32
-# include "io_32.h"
+# include "../../io_32.h"
 #else
-# include "io_64.h"
+# include "../../io_64.h"
+#endif
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page)	 (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page)	 (phys_to_machine(page_to_pseudophys(page)))
+
+#define bvec_to_pseudophys(bv)	 (page_to_pseudophys((bv)->bv_page) + \
+				  (unsigned long)(bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+	(bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
+	 && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
+	    == bvec_to_pseudophys(vec2))
+
+#undef virt_to_bus
+#undef bus_to_virt
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+#include <asm/fixmap.h>
+
+#undef __ISA_IO_base
+#undef isa_virt_to_bus
+#undef isa_page_to_bus
+#undef isa_bus_to_virt
+#define isa_virt_to_bus(_x) ({ \
+	unsigned long _va_ = (unsigned long)(_x); \
+	_va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+	? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+	: ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
 #endif

 extern void *xlate_dev_mem_ptr(unsigned long phys);
 extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);

-extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-
 extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
 				     unsigned long prot_val);
 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);

+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+
 #endif /* _ASM_X86_IO_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/irq_vectors.h	2011-02-15 17:27:18.000000000 +0100
@@ -0,0 +1,52 @@
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR		0x80
+#else
+# define IA32_SYSCALL_VECTOR	0x80
+#endif
+
+#define RESCHEDULE_VECTOR	0
+#define CALL_FUNCTION_VECTOR	1
+#define NMI_VECTOR		0x02
+#define CALL_FUNC_SINGLE_VECTOR 3
+#define NR_IPIS			4
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS		256
+
+#define	FIRST_VM86_IRQ		3
+#define LAST_VM86_IRQ		15
+#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
+
+/*
+ * The flat IRQ space is divided into two regions:
+ *  1. A one-to-one mapping of real physical IRQs. This space is only used
+ *     if we have physical device-access privilege. This region is at the
+ *     start of the IRQ space so that existing device drivers do not need
+ *     to be modified to translate physical IRQ numbers into our IRQ space.
+ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ *     are bound using the provided bind/unbind functions.
+ */
+
+#define PIRQ_BASE		0
+#if defined(NR_CPUS) && defined(MAX_IO_APICS)
+# if NR_CPUS < MAX_IO_APICS
+#  define NR_PIRQS		(NR_VECTORS + 32 * NR_CPUS)
+# else
+#  define NR_PIRQS		(NR_VECTORS + 32 * MAX_IO_APICS)
+# endif
+#endif
+
+#define DYNIRQ_BASE		(PIRQ_BASE + NR_PIRQS)
+#define NR_DYNIRQS		256
+
+#define NR_IRQS			(NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQ_VECTORS		NR_IRQS
+
+#endif /* _ASM_IRQ_VECTORS_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/irqflags.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/irqflags.h	2011-02-01 14:38:38.000000000 +0100
@@ -118,7 +118,7 @@ static inline void halt(void)

 #ifndef CONFIG_X86_64
 #define INTERRUPT_RETURN		iret
-#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS		; \
+#define ENABLE_INTERRUPTS_SYSEXIT	__ENABLE_INTERRUPTS		; \
 sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/		; \
 	__TEST_PENDING							; \
 	jnz  14f	/* process more events if necessary... */	; \
@@ -177,18 +177,6 @@ static inline void trace_hardirqs_fixup_
 #else

 #ifdef CONFIG_X86_64
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack).  So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
@@ -200,24 +188,6 @@ static inline void trace_hardirqs_fixup_
 	TRACE_IRQS_OFF;

 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %ecx;				\
@@ -231,8 +201,8 @@ static inline void trace_hardirqs_fixup_
 #endif

 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context.h	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context.h	2011-02-01 14:38:38.000000000 +0100
@@ -1,5 +1,42 @@
+#ifndef __ASM_X86_MMU_CONTEXT_H
+#define __ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+				   struct mm_struct *next)
+{
+	if (!PagePinned(virt_to_page(next->pgd)))
+		mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
 #ifdef CONFIG_X86_32
 # include "mmu_context_32.h"
 #else
 # include "mmu_context_64.h"
 #endif
+
+#define activate_mm(prev, next)			\
+do {						\
+	xen_activate_mm(prev, next);		\
+	switch_mm((prev), (next), NULL);	\
+} while (0);
+
+
+#endif /* __ASM_X86_MMU_CONTEXT_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_32.h	2011-02-01 14:38:38.000000000 +0100
@@ -1,32 +1,6 @@
 #ifndef __I386_SCHED_H
 #define __I386_SCHED_H

-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-void arch_exit_mmap(struct mm_struct *mm);
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-
-void mm_pin(struct mm_struct *mm);
-void mm_unpin(struct mm_struct *mm);
-void mm_pin_all(void);
-
-static inline void xen_activate_mm(struct mm_struct *prev,
-				   struct mm_struct *next)
-{
-	if (!PagePinned(virt_to_page(next->pgd)))
-		mm_pin(next);
-}
-
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #if 0 /* XEN: no lazy tlb */
@@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
 #define deactivate_mm(tsk, mm)			\
 	asm("movl %0,%%gs": :"r" (0));

-#define activate_mm(prev, next)			\
-do {						\
-	xen_activate_mm(prev, next);		\
-	switch_mm((prev), (next), NULL);	\
-} while (0)
-
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_64.h	2011-02-01 14:38:38.000000000 +0100
@@ -1,23 +1,6 @@
 #ifndef __X86_64_MMU_CONTEXT_H
 #define __X86_64_MMU_CONTEXT_H

-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/pgalloc.h>
-#include <asm/page.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-void arch_exit_mmap(struct mm_struct *mm);
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-
-/*
- * possibly do the LDT unload here?
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
@@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
 	}
 }

-extern void mm_pin(struct mm_struct *mm);
-extern void mm_unpin(struct mm_struct *mm);
-void mm_pin_all(void);
-
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
@@ -124,11 +103,4 @@ do {						\
 	asm volatile("movl %0,%%fs"::"r"(0));	\
 } while (0)

-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
-	if (!PagePinned(virt_to_page(next->pgd)))
-		mm_pin(next);
-	switch_mm(prev, next, NULL);
-}
-
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pci.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pci.h	2011-02-01 14:38:38.000000000 +0100
@@ -21,6 +21,8 @@ struct pci_sysdata {
 #endif
 };

+extern int pci_routeirq;
+
 /* scan a bus after allocating a pci_sysdata for it */
 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
 					    int node);
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgalloc.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgalloc.h	2011-02-01 14:38:38.000000000 +0100
@@ -7,6 +7,9 @@

 #include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */

+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+
 static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)	{}
 static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)	{}
 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable.h	2011-02-07 15:40:30.000000000 +0100
@@ -13,11 +13,12 @@
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_PAT		7	/* on 4KB pages */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
-#define _PAGE_BIT_IO		9	/* Mapped page is I/O or foreign and
+#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
+#define _PAGE_BIT_UNUSED2	10
+#define _PAGE_BIT_IO		11	/* Mapped page is I/O or foreign and
 					 * has no associated page struct. */
-#define _PAGE_BIT_UNUSED2	10	/* available for programmer */
-#define _PAGE_BIT_UNUSED3	11
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */

 /* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -28,34 +29,31 @@
 /* if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL

-/*
- * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
- * sign-extended value on 32-bit with all 1's in the upper word,
- * which preserves the upper pte values on 64-bit ptes:
- */
-#define _PAGE_PRESENT	(_AC(1, L)<<_PAGE_BIT_PRESENT)
-#define _PAGE_RW	(_AC(1, L)<<_PAGE_BIT_RW)
-#define _PAGE_USER	(_AC(1, L)<<_PAGE_BIT_USER)
-#define _PAGE_PWT	(_AC(1, L)<<_PAGE_BIT_PWT)
-#define _PAGE_PCD	(_AC(1, L)<<_PAGE_BIT_PCD)
-#define _PAGE_ACCESSED	(_AC(1, L)<<_PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY	(_AC(1, L)<<_PAGE_BIT_DIRTY)
-#define _PAGE_PSE	(_AC(1, L)<<_PAGE_BIT_PSE)	/* 2MB page */
-#define _PAGE_GLOBAL	(_AC(1, L)<<_PAGE_BIT_GLOBAL)	/* Global TLB entry */
-#define _PAGE_IO	(_AC(1, L)<<_PAGE_BIT_IO)
-#define _PAGE_UNUSED2	(_AC(1, L)<<_PAGE_BIT_UNUSED2)
-#define _PAGE_UNUSED3	(_AC(1, L)<<_PAGE_BIT_UNUSED3)
-#define _PAGE_PAT	(_AC(1, L)<<_PAGE_BIT_PAT)
-#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
+#define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER	(_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_UNUSED2	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
+#define _PAGE_IO	(_AT(pteval_t, 1) << _PAGE_BIT_IO)
+#define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define __HAVE_ARCH_PTE_SPECIAL

 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_NX	(_AC(1, ULL) << _PAGE_BIT_NX)
+#define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #else
-#define _PAGE_NX	0
+#define _PAGE_NX	(_AT(pteval_t, 0))
 #endif

-#define _PAGE_FILE	(_AC(1, L)<<_PAGE_BIT_FILE)
-#define _PAGE_PROTNONE	(_AC(1, L)<<_PAGE_BIT_PROTNONE)
+#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

 #ifndef __ASSEMBLY__
 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
@@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
 			 _PAGE_DIRTY | __kernel_page_user)

 /* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO |	\
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)

 /*
  * PAT settings are part of the hypervisor interface, which sets the
@@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
 #define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
 					 _PAGE_ACCESSED)

-#ifdef CONFIG_X86_32
-#define _PAGE_KERNEL_EXEC \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#ifndef __ASSEMBLY__
-extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
-#endif	/* __ASSEMBLY__ */
-#else
 #define __PAGE_KERNEL_EXEC						\
 	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
 #define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
-#endif

 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
@@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
 #define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)

-/*
- * We don't support GLOBAL page in xenolinux64
- */
-#define MAKE_GLOBAL(x)			__pgprot((x))
-
-#define PAGE_KERNEL			MAKE_GLOBAL(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			MAKE_GLOBAL(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC			MAKE_GLOBAL(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE		MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			__pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		__pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE	__pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE	__pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE	__pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)

 /*         xwr */
 #define __P000	PAGE_NONE
@@ -182,27 +167,27 @@ extern struct list_head pgd_list;
  */
 static inline int pte_dirty(pte_t pte)
 {
-	return __pte_val(pte) & _PAGE_DIRTY;
+	return pte_flags(pte) & _PAGE_DIRTY;
 }

 static inline int pte_young(pte_t pte)
 {
-	return __pte_val(pte) & _PAGE_ACCESSED;
+	return pte_flags(pte) & _PAGE_ACCESSED;
 }

 static inline int pte_write(pte_t pte)
 {
-	return __pte_val(pte) & _PAGE_RW;
+	return pte_flags(pte) & _PAGE_RW;
 }

 static inline int pte_file(pte_t pte)
 {
-	return __pte_val(pte) & _PAGE_FILE;
+	return pte_flags(pte) & _PAGE_FILE;
 }

 static inline int pte_huge(pte_t pte)
 {
-	return __pte_val(pte) & _PAGE_PSE;
+	return pte_flags(pte) & _PAGE_PSE;
 }

 static inline int pte_global(pte_t pte)
@@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)

 static inline int pte_exec(pte_t pte)
 {
-	return !(__pte_val(pte) & _PAGE_NX);
+	return !(pte_flags(pte) & _PAGE_NX);
 }

 static inline int pte_special(pte_t pte)
 {
-	return 0;
+	return pte_flags(pte) & _PAGE_SPECIAL;
 }

 static inline int pmd_large(pmd_t pte)
@@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)

 static inline pte_t pte_mkclean(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
+	return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
 }

 static inline pte_t pte_mkold(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
+	return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
 }

 static inline pte_t pte_wrprotect(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
+	return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
 }

 static inline pte_t pte_mkexec(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
+	return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
 }

 static inline pte_t pte_mkdirty(pte_t pte)
@@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte

 static inline pte_t pte_clrhuge(pte_t pte)
 {
-	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
+	return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
 }

 static inline pte_t pte_mkglobal(pte_t pte)
@@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t

 static inline pte_t pte_mkspecial(pte_t pte)
 {
-	return pte;
+	return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
 }

 extern pteval_t __supported_pte_mask;

 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
-	return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
+	pgprotval_t prot = pgprot_val(pgprot);
+
+	if (prot & _PAGE_PRESENT)
+		prot &= __supported_pte_mask;
+	return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
 }

 static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
 {
-	return __pte_ma(((page_nr << PAGE_SHIFT) |
-			 pgprot_val(pgprot)) & __supported_pte_mask);
+	pgprotval_t prot = pgprot_val(pgprot);
+
+	if (prot & _PAGE_PRESENT)
+		prot &= __supported_pte_mask;
+	return __pte_ma((page_nr << PAGE_SHIFT) | prot);
 }

 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
-	return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
-		      pgprot_val(pgprot)) & __supported_pte_mask);
+	pgprotval_t prot = pgprot_val(pgprot);
+
+	if (prot & _PAGE_PRESENT)
+		prot &= __supported_pte_mask;
+	return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
 }

 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	pteval_t val = pte_val(pte);
+	pgprotval_t prot = pgprot_val(newprot);
+	pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;

-	val &= _PAGE_CHG_MASK;
-	val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+	if (prot & _PAGE_PRESENT)
+		prot &= __supported_pte_mask;
+	val |= prot & ~_PAGE_CHG_MASK;

 	return __pte(val);
 }
@@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
 	return __pgprot(preservebits | addbits);
 }

-#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)

-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
+				 ? pgprot_val(p) & __supported_pte_mask \
+				 : pgprot_val(p))

 #ifndef __ASSEMBLY__
 #define __HAVE_PHYS_MEM_ACCESS_PROT
@@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
                               unsigned long size, pgprot_t *vma_prot);
 #endif

+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifndef CONFIG_XEN
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void xen_pagetable_setup_start(pgd_t *base) {}
+static inline void xen_pagetable_setup_done(pgd_t *base) {}
+#endif
+
 #define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)

@@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
 # include "pgtable_64.h"
 #endif

+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
 #define KERNEL_PGD_BOUNDARY	pgd_index(PAGE_OFFSET)
 #define KERNEL_PGD_PTRS		(PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

@@ -383,8 +412,15 @@ enum {
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
+	PG_LEVEL_NUM
 };

+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
@@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
  * race with other CPU's that might be updating the dirty
  * bit at the same time.
  */
+struct vm_area_struct;
+
 #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 extern int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
@@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
        memcpy(dst, src, count * sizeof(pgd_t));
 }

-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
-	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
-
 #define arbitrary_virt_to_machine(va)					\
 ({									\
 	unsigned int __lvl;						\
@@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
 #define ptep_to_machine(ptep)	virt_to_machine(ptep)
 #endif

+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep)
+{
+#if CONFIG_XEN_COMPAT < 0x030300
+	if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
+		return ptep_get_and_clear(mm, addr, ptep);
+#endif
+	return *ptep;
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep, pte_t pte)
+{
+	mmu_update_t u;
+
+#if CONFIG_XEN_COMPAT < 0x030300
+	if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+#endif
+	u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
+	u.val = __pte_val(pte);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 #include <asm-generic/pgtable.h>

 #include <xen/features.h>
@@ -573,10 +636,6 @@ int create_lookup_pte_addr(struct mm_str
                            unsigned long address,
                            uint64_t *ptep);

-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot,
-		int dirty_accountable);
-
 #endif	/* __ASSEMBLY__ */

 #endif	/* _ASM_X86_PGTABLE_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable-3level.h	2011-02-01 14:38:38.000000000 +0100
@@ -14,11 +14,11 @@
 #define pmd_ERROR(e)							\
 	printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n",		\
 	       __FILE__, __LINE__, &(e), __pmd_val(e),			\
-	       (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+	       (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
 #define pgd_ERROR(e)							\
 	printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n",		\
 	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
-	       (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+	       (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)

 static inline int pud_none(pud_t pud)
 {
@@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
 }
 static inline int pud_bad(pud_t pud)
 {
-	return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+	return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
 }

 static inline int pud_present(pud_t pud)
@@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
 		xen_tlb_flush();
 }

-#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))

-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))


 /* Find an entry in the second-level page table.. */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_32.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_32.h	2011-02-01 14:38:38.000000000 +0100
@@ -89,10 +89,10 @@ extern unsigned long pg0[];
 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
    can temporarily clear it. */
 #define pmd_present(x)	(__pmd_val(x))
-#define pmd_bad(x)	((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+#define pmd_bad(x)	((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
 #else
 #define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
-#define pmd_bad(x)	((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define pmd_bad(x)	((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 #endif


@@ -119,26 +119,6 @@ extern unsigned long pg0[];
  */
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))

-/*
- * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
- *
- * this macro returns the index of the entry in the pgd page which would
- * control the given virtual address
- */
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_index_k(addr) pgd_index((addr))
-
-/*
- * pgd_offset() returns a (pgd_t *)
- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
- */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-
-/*
- * a shortcut which implies the use of the kernel's pgd, instead
- * of a process's
- */
-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))

 static inline int pud_large(pud_t pud) { return 0; }

@@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
 #define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))

 #define pmd_page_vaddr(pmd)					\
-	((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
+	((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))

 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address)					\
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_64.h	2011-02-01 14:38:38.000000000 +0100
@@ -23,6 +23,8 @@ extern void xen_init_pt(void);
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
 extern pgd_t init_level4_pgt[];

 #define swapper_pg_dir init_level4_pgt
@@ -79,6 +81,9 @@ extern void paging_init(void);

 struct mm_struct;

+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
 #define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))

 static inline void xen_set_pte(pte_t *ptep, pte_t pte)
@@ -145,7 +150,7 @@ static inline void xen_pgd_clear(pgd_t *
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))


-#define MAXMEM		 _AC(0x0000006fffffffff, UL)
+#define MAXMEM		 _AC(0x000004ffffffffff, UL)
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
@@ -157,17 +162,17 @@ static inline void xen_pgd_clear(pgd_t *

 static inline int pgd_bad(pgd_t pgd)
 {
-	return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+	return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

 static inline int pud_bad(pud_t pud)
 {
-	return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+	return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

 static inline int pmd_bad(pmd_t pmd)
 {
-	return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+	return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

 #define pte_none(x)	(!(x).pte)
@@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)

 #define pages_to_mb(x)	((x) >> (20 - PAGE_SHIFT))   /* FIXME: is this right? */

-#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
 	__pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
 #define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr :	\
@@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
  * Level 4 access.
  */
 #define pgd_page_vaddr(pgd)						\
-	((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
+	((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
 #define pgd_page(pgd)		(pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_offset(mm, address)	((mm)->pgd + pgd_index((address)))
-#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
 static inline int pgd_large(pgd_t pgd) { return 0; }
 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
@@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
 }

 /* PMD  - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))

 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/processor.h	2011-03-03 16:42:13.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/processor.h	2011-03-03 16:44:23.000000000 +0100
@@ -144,7 +144,7 @@ extern __u32			cleared_cpu_caps[NCAPINTS
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
 #define cpu_data(cpu)		per_cpu(cpu_info, cpu)
-#define current_cpu_data	cpu_data(smp_processor_id())
+#define current_cpu_data	__get_cpu_var(cpu_info)
 #else
 #define cpu_data(cpu)		boot_cpu_data
 #define current_cpu_data	boot_cpu_data
@@ -163,7 +163,7 @@ static inline int hlt_works(int cpu)

 extern void cpu_detect(struct cpuinfo_x86 *c);

-extern void identify_cpu(struct cpuinfo_x86 *);
+extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
@@ -277,15 +277,11 @@ struct tss_struct {
 	struct thread_struct	*io_bitmap_owner;

 	/*
-	 * Pad the TSS to be cacheline-aligned (size is 0x100):
-	 */
-	unsigned long		__cacheline_filler[35];
-	/*
 	 * .. and then another 0x100 bytes for the emergency kernel stack:
 	 */
 	unsigned long		stack[64];

-} __attribute__((packed));
+} ____cacheline_aligned;

 DECLARE_PER_CPU(struct tss_struct, init_tss);

@@ -677,11 +673,36 @@ static inline void __sti_mwait(unsigned

 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);

-extern int			force_mwait;
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);

 extern unsigned long		boot_option_idle_override;
+extern unsigned long		idle_halt;
+extern unsigned long		idle_nomwait;
+
+#ifndef CONFIG_XEN
+/*
+ * on systems with caches, caches must be flashed as the absolute
+ * last instruction before going into a suspended halt.  Otherwise,
+ * dirty data can linger in the cache and become stale on resume,
+ * leading to strange errors.
+ *
+ * perform a variety of operations to guarantee that the compiler
+ * will not reorder instructions.  wbinvd itself is serializing
+ * so the processor will not reorder.
+ *
+ * Systems without cache can just go into halt.
+ */
+static inline void wbinvd_halt(void)
+{
+	mb();
+	/* check for clflush to determine if wbinvd is legal */
+	if (cpu_has_clflush)
+		asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
+	else
+		while (1)
+			halt();
+}
+#endif

 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/smp.h	2011-02-01 14:38:38.000000000 +0100
@@ -25,25 +25,18 @@ extern cpumask_t cpu_initialized;
 extern void (*mtrr_hook)(void);
 extern void zap_low_mappings(void);

+extern int __cpuinit get_local_pda(int cpu);
+
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;

 #ifndef CONFIG_XEN
-#ifdef CONFIG_SMP
-extern u16 x86_cpu_to_apicid_init[];
-extern u16 x86_bios_cpu_apicid_init[];
-extern void *x86_cpu_to_apicid_early_ptr;
-extern void *x86_bios_cpu_apicid_early_ptr;
-#else
-#define x86_cpu_to_apicid_early_ptr NULL
-#define x86_bios_cpu_apicid_early_ptr NULL
-#endif
-
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 #endif

 #ifdef CONFIG_SMP
@@ -64,9 +57,9 @@ struct smp_ops {

 	void (*smp_send_stop)(void);
 	void (*smp_send_reschedule)(int cpu);
-	int (*smp_call_function_mask)(cpumask_t mask,
-				      void (*func)(void *info), void *info,
-				      int wait);
+
+	void (*send_call_func_ipi)(cpumask_t mask);
+	void (*send_call_func_single_ipi)(int cpu);
 };

 /* Globals due to paravirt */
@@ -104,11 +97,14 @@ static inline void smp_send_reschedule(i
 	smp_ops.smp_send_reschedule(cpu);
 }

-static inline int smp_call_function_mask(cpumask_t mask,
-					 void (*func) (void *info), void *info,
-					 int wait)
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+	smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-	return smp_ops.smp_call_function_mask(mask, func, info, wait);
+	smp_ops.send_call_func_ipi(mask);
 }

 void native_smp_prepare_boot_cpu(void);
@@ -120,23 +116,19 @@ int native_cpu_up(unsigned int cpunum);

 void xen_smp_send_stop(void);
 void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function_mask(cpumask_t mask,
-			       void (*func) (void *info), void *info,
-			       int wait);
+void xen_send_call_func_ipi(cpumask_t mask);
+void xen_send_call_func_single_ipi(int cpu);

 #define smp_send_stop		xen_smp_send_stop
 #define smp_send_reschedule	xen_smp_send_reschedule
-#define smp_call_function_mask	xen_smp_call_function_mask
-
-extern void prefill_possible_map(void);
+#define arch_send_call_function_single_ipi	xen_send_call_func_single_ipi
+#define arch_send_call_function_ipi		xen_send_call_func_ipi

 #endif /* CONFIG_XEN */

 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);

-extern void prefill_possible_map(void);
-
 void smp_store_cpu_info(int id);
 #define cpu_physical_id(cpu)	(cpu)

@@ -147,6 +139,14 @@ static inline int num_booting_cpus(void)
 }
 #endif /* CONFIG_SMP */

+#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
+extern void prefill_possible_map(void);
+#else
+static inline void prefill_possible_map(void)
+{
+}
+#endif
+
 extern unsigned disabled_cpus __cpuinitdata;

 #ifdef CONFIG_X86_32_SMP
@@ -214,12 +214,8 @@ static inline int hard_smp_processor_id(
 #endif /* CONFIG_X86_LOCAL_APIC */

 #ifdef CONFIG_HOTPLUG_CPU
-extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif

-extern void smp_alloc_memory(void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
 #endif /* __ASSEMBLY__ */
 #endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/spinlock.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/spinlock.h	2011-02-01 14:38:38.000000000 +0100
@@ -38,6 +38,8 @@
 # define UNLOCK_LOCK_PREFIX
 #endif

+#ifdef TICKET_SHIFT
+
 #include <asm/irqflags.h>

 int xen_spinlock_init(unsigned int cpu);
@@ -65,14 +67,14 @@ void xen_spin_kick(raw_spinlock_t *, uns
  * much between them in performance though, especially as locks are out of line.
  */
 #if TICKET_SHIFT == 8
-#define __raw_spin_lock_preamble \
+#define __ticket_spin_lock_preamble \
 	asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
 	    "cmpb %h0, %b0\n\t" \
 	    "sete %1" \
 	    : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
 	    : "0" (0x0100) \
 	    : "memory", "cc")
-#define __raw_spin_lock_body \
+#define __ticket_spin_lock_body \
 	asm("1:\t" \
 	    "cmpb %h0, %b0\n\t" \
 	    "je 2f\n\t" \
@@ -86,7 +88,7 @@ void xen_spin_kick(raw_spinlock_t *, uns
 	    : "+Q" (token), "+g" (count) \
 	    : "m" (lock->slock) \
 	    : "memory", "cc")
-#define __raw_spin_unlock_body \
+#define __ticket_spin_unlock_body \
 	asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \
 	    "movzwl %2, %0\n\t" \
 	    "cmpb %h0, %b0\n\t" \
@@ -95,7 +97,7 @@ void xen_spin_kick(raw_spinlock_t *, uns
 	    : \
 	    : "memory", "cc")

-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp, new;

@@ -114,7 +116,7 @@ static __always_inline int __raw_spin_tr
 	return tmp;
 }
 #elif TICKET_SHIFT == 16
-#define __raw_spin_lock_preamble \
+#define __ticket_spin_lock_preamble \
 	do { \
 		unsigned int tmp; \
 		asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
@@ -126,7 +128,7 @@ static __always_inline int __raw_spin_tr
 		    : "0" (0x00010000) \
 		    : "memory", "cc"); \
 	} while (0)
-#define __raw_spin_lock_body \
+#define __ticket_spin_lock_body \
 	do { \
 		unsigned int tmp; \
 		asm("shldl $16, %0, %2\n" \
@@ -144,7 +146,7 @@ static __always_inline int __raw_spin_tr
 		    : "m" (lock->slock) \
 		    : "memory", "cc"); \
 	} while (0)
-#define __raw_spin_unlock_body \
+#define __ticket_spin_unlock_body \
 	do { \
 		unsigned int tmp; \
 		asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \
@@ -158,7 +160,7 @@ static __always_inline int __raw_spin_tr
 		    : "memory", "cc"); \
 	} while (0)

-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp;
 	int new;
@@ -181,27 +183,27 @@ static __always_inline int __raw_spin_tr
 }
 #endif

-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);

 	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
 }

-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 {
 	int tmp = ACCESS_ONCE(lock->slock);

 	return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
 }

-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned int token, count;
 	unsigned int flags = __raw_local_irq_save();
 	bool free;

-	__raw_spin_lock_preamble;
+	__ticket_spin_lock_preamble;
 	if (likely(free)) {
 		raw_local_irq_restore(flags);
 		return;
@@ -210,41 +212,154 @@ static __always_inline void __raw_spin_l
 	raw_local_irq_restore(flags);
 	do {
 		count = 1 << 10;
-		__raw_spin_lock_body;
+		__ticket_spin_lock_body;
 	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
 }

-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
-						  unsigned long flags)
+static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
+						     unsigned long flags)
 {
 	unsigned int token, count;
 	bool free;

-	__raw_spin_lock_preamble;
+	__ticket_spin_lock_preamble;
 	if (likely(free))
 		return;
 	token = xen_spin_adjust(lock, token);
 	do {
 		count = 1 << 10;
-		__raw_spin_lock_body;
+		__ticket_spin_lock_body;
 	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
 }

-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
 {
 	unsigned int token;
 	bool kick;

-	__raw_spin_unlock_body;
+	__ticket_spin_unlock_body;
 	if (kick)
 		xen_spin_kick(lock, token);
 }

 #ifndef XEN_SPINLOCK_SOURCE
-#undef __raw_spin_lock_preamble
-#undef __raw_spin_lock_body
-#undef __raw_spin_unlock_body
+#undef __ticket_spin_lock_preamble
+#undef __ticket_spin_lock_body
+#undef __ticket_spin_unlock_body
+#endif
+
+#define __raw_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+	s8 lock;
+#if NR_CPUS < 256
+	s8 spinners;
+#else
+#error NR_CPUS >= 256 support not implemented
 #endif
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	s8 val = 1;
+
+	asm("1: xchgb %1, %0\n"
+	    "   test %1,%1\n"
+	    "   jz 3f\n"
+	    "   " LOCK_PREFIX "incb %2\n"
+	    "2: rep;nop\n"
+	    "   cmpb $1, %0\n"
+	    "   je 2b\n"
+	    "   " LOCK_PREFIX "decb %2\n"
+	    "   jmp 1b\n"
+	    "3:"
+	    : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	u8 old = 1;
+
+	asm("xchgb %1,%0"
+	    : "+m" (bl->lock), "+q" (old) : : "memory");
+
+	return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+	struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+	smp_wmb();
+	bl->lock = 0;
+}
+
+#define __raw_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
+
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+	return __raw_spin(is_locked)(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+	return __raw_spin(is_contended)(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+	__raw_spin(lock)(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+	return __raw_spin(trylock)(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+	__raw_spin(unlock)(lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+						  unsigned long flags)
+{
+	__raw_spin(lock_flags)(lock, flags);
+}
+
+#undef __raw_spin

 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 {
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/spinlock_types.h	2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/spinlock_types.h	2011-02-01 14:38:38.000000000 +0100
@@ -11,6 +11,10 @@ typedef union {
 	unsigned int slock;
 	struct {
 /*
+ * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
+ */
+#if CONFIG_XEN_COMPAT >= 0x030200
+/*
  * On Xen we support a single level of interrupt re-enabling per lock. Hence
  * we can have twice as many outstanding tickets. Thus the cut-off for using
  * byte register pairs must be at half the number of CPUs.
@@ -22,6 +26,7 @@ typedef union {
 # define TICKET_SHIFT 16
 		u16 cur, seq;
 #endif
+#endif
 	};
 } raw_spinlock_t;

--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/system.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/system.h	2011-03-03 15:58:55.000000000 +0100
@@ -68,10 +68,12 @@ do {									\
 		       [next]     "d" (next));				\
 } while (0)

+#ifndef CONFIG_XEN
 /*
  * disable hlt during certain critical i/o operations
  */
 #define HAVE_DISABLE_HLT
+#endif
 #else
 #define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
 #define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
@@ -137,7 +139,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
 #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))

-extern void load_gs_index(unsigned);
+extern void xen_load_gs_index(unsigned);

 /*
  * Load a segment. Fall back on loading the zero
@@ -154,14 +156,14 @@ extern void load_gs_index(unsigned);
 		     "jmp 2b\n"			\
 		     ".previous\n"		\
 		     _ASM_EXTABLE(1b,3b)	\
-		     : :"r" (value), "r" (0))
+		     : :"r" (value), "r" (0) : "memory")


 /*
  * Save a segment register away
  */
 #define savesegment(seg, value)				\
-	asm volatile("mov %%" #seg ",%0":"=rm" (value))
+	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")

 static inline unsigned long get_limit(unsigned long segment)
 {
@@ -269,6 +271,7 @@ static inline void xen_wbinvd(void)
 #ifdef CONFIG_X86_64
 #define read_cr8()	(xen_read_cr8())
 #define write_cr8(x)	(xen_write_cr8(x))
+#define load_gs_index   xen_load_gs_index
 #endif

 /* Clear the 'TS' bit */
@@ -287,13 +290,12 @@ static inline void clflush(volatile void
 void disable_hlt(void);
 void enable_hlt(void);

-extern int es7000_plat;
 void cpu_idle_wait(void);

 extern unsigned long arch_align_stack(unsigned long sp);
 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);

-void default_idle(void);
+void xen_idle(void);

 /*
  * Force strict CPU ordering.
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/xor_64.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/xor_64.h	2011-02-01 14:38:38.000000000 +0100
@@ -1,3 +1,6 @@
+#ifndef ASM_X86__XOR_64_H
+#define ASM_X86__XOR_64_H
+
 /*
  * x86-64 changes / gcc fixes from Andi Kleen.
  * Copyright 2002 Andi Kleen, SuSE Labs.
@@ -330,3 +333,5 @@ do {						\
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* ASM_X86__XOR_64_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/irq_vectors.h	2008-09-25 13:55:32.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-/*
- * This file should contain #defines for all of the interrupt vector
- * numbers used by this architecture.
- *
- * In addition, there are some standard defines:
- *
- *	FIRST_EXTERNAL_VECTOR:
- *		The first free place for external interrupts
- *
- *	SYSCALL_VECTOR:
- *		The IRQ vector a syscall makes the user to kernel transition
- *		under.
- *
- *	TIMER_IRQ:
- *		The IRQ number the timer interrupt comes in at.
- *
- *	NR_IRQS:
- *		The total number of interrupt vectors (including all the
- *		architecture specific interrupts) needed.
- *
- */
-#ifndef _ASM_IRQ_VECTORS_H
-#define _ASM_IRQ_VECTORS_H
-
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR	0x20
-
-#define SYSCALL_VECTOR		0x80
-
-/*
- * Vectors 0x20-0x2f are used for ISA interrupts.
- */
-
-#if 0
-/*
- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
- *
- *  some of the following vectors are 'rare', they are merged
- *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
- *  TLB, reschedule and local APIC vectors are performance-critical.
- *
- *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
- */
-#define SPURIOUS_APIC_VECTOR	0xff
-#define ERROR_APIC_VECTOR	0xfe
-#define INVALIDATE_TLB_VECTOR	0xfd
-#define RESCHEDULE_VECTOR	0xfc
-#define CALL_FUNCTION_VECTOR	0xfb
-
-#define THERMAL_APIC_VECTOR	0xf0
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR	0xef
-#endif
-
-#define SPURIOUS_APIC_VECTOR	0xff
-#define ERROR_APIC_VECTOR	0xfe
-
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x31 to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
-
-/*
- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
- * Right now the APIC is mostly only used for SMP.
- * 256 vectors is an architectural limit. (we can have
- * more than 256 devices theoretically, but they will
- * have to use shared interrupts)
- * Since vectors 0x00-0x1f are used/reserved for the CPU,
- * the usable vector space is 0x20-0xff (224 vectors)
- */
-
-#define RESCHEDULE_VECTOR	0
-#define CALL_FUNCTION_VECTOR	1
-#define NR_IPIS			2
-
-/*
- * The maximum number of vectors supported by i386 processors
- * is limited to 256. For processors other than i386, NR_VECTORS
- * should be changed accordingly.
- */
-#define NR_VECTORS 256
-
-#define FPU_IRQ			13
-
-#define	FIRST_VM86_IRQ		3
-#define LAST_VM86_IRQ		15
-#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
-
-/*
- * The flat IRQ space is divided into two regions:
- *  1. A one-to-one mapping of real physical IRQs. This space is only used
- *     if we have physical device-access privilege. This region is at the
- *     start of the IRQ space so that existing device drivers do not need
- *     to be modified to translate physical IRQ numbers into our IRQ space.
- *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
- *     are bound using the provided bind/unbind functions.
- */
-
-#define PIRQ_BASE		0
-#if !defined(MAX_IO_APICS)
-# define NR_PIRQS		(NR_VECTORS + 32 * NR_CPUS)
-#elif NR_CPUS < MAX_IO_APICS
-# define NR_PIRQS		(NR_VECTORS + 32 * NR_CPUS)
-#else
-# define NR_PIRQS		(NR_VECTORS + 32 * MAX_IO_APICS)
-#endif
-
-#define DYNIRQ_BASE		(PIRQ_BASE + NR_PIRQS)
-#define NR_DYNIRQS		256
-
-#define NR_IRQS			(NR_PIRQS + NR_DYNIRQS)
-#define NR_IRQ_VECTORS		NR_IRQS
-
-#endif /* _ASM_IRQ_VECTORS_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/setup_arch_post.h	2007-06-12 13:14:13.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-/**
- * machine_specific_* - Hooks for machine specific setup.
- *
- * Description:
- *	This is included late in kernel/setup.c so that it can make
- *	use of all of the static functions.
- **/
-
-#include <xen/interface/callback.h>
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
-static void __init machine_specific_arch_setup(void)
-{
-	int ret;
-	static struct callback_register __initdata event = {
-		.type = CALLBACKTYPE_event,
-		.address = (unsigned long) hypervisor_callback,
-	};
-	static struct callback_register __initdata failsafe = {
-		.type = CALLBACKTYPE_failsafe,
-		.address = (unsigned long)failsafe_callback,
-	};
-	static struct callback_register __initdata syscall = {
-		.type = CALLBACKTYPE_syscall,
-		.address = (unsigned long)system_call,
-	};
-#ifdef CONFIG_X86_LOCAL_APIC
-	static struct callback_register __initdata nmi_cb = {
-		.type = CALLBACKTYPE_nmi,
-		.address = (unsigned long)nmi,
-	};
-#endif
-
-	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
-	if (ret == 0)
-		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
-	if (ret == 0)
-		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
-#if CONFIG_XEN_COMPAT <= 0x030002
-	if (ret == -ENOSYS)
-		ret = HYPERVISOR_set_callbacks(
-			event.address,
-			failsafe.address,
-			syscall.address);
-#endif
-	BUG_ON(ret);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
-#if CONFIG_XEN_COMPAT <= 0x030002
-	if (ret == -ENOSYS) {
-		static struct xennmi_callback __initdata cb = {
-			.handler_address = (unsigned long)nmi
-		};
-
-		HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
-	}
-#endif
-#endif
-}
--- head-2011-03-11.orig/arch/x86/include/mach-xen/setup_arch_pre.h	2007-06-12 13:14:13.000000000 +0200
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,5 +0,0 @@
-/* Hook to call BIOS initialisation function */
-
-#define ARCH_SETUP machine_specific_arch_setup();
-
-static void __init machine_specific_arch_setup(void);
--- head-2011-03-11.orig/arch/x86/include/asm/traps.h	2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/include/asm/traps.h	2011-02-01 14:38:38.000000000 +0100
@@ -38,6 +38,9 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_X86_XEN
+asmlinkage void fixup_4gb_segment(void);
+#endif

 dotraplinkage void do_divide_error(struct pt_regs *, long);
 dotraplinkage void do_debug(struct pt_regs *, long);
@@ -66,6 +69,9 @@ dotraplinkage void do_machine_check(stru
 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
+#ifdef CONFIG_XEN
+void do_fixup_4gb_segment(struct pt_regs *, long);
+#endif
 #endif

 static inline int get_si_code(unsigned long condition)
--- head-2011-03-11.orig/include/linux/page-flags.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/include/linux/page-flags.h	2011-02-01 14:38:38.000000000 +0100
@@ -125,12 +125,12 @@ enum pageflags {
 	PG_fscache = PG_private_2,	/* page backed by cache */

 	/* XEN */
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN)
 	PG_pinned = PG_locked,	/* Cannot alias with PG_owner_priv_1 since
 				 * bad_page() checks should include this bit.
 				 * Should not use PG_arch_1 as that may have
 				 * a different purpose elsewhere. */
-#else
+#elif defined(CONFIG_PARAVIRT_XEN)
 	PG_pinned = PG_owner_priv_1,
 	PG_savepinned = PG_dirty,
 #endif
@@ -225,8 +225,12 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG
 	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
+#endif
+#ifdef CONFIG_PARAVIRT_XEN
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+#endif
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)

--- head-2011-03-11.orig/include/xen/interface/memory.h	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/include/xen/interface/memory.h	2011-02-01 14:38:38.000000000 +0100
@@ -88,6 +88,7 @@ struct xen_memory_reservation {
      */
     domid_t        domid;
 };
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
 typedef struct xen_memory_reservation xen_memory_reservation_t;
 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);

@@ -173,11 +174,7 @@ struct xen_machphys_mfn_list {
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-#ifndef CONFIG_PARAVIRT_XEN
     XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
-#else
-    ulong extent_start;
-#endif

     /*
      * Number of extents written to the above array. This will be smaller
@@ -185,6 +182,7 @@ struct xen_machphys_mfn_list {
      */
     unsigned int nr_extents;
 };
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);

@@ -226,6 +224,7 @@ struct xen_add_to_physmap {
     /* GPFN where the source mapping page should appear. */
     xen_pfn_t     gpfn;
 };
+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);

--- head-2011-03-11.orig/include/xen/public/Kbuild	2011-01-31 14:31:28.000000000 +0100
+++ head-2011-03-11/include/xen/public/Kbuild	2011-02-01 14:38:38.000000000 +0100
@@ -1 +1,5 @@
+header-y += evtchn.h
+header-y += gntdev.h
 header-y += iomulti.h
+header-y += privcmd.h
+header-y += xenbus.h
--- head-2011-03-11.orig/include/xen/public/privcmd.h	2010-01-18 15:23:12.000000000 +0100
+++ head-2011-03-11/include/xen/public/privcmd.h	2011-02-01 14:38:38.000000000 +0100
@@ -35,10 +35,6 @@

 #include <linux/types.h>

-#ifndef __user
-#define __user
-#endif
-
 typedef struct privcmd_hypercall
 {
 	__u64 op;
--- head-2011-03-11.orig/include/xen/public/xenbus.h	2009-05-29 10:25:53.000000000 +0200
+++ head-2011-03-11/include/xen/public/xenbus.h	2011-02-01 14:38:38.000000000 +0100
@@ -35,10 +35,6 @@

 #include <linux/types.h>

-#ifndef __user
-#define __user
-#endif
-
 typedef struct xenbus_alloc {
 	domid_t dom;
 	__u32 port;
--- head-2011-03-11.orig/kernel/kexec.c	2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/kernel/kexec.c	2011-02-01 14:38:38.000000000 +0100
@@ -49,7 +49,7 @@ note_buf_t __percpu *crash_notes;
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
 u32
 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
-__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
+__page_aligned_bss
 #endif
 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
--- head-2011-03-11.orig/lib/swiotlb-xen.c	2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/lib/swiotlb-xen.c	2011-02-01 14:38:38.000000000 +0100
@@ -788,7 +788,7 @@ swiotlb_sync_sg_for_device(struct device
 }

 int
-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
 	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
 }
--- head-2011-03-11.orig/mm/mprotect.c	2011-01-31 17:29:16.000000000 +0100
+++ head-2011-03-11/mm/mprotect.c	2011-02-01 14:38:38.000000000 +0100
@@ -97,8 +97,6 @@ static inline void change_pmd_range(stru
 		}
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
-			continue;
 		change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
 				 dirty_accountable);
 	} while (pmd++, addr = next, addr != end);