qubes-linux-kernel/patches.xen/xen3-patch-2.6.26

From: kernel.org
Subject: 2.6.26
Patch-mainline: 2.6.26

Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.26" by xen-port-patches.py

--- head-2010-05-25.orig/arch/x86/Kconfig	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/Kconfig	2010-03-24 15:12:36.000000000 +0100
@@ -41,7 +41,7 @@ config X86
 	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KVM if !XEN
-	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KGDB if !XEN
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -681,6 +681,7 @@ config NO_BOOTMEM

 config MEMTEST
 	bool "Memtest"
+	depends on !XEN
 	---help---
 	  This option adds a kernel parameter 'memtest', which allows memtest
 	  to be set.
@@ -1255,7 +1256,7 @@ config ARCH_PHYS_ADDR_T_64BIT
 config DIRECT_GBPAGES
 	bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
 	default y
-	depends on X86_64
+	depends on X86_64 && !XEN
 	---help---
 	  Allow the kernel linear mapping to use 1GB pages on CPUs that
 	  support it. This can improve the kernel's performance a tiny bit by
@@ -2279,6 +2280,4 @@ source "crypto/Kconfig"

 source "arch/x86/kvm/Kconfig"

-source "drivers/xen/Kconfig"
-
 source "lib/Kconfig"
--- head-2010-05-25.orig/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:12:36.000000000 +0100
@@ -129,12 +129,14 @@ sysenter_tracesys:
 	SAVE_REST
 	CLEAR_RREGS
 	movq	%r9,R9(%rsp)
-	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	xchgl	%ebp,%r9d
+	cmpl	$(IA32_NR_syscalls-1),%eax
+	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
 	CFI_ENDPROC
 ENDPROC(ia32_sysenter_target)
@@ -200,13 +202,15 @@ cstar_tracesys:
 	SAVE_REST
 	CLEAR_RREGS
 	movq %r9,R9(%rsp)
-	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	xchgl %ebp,%r9d
 	movl RSP-ARGOFFSET(%rsp), %r8d
+	cmpl $(IA32_NR_syscalls-1),%eax
+	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
 	jmp cstar_do_call
 END(ia32_cstar_target)

@@ -264,7 +268,7 @@ ENTRY(ia32_syscall)
 	jnz ia32_tracesys
 ia32_do_syscall:
 	cmpl $(IA32_NR_syscalls-1),%eax
-	ja  ia32_badsys
+	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	IA32_ARG_FIXUP
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
@@ -274,7 +278,7 @@ ia32_sysret:
 ia32_tracesys:
 	SAVE_REST
 	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
+	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
@@ -365,7 +369,7 @@ ia32_sys_call_table:
 	.quad sys_setuid16
 	.quad sys_getuid16
 	.quad compat_sys_stime	/* stime */		/* 25 */
-	.quad sys32_ptrace	/* ptrace */
+	.quad compat_sys_ptrace	/* ptrace */
 	.quad sys_alarm
 	.quad sys_fstat	/* (old)fstat */
 	.quad sys_pause
--- head-2010-05-25.orig/arch/x86/kernel/Makefile	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/Makefile	2010-03-24 15:12:36.000000000 +0100
@@ -136,8 +136,7 @@ ifeq ($(CONFIG_X86_64),y)

 	obj-$(CONFIG_XEN)		+= nmi_64.o
 	time_64-$(CONFIG_XEN)		+= time_32.o
-	pci-dma_64-$(CONFIG_XEN)	+= pci-dma_32.o
 endif

-disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
-	smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
+disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
+	pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
--- head-2010-05-25.orig/arch/x86/kernel/acpi/Makefile	2010-03-24 15:01:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/acpi/Makefile	2010-03-24 15:12:36.000000000 +0100
@@ -15,4 +15,4 @@ $(obj)/wakeup_rm.o:    $(obj)/realmode/w
 $(obj)/realmode/wakeup.bin: FORCE
 	$(Q)$(MAKE) $(build)=$(obj)/realmode

-disabled-obj-$(CONFIG_XEN)	:= cstate.o wakeup_$(BITS).o
+disabled-obj-$(CONFIG_XEN)	:= cstate.o wakeup_%.o
--- head-2010-05-25.orig/arch/x86/kernel/acpi/boot.c	2010-04-15 09:56:18.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/acpi/boot.c	2010-04-15 10:03:01.000000000 +0200
@@ -159,6 +159,7 @@ static int __init acpi_parse_madt(struct

 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
+#ifndef CONFIG_XEN
 	unsigned int ver = 0;

 	if (!enabled) {
@@ -170,6 +171,7 @@ static void __cpuinit acpi_register_lapi
 		ver = apic_version[boot_cpu_physical_apicid];

 	generic_processor_info(id, ver);
+#endif
 }

 static int __init
@@ -780,6 +782,7 @@ static int __init acpi_parse_fadt(struct
  * returns 0 on success, < 0 on error
  */

+#ifndef CONFIG_XEN
 static void __init acpi_register_lapic_address(unsigned long address)
 {
 	mp_lapic_addr = address;
@@ -791,6 +794,9 @@ static void __init acpi_register_lapic_a
 			 GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
 }
+#else
+#define acpi_register_lapic_address(address)
+#endif

 static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 {
--- head-2010-05-25.orig/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -10,15 +10,19 @@
 #include <linux/dmi.h>
 #include <linux/cpumask.h>

-#include <asm/smp.h>
+#include "realmode/wakeup.h"
+#include "sleep.h"

 #ifndef CONFIG_ACPI_PV_SLEEP
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_wakeup_address;
 unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;

-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+/* address in low memory of the wakeup routine. */
+static unsigned long acpi_realmode;
+
+#ifdef CONFIG_64BIT
+static char temp_stack[10240];
+#endif
 #endif

 /**
@@ -26,17 +30,69 @@ extern unsigned long acpi_copy_wakeup_ro
  *
  * Create an identity mapped page table and copy the wakeup routine to
  * low memory.
+ *
+ * Note that this is too late to change acpi_wakeup_address.
  */
 int acpi_save_state_mem(void)
 {
 #ifndef CONFIG_ACPI_PV_SLEEP
-	if (!acpi_wakeup_address) {
-		printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
+	struct wakeup_header *header;
+
+	if (!acpi_realmode) {
+		printk(KERN_ERR "Could not allocate memory during boot, "
+		       "S3 disabled\n");
 		return -ENOMEM;
 	}
-	memcpy((void *)acpi_wakeup_address, &wakeup_start,
-	       &wakeup_end - &wakeup_start);
-	acpi_copy_wakeup_routine(acpi_wakeup_address);
+	memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
+
+	header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
+	if (header->signature != 0x51ee1111) {
+		printk(KERN_ERR "wakeup header does not match\n");
+		return -EINVAL;
+	}
+
+	header->video_mode = saved_video_mode;
+
+	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+	/* GDT[0]: GDT self-pointer */
+	header->wakeup_gdt[0] =
+		(u64)(sizeof(header->wakeup_gdt) - 1) +
+		((u64)(acpi_wakeup_address +
+			((char *)&header->wakeup_gdt - (char *)acpi_realmode))
+				<< 16);
+	/* GDT[1]: real-mode-like code segment */
+	header->wakeup_gdt[1] = (0x009bULL << 40) +
+		((u64)acpi_wakeup_address << 16) + 0xffff;
+	/* GDT[2]: real-mode-like data segment */
+	header->wakeup_gdt[2] = (0x0093ULL << 40) +
+		((u64)acpi_wakeup_address << 16) + 0xffff;
+
+#ifndef CONFIG_64BIT
+	store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+	header->pmode_efer_low = nx_enabled;
+	if (header->pmode_efer_low & 1) {
+		/* This is strange, why not save efer, always? */
+		rdmsr(MSR_EFER, header->pmode_efer_low,
+			header->pmode_efer_high);
+	}
+#endif /* !CONFIG_64BIT */
+
+	header->pmode_cr0 = read_cr0();
+	header->pmode_cr4 = read_cr4();
+	header->realmode_flags = acpi_realmode_flags;
+	header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+	header->pmode_entry = (u32)&wakeup_pmode_return;
+	header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+	saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+	header->trampoline_segment = setup_trampoline() >> 4;
+	init_rsp = (unsigned long)temp_stack + 4096;
+	initial_code = (unsigned long)wakeup_long64;
+	saved_magic = 0x123456789abcdef0;
+#endif /* CONFIG_64BIT */
 #endif

 	return 0;
@@ -61,15 +117,20 @@ void acpi_restore_state_mem(void)
 void __init acpi_reserve_bootmem(void)
 {
 #ifndef CONFIG_ACPI_PV_SLEEP
-	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
+	if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
 		printk(KERN_ERR
 		       "ACPI: Wakeup code way too big, S3 disabled.\n");
 		return;
 	}

-	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
-	if (!acpi_wakeup_address)
+	acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
+
+	if (!acpi_realmode) {
 		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+		return;
+	}
+
+	acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
 #endif
 }

--- head-2010-05-25.orig/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -5,7 +5,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
@@ -13,6 +12,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/pat.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
@@ -69,9 +69,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuin
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;

-struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};

-static void __cpuinit default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
 	/* Not much we can do here... */
 	/* Check if at least it has cpuid */
@@ -88,11 +88,11 @@ static struct cpu_dev __cpuinitdata defa
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 };
-static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;

 static int __init cachesize_setup(char *str)
 {
-	get_option (&str, &cachesize_override);
+	get_option(&str, &cachesize_override);
 	return 1;
 }
 __setup("cachesize=", cachesize_setup);
@@ -114,12 +114,12 @@ int __cpuinit get_model_name(struct cpui
 	/* Intel chips right-justify this string for some dumb reason;
 	   undo that brain damage */
 	p = q = &c->x86_model_id[0];
-	while ( *p == ' ' )
+	while (*p == ' ')
 	     p++;
-	if ( p != q ) {
-	     while ( *p )
+	if (p != q) {
+	     while (*p)
 		  *q++ = *p++;
-	     while ( q <= &c->x86_model_id[48] )
+	     while (q <= &c->x86_model_id[48])
 		  *q++ = '\0';	/* Zero-pad the rest */
 	}

@@ -137,7 +137,7 @@ void __cpuinit display_cacheinfo(struct
 		cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
 		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
 			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size=(ecx>>24)+(edx>>24);
+		c->x86_cache_size = (ecx>>24)+(edx>>24);
 	}

 	if (n < 0x80000006)	/* Some chips just has a large L1. */
@@ -145,16 +145,16 @@ void __cpuinit display_cacheinfo(struct

 	ecx = cpuid_ecx(0x80000006);
 	l2size = ecx >> 16;
-
+
 	/* do processor-specific cache resizing */
 	if (this_cpu->c_size_cache)
-		l2size = this_cpu->c_size_cache(c,l2size);
+		l2size = this_cpu->c_size_cache(c, l2size);

 	/* Allow user to override all this if necessary. */
 	if (cachesize_override != -1)
 		l2size = cachesize_override;

-	if ( l2size == 0 )
+	if (l2size == 0)
 		return;		/* Again, no L2 cache is possible */

 	c->x86_cache_size = l2size;
@@ -163,16 +163,19 @@ void __cpuinit display_cacheinfo(struct
 	       l2size, ecx & 0xFF);
 }

-/* Naming convention should be: <Name> [(<Codename>)] */
-/* This table only is used unless init_<vendor>() below doesn't set it; */
-/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
+ *
+ */

 /* Look up CPU names by table lookup. */
 static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
 {
 	struct cpu_model_info *info;

-	if ( c->x86_model >= 16 )
+	if (c->x86_model >= 16)
 		return NULL;	/* Range check */

 	if (!this_cpu)
@@ -197,9 +200,9 @@ static void __cpuinit get_cpu_vendor(str

 	for (i = 0; i < X86_VENDOR_NUM; i++) {
 		if (cpu_devs[i]) {
-			if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			     !strcmp(v,cpu_devs[i]->c_ident[1]))) {
+			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+			    (cpu_devs[i]->c_ident[1] &&
+			     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
 				c->x86_vendor = i;
 				if (!early)
 					this_cpu = cpu_devs[i];
@@ -217,7 +220,7 @@ static void __cpuinit get_cpu_vendor(str
 }


-static int __init x86_fxsr_setup(char * s)
+static int __init x86_fxsr_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_FXSR);
 	setup_clear_cpu_cap(X86_FEATURE_XMM);
@@ -226,7 +229,7 @@ static int __init x86_fxsr_setup(char *
 __setup("nofxsr", x86_fxsr_setup);


-static int __init x86_sep_setup(char * s)
+static int __init x86_sep_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_SEP);
 	return 1;
@@ -315,12 +318,15 @@ static void __cpuinit early_get_cap(stru

 }

-/* Do minimum CPU detection early.
-   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
-   The others are not touched to avoid unwanted side effects.
-
-   WARNING: this function is only called on the BP.  Don't add code here
-   that is supposed to run on all CPUs. */
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the BP.  Don't add code here
+ * that is supposed to run on all CPUs.
+ */
 static void __init early_cpu_detect(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -335,19 +341,14 @@ static void __init early_cpu_detect(void

 	get_cpu_vendor(c, 1);

-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		early_init_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		early_init_intel(c);
-		break;
-	}
+	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+	    cpu_devs[c->x86_vendor]->c_early_init)
+		cpu_devs[c->x86_vendor]->c_early_init(c);

 	early_get_cap(c);
 }

-static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 {
 	u32 tfms, xlvl;
 	unsigned int ebx;
@@ -358,13 +359,12 @@ static void __cpuinit generic_identify(s
 		      (unsigned int *)&c->x86_vendor_id[0],
 		      (unsigned int *)&c->x86_vendor_id[8],
 		      (unsigned int *)&c->x86_vendor_id[4]);
-
+
 		get_cpu_vendor(c, 0);
 		/* Initialize the standard set of capabilities */
 		/* Note that the vendor-specific code below might override */
-
 		/* Intel-defined flags: level 0x00000001 */
-		if ( c->cpuid_level >= 0x00000001 ) {
+		if (c->cpuid_level >= 0x00000001) {
 			u32 capability, excap;
 			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 			c->x86_capability[0] = capability;
@@ -376,14 +376,16 @@ static void __cpuinit generic_identify(s
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
+			c->initial_apicid = (ebx >> 24) & 0xFF;
 #ifndef CONFIG_XEN
 #ifdef CONFIG_X86_HT
-			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+			c->apicid = phys_pkg_id(c->initial_apicid, 0);
+			c->phys_proc_id = c->initial_apicid;
 #else
-			c->apicid = (ebx >> 24) & 0xFF;
+			c->apicid = c->initial_apicid;
 #endif
 #endif
-			if (c->x86_capability[0] & (1<<19))
+			if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
 				c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
 		} else {
 			/* Have CPUID level 0 only - unheard of */
@@ -392,33 +394,30 @@ static void __cpuinit generic_identify(s

 		/* AMD-defined flags: level 0x80000001 */
 		xlvl = cpuid_eax(0x80000000);
-		if ( (xlvl & 0xffff0000) == 0x80000000 ) {
-			if ( xlvl >= 0x80000001 ) {
+		if ((xlvl & 0xffff0000) == 0x80000000) {
+			if (xlvl >= 0x80000001) {
 				c->x86_capability[1] = cpuid_edx(0x80000001);
 				c->x86_capability[6] = cpuid_ecx(0x80000001);
 			}
-			if ( xlvl >= 0x80000004 )
+			if (xlvl >= 0x80000004)
 				get_model_name(c); /* Default name */
 		}

 		init_scattered_cpuid_features(c);
 	}

-#ifdef CONFIG_X86_HT
-	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
-#endif
 }

 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
+	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
 		/* Disable processor serial number */
-		unsigned long lo,hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+		unsigned long lo, hi;
+		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
 		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
 		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_bit(X86_FEATURE_PN, c->x86_capability);
+		clear_cpu_cap(c, X86_FEATURE_PN);

 		/* Disabling the serial number may affect the cpuid level */
 		c->cpuid_level = cpuid_eax(0);
@@ -453,9 +452,11 @@ void __cpuinit identify_cpu(struct cpuin
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);

 	if (!have_cpuid_p()) {
-		/* First of all, decide if this is a 486 or higher */
-		/* It's a 486 if we can modify the AC flag */
-		if ( flag_is_changeable_p(X86_EFLAGS_AC) )
+		/*
+		 * First of all, decide if this is a 486 or higher
+		 * It's a 486 if we can modify the AC flag
+		 */
+		if (flag_is_changeable_p(X86_EFLAGS_AC))
 			c->x86 = 4;
 		else
 			c->x86 = 3;
@@ -488,10 +489,10 @@ void __cpuinit identify_cpu(struct cpuin
 	 */

 	/* If the model name is still unset, do table lookup. */
-	if ( !c->x86_model_id[0] ) {
+	if (!c->x86_model_id[0]) {
 		char *p;
 		p = table_lookup_model(c);
-		if ( p )
+		if (p)
 			strcpy(c->x86_model_id, p);
 		else
 			/* Last resort... */
@@ -505,9 +506,9 @@ void __cpuinit identify_cpu(struct cpuin
 	 * common between the CPUs.  The first time this routine gets
 	 * executed, c == &boot_cpu_data.
 	 */
-	if ( c != &boot_cpu_data ) {
+	if (c != &boot_cpu_data) {
 		/* AND the already accumulated flags with these */
-		for ( i = 0 ; i < NCAPINTS ; i++ )
+		for (i = 0 ; i < NCAPINTS ; i++)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}

@@ -551,7 +552,7 @@ void __cpuinit detect_ht(struct cpuinfo_

 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1 ) {
+	} else if (smp_num_siblings > 1) {

 		if (smp_num_siblings > NR_CPUS) {
 			printk(KERN_WARNING "CPU: Unsupported number of the "
@@ -561,7 +562,7 @@ void __cpuinit detect_ht(struct cpuinfo_
 		}

 		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);

 		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
 		       c->phys_proc_id);
@@ -572,7 +573,7 @@ void __cpuinit detect_ht(struct cpuinfo_

 		core_bits = get_count_order(c->x86_max_cores);

-		c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
 					       ((1 << core_bits) - 1);

 		if (c->x86_max_cores > 1)
@@ -606,7 +607,7 @@ void __cpuinit print_cpu_info(struct cpu
 	else
 		printk("%s", c->x86_model_id);

-	if (c->x86_mask || c->cpuid_level >= 0)
+	if (c->x86_mask || c->cpuid_level >= 0)
 		printk(" stepping %02x\n", c->x86_mask);
 	else
 		printk("\n");
@@ -625,24 +626,17 @@ __setup("clearcpuid=", setup_disablecpui

 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-/* This is hacky. :)
- * We're emulating future behavior.
- * In the future, the cpu-specific init functions will be called implicitly
- * via the magic of initcalls.
- * They will insert themselves into the cpu_devs structure.
- * Then, when cpu_init() is called, we can just iterate over that array.
- */
 void __init early_cpu_init(void)
 {
-	intel_cpu_init();
-	cyrix_init_cpu();
-	nsc_init_cpu();
-	amd_init_cpu();
-	centaur_init_cpu();
-	transmeta_init_cpu();
-	nexgen_init_cpu();
-	umc_init_cpu();
+	struct cpu_vendor_dev *cvdev;
+
+	for (cvdev = __x86cpuvendor_start ;
+	     cvdev < __x86cpuvendor_end   ;
+	     cvdev++)
+		cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+
 	early_cpu_detect();
+	validate_pat_support(&boot_cpu_data);
 }

 /* Make sure %fs is initialized properly in idle threads */
@@ -687,7 +681,7 @@ void __cpuinit cpu_init(void)
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 #ifndef CONFIG_X86_NO_TSS
-	struct tss_struct * t = &per_cpu(init_tss, cpu);
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
 #endif
 	struct thread_struct *thread = &curr->thread;

@@ -740,7 +734,7 @@ void __cpuinit cpu_init(void)
 	mxcsr_feature_mask_init();
 }

-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
 void __cpuinit cpu_uninit(void)
 {
 	int cpu = raw_smp_processor_id();
--- head-2010-05-25.orig/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -35,6 +35,8 @@ struct mtrr_ops *mtrr_if = &generic_mtrr
 unsigned int num_var_ranges;
 unsigned int mtrr_usage_table[MAX_VAR_RANGES];

+static u64 tom2;
+
 static void __init set_num_var_ranges(void)
 {
 	struct xen_platform_op op;
@@ -162,8 +164,144 @@ mtrr_del(int reg, unsigned long base, un
 EXPORT_SYMBOL(mtrr_add);
 EXPORT_SYMBOL(mtrr_del);

+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+	int i, error;
+	u64 start_mfn, end_mfn, base_mfn, top_mfn;
+	u8 prev_match, curr_match;
+	struct xen_platform_op op;
+
+	if (!is_initial_xendomain())
+		return MTRR_TYPE_WRBACK;
+
+	if (!num_var_ranges)
+		return 0xFF;
+
+	start_mfn = start >> PAGE_SHIFT;
+	/* Make end inclusive end, instead of exclusive */
+	end_mfn = --end >> PAGE_SHIFT;
+
+	/* Look in fixed ranges. Just return the type as per start */
+	if (start_mfn < 0x100) {
+#if 0//todo
+		op.cmd = XENPF_read_memtype;
+		op.u.read_memtype.reg = ???;
+		error = HYPERVISOR_platform_op(&op);
+		if (!error)
+			return op.u.read_memtype.type;
+#endif
+		return MTRR_TYPE_UNCACHABLE;
+	}
+
+	/*
+	 * Look in variable ranges
+	 * Look of multiple ranges matching this address and pick type
+	 * as per MTRR precedence
+	 */
+	prev_match = 0xFF;
+	for (i = 0; i < num_var_ranges; ++i) {
+		op.cmd = XENPF_read_memtype;
+		op.u.read_memtype.reg = i;
+		error = HYPERVISOR_platform_op(&op);
+
+		if (error || !op.u.read_memtype.nr_mfns)
+			continue;
+
+		base_mfn = op.u.read_memtype.mfn;
+		top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
+
+		if (base_mfn > end_mfn || start_mfn > top_mfn) {
+			continue;
+		}
+
+		if (base_mfn > start_mfn || end_mfn > top_mfn) {
+			return 0xFE;
+		}
+
+		curr_match = op.u.read_memtype.type;
+		if (prev_match == 0xFF) {
+			prev_match = curr_match;
+			continue;
+		}
+
+		if (prev_match == MTRR_TYPE_UNCACHABLE ||
+		    curr_match == MTRR_TYPE_UNCACHABLE) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+
+		if ((prev_match == MTRR_TYPE_WRBACK &&
+		     curr_match == MTRR_TYPE_WRTHROUGH) ||
+		    (prev_match == MTRR_TYPE_WRTHROUGH &&
+		     curr_match == MTRR_TYPE_WRBACK)) {
+			prev_match = MTRR_TYPE_WRTHROUGH;
+			curr_match = MTRR_TYPE_WRTHROUGH;
+		}
+
+		if (prev_match != curr_match) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+	}
+
+	if (tom2) {
+		if (start >= (1ULL<<32) && (end < tom2))
+			return MTRR_TYPE_WRBACK;
+	}
+
+	if (prev_match != 0xFF)
+		return prev_match;
+
+#if 0//todo
+	op.cmd = XENPF_read_def_memtype;
+	error = HYPERVISOR_platform_op(&op);
+	if (!error)
+		return op.u.read_def_memtype.type;
+#endif
+	return MTRR_TYPE_UNCACHABLE;
+}
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+	u32 l, h;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return 0;
+	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+		return 0;
+	/* In case some hypervisor doesn't pass SYSCFG through */
+	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+		return 0;
+	/*
+	 * Memory between 4GB and top of mem is forced WB by this magic bit.
+	 * Reserved before K8RevF, but should be zero there.
+	 */
+	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+		 (Tom2Enabled | Tom2ForceMemTypeWB))
+		return 1;
+	return 0;
+}
+
 void __init mtrr_bp_init(void)
 {
+	if (amd_special_default_mtrr()) {
+		/* TOP_MEM2 */
+		rdmsrl(MSR_K8_TOP_MEM2, tom2);
+		tom2 &= 0xffffff8000000ULL;
+	}
 }

 void mtrr_ap_init(void)
--- head-2010-05-25.orig/arch/x86/kernel/e820_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/e820_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -469,7 +469,7 @@ int __init sanitize_e820_map(struct e820
  * thinkpad 560x, for example, does not cooperate with the memory
  * detection code.)
  */
-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 {
 #ifndef CONFIG_XEN
 	/* Only one memory region (or negative)? Ignore it */
@@ -480,33 +480,17 @@ int __init copy_e820_map(struct e820entr
 #endif

 	do {
-		unsigned long long start = biosmap->addr;
-		unsigned long long size = biosmap->size;
-		unsigned long long end = start + size;
-		unsigned long type = biosmap->type;
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;

 		/* Overflow in 64 bits? Ignore the memory map. */
 		if (start > end)
 			return -1;

-#ifndef CONFIG_XEN
-		/*
-		 * Some BIOSes claim RAM in the 640k - 1M region.
-		 * Not right. Fix it up.
-		 */
-		if (type == E820_RAM) {
-			if (start < 0x100000ULL && end > 0xA0000ULL) {
-				if (start < 0xA0000ULL)
-					add_memory_region(start, 0xA0000ULL-start, type);
-				if (end <= 0x100000ULL)
-					continue;
-				start = 0x100000ULL;
-				size = end - start;
-			}
-		}
-#endif
 		add_memory_region(start, size, type);
-	} while (biosmap++,--nr_map);
+	} while (biosmap++, --nr_map);

 #ifdef CONFIG_XEN
 	if (is_initial_xendomain()) {
@@ -528,7 +512,7 @@ int __init copy_e820_map(struct e820entr
 /*
  * Find the highest page frame number we have available
  */
-void __init find_max_pfn(void)
+void __init propagate_e820_map(void)
 {
 	int i;

@@ -801,7 +785,7 @@ static int __init parse_memmap(char *arg
 		 * size before original memory map is
 		 * reset.
 		 */
-		find_max_pfn();
+		propagate_e820_map();
 		saved_max_pfn = max_pfn;
 #endif
 		e820.nr_map = 0;
--- head-2010-05-25.orig/arch/x86/kernel/e820_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/e820_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -41,11 +41,11 @@ unsigned long end_pfn;

 #ifndef CONFIG_XEN
 /*
- * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
- * The direct mapping extends to end_pfn_map, so that we can directly access
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
  * apertures, ACPI and other tables without having to play with fixmaps.
  */
-unsigned long end_pfn_map;
+unsigned long max_pfn_mapped;
 #endif

 /*
@@ -65,8 +65,8 @@ struct early_res {
 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
 #ifndef CONFIG_XEN
 	{ 0, PAGE_SIZE, "BIOS data page" },			/* BIOS data page */
-#ifdef CONFIG_SMP
-	{ SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
+#ifdef CONFIG_X86_TRAMPOLINE
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
 #endif
 #endif
 	{}
@@ -91,19 +91,47 @@ void __init reserve_early(unsigned long
 		strncpy(r->name, name, sizeof(r->name) - 1);
 }

-void __init early_res_to_bootmem(void)
+void __init free_early(unsigned long start, unsigned long end)
+{
+	struct early_res *r;
+	int i, j;
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (start == r->start && end == r->end)
+			break;
+	}
+	if (i >= MAX_EARLY_RES || !early_res[i].end)
+		panic("free_early on not reserved area: %lx-%lx!", start, end);
+
+	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
 {
 	int i;
+	unsigned long final_start, final_end;
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
 		struct early_res *r = &early_res[i];
-		printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
-			r->start, r->end - 1, r->name);
-		reserve_bootmem_generic(r->start, r->end - r->start);
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end)
+			continue;
+		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
+			final_start, final_end - 1, r->name);
+		reserve_bootmem_generic(final_start, final_end - final_start);
 	}
 }

 /* Check for already reserved areas */
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
+static inline int __init
+bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
 {
 	int i;
 	unsigned long addr = *addrp, last;
@@ -113,7 +141,7 @@ again:
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
 		struct early_res *r = &early_res[i];
 		if (last >= r->start && addr < r->end) {
-			*addrp = addr = r->end;
+			*addrp = addr = round_up(r->end, align);
 			changed = 1;
 			goto again;
 		}
@@ -121,6 +149,40 @@ again:
 	return changed;
 }

+/* Check for already reserved areas */
+static inline int __init
+bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
+{
+	int i;
+	unsigned long addr = *addrp, last;
+	unsigned long size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
@@ -196,26 +258,27 @@ int __init e820_all_mapped(unsigned long
  * Find a free area with specified alignment in a specific range.
  */
 unsigned long __init find_e820_area(unsigned long start, unsigned long end,
-				    unsigned size, unsigned long align)
+				    unsigned long size, unsigned long align)
 {
 	int i;
-	unsigned long mask = ~(align - 1);

 	for (i = 0; i < e820.nr_map; i++) {
 		struct e820entry *ei = &e820.map[i];
-		unsigned long addr = ei->addr, last;
+		unsigned long addr, last;
+		unsigned long ei_last;

 		if (ei->type != E820_RAM)
 			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
 		if (addr < start)
-			addr = start;
-		if (addr > ei->addr + ei->size)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
 			continue;
-		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
+		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
 			;
-		addr = (addr + align - 1) & mask;
 		last = addr + size;
-		if (last > ei->addr + ei->size)
+		if (last > ei_last)
 			continue;
 		if (last > end)
 			continue;
@@ -225,6 +288,40 @@ unsigned long __init find_e820_area(unsi
 }

 /*
+ * Find next free range after *start
+ */
+unsigned long __init find_e820_area_size(unsigned long start,
+					 unsigned long *sizep,
+					 unsigned long align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long addr, last;
+		unsigned long ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (bad_addr_size(&addr, sizep, align) &&
+			addr + *sizep <= ei_last)
+			;
+		last = addr + *sizep;
+		if (last > ei_last)
+			continue;
+		return addr;
+	}
+	return -1UL;
+
+}
+/*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
@@ -233,31 +330,29 @@ unsigned long __init e820_end_of_ram(voi

 	end_pfn = find_max_pfn_with_active_regions();

-	if (end_pfn > end_pfn_map)
-		end_pfn_map = end_pfn;
-	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
-		end_pfn_map = MAXMEM>>PAGE_SHIFT;
+	if (end_pfn > max_pfn_mapped)
+		max_pfn_mapped = end_pfn;
+	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
+		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
 	if (end_pfn > end_user_pfn)
 		end_pfn = end_user_pfn;
-	if (end_pfn > end_pfn_map)
-		end_pfn = end_pfn_map;
+	if (end_pfn > max_pfn_mapped)
+		end_pfn = max_pfn_mapped;

-	printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
 	return end_pfn;
 }

 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
-void __init e820_reserve_resources(struct e820entry *e820, int nr_map,
-				   struct resource *code_resource,
-				   struct resource *data_resource,
-				   struct resource *bss_resource)
+void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
 {
 	int i;
+	struct resource *res;
+
+	res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
 	for (i = 0; i < nr_map; i++) {
-		struct resource *res;
-		res = alloc_bootmem_low(sizeof(struct resource));
 		switch (e820[i].type) {
 		case E820_RAM:	res->name = "System RAM"; break;
 		case E820_ACPI:	res->name = "ACPI Tables"; break;
@@ -267,26 +362,8 @@ void __init e820_reserve_resources(struc
 		res->start = e820[i].addr;
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		request_resource(&iomem_resource, res);
-		if (e820[i].type == E820_RAM) {
-			/*
-			 * We don't know which RAM region contains kernel data,
-			 * so we try it repeatedly and let the resource manager
-			 * test it.
-			 */
-#ifndef CONFIG_XEN
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#endif
-#ifdef CONFIG_KEXEC
-			if (crashk_res.start != crashk_res.end)
-				request_resource(res, &crashk_res);
-#ifdef CONFIG_XEN
-			xen_machine_kexec_register_resources(res);
-#endif
-#endif
-		}
+		insert_resource(&iomem_resource, res);
+		res++;
 	}
 }

@@ -345,9 +422,9 @@ static int __init e820_find_active_regio
 	if (*ei_startpfn >= *ei_endpfn)
 		return 0;

-	/* Check if end_pfn_map should be updated */
-	if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
-		end_pfn_map = *ei_endpfn;
+	/* Check if max_pfn_mapped should be updated */
+	if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
+		max_pfn_mapped = *ei_endpfn;

 	/* Skip if map is outside the node */
 	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
@@ -678,10 +755,10 @@ static int __init copy_e820_map(struct e
 #endif

 	do {
-		unsigned long start = biosmap->addr;
-		unsigned long size = biosmap->size;
-		unsigned long end = start + size;
-		unsigned long type = biosmap->type;
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;

 		/* Overflow in 64 bits? Ignore the memory map. */
 		if (start > end)
@@ -731,11 +808,7 @@ char * __init machine_specific_memory_se
 	char *who = "Xen";
 	int rc;
 	struct xen_memory_map memmap;
-	/*
-	 * This is rather large for a stack variable but this early in
-	 * the boot process we know we have plenty slack space.
-	 */
-	struct e820entry map[E820MAX];
+	static struct e820entry __initdata map[E820MAX];

 	memmap.nr_entries = E820MAX;
 	set_xen_guest_handle(memmap.buffer, map);
@@ -812,7 +885,7 @@ static int __init parse_memmap_opt(char
 		saved_max_pfn = e820_end_of_ram();
 		remove_all_active_ranges();
 #endif
-		end_pfn_map = 0;
+		max_pfn_mapped = 0;
 		e820.nr_map = 0;
 		userdef = 1;
 		return 0;
--- head-2010-05-25.orig/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -13,7 +13,7 @@
 #define VGABASE		(__ISA_IO_base + 0xb8000)

 static int max_ypos = 25, max_xpos = 80;
-static int current_ypos = 25, current_xpos = 0;
+static int current_ypos = 25, current_xpos;

 static void early_vga_write(struct console *con, const char *str, unsigned n)
 {
@@ -108,12 +108,12 @@ static __init void early_serial_init(cha

 	if (*s) {
 		unsigned port;
-		if (!strncmp(s,"0x",2)) {
+		if (!strncmp(s, "0x", 2)) {
 			early_serial_base = simple_strtoul(s, &e, 16);
 		} else {
 			static int bases[] = { 0x3f8, 0x2f8 };

-			if (!strncmp(s,"ttyS",4))
+			if (!strncmp(s, "ttyS", 4))
 				s += 4;
 			port = simple_strtoul(s, &e, 10);
 			if (port > 1 || s == e)
@@ -223,7 +223,7 @@ static struct console simnow_console = {

 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
-static int early_console_initialized = 0;
+static int early_console_initialized;

 void early_printk(const char *fmt, ...)
 {
@@ -231,9 +231,9 @@ void early_printk(const char *fmt, ...)
 	int n;
 	va_list ap;

-	va_start(ap,fmt);
-	n = vscnprintf(buf,512,fmt,ap);
-	early_console->write(early_console,buf,n);
+	va_start(ap, fmt);
+	n = vscnprintf(buf, 512, fmt, ap);
+	early_console->write(early_console, buf, n);
 	va_end(ap);
 }

@@ -259,16 +259,16 @@ static int __init setup_early_printk(cha
 		early_console = &early_serial_console;
 	} else if (!strncmp(buf, "vga", 3)) {
 #ifndef CONFIG_XEN
-	           && boot_params.screen_info.orig_video_isVGA == 1) {
+		&& boot_params.screen_info.orig_video_isVGA == 1) {
 		max_xpos = boot_params.screen_info.orig_video_cols;
 		max_ypos = boot_params.screen_info.orig_video_lines;
 		current_ypos = boot_params.screen_info.orig_y;
 #endif
 		early_console = &early_vga_console;
- 	} else if (!strncmp(buf, "simnow", 6)) {
- 		simnow_init(buf + 6);
- 		early_console = &simnow_console;
- 		keep_early = 1;
+	} else if (!strncmp(buf, "simnow", 6)) {
+		simnow_init(buf + 6);
+		early_console = &simnow_console;
+		keep_early = 1;
 #ifdef CONFIG_XEN
 	} else if (!strncmp(buf, "xen", 3)) {
 		early_console = &xenboot_console;
--- head-2010-05-25.orig/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/entry.S
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
@@ -51,6 +50,7 @@
 #include <asm/desc.h>
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
+#include <asm/processor-flags.h>
 #include "irq_vectors.h"
 #include <xen/interface/xen.h>

@@ -69,12 +69,6 @@

 #define nr_syscalls ((syscall_table_size)/4)

-CF_MASK		= 0x00000001
-TF_MASK		= 0x00000100
-IF_MASK		= 0x00000200
-DF_MASK		= 0x00000400
-NT_MASK		= 0x00004000
-VM_MASK		= 0x00020000
 /* Pseudo-eflags. */
 NMI_MASK	= 0x80000000

@@ -87,7 +81,7 @@ NMI_MASK	= 0x80000000

 .macro TRACE_IRQS_IRET
 #ifdef CONFIG_TRACE_IRQFLAGS
-	testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
+	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
 	jz 1f
 	TRACE_IRQS_ON
 1:
@@ -249,7 +243,7 @@ ret_from_intr:
 check_userspace:
 	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
 	movb PT_CS(%esp), %al
-	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
 	cmpl $USER_RPL, %eax
 	jb resume_kernel		# not returning to v8086 or userspace

@@ -258,6 +252,7 @@ ENTRY(resume_userspace)
  	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
+	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
 					# int/exception return?
@@ -274,7 +269,7 @@ need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
 	jz restore_all
-	testl $IF_MASK,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
 	jz restore_all
 	call preempt_schedule_irq
 	jmp need_resched
@@ -299,10 +294,10 @@ ENTRY(ia32_sysenter_target)
 	movl SYSENTER_stack_sp0(%esp),%esp
 sysenter_past_esp:
 	/*
-	 * No need to follow this irqs on/off section: the syscall
-	 * disabled irqs and here we enable it straight after entry:
+	 * Interrupts are disabled here, but we can't trace it until
+	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
+	 * we immediately enable interrupts at that point anyway.
 	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -310,6 +305,7 @@ sysenter_past_esp:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET esp, 0
 	pushfl
+	orl $X86_EFLAGS_IF, (%esp)
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl $(__USER_CS)
 	CFI_ADJUST_CFA_OFFSET 4
@@ -323,6 +319,11 @@ sysenter_past_esp:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eip, 0

+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 /*
  * Load the potential sixth argument from user stack.
  * Careful about security.
@@ -330,14 +331,12 @@ sysenter_past_esp:
 	cmpl $__PAGE_OFFSET-3,%ebp
 	jae syscall_fault
 1:	movl (%ebp),%ebp
+	movl %ebp,PT_EBP(%esp)
 .section __ex_table,"a"
 	.align 4
 	.long 1b,syscall_fault
 .previous

-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 	test_tif %ebp
 	jnz syscall_trace_entry
@@ -414,7 +413,7 @@ syscall_exit:
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
-	testl $TF_MASK,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
+	testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
 	jz no_singlestep
 	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
 no_singlestep:
@@ -430,7 +429,7 @@ restore_all:
 	# See comments in process.c:copy_thread() for details.
 	movb PT_OLDSS(%esp), %ah
 	movb PT_CS(%esp), %al
-	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
@@ -438,7 +437,7 @@ restore_nocheck:
 #else
 restore_nocheck:
 	movl PT_EFLAGS(%esp), %eax
-	testl $(VM_MASK|NMI_MASK), %eax
+	testl $(X86_EFLAGS_VM|NMI_MASK), %eax
 	CFI_REMEMBER_STATE
 	jnz hypervisor_iret
 	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
@@ -456,7 +455,7 @@ restore_nocheck_notrace:
 irq_return:
 	INTERRUPT_RETURN
 .section .fixup,"ax"
-iret_exc:
+ENTRY(iret_exc)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -560,7 +559,7 @@ work_resched:
 work_notifysig:				# deal with pending signals and
 					# notify-resume requests
 #ifdef CONFIG_VM86
-	testl $VM_MASK, PT_EFLAGS(%esp)
+	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
 	movl %esp, %eax
 	jne work_notifysig_v86		# returning to kernel-space or
 					# vm86-space
@@ -617,9 +616,6 @@ END(syscall_exit_work)

 	RING0_INT_FRAME			# can't unwind into user space anyway
 syscall_fault:
-	pushl %eax			# save orig_eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 	movl $-EFAULT,PT_EAX(%esp)
 	jmp resume_userspace
--- head-2010-05-25.orig/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:12:36.000000000 +0100
@@ -331,19 +331,17 @@ badsys:
 	/* Do syscall tracing */
 tracesys:
 	SAVE_REST
-	movq $-ENOSYS,RAX(%rsp)
+	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
-	movq $-ENOSYS,%rcx
-	cmova %rcx,%rax
-	ja  1f
+	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-1:	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */

 /*
--- head-2010-05-25.orig/arch/x86/kernel/head64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/percpu.h>
 #include <linux/start_kernel.h>
+#include <linux/io.h>
 #include <linux/module.h>

 #include <asm/processor.h>
@@ -29,6 +30,7 @@
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 #include <asm/e820.h>
+#include <asm/bios_ebda.h>

 unsigned long start_pfn;

@@ -75,34 +77,75 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
 unsigned int machine_to_phys_order;
 EXPORT_SYMBOL(machine_to_phys_order);

-#define EBDA_ADDR_POINTER 0x40E
+#define BIOS_LOWMEM_KILOBYTES 0x413

-static __init void reserve_ebda(void)
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+static void __init reserve_ebda_region(void)
 {
 #ifndef CONFIG_XEN
-	unsigned ebda_addr, ebda_size;
+	unsigned int lowmem, ebda_addr;

-	/*
-	 * there is a real-mode segmented pointer pointing to the
-	 * 4K EBDA area at 0x40E
-	 */
-	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
-	ebda_addr <<= 4;
-
-	if (!ebda_addr)
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
 		return;

-	ebda_size = *(unsigned short *)__va(ebda_addr);
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;

-	/* Round EBDA up to pages */
-	if (ebda_size == 0)
-		ebda_size = 1;
-	ebda_size <<= 10;
-	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
-	if (ebda_size > 64*1024)
-		ebda_size = 64*1024;
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early(lowmem, 0x100000, "BIOS reserved");
+#endif
+}

-	reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
+static void __init reserve_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	unsigned long pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
 #endif
 }

@@ -112,6 +155,19 @@ void __init x86_64_start_kernel(char * r
 	unsigned long machine_to_phys_nr_ents;
 	int i;

+	/*
+	 * Build-time sanity checks on the kernel image and module
+	 * area mappings. (these are purely build-time and produce no code)
+	 */
+	BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
+	BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+	BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+				(__START_KERNEL & PGDIR_MASK)));
+
 	xen_setup_features();

 	xen_start_info = (struct start_info *)real_mode_data;
@@ -140,7 +196,7 @@ void __init x86_64_start_kernel(char * r
 	/* Cleanup the over mapped high alias */
 	cleanup_highmap();

-	for (i = 0; i < IDT_ENTRIES; i++) {
+	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
 		set_intr_gate(i, &early_idt_handlers[i]);
 #else
@@ -163,7 +219,8 @@ void __init x86_64_start_kernel(char * r
 	reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
 		      start_pfn << PAGE_SHIFT, "Xen provided");

-	reserve_ebda();
+	reserve_ebda_region();
+	reserve_setup_data();

 	/*
 	 * At this point everything still needed from the boot loader
--- head-2010-05-25.orig/arch/x86/kernel/head_32-xen.S	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/head_32-xen.S	2010-03-24 15:12:36.000000000 +0100
@@ -69,7 +69,7 @@ ENTRY(startup_32)
 	cld			# gcc2 wants the direction flag cleared at all times

 	pushl $0		# fake return address for unwinder
-	jmp start_kernel
+	jmp i386_start_kernel

 #define HYPERCALL_PAGE_OFFSET 0x1000
 .org HYPERCALL_PAGE_OFFSET
--- head-2010-05-25.orig/arch/x86/kernel/io_apic_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/io_apic_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -88,6 +88,16 @@ int sis_apic_bug = -1;
  */
 int nr_ioapic_registers[MAX_IO_APICS];

+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
 static int disable_timer_pin_1 __initdata;

 /*
@@ -863,10 +873,7 @@ static int __init find_isa_irq_pin(int i
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].mpc_srcbus;

-		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
-		    ) &&
+		if (test_bit(lbus, mp_bus_not_pci) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))

@@ -882,10 +889,7 @@ static int __init find_isa_irq_apic(int
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].mpc_srcbus;

-		if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
-		     mp_bus_id_to_type[lbus] == MP_BUS_MCA
-		    ) &&
+		if (test_bit(lbus, mp_bus_not_pci) &&
 		    (mp_irqs[i].mpc_irqtype == type) &&
 		    (mp_irqs[i].mpc_srcbusirq == irq))
 			break;
@@ -926,7 +930,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
 			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
 				break;

-		if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+		if (!test_bit(lbus, mp_bus_not_pci) &&
 		    !mp_irqs[i].mpc_irqtype &&
 		    (bus == lbus) &&
 		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -977,6 +981,7 @@ void __init setup_ioapic_dest(void)
 #endif /* !CONFIG_XEN */
 #endif

+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 /*
  * EISA Edge/Level control register, ELCR
  */
@@ -990,6 +995,13 @@ static int EISA_ELCR(unsigned int irq)
 			"Broken MPtable reports ISA irq %d\n", irq);
 	return 0;
 }
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)	(0)
+#define default_ISA_polarity(idx)	(0)

 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -997,13 +1009,7 @@ static int EISA_ELCR(unsigned int irq)
  * be read in from the ELCR */

 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx)	(0)
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_EISA_polarity(idx)	default_ISA_polarity(idx)

 /* PCI interrupts are always polarity one level triggered,
  * when listed as conforming in the MP table. */
@@ -1015,7 +1021,7 @@ static int EISA_ELCR(unsigned int irq)
  * when listed as conforming in the MP table. */

 #define default_MCA_trigger(idx)	(1)
-#define default_MCA_polarity(idx)	(0)
+#define default_MCA_polarity(idx)	default_ISA_polarity(idx)

 static int MPBIOS_polarity(int idx)
 {
@@ -1029,35 +1035,9 @@ static int MPBIOS_polarity(int idx)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 		{
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					polarity = default_ISA_polarity(idx);
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					polarity = default_EISA_polarity(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					polarity = default_PCI_polarity(idx);
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					polarity = default_MCA_polarity(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					polarity = 1;
-					break;
-				}
-			}
+			polarity = test_bit(bus, mp_bus_not_pci)?
+				default_ISA_polarity(idx):
+				default_PCI_polarity(idx);
 			break;
 		}
 		case 1: /* high active */
@@ -1098,11 +1078,15 @@ static int MPBIOS_trigger(int idx)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 		{
+			trigger = test_bit(bus, mp_bus_not_pci)?
+					default_ISA_trigger(idx):
+					default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 			switch (mp_bus_id_to_type[bus])
 			{
 				case MP_BUS_ISA: /* ISA pin */
 				{
-					trigger = default_ISA_trigger(idx);
+					/* set before the switch */
 					break;
 				}
 				case MP_BUS_EISA: /* EISA pin */
@@ -1112,7 +1096,7 @@ static int MPBIOS_trigger(int idx)
 				}
 				case MP_BUS_PCI: /* PCI pin */
 				{
-					trigger = default_PCI_trigger(idx);
+					/* set before the switch */
 					break;
 				}
 				case MP_BUS_MCA: /* MCA pin */
@@ -1127,6 +1111,7 @@ static int MPBIOS_trigger(int idx)
 					break;
 				}
 			}
+#endif
 			break;
 		}
 		case 1: /* edge */
@@ -1176,39 +1161,22 @@ static int pin_2_irq(int idx, int apic,
 	if (mp_irqs[idx].mpc_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");

-	switch (mp_bus_id_to_type[bus])
-	{
-		case MP_BUS_ISA: /* ISA pin */
-		case MP_BUS_EISA:
-		case MP_BUS_MCA:
-		{
-			irq = mp_irqs[idx].mpc_srcbusirq;
-			break;
-		}
-		case MP_BUS_PCI: /* PCI pin */
-		{
-			/*
-			 * PCI IRQs are mapped in order
-			 */
-			i = irq = 0;
-			while (i < apic)
-				irq += nr_ioapic_registers[i++];
-			irq += pin;
-
-			/*
-			 * For MPS mode, so far only needed by ES7000 platform
-			 */
-			if (ioapic_renumber_irq)
-				irq = ioapic_renumber_irq(apic, irq);
+	if (test_bit(bus, mp_bus_not_pci))
+		irq = mp_irqs[idx].mpc_srcbusirq;
+	else {
+		/*
+		 * PCI IRQs are mapped in order
+		 */
+		i = irq = 0;
+		while (i < apic)
+			irq += nr_ioapic_registers[i++];
+		irq += pin;

-			break;
-		}
-		default:
-		{
-			printk(KERN_ERR "unknown bus type %d.\n",bus);
-			irq = 0;
-			break;
-		}
+		/*
+		 * For MPS mode, so far only needed by ES7000 platform
+		 */
+		if (ioapic_renumber_irq)
+			irq = ioapic_renumber_irq(apic, irq);
 	}

 	/*
@@ -1314,7 +1282,6 @@ static void __init setup_IO_APIC_irqs(vo
 {
 	struct IO_APIC_route_entry entry;
 	int apic, pin, idx, irq, first_notcon = 1, vector;
-	unsigned long flags;

 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");

@@ -1380,9 +1347,7 @@ static void __init setup_IO_APIC_irqs(vo
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
-		spin_lock_irqsave(&ioapic_lock, flags);
-		__ioapic_write_entry(apic, pin, entry);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		ioapic_write_entry(apic, pin, entry);
 	}
 	}

@@ -1577,8 +1542,8 @@ void /*__init*/ print_local_APIC(void *

 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
-	v = apic_read(APIC_ID);
-	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
+			GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
@@ -1791,7 +1756,7 @@ void disable_IO_APIC(void)
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
 		entry.dest.physical.physical_dest =
-					GET_APIC_ID(apic_read(APIC_ID));
+					GET_APIC_ID(read_apic_id());

 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -2090,8 +2055,7 @@ static inline void init_IO_APIC_traps(vo
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
-		int tmp = irq;
-		if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+		if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -2166,7 +2130,7 @@ static void __init setup_nmi(void)
  * cycles as some i82489DX-based boards have glue logic that keeps the
  * 8259A interrupt line asserted until INTA.  --macro
  */
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
 {
 	int apic, pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
@@ -2218,8 +2182,6 @@ static inline void unlock_ExtINT_logic(v
 	ioapic_write_entry(apic, pin, entry0);
 }

-int timer_uses_ioapic_pin_0;
-
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2259,9 +2221,6 @@ static inline void __init check_timer(vo
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;

-	if (pin1 == 0)
-		timer_uses_ioapic_pin_0 = 1;
-
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);

@@ -2555,6 +2514,7 @@ void destroy_irq(unsigned int irq)
 	dynamic_irq_cleanup(irq);

 	spin_lock_irqsave(&vector_lock, flags);
+	clear_bit(irq_vector[irq], used_vectors);
 	irq_vector[irq] = 0;
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
@@ -2871,7 +2831,6 @@ int __init io_apic_get_redir_entries (in
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;

 	if (!IO_APIC_IRQ(irq)) {
 		printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -2912,9 +2871,7 @@ int io_apic_set_pci_routing (int ioapic,
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);

-	spin_lock_irqsave(&ioapic_lock, flags);
-	__ioapic_write_entry(ioapic, pin, entry);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(ioapic, pin, entry);

 	return 0;
 }
--- head-2010-05-25.orig/arch/x86/kernel/io_apic_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/io_apic_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -43,13 +43,15 @@
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/proto.h>
-#include <asm/mach_apic.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>

+#include <mach_ipi.h>
+#include <mach_apic.h>
+
 struct irq_cfg {
 #ifndef CONFIG_XEN
 	cpumask_t domain;
@@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock);
  */
 int nr_ioapic_registers[MAX_IO_APICS];

+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
@@ -181,11 +193,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }

-static int io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(unsigned int irq)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	int pending = 0;

 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = irq_2_pin + irq;
@@ -198,13 +209,17 @@ static int io_apic_level_ack_pending(uns
 			break;
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		pending |= (reg >> 14) & 1;
+		if ((reg >> 14) & 1) {
+			spin_unlock_irqrestore(&ioapic_lock, flags);
+			return true;
+		}
 		if (!entry->next)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
-	return pending;
+
+	return false;
 }
 #endif

@@ -762,7 +777,7 @@ static void __clear_irq_vector(int irq)
 		per_cpu(vector_irq, cpu)[vector] = -1;

 	cfg->vector = 0;
-	cfg->domain = CPU_MASK_NONE;
+	cpus_clear(cfg->domain);
 }

 void __setup_vector_irq(int cpu)
@@ -899,9 +914,8 @@ static void __init setup_IO_APIC_irqs(vo
 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;

-	memset(&entry,0,sizeof(entry));
+	memset(&entry, 0, sizeof(entry));

 	disable_8259A_irq(0);

@@ -929,10 +943,7 @@ static void __init setup_ExtINT_IRQ0_pin
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry);

 	enable_8259A_irq(0);
 }
@@ -1061,8 +1072,7 @@ void __apicdebuginit print_local_APIC(vo

 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
-	v = apic_read(APIC_ID);
-	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
@@ -1260,7 +1270,7 @@ void disable_IO_APIC(void)
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest          = GET_APIC_ID(apic_read(APIC_ID));
+		entry.dest          = GET_APIC_ID(read_apic_id());

 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1353,9 +1363,7 @@ static int ioapic_retrigger_irq(unsigned
 	unsigned long flags;

 	spin_lock_irqsave(&vector_lock, flags);
-	cpus_clear(mask);
-	cpu_set(first_cpu(cfg->domain), mask);
-
+	mask = cpumask_of_cpu(first_cpu(cfg->domain));
 	send_IPI_mask(mask, cfg->vector);
 	spin_unlock_irqrestore(&vector_lock, flags);

@@ -1519,8 +1527,7 @@ static inline void init_IO_APIC_traps(vo
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for (irq = 0; irq < NR_IRQS ; irq++) {
-		int tmp = irq;
-		if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
+		if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -1597,22 +1604,19 @@ static void __init setup_nmi(void)
  * cycles as some i82489DX-based boards have glue logic that keeps the
  * 8259A interrupt line asserted until INTA.  --macro
  */
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
 {
 	int apic, pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
 	unsigned char save_control, save_freq_select;
-	unsigned long flags;

 	pin  = find_isa_irq_pin(8, mp_INT);
 	apic = find_isa_irq_apic(8, mp_INT);
 	if (pin == -1)
 		return;

-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	*(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry0 = ioapic_read_entry(apic, pin);
+
 	clear_IO_APIC_pin(apic, pin);

 	memset(&entry1, 0, sizeof(entry1));
@@ -1625,10 +1629,7 @@ static inline void unlock_ExtINT_logic(v
 	entry1.trigger = 0;
 	entry1.vector = 0;

-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry1);

 	save_control = CMOS_READ(RTC_CONTROL);
 	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -1647,10 +1648,7 @@ static inline void unlock_ExtINT_logic(v
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
 	clear_IO_APIC_pin(apic, pin);

-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry0);
 }

 /*
@@ -2327,7 +2325,6 @@ static struct resource * __init ioapic_s
 	res = (void *)mem;

 	if (mem != NULL) {
-		memset(mem, 0, n);
 		mem += sizeof(struct resource) * nr_ioapics;

 		for (i = 0; i < nr_ioapics; i++) {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/apic/ipi-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,232 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+#include <mach_apic.h>
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR(unsigned int shortcut, int vector)
+{
+	unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+	switch (vector) {
+	default:
+		icr |= APIC_DM_FIXED | vector;
+		break;
+	case NMI_VECTOR:
+		icr |= APIC_DM_NMI;
+		break;
+	}
+	return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+	return SET_APIC_DEST_FIELD(mask);
+}
+#else
+#include <xen/evtchn.h>
+
+DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
+{
+	int irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
+}
+#endif
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+#ifndef CONFIG_XEN
+	/*
+	 * Subtle. In the case of the 'never do double writes' workaround
+	 * we have to lock out interrupts to be safe.  As we don't care
+	 * of the value read we use an atomic rmw access to avoid costly
+	 * cli/sti.  Otherwise we use an even cheaper single atomic write
+	 * to the APIC.
+	 */
+	unsigned int cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+
+	/*
+	 * No need to touch the target chip field
+	 */
+	cfg = __prepare_ICR(shortcut, vector);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+#else
+	int cpu;
+
+	switch (shortcut) {
+	case APIC_DEST_SELF:
+		__send_IPI_one(smp_processor_id(), vector);
+		break;
+	case APIC_DEST_ALLBUT:
+		for_each_online_cpu(cpu)
+			if (cpu != smp_processor_id())
+				__send_IPI_one(cpu, vector);
+		break;
+	default:
+		printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+		       vector);
+		break;
+	}
+#endif
+}
+
+void send_IPI_self(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+	unsigned long cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		apic_wait_icr_idle();
+
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	apic_write_around(APIC_ICR2, cfg);
+
+	/*
+	 * program the ICR
+	 */
+	cfg = __prepare_ICR(0, vector);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write_around(APIC_ICR, cfg);
+}
+#endif
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+#ifndef CONFIG_XEN
+	unsigned long mask = cpus_addr(cpumask)[0];
+#else
+	cpumask_t mask;
+	unsigned int cpu;
+#endif
+	unsigned long flags;
+
+	local_irq_save(flags);
+#ifndef CONFIG_XEN
+	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+	__send_IPI_dest_field(mask, vector);
+#else
+	cpus_andnot(mask, cpumask, cpu_online_map);
+	WARN_ON(!cpus_empty(mask));
+	for_each_online_cpu(cpu)
+		if (cpu_isset(cpu, cpumask))
+			__send_IPI_one(cpu, vector);
+#endif
+	local_irq_restore(flags);
+}
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+#ifndef CONFIG_XEN
+	unsigned long flags;
+	unsigned int query_cpu;
+
+	/*
+	 * Hack. The clustered APIC addressing mode doesn't allow us to send
+	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+	 * should be modified to do 1 message per cluster ID - mbligh
+	 */
+
+	local_irq_save(flags);
+	for_each_possible_cpu(query_cpu) {
+		if (cpu_isset(query_cpu, mask)) {
+			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+					      vector);
+		}
+	}
+	local_irq_restore(flags);
+#else
+	send_IPI_mask_bitmask(mask, vector);
+#endif
+}
+
+/* must come after the send_IPI functions above for inlining */
+#include <mach_ipi.h>
+
+#ifndef CONFIG_XEN
+static int convert_apicid_to_cpu(int apic_id)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+			return i;
+	}
+	return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+	int apicid, cpuid;
+
+	if (!boot_cpu_has(X86_FEATURE_APIC))
+		return 0;
+
+	apicid = hard_smp_processor_id();
+	if (apicid == BAD_APICID)
+		return 0;
+
+	cpuid = convert_apicid_to_cpu(apicid);
+
+	return cpuid >= 0 ? cpuid : 0;
+}
+#endif
+#endif
--- head-2010-05-25.orig/arch/x86/kernel/machine_kexec_64.c	2010-04-15 09:56:14.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/machine_kexec_64.c	2010-04-15 10:03:05.000000000 +0200
@@ -114,8 +114,6 @@ int __init machine_kexec_setup_resources
 	return 0;
 }

-void machine_kexec_register_resources(struct resource *res) { ; }
-
 #else /* CONFIG_XEN */

 #define x__pmd(x) __pmd(x)
--- head-2010-05-25.orig/arch/x86/kernel/microcode-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/microcode-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -162,7 +162,7 @@ static int request_microcode(void)
 		c->x86, c->x86_model, c->x86_mask);
 	error = request_firmware(&firmware, name, &microcode_pdev->dev);
 	if (error) {
-		pr_debug("ucode data file %s load failed\n", name);
+		pr_debug("microcode: ucode data file %s load failed\n", name);
 		return error;
 	}

--- head-2010-05-25.orig/arch/x86/kernel/mmconf-fam10h_64.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/x86/kernel/mmconf-fam10h_64.c	2010-03-24 15:12:36.000000000 +0100
@@ -215,6 +215,16 @@ void __cpuinit fam10h_check_enable_mmcfg
 	val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
 	       FAM10H_MMIO_CONF_ENABLE;
 	wrmsrl(address, val);
+
+#ifdef CONFIG_XEN
+	{
+		u64 val2;
+
+		rdmsrl(address, val2);
+		if (val2 != val)
+			pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
+	}
+#endif
 }

 static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/mpparse-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,1101 @@
+/*
+ *	Intel Multiprocessor Specification 1.1 and 1.4
+ *	compliant MP-table parsing routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
+
+#include <mach_apic.h>
+#ifdef CONFIG_X86_32
+#include <mach_apicdef.h>
+#include <mach_mpparse.h>
+#endif
+
+/* Have we found an MP table */
+int smp_found_config;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
+
+static int mp_current_pci_id;
+
+int pic_mode;
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+#ifdef CONFIG_X86_NUMAQ
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+    __cpuinitdata;
+#endif
+
+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+{
+#ifndef CONFIG_XEN
+	int apicid;
+	char *bootup_cpu = "";
+
+	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+		disabled_cpus++;
+		return;
+	}
+#ifdef CONFIG_X86_NUMAQ
+	apicid = mpc_apic_id(m, translation_table[mpc_record]);
+#else
+	apicid = m->mpc_apicid;
+#endif
+	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+		bootup_cpu = " (Bootup-CPU)";
+		boot_cpu_physical_apicid = m->mpc_apicid;
+	}
+
+	printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+	generic_processor_info(apicid, m->mpc_apicver);
+#else /* CONFIG_XEN */
+	num_processors++;
+#endif
+}
+
+static void __init MP_bus_info(struct mpc_config_bus *m)
+{
+	char str[7];
+
+	memcpy(str, m->mpc_bustype, 6);
+	str[6] = 0;
+
+#ifdef CONFIG_X86_NUMAQ
+	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#else
+	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+#endif
+
+#if MAX_MP_BUSSES < 256
+	if (m->mpc_busid >= MAX_MP_BUSSES) {
+		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+		       " is too large, max. supported is %d\n",
+		       m->mpc_busid, str, MAX_MP_BUSSES - 1);
+		return;
+	}
+#endif
+
+	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+		 set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+#endif
+	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+#ifdef CONFIG_X86_NUMAQ
+		mpc_oem_pci_bus(m, translation_table[mpc_record]);
+#endif
+		clear_bit(m->mpc_busid, mp_bus_not_pci);
+		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+		mp_current_pci_id++;
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
+		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+#endif
+	} else
+		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+		       " found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
+static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
+{
+	if (!(m->mpc_flags & MPC_APIC_USABLE))
+		return;
+
+	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
+	       m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+
+	if (bad_ioapic(m->mpc_apicaddr))
+		return;
+
+	mp_ioapics[nr_ioapics] = *m;
+	nr_ioapics++;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+	mp_irqs[mp_irq_entries] = *m;
+	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		m->mpc_irqtype, m->mpc_irqflag & 3,
+		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+#endif
+
+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+{
+	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+		m->mpc_irqtype, m->mpc_irqflag & 3,
+		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
+		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+	printk(KERN_INFO
+	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+	       m->trans_local);
+
+	if (mpc_record >= MAX_MPC_ENTRY)
+		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+	else
+		translation_table[mpc_record] = m;	/* stash this for later */
+	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+		node_set_online(m->trans_quad);
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+				    unsigned short oemsize)
+{
+	int count = sizeof(*oemtable);	/* the header size */
+	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+	mpc_record = 0;
+	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+	       oemtable);
+	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+		printk(KERN_WARNING
+		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+		       oemtable->oem_signature[0], oemtable->oem_signature[1],
+		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
+		return;
+	}
+	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+		return;
+	}
+	while (count < oemtable->oem_length) {
+		switch (*oemptr) {
+		case MP_TRANSLATION:
+			{
+				struct mpc_config_translation *m =
+				    (struct mpc_config_translation *)oemptr;
+				MP_translation_info(m);
+				oemptr += sizeof(*m);
+				count += sizeof(*m);
+				++mpc_record;
+				break;
+			}
+		default:
+			{
+				printk(KERN_WARNING
+				       "Unrecognised OEM table entry type! - %d\n",
+				       (int)*oemptr);
+				return;
+			}
+		}
+	}
+}
+
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+				 char *productid)
+{
+	if (strncmp(oem, "IBM NUMA", 8))
+		printk("Warning!  May not be a NUMA-Q system!\n");
+	if (mpc->mpc_oemptr)
+		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
+				 mpc->mpc_oemsize);
+}
+#endif /* CONFIG_X86_NUMAQ */
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+		       mpc->mpc_signature[0], mpc->mpc_signature[1],
+		       mpc->mpc_signature[2], mpc->mpc_signature[3]);
+		return 0;
+	}
+	if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+		printk(KERN_ERR "MPTABLE: checksum error!\n");
+		return 0;
+	}
+	if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+		       mpc->mpc_spec);
+		return 0;
+	}
+	if (!mpc->mpc_lapic) {
+		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+		return 0;
+	}
+	memcpy(oem, mpc->mpc_oem, 8);
+	oem[8] = 0;
+	printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+
+	memcpy(str, mpc->mpc_productid, 12);
+	str[12] = 0;
+	printk("Product ID: %s ", str);
+
+#ifdef CONFIG_X86_32
+	mps_oem_check(mpc, oem, str);
+#endif
+	printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+
+	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+
+	/* save the local APIC address, it might be non-default */
+	if (!acpi_lapic)
+		mp_lapic_addr = mpc->mpc_lapic;
+
+	if (early)
+		return 1;
+
+	/*
+	 *      Now process the configuration blocks.
+	 */
+#ifdef CONFIG_X86_NUMAQ
+	mpc_record = 0;
+#endif
+	while (count < mpc->mpc_length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m =
+				    (struct mpc_config_processor *)mpt;
+				/* ACPI may have already provided this data */
+				if (!acpi_lapic)
+					MP_processor_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_BUS:
+			{
+				struct mpc_config_bus *m =
+				    (struct mpc_config_bus *)mpt;
+				MP_bus_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_IOAPIC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_ioapic *m =
+				    (struct mpc_config_ioapic *)mpt;
+				MP_ioapic_info(m);
+#endif
+				mpt += sizeof(struct mpc_config_ioapic);
+				count += sizeof(struct mpc_config_ioapic);
+				break;
+			}
+		case MP_INTSRC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_intsrc *m =
+				    (struct mpc_config_intsrc *)mpt;
+
+				MP_intsrc_info(m);
+#endif
+				mpt += sizeof(struct mpc_config_intsrc);
+				count += sizeof(struct mpc_config_intsrc);
+				break;
+			}
+		case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m =
+				    (struct mpc_config_lintsrc *)mpt;
+				MP_lintsrc_info(m);
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		default:
+			/* wrong mptable */
+			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+			printk(KERN_ERR "type %x\n", *mpt);
+			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+					1, mpc, mpc->mpc_length, 1);
+			count = mpc->mpc_length;
+			break;
+		}
+#ifdef CONFIG_X86_NUMAQ
+		++mpc_record;
+#endif
+	}
+	setup_apic_routing();
+	if (!num_processors)
+		printk(KERN_ERR "MPTABLE: no processors registered!\n");
+	return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+	unsigned int port;
+
+	port = 0x4d0 + (irq >> 3);
+	return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+	struct mpc_config_intsrc intsrc;
+	int i;
+	int ELCR_fallback = 0;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;	/* conforming */
+	intsrc.mpc_srcbus = 0;
+	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+	intsrc.mpc_irqtype = mp_INT;
+
+	/*
+	 *  If true, we have an ISA/PCI system with no IRQ entries
+	 *  in the MP table. To prevent the PCI interrupts from being set up
+	 *  incorrectly, we try to use the ELCR. The sanity check to see if
+	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+	 *  never be level sensitive, so we simply see if the ELCR agrees.
+	 *  If it does, we assume it's valid.
+	 */
+	if (mpc_default_type == 5) {
+		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
+		       "falling back to ELCR\n");
+
+		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+		    ELCR_trigger(13))
+			printk(KERN_ERR "ELCR contains invalid data... "
+			       "not using ELCR\n");
+		else {
+			printk(KERN_INFO
+			       "Using ELCR to identify PCI interrupts\n");
+			ELCR_fallback = 1;
+		}
+	}
+
+	for (i = 0; i < 16; i++) {
+		switch (mpc_default_type) {
+		case 2:
+			if (i == 0 || i == 13)
+				continue;	/* IRQ0 & IRQ13 not connected */
+			/* fall through */
+		default:
+			if (i == 2)
+				continue;	/* IRQ2 is never connected */
+		}
+
+		if (ELCR_fallback) {
+			/*
+			 *  If the ELCR indicates a level-sensitive interrupt, we
+			 *  copy that information over to the MP table in the
+			 *  irqflag field (level sensitive, active high polarity).
+			 */
+			if (ELCR_trigger(i))
+				intsrc.mpc_irqflag = 13;
+			else
+				intsrc.mpc_irqflag = 0;
+		}
+
+		intsrc.mpc_srcbusirq = i;
+		intsrc.mpc_dstirq = i ? i : 2;	/* IRQ0 to INTIN2 */
+		MP_intsrc_info(&intsrc);
+	}
+
+	intsrc.mpc_irqtype = mp_ExtINT;
+	intsrc.mpc_srcbusirq = 0;
+	intsrc.mpc_dstirq = 0;	/* 8259A to INTIN0 */
+	MP_intsrc_info(&intsrc);
+}
+
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_bus bus;
+#ifdef CONFIG_X86_IO_APIC
+	struct mpc_config_ioapic ioapic;
+#endif
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	bus.mpc_type = MP_BUS;
+	bus.mpc_busid = 0;
+	switch (mpc_default_type) {
+	default:
+		printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+		       mpc_default_type);
+		/* fall through */
+	case 1:
+	case 5:
+		memcpy(bus.mpc_bustype, "ISA   ", 6);
+		break;
+	case 2:
+	case 6:
+	case 3:
+		memcpy(bus.mpc_bustype, "EISA  ", 6);
+		break;
+	case 4:
+	case 7:
+		memcpy(bus.mpc_bustype, "MCA   ", 6);
+	}
+	MP_bus_info(&bus);
+	if (mpc_default_type > 4) {
+		bus.mpc_busid = 1;
+		memcpy(bus.mpc_bustype, "PCI   ", 6);
+		MP_bus_info(&bus);
+	}
+
+#ifdef CONFIG_X86_IO_APIC
+	ioapic.mpc_type = MP_IOAPIC;
+	ioapic.mpc_apicid = 2;
+	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.mpc_flags = MPC_APIC_USABLE;
+	ioapic.mpc_apicaddr = 0xFEC00000;
+	MP_ioapic_info(&ioapic);
+
+	/*
+	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
+	 */
+	construct_default_ioirq_mptable(mpc_default_type);
+#endif
+	lintsrc.mpc_type = MP_LINTSRC;
+	lintsrc.mpc_irqflag = 0;	/* conforming */
+	lintsrc.mpc_srcbusid = 0;
+	lintsrc.mpc_srcbusirq = 0;
+	lintsrc.mpc_destapic = MP_APIC_ALL;
+	for (i = 0; i < 2; i++) {
+		lintsrc.mpc_irqtype = linttypes[i];
+		lintsrc.mpc_destapiclint = i;
+		MP_lintsrc_info(&lintsrc);
+	}
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+static void __init __get_smp_config(unsigned early)
+{
+	struct intel_mp_floating *mpf = mpf_found;
+
+	if (acpi_lapic && early)
+		return;
+	/*
+	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
+	 * processors, where MPS only supports physical.
+	 */
+	if (acpi_lapic && acpi_ioapic) {
+		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+		       "information\n");
+		return;
+	} else if (acpi_lapic)
+		printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+		       "configuration information\n");
+
+	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
+	       mpf->mpf_specification);
+#ifdef CONFIG_X86_32
+	if (mpf->mpf_feature2 & (1 << 7)) {
+		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+		pic_mode = 1;
+	} else {
+		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+		pic_mode = 0;
+	}
+#endif
+	/*
+	 * Now see if we need to read further.
+	 */
+	if (mpf->mpf_feature1 != 0) {
+		if (early) {
+			/*
+			 * local APIC has default address
+			 */
+			mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+			return;
+		}
+
+		printk(KERN_INFO "Default MP configuration #%d\n",
+		       mpf->mpf_feature1);
+		construct_default_ISA_mptable(mpf->mpf_feature1);
+
+	} else if (mpf->mpf_physptr) {
+
+		/*
+		 * Read the physical hardware table.  Anything here will
+		 * override the defaults.
+		 */
+		if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
+			smp_found_config = 0;
+			printk(KERN_ERR
+			       "BIOS bug, MP table errors detected!...\n");
+			printk(KERN_ERR "... disabling SMP support. "
+			       "(tell your hw vendor)\n");
+			return;
+		}
+
+		if (early)
+			return;
+#ifdef CONFIG_X86_IO_APIC
+		/*
+		 * If there are no explicit MP IRQ entries, then we are
+		 * broken.  We set up most of the low 16 IO-APIC pins to
+		 * ISA defaults and hope it will work.
+		 */
+		if (!mp_irq_entries) {
+			struct mpc_config_bus bus;
+
+			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+			       "using default mptable. "
+			       "(tell your hw vendor)\n");
+
+			bus.mpc_type = MP_BUS;
+			bus.mpc_busid = 0;
+			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			MP_bus_info(&bus);
+
+			construct_default_ioirq_mptable(0);
+		}
+#endif
+	} else
+		BUG();
+
+	if (!early)
+		printk(KERN_INFO "Processors: %d\n", num_processors);
+	/*
+	 * Only use the first configuration found.
+	 */
+}
+
+void __init early_get_smp_config(void)
+{
+	__get_smp_config(1);
+}
+
+void __init get_smp_config(void)
+{
+	__get_smp_config(0);
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length,
+				  unsigned reserve)
+{
+	unsigned int *bp = isa_bus_to_virt(base);
+	struct intel_mp_floating *mpf;
+
+	Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+	BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+	while (length > 0) {
+		mpf = (struct intel_mp_floating *)bp;
+		if ((*bp == SMP_MAGIC_IDENT) &&
+		    (mpf->mpf_length == 1) &&
+		    !mpf_checksum((unsigned char *)bp, 16) &&
+		    ((mpf->mpf_specification == 1)
+		     || (mpf->mpf_specification == 4))) {
+
+			smp_found_config = 1;
+			mpf_found = mpf;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+			       mpf, virt_to_phys(mpf));
+			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+					BOOTMEM_DEFAULT);
+			if (mpf->mpf_physptr) {
+				/*
+				 * We cannot access to MPC table to compute
+				 * table size yet, as only few megabytes from
+				 * the bottom is mapped now.
+				 * PC-9800's MPC table places on the very last
+				 * of physical memory; so that simply reserving
+				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+				 * in reserve_bootmem.
+				 */
+				unsigned long size = PAGE_SIZE;
+				unsigned long end = max_low_pfn * PAGE_SIZE;
+				if (mpf->mpf_physptr + size > end)
+					size = end - mpf->mpf_physptr;
+				reserve_bootmem(mpf->mpf_physptr, size,
+						BOOTMEM_DEFAULT);
+			}
+#else
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+				mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
+#endif
+#elif !defined(CONFIG_XEN)
+			if (!reserve)
+				return 1;
+
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+			if (mpf->mpf_physptr)
+				reserve_bootmem_generic(mpf->mpf_physptr,
+							PAGE_SIZE);
+#endif
+		return 1;
+		}
+		bp += 4;
+		length -= 16;
+	}
+	return 0;
+}
+
+static void __init __find_smp_config(unsigned reserve)
+{
+#ifndef CONFIG_XEN
+	unsigned int address;
+#endif
+
+	/*
+	 * FIXME: Linux assumes you have 640K of base ram..
+	 * this continues the error...
+	 *
+	 * 1) Scan the bottom 1K for a signature
+	 * 2) Scan the top 1K of base RAM
+	 * 3) Scan the 64K of bios
+	 */
+	if (smp_scan_config(0x0, 0x400, reserve) ||
+	    smp_scan_config(639 * 0x400, 0x400, reserve) ||
+	    smp_scan_config(0xF0000, 0x10000, reserve))
+		return;
+	/*
+	 * If it is an SMP machine we should know now, unless the
+	 * configuration is in an EISA/MCA bus machine with an
+	 * extended bios data area.
+	 *
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E, calculate and scan it here.
+	 *
+	 * NOTE! There are Linux loaders that will corrupt the EBDA
+	 * area, and as such this kind of SMP config may be less
+	 * trustworthy, simply because the SMP table may have been
+	 * stomped on during early boot. These loaders are buggy and
+	 * should be fixed.
+	 *
+	 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+	 */
+
+#ifndef CONFIG_XEN
+	address = get_bios_ebda();
+	if (address)
+		smp_scan_config(address, 0x400, reserve);
+#endif
+}
+
+void __init early_find_smp_config(void)
+{
+	__find_smp_config(0);
+}
+
+void __init find_smp_config(void)
+{
+	__find_smp_config(1);
+}
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+
+/*
+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
+ */
+int es7000_plat;
+
+#ifdef CONFIG_ACPI
+
+#ifdef	CONFIG_X86_IO_APIC
+
+#define MP_ISA_BUS		0
+
+extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+	int i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_base)
+		    && (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+		return id;
+#else
+	int i;
+	DECLARE_BITMAP(used, 256);
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mpc_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mpc_apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+#endif
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+
+	if (bad_ioapic(address))
+		return;
+
+	idx = nr_ioapics;
+
+	mp_ioapics[idx].mpc_type = MP_IOAPIC;
+	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mpc_apicaddr = address;
+
+#ifndef CONFIG_XEN
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+#endif
+	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+#else
+	mp_ioapics[idx].mpc_apicver = 0;
+#endif
+	/*
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+	mp_ioapic_routing[idx].gsi_base = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
+	    io_apic_get_redir_entries(idx);
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+	nr_ioapics++;
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+	struct mpc_config_intsrc intsrc;
+	int ioapic = -1;
+	int pin = -1;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqtype = mp_INT;
+	intsrc.mpc_irqflag = (trigger << 2) | polarity;
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+	intsrc.mpc_srcbusirq = bus_irq;	/* IRQ */
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
+	intsrc.mpc_dstirq = pin;	/* INTIN# */
+
+	MP_intsrc_info(&intsrc);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+	struct mpc_config_intsrc intsrc;
+	int i = 0;
+	int ioapic = -1;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+	/*
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+	set_bit(MP_ISA_BUS, mp_bus_not_pci);
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+	/*
+	 * Older generations of ES7000 have no legacy identity mappings
+	 */
+	if (es7000_plat == 1)
+		return;
+
+	/*
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+
+	intsrc.mpc_type = MP_INTSRC;
+	intsrc.mpc_irqflag = 0;	/* Conforming */
+	intsrc.mpc_srcbus = MP_ISA_BUS;
+#ifdef CONFIG_X86_IO_APIC
+	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+#endif
+	/*
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overridden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mpc_srcbus == MP_ISA_BUS
+			    && irq->mpc_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+			    (irq->mpc_dstirq == i))
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;	/* IRQ already used */
+		}
+
+		intsrc.mpc_irqtype = mp_INT;
+		intsrc.mpc_srcbusirq = i;	/* Identity mapped */
+		intsrc.mpc_dstirq = i;
+
+		MP_intsrc_info(&intsrc);
+	}
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int ioapic;
+	int ioapic_pin;
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+#define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64
+
+	static int pci_irq = IRQ_COMPRESSION_START;
+	/*
+	 * Mapping between Global System Interrupts, which
+	 * represent all possible interrupts, and IRQs
+	 * assigned to actual devices.
+	 */
+	static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+#endif
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifndef CONFIG_X86_32
+	if (ioapic_renumber_irq)
+		gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       ioapic_pin);
+		return gsi;
+	}
+	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+		return gsi;
+#endif
+	}
+
+	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		if (gsi < MAX_GSI_NUM) {
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			gsi = pci_irq++;
+			/*
+			 * Don't assign IRQ used by ACPI SCI
+			 */
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
+				gsi = pci_irq++;
+			gsi_to_irq[irq] = gsi;
+		} else {
+			printk(KERN_ERR "GSI %u is too high\n", gsi);
+			return gsi;
+		}
+	}
+#endif
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
--- head-2010-05-25.orig/arch/x86/kernel/mpparse_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,1161 +0,0 @@
-/*
- *	Intel Multiprocessor Specification 1.1 and 1.4
- *	compliant MP-table parsing routines.
- *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *	Fixes
- *		Erich Boleyn	:	MP v1.4 and additional changes.
- *		Alan Cox	:	Added EBDA scanning
- *		Ingo Molnar	:	various cleanups and rewrites
- *		Maciej W. Rozycki:	Bits for default MP configurations
- *		Paul Diefenbaugh:	Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/bitops.h>
-
-#include <asm/smp.h>
-#include <asm/acpi.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/io_apic.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#include <bios_ebda.h>
-
-/* Have we found an MP table */
-int smp_found_config;
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-int apic_version [MAX_APICS];
-int mp_bus_id_to_type [MAX_MP_BUSSES];
-int mp_bus_id_to_node [MAX_MP_BUSSES];
-int mp_bus_id_to_local [MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-static int mp_current_pci_id;
-
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* MP IRQ source entries */
-int mp_irq_entries;
-
-int nr_ioapics;
-
-int pic_mode;
-unsigned long mp_lapic_addr;
-
-unsigned int def_to_bigsmp = 0;
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-/* Internal processor count */
-unsigned int num_processors;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-
-/*
- * Checksum an MP configuration block.
- */
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
-	int sum = 0;
-
-	while (len--)
-		sum += *mp++;
-
-	return sum & 0xFF;
-}
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
-
-#ifndef CONFIG_XEN
-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
-{
- 	int ver, apicid;
-	physid_mask_t phys_cpu;
-
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
-		return;
-
-	apicid = mpc_apic_id(m, translation_table[mpc_record]);
-
-	if (m->mpc_featureflag&(1<<0))
-		Dprintk("    Floating point unit present.\n");
-	if (m->mpc_featureflag&(1<<7))
-		Dprintk("    Machine Exception supported.\n");
-	if (m->mpc_featureflag&(1<<8))
-		Dprintk("    64 bit compare & exchange supported.\n");
-	if (m->mpc_featureflag&(1<<9))
-		Dprintk("    Internal APIC present.\n");
-	if (m->mpc_featureflag&(1<<11))
-		Dprintk("    SEP present.\n");
-	if (m->mpc_featureflag&(1<<12))
-		Dprintk("    MTRR  present.\n");
-	if (m->mpc_featureflag&(1<<13))
-		Dprintk("    PGE  present.\n");
-	if (m->mpc_featureflag&(1<<14))
-		Dprintk("    MCA  present.\n");
-	if (m->mpc_featureflag&(1<<15))
-		Dprintk("    CMOV  present.\n");
-	if (m->mpc_featureflag&(1<<16))
-		Dprintk("    PAT  present.\n");
-	if (m->mpc_featureflag&(1<<17))
-		Dprintk("    PSE  present.\n");
-	if (m->mpc_featureflag&(1<<18))
-		Dprintk("    PSN  present.\n");
-	if (m->mpc_featureflag&(1<<19))
-		Dprintk("    Cache Line Flush Instruction present.\n");
-	/* 20 Reserved */
-	if (m->mpc_featureflag&(1<<21))
-		Dprintk("    Debug Trace and EMON Store present.\n");
-	if (m->mpc_featureflag&(1<<22))
-		Dprintk("    ACPI Thermal Throttle Registers  present.\n");
-	if (m->mpc_featureflag&(1<<23))
-		Dprintk("    MMX  present.\n");
-	if (m->mpc_featureflag&(1<<24))
-		Dprintk("    FXSR  present.\n");
-	if (m->mpc_featureflag&(1<<25))
-		Dprintk("    XMM  present.\n");
-	if (m->mpc_featureflag&(1<<26))
-		Dprintk("    Willamette New Instructions  present.\n");
-	if (m->mpc_featureflag&(1<<27))
-		Dprintk("    Self Snoop  present.\n");
-	if (m->mpc_featureflag&(1<<28))
-		Dprintk("    HT  present.\n");
-	if (m->mpc_featureflag&(1<<29))
-		Dprintk("    Thermal Monitor present.\n");
-	/* 30, 31 Reserved */
-
-
-	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-		Dprintk("    Bootup CPU\n");
-		boot_cpu_physical_apicid = m->mpc_apicid;
-	}
-
-	ver = m->mpc_apicver;
-
-	/*
-	 * Validate version
-	 */
-	if (ver == 0x0) {
-		printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
-				"fixing up to 0x10. (tell your hw vendor)\n",
-				m->mpc_apicid);
-		ver = 0x10;
-	}
-	apic_version[m->mpc_apicid] = ver;
-
-	phys_cpu = apicid_to_cpu_present(apicid);
-	physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
-	if (num_processors >= NR_CPUS) {
-		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-			"  Processor ignored.\n", NR_CPUS);
-		return;
-	}
-
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
-		return;
-	}
-
-	cpu_set(num_processors, cpu_possible_map);
-	num_processors++;
-
-	/*
-	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
-	 * but we need to work other dependencies like SMP_SUSPEND etc
-	 * before this can be done without some confusion.
-	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
-	 *       - Ashok Raj <ashok.raj@intel.com>
-	 */
-	if (num_processors > 8) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_INTEL:
-			if (!APIC_XAPIC(ver)) {
-				def_to_bigsmp = 0;
-				break;
-			}
-			/* If P4 and above fall through */
-		case X86_VENDOR_AMD:
-			def_to_bigsmp = 1;
-		}
-	}
-	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
-}
-#else
-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
-{
-	num_processors++;
-}
-#endif /* CONFIG_XEN */
-
-static void __init MP_bus_info (struct mpc_config_bus *m)
-{
-	char str[7];
-
-	memcpy(str, m->mpc_bustype, 6);
-	str[6] = 0;
-
-	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-
-#if MAX_MP_BUSSES < 256
-	if (m->mpc_busid >= MAX_MP_BUSSES) {
-		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
-			" is too large, max. supported is %d\n",
-			m->mpc_busid, str, MAX_MP_BUSSES - 1);
-		return;
-	}
-#endif
-
-	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
-	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
-	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
-		mpc_oem_pci_bus(m, translation_table[mpc_record]);
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
-	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
-	} else {
-		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
-	}
-}
-
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
-{
-	if (!(m->mpc_flags & MPC_APIC_USABLE))
-		return;
-
-	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
-		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
-			MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
-	}
-	if (!m->mpc_apicaddr) {
-		printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
-			" found in MP table, skipping!\n");
-		return;
-	}
-	mp_ioapics[nr_ioapics] = *m;
-	nr_ioapics++;
-}
-
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
-{
-	mp_irqs [mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
-			m->mpc_irqtype, m->mpc_irqflag & 3,
-			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!!\n");
-}
-
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
-{
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-			m->mpc_irqtype, m->mpc_irqflag & 3,
-			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-}
-
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info (struct mpc_config_translation *m)
-{
-	printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
-
-	if (mpc_record >= MAX_MPC_ENTRY)
-		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-	else
-		translation_table[mpc_record] = m; /* stash this for later */
-	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-		node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
-	unsigned short oemsize)
-{
-	int count = sizeof (*oemtable); /* the header size */
-	unsigned char *oemptr = ((unsigned char *)oemtable)+count;
-
-	mpc_record = 0;
-	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
-	if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
-	{
-		printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-			oemtable->oem_signature[0],
-			oemtable->oem_signature[1],
-			oemtable->oem_signature[2],
-			oemtable->oem_signature[3]);
-		return;
-	}
-	if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
-	{
-		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-		return;
-	}
-	while (count < oemtable->oem_length) {
-		switch (*oemptr) {
-			case MP_TRANSLATION:
-			{
-				struct mpc_config_translation *m=
-					(struct mpc_config_translation *)oemptr;
-				MP_translation_info(m);
-				oemptr += sizeof(*m);
-				count += sizeof(*m);
-				++mpc_record;
-				break;
-			}
-			default:
-			{
-				printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
-				return;
-			}
-		}
-       }
-}
-
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
-		char *productid)
-{
-	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  May not be a NUMA-Q system!\n");
-	if (mpc->mpc_oemptr)
-		smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
-				mpc->mpc_oemsize);
-}
-#endif	/* CONFIG_X86_NUMAQ */
-
-/*
- * Read/parse the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
-	char str[16];
-	char oem[10];
-	int count=sizeof(*mpc);
-	unsigned char *mpt=((unsigned char *)mpc)+count;
-
-	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-		printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
-			*(u32 *)mpc->mpc_signature);
-		return 0;
-	}
-	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-		printk(KERN_ERR "SMP mptable: checksum error!\n");
-		return 0;
-	}
-	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-		printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
-			mpc->mpc_spec);
-		return 0;
-	}
-	if (!mpc->mpc_lapic) {
-		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
-		return 0;
-	}
-	memcpy(oem,mpc->mpc_oem,8);
-	oem[8]=0;
-	printk(KERN_INFO "OEM ID: %s ",oem);
-
-	memcpy(str,mpc->mpc_productid,12);
-	str[12]=0;
-	printk("Product ID: %s ",str);
-
-	mps_oem_check(mpc, oem, str);
-
-	printk("APIC at: 0x%X\n", mpc->mpc_lapic);
-
-	/*
-	 * Save the local APIC address (it might be non-default) -- but only
-	 * if we're not using ACPI.
-	 */
-	if (!acpi_lapic)
-		mp_lapic_addr = mpc->mpc_lapic;
-
-	/*
-	 *	Now process the configuration blocks.
-	 */
-	mpc_record = 0;
-	while (count < mpc->mpc_length) {
-		switch(*mpt) {
-			case MP_PROCESSOR:
-			{
-				struct mpc_config_processor *m=
-					(struct mpc_config_processor *)mpt;
-				/* ACPI may have already provided this data */
-				if (!acpi_lapic)
-					MP_processor_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_BUS:
-			{
-				struct mpc_config_bus *m=
-					(struct mpc_config_bus *)mpt;
-				MP_bus_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_IOAPIC:
-			{
-				struct mpc_config_ioapic *m=
-					(struct mpc_config_ioapic *)mpt;
-				MP_ioapic_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_INTSRC:
-			{
-				struct mpc_config_intsrc *m=
-					(struct mpc_config_intsrc *)mpt;
-
-				MP_intsrc_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			case MP_LINTSRC:
-			{
-				struct mpc_config_lintsrc *m=
-					(struct mpc_config_lintsrc *)mpt;
-				MP_lintsrc_info(m);
-				mpt+=sizeof(*m);
-				count+=sizeof(*m);
-				break;
-			}
-			default:
-			{
-				count = mpc->mpc_length;
-				break;
-			}
-		}
-		++mpc_record;
-	}
-	setup_apic_routing();
-	if (!num_processors)
-		printk(KERN_ERR "SMP mptable: no processors registered!\n");
-	return num_processors;
-}
-
-static int __init ELCR_trigger(unsigned int irq)
-{
-	unsigned int port;
-
-	port = 0x4d0 + (irq >> 3);
-	return (inb(port) >> (irq & 7)) & 1;
-}
-
-static void __init construct_default_ioirq_mptable(int mpc_default_type)
-{
-	struct mpc_config_intsrc intsrc;
-	int i;
-	int ELCR_fallback = 0;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;			/* conforming */
-	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-
-	intsrc.mpc_irqtype = mp_INT;
-
-	/*
-	 *  If true, we have an ISA/PCI system with no IRQ entries
-	 *  in the MP table. To prevent the PCI interrupts from being set up
-	 *  incorrectly, we try to use the ELCR. The sanity check to see if
-	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-	 *  never be level sensitive, so we simply see if the ELCR agrees.
-	 *  If it does, we assume it's valid.
-	 */
-	if (mpc_default_type == 5) {
-		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
-
-		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-			printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
-		else {
-			printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-			ELCR_fallback = 1;
-		}
-	}
-
-	for (i = 0; i < 16; i++) {
-		switch (mpc_default_type) {
-		case 2:
-			if (i == 0 || i == 13)
-				continue;	/* IRQ0 & IRQ13 not connected */
-			/* fall through */
-		default:
-			if (i == 2)
-				continue;	/* IRQ2 is never connected */
-		}
-
-		if (ELCR_fallback) {
-			/*
-			 *  If the ELCR indicates a level-sensitive interrupt, we
-			 *  copy that information over to the MP table in the
-			 *  irqflag field (level sensitive, active high polarity).
-			 */
-			if (ELCR_trigger(i))
-				intsrc.mpc_irqflag = 13;
-			else
-				intsrc.mpc_irqflag = 0;
-		}
-
-		intsrc.mpc_srcbusirq = i;
-		intsrc.mpc_dstirq = i ? i : 2;		/* IRQ0 to INTIN2 */
-		MP_intsrc_info(&intsrc);
-	}
-
-	intsrc.mpc_irqtype = mp_ExtINT;
-	intsrc.mpc_srcbusirq = 0;
-	intsrc.mpc_dstirq = 0;				/* 8259A to INTIN0 */
-	MP_intsrc_info(&intsrc);
-}
-
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-{
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-	struct mpc_config_ioapic ioapic;
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-				   (boot_cpu_data.x86_model << 4) |
-				   boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
-
-	bus.mpc_type = MP_BUS;
-	bus.mpc_busid = 0;
-	switch (mpc_default_type) {
-		default:
-			printk("???\n");
-			printk(KERN_ERR "Unknown standard configuration %d\n",
-				mpc_default_type);
-			/* fall through */
-		case 1:
-		case 5:
-			memcpy(bus.mpc_bustype, "ISA   ", 6);
-			break;
-		case 2:
-		case 6:
-		case 3:
-			memcpy(bus.mpc_bustype, "EISA  ", 6);
-			break;
-		case 4:
-		case 7:
-			memcpy(bus.mpc_bustype, "MCA   ", 6);
-	}
-	MP_bus_info(&bus);
-	if (mpc_default_type > 4) {
-		bus.mpc_busid = 1;
-		memcpy(bus.mpc_bustype, "PCI   ", 6);
-		MP_bus_info(&bus);
-	}
-
-	ioapic.mpc_type = MP_IOAPIC;
-	ioapic.mpc_apicid = 2;
-	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	ioapic.mpc_flags = MPC_APIC_USABLE;
-	ioapic.mpc_apicaddr = 0xFEC00000;
-	MP_ioapic_info(&ioapic);
-
-	/*
-	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
-	 */
-	construct_default_ioirq_mptable(mpc_default_type);
-
-	lintsrc.mpc_type = MP_LINTSRC;
-	lintsrc.mpc_irqflag = 0;		/* conforming */
-	lintsrc.mpc_srcbusid = 0;
-	lintsrc.mpc_srcbusirq = 0;
-	lintsrc.mpc_destapic = MP_APIC_ALL;
-	for (i = 0; i < 2; i++) {
-		lintsrc.mpc_irqtype = linttypes[i];
-		lintsrc.mpc_destapiclint = i;
-		MP_lintsrc_info(&lintsrc);
-	}
-}
-
-static struct intel_mp_floating *mpf_found;
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-void __init get_smp_config (void)
-{
-	struct intel_mp_floating *mpf = mpf_found;
-
-	/*
-	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
-	 * processors, where MPS only supports physical.
-	 */
-	if (acpi_lapic && acpi_ioapic) {
-		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
-		return;
-	}
-	else if (acpi_lapic)
-		printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
-	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-	if (mpf->mpf_feature2 & (1<<7)) {
-		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
-		pic_mode = 1;
-	} else {
-		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
-		pic_mode = 0;
-	}
-
-	/*
-	 * Now see if we need to read further.
-	 */
-	if (mpf->mpf_feature1 != 0) {
-
-		printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-		construct_default_ISA_mptable(mpf->mpf_feature1);
-
-	} else if (mpf->mpf_physptr) {
-
-		/*
-		 * Read the physical hardware table.  Anything here will
-		 * override the defaults.
-		 */
-		if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
-			smp_found_config = 0;
-			printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-			printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-			return;
-		}
-		/*
-		 * If there are no explicit MP IRQ entries, then we are
-		 * broken.  We set up most of the low 16 IO-APIC pins to
-		 * ISA defaults and hope it will work.
-		 */
-		if (!mp_irq_entries) {
-			struct mpc_config_bus bus;
-
-			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
-
-			bus.mpc_type = MP_BUS;
-			bus.mpc_busid = 0;
-			memcpy(bus.mpc_bustype, "ISA   ", 6);
-			MP_bus_info(&bus);
-
-			construct_default_ioirq_mptable(0);
-		}
-
-	} else
-		BUG();
-
-	printk(KERN_INFO "Processors: %d\n", num_processors);
-	/*
-	 * Only use the first configuration found.
-	 */
-}
-
-static int __init smp_scan_config (unsigned long base, unsigned long length)
-{
-	unsigned long *bp = isa_bus_to_virt(base);
-	struct intel_mp_floating *mpf;
-
-	printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
-	if (sizeof(*mpf) != 16)
-		printk("Error: MPF size\n");
-
-	while (length > 0) {
-		mpf = (struct intel_mp_floating *)bp;
-		if ((*bp == SMP_MAGIC_IDENT) &&
-			(mpf->mpf_length == 1) &&
-			!mpf_checksum((unsigned char *)bp, 16) &&
-			((mpf->mpf_specification == 1)
-				|| (mpf->mpf_specification == 4)) ) {
-
-			smp_found_config = 1;
-#ifndef CONFIG_XEN
-			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-				mpf, virt_to_phys(mpf));
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
-					BOOTMEM_DEFAULT);
-			if (mpf->mpf_physptr) {
-				/*
-				 * We cannot access to MPC table to compute
-				 * table size yet, as only few megabytes from
-				 * the bottom is mapped now.
-				 * PC-9800's MPC table places on the very last
-				 * of physical memory; so that simply reserving
-				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
-				 * in reserve_bootmem.
-				 */
-				unsigned long size = PAGE_SIZE;
-				unsigned long end = max_low_pfn * PAGE_SIZE;
-				if (mpf->mpf_physptr + size > end)
-					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
-						BOOTMEM_DEFAULT);
-			}
-#else
-			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
-				mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
-#endif
-
-			mpf_found = mpf;
-			return 1;
-		}
-		bp += 4;
-		length -= 16;
-	}
-	return 0;
-}
-
-void __init find_smp_config (void)
-{
-#ifndef CONFIG_XEN
-	unsigned int address;
-#endif
-
-	/*
-	 * FIXME: Linux assumes you have 640K of base ram..
-	 * this continues the error...
-	 *
-	 * 1) Scan the bottom 1K for a signature
-	 * 2) Scan the top 1K of base RAM
-	 * 3) Scan the 64K of bios
-	 */
-	if (smp_scan_config(0x0,0x400) ||
-		smp_scan_config(639*0x400,0x400) ||
-			smp_scan_config(0xF0000,0x10000))
-		return;
-	/*
-	 * If it is an SMP machine we should know now, unless the
-	 * configuration is in an EISA/MCA bus machine with an
-	 * extended bios data area.
-	 *
-	 * there is a real-mode segmented pointer pointing to the
-	 * 4K EBDA area at 0x40E, calculate and scan it here.
-	 *
-	 * NOTE! There are Linux loaders that will corrupt the EBDA
-	 * area, and as such this kind of SMP config may be less
-	 * trustworthy, simply because the SMP table may have been
-	 * stomped on during early boot. These loaders are buggy and
-	 * should be fixed.
-	 *
-	 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
-	 */
-
-#ifndef CONFIG_XEN
-	address = get_bios_ebda();
-	if (address)
-		smp_scan_config(address, 0x400);
-#endif
-}
-
-int es7000_plat;
-
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
-{
-#ifndef CONFIG_XEN
-	mp_lapic_addr = (unsigned long) address;
-
-	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
-	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
-	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
-#endif
-}
-
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
-{
-	struct mpc_config_processor processor;
-	int boot_cpu = 0;
-
-	if (MAX_APICS - id <= 0) {
-		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
-			id, MAX_APICS);
-		return;
-	}
-
-	if (id == boot_cpu_physical_apicid)
-		boot_cpu = 1;
-
-#ifndef CONFIG_XEN
-	processor.mpc_type = MP_PROCESSOR;
-	processor.mpc_apicid = id;
-	processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-	processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-		(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-#endif
-
-	MP_processor_info(&processor);
-}
-
-#ifdef	CONFIG_X86_IO_APIC
-
-#define MP_ISA_BUS		0
-#define MP_MAX_IOAPIC_PIN	127
-
-static struct mp_ioapic_routing {
-	int			apic_id;
-	int			gsi_base;
-	int			gsi_end;
-	u32			pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (int gsi)
-{
-	int i = 0;
-
-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_base)
-			&& (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
-	}
-
-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
-	return -1;
-}
-
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
-{
-	int idx = 0;
-	int tmpid;
-
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-			" found in MADT table, skipping!\n");
-		return;
-	}
-
-	idx = nr_ioapics++;
-
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
-
-#ifndef CONFIG_XEN
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-#endif
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		&& !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		tmpid = io_apic_get_unique_id(idx, id);
-	else
-		tmpid = id;
-	if (tmpid == -1) {
-		nr_ioapics--;
-		return;
-	}
-	mp_ioapics[idx].mpc_apicid = tmpid;
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-
-	/*
-	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-		io_apic_get_redir_entries(idx);
-
-	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-	       mp_ioapic_routing[idx].gsi_base,
-	       mp_ioapic_routing[idx].gsi_end);
-}
-
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int			ioapic = -1;
-	int			pin = -1;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;				       /* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	   /* APIC ID */
-	intsrc.mpc_dstirq = pin;				    /* INTIN# */
-
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
-		intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-		(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-		intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
-	mp_irqs[mp_irq_entries] = intsrc;
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!\n");
-}
-
-void __init mp_config_acpi_legacy_irqs (void)
-{
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
-
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-
-	/*
-	 * Older generations of ES7000 have no legacy identity mappings
-	 */
-	if (es7000_plat == 1)
-		return;
-
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;					/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-				(irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;			/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;		   /* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-			"%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-			(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-			intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
-			intsrc.mpc_dstirq);
-
-		mp_irqs[mp_irq_entries] = intsrc;
-		if (++mp_irq_entries == MAX_IRQ_SOURCES)
-			panic("Max # of irq sources exceeded!\n");
-	}
-}
-
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
-	int ioapic = -1;
-	int ioapic_pin = 0;
-	int idx, bit = 0;
-	static int pci_irq = IRQ_COMPRESSION_START;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
-	 */
-	static int		gsi_to_irq[MAX_GSI_NUM];
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
-
-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	bit = ioapic_pin % 32;
-	idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-	if (idx > 3) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-			"%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-			ioapic_pin);
-		return gsi;
-	}
-	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-	}
-
-	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
-	/*
-	 * For GSI >= 64, use IRQ compression
-	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-		&& (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
-				gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-		    triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-		    polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
-}
-
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
--- head-2010-05-25.orig/arch/x86/kernel/mpparse_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,879 +0,0 @@
-/*
- *	Intel Multiprocessor Specification 1.1 and 1.4
- *	compliant MP-table parsing routines.
- *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *	Fixes
- *		Erich Boleyn	:	MP v1.4 and additional changes.
- *		Alan Cox	:	Added EBDA scanning
- *		Ingo Molnar	:	various cleanups and rewrites
- *		Maciej W. Rozycki:	Bits for default MP configurations
- *		Paul Diefenbaugh:	Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/acpi.h>
-
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-
-static int mp_current_pci_id = 0;
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* MP IRQ source entries */
-int mp_irq_entries;
-
-int nr_ioapics;
-unsigned long mp_lapic_addr = 0;
-
-
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_id = -1U;
-EXPORT_SYMBOL(boot_cpu_id);
-
-/* Internal processor count */
-unsigned int num_processors;
-
-unsigned disabled_cpus __cpuinitdata;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-
-#ifndef CONFIG_XEN
-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
-				= { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_bios_cpu_apicid_early_ptr;
-#endif
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-/*
- * Checksum an MP configuration block.
- */
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
-	int sum = 0;
-
-	while (len--)
-		sum += *mp++;
-
-	return sum & 0xFF;
-}
-
-#ifndef CONFIG_XEN
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
-{
-	int cpu;
-	cpumask_t tmp_map;
-	char *bootup_cpu = "";
-
-	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
-		disabled_cpus++;
-		return;
-	}
-	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-		bootup_cpu = " (Bootup-CPU)";
-		boot_cpu_id = m->mpc_apicid;
-	}
-
-	printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
-
-	if (num_processors >= NR_CPUS) {
-		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-			" Processor ignored.\n", NR_CPUS);
-		return;
-	}
-
-	num_processors++;
-	cpus_complement(tmp_map, cpu_present_map);
-	cpu = first_cpu(tmp_map);
-
-	physid_set(m->mpc_apicid, phys_cpu_present_map);
- 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- 		/*
-		 * x86_bios_cpu_apicid is required to have processors listed
- 		 * in same order as logical cpu numbers. Hence the first
- 		 * entry is BSP, and so on.
- 		 */
-		cpu = 0;
- 	}
-	/* are we being called early in kernel startup? */
-	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
-
-		cpu_to_apicid[cpu] = m->mpc_apicid;
-		bios_cpu_apicid[cpu] = m->mpc_apicid;
-	} else {
-		per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
-		per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
-	}
-
-	cpu_set(cpu, cpu_possible_map);
-	cpu_set(cpu, cpu_present_map);
-}
-#else
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
-{
-	num_processors++;
-}
-#endif /* CONFIG_XEN */
-
-static void __init MP_bus_info (struct mpc_config_bus *m)
-{
-	char str[7];
-
-	memcpy(str, m->mpc_bustype, 6);
-	str[6] = 0;
-	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-
-	if (strncmp(str, "ISA", 3) == 0) {
-		set_bit(m->mpc_busid, mp_bus_not_pci);
-	} else if (strncmp(str, "PCI", 3) == 0) {
-		clear_bit(m->mpc_busid, mp_bus_not_pci);
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
-	} else {
-		printk(KERN_ERR "Unknown bustype %s\n", str);
-	}
-}
-
-static int bad_ioapic(unsigned long address)
-{
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-			"(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-			" found in table, skipping!\n");
-		return 1;
-	}
-	return 0;
-}
-
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
-{
-	if (!(m->mpc_flags & MPC_APIC_USABLE))
-		return;
-
-	printk("I/O APIC #%d at 0x%X.\n",
-		m->mpc_apicid, m->mpc_apicaddr);
-
-	if (bad_ioapic(m->mpc_apicaddr))
-		return;
-
-	mp_ioapics[nr_ioapics] = *m;
-	nr_ioapics++;
-}
-
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
-{
-	mp_irqs [mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
-			m->mpc_irqtype, m->mpc_irqflag & 3,
-			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!!\n");
-}
-
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
-{
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-			m->mpc_irqtype, m->mpc_irqflag & 3,
-			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-}
-
-/*
- * Read/parse the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
-	char str[16];
-	int count=sizeof(*mpc);
-	unsigned char *mpt=((unsigned char *)mpc)+count;
-
-	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-		printk("MPTABLE: bad signature [%c%c%c%c]!\n",
-			mpc->mpc_signature[0],
-			mpc->mpc_signature[1],
-			mpc->mpc_signature[2],
-			mpc->mpc_signature[3]);
-		return 0;
-	}
-	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-		printk("MPTABLE: checksum error!\n");
-		return 0;
-	}
-	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-			mpc->mpc_spec);
-		return 0;
-	}
-	if (!mpc->mpc_lapic) {
-		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
-		return 0;
-	}
-	memcpy(str,mpc->mpc_oem,8);
-	str[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
-
-	memcpy(str,mpc->mpc_productid,12);
-	str[12] = 0;
-	printk("MPTABLE: Product ID: %s ",str);
-
-	printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
-
-	/* save the local APIC address, it might be non-default */
-	if (!acpi_lapic)
-		mp_lapic_addr = mpc->mpc_lapic;
-
-	/*
-	 *	Now process the configuration blocks.
-	 */
-	while (count < mpc->mpc_length) {
-		switch(*mpt) {
-			case MP_PROCESSOR:
-			{
-				struct mpc_config_processor *m=
-					(struct mpc_config_processor *)mpt;
-				if (!acpi_lapic)
-					MP_processor_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_BUS:
-			{
-				struct mpc_config_bus *m=
-					(struct mpc_config_bus *)mpt;
-				MP_bus_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_IOAPIC:
-			{
-				struct mpc_config_ioapic *m=
-					(struct mpc_config_ioapic *)mpt;
-				MP_ioapic_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_INTSRC:
-			{
-				struct mpc_config_intsrc *m=
-					(struct mpc_config_intsrc *)mpt;
-
-				MP_intsrc_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-			case MP_LINTSRC:
-			{
-				struct mpc_config_lintsrc *m=
-					(struct mpc_config_lintsrc *)mpt;
-				MP_lintsrc_info(m);
-				mpt += sizeof(*m);
-				count += sizeof(*m);
-				break;
-			}
-		}
-	}
-	setup_apic_routing();
-	if (!num_processors)
-		printk(KERN_ERR "MPTABLE: no processors registered!\n");
-	return num_processors;
-}
-
-static int __init ELCR_trigger(unsigned int irq)
-{
-	unsigned int port;
-
-	port = 0x4d0 + (irq >> 3);
-	return (inb(port) >> (irq & 7)) & 1;
-}
-
-static void __init construct_default_ioirq_mptable(int mpc_default_type)
-{
-	struct mpc_config_intsrc intsrc;
-	int i;
-	int ELCR_fallback = 0;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;			/* conforming */
-	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-
-	intsrc.mpc_irqtype = mp_INT;
-
-	/*
-	 *  If true, we have an ISA/PCI system with no IRQ entries
-	 *  in the MP table. To prevent the PCI interrupts from being set up
-	 *  incorrectly, we try to use the ELCR. The sanity check to see if
-	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-	 *  never be level sensitive, so we simply see if the ELCR agrees.
-	 *  If it does, we assume it's valid.
-	 */
-	if (mpc_default_type == 5) {
-		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
-
-		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-			printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
-		else {
-			printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-			ELCR_fallback = 1;
-		}
-	}
-
-	for (i = 0; i < 16; i++) {
-		switch (mpc_default_type) {
-		case 2:
-			if (i == 0 || i == 13)
-				continue;	/* IRQ0 & IRQ13 not connected */
-			/* fall through */
-		default:
-			if (i == 2)
-				continue;	/* IRQ2 is never connected */
-		}
-
-		if (ELCR_fallback) {
-			/*
-			 *  If the ELCR indicates a level-sensitive interrupt, we
-			 *  copy that information over to the MP table in the
-			 *  irqflag field (level sensitive, active high polarity).
-			 */
-			if (ELCR_trigger(i))
-				intsrc.mpc_irqflag = 13;
-			else
-				intsrc.mpc_irqflag = 0;
-		}
-
-		intsrc.mpc_srcbusirq = i;
-		intsrc.mpc_dstirq = i ? i : 2;		/* IRQ0 to INTIN2 */
-		MP_intsrc_info(&intsrc);
-	}
-
-	intsrc.mpc_irqtype = mp_ExtINT;
-	intsrc.mpc_srcbusirq = 0;
-	intsrc.mpc_dstirq = 0;				/* 8259A to INTIN0 */
-	MP_intsrc_info(&intsrc);
-}
-
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-{
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-	struct mpc_config_ioapic ioapic;
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	processor.mpc_apicver = 0;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = 0;
-	processor.mpc_featureflag = 0;
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
-
-	bus.mpc_type = MP_BUS;
-	bus.mpc_busid = 0;
-	switch (mpc_default_type) {
-		default:
-			printk(KERN_ERR "???\nUnknown standard configuration %d\n",
-				mpc_default_type);
-			/* fall through */
-		case 1:
-		case 5:
-			memcpy(bus.mpc_bustype, "ISA   ", 6);
-			break;
-	}
-	MP_bus_info(&bus);
-	if (mpc_default_type > 4) {
-		bus.mpc_busid = 1;
-		memcpy(bus.mpc_bustype, "PCI   ", 6);
-		MP_bus_info(&bus);
-	}
-
-	ioapic.mpc_type = MP_IOAPIC;
-	ioapic.mpc_apicid = 2;
-	ioapic.mpc_apicver = 0;
-	ioapic.mpc_flags = MPC_APIC_USABLE;
-	ioapic.mpc_apicaddr = 0xFEC00000;
-	MP_ioapic_info(&ioapic);
-
-	/*
-	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
-	 */
-	construct_default_ioirq_mptable(mpc_default_type);
-
-	lintsrc.mpc_type = MP_LINTSRC;
-	lintsrc.mpc_irqflag = 0;		/* conforming */
-	lintsrc.mpc_srcbusid = 0;
-	lintsrc.mpc_srcbusirq = 0;
-	lintsrc.mpc_destapic = MP_APIC_ALL;
-	for (i = 0; i < 2; i++) {
-		lintsrc.mpc_irqtype = linttypes[i];
-		lintsrc.mpc_destapiclint = i;
-		MP_lintsrc_info(&lintsrc);
-	}
-}
-
-static struct intel_mp_floating *mpf_found;
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-void __init get_smp_config (void)
-{
-	struct intel_mp_floating *mpf = mpf_found;
-
-	/*
- 	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
- 	 * processors, where MPS only supports physical.
- 	 */
- 	if (acpi_lapic && acpi_ioapic) {
- 		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
- 		return;
-	}
- 	else if (acpi_lapic)
- 		printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
-	printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-
-	/*
-	 * Now see if we need to read further.
-	 */
-	if (mpf->mpf_feature1 != 0) {
-
-		printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-		construct_default_ISA_mptable(mpf->mpf_feature1);
-
-	} else if (mpf->mpf_physptr) {
-
-		/*
-		 * Read the physical hardware table.  Anything here will
-		 * override the defaults.
-		 */
- 		if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
-			smp_found_config = 0;
-			printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-			printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-			return;
-		}
-		/*
-		 * If there are no explicit MP IRQ entries, then we are
-		 * broken.  We set up most of the low 16 IO-APIC pins to
-		 * ISA defaults and hope it will work.
-		 */
-		if (!mp_irq_entries) {
-			struct mpc_config_bus bus;
-
-			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
-
-			bus.mpc_type = MP_BUS;
-			bus.mpc_busid = 0;
-			memcpy(bus.mpc_bustype, "ISA   ", 6);
-			MP_bus_info(&bus);
-
-			construct_default_ioirq_mptable(0);
-		}
-
-	} else
-		BUG();
-
-	printk(KERN_INFO "Processors: %d\n", num_processors);
-	/*
-	 * Only use the first configuration found.
-	 */
-}
-
-static int __init smp_scan_config (unsigned long base, unsigned long length)
-{
-	extern void __bad_mpf_size(void);
-	unsigned int *bp = isa_bus_to_virt(base);
-	struct intel_mp_floating *mpf;
-
-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
-	if (sizeof(*mpf) != 16)
-		__bad_mpf_size();
-
-	while (length > 0) {
-		mpf = (struct intel_mp_floating *)bp;
-		if ((*bp == SMP_MAGIC_IDENT) &&
-			(mpf->mpf_length == 1) &&
-			!mpf_checksum((unsigned char *)bp, 16) &&
-			((mpf->mpf_specification == 1)
-				|| (mpf->mpf_specification == 4)) ) {
-
-			smp_found_config = 1;
-			mpf_found = mpf;
-			return 1;
-		}
-		bp += 4;
-		length -= 16;
-	}
-	return 0;
-}
-
-void __init find_smp_config(void)
-{
-	unsigned int address;
-
-	/*
-	 * FIXME: Linux assumes you have 640K of base ram..
-	 * this continues the error...
-	 *
-	 * 1) Scan the bottom 1K for a signature
-	 * 2) Scan the top 1K of base RAM
-	 * 3) Scan the 64K of bios
-	 */
-	if (smp_scan_config(0x0,0x400) ||
-		smp_scan_config(639*0x400,0x400) ||
-			smp_scan_config(0xF0000,0x10000))
-		return;
-	/*
-	 * If it is an SMP machine we should know now.
-	 *
-	 * there is a real-mode segmented pointer pointing to the
-	 * 4K EBDA area at 0x40E, calculate and scan it here.
-	 *
-	 * NOTE! There are Linux loaders that will corrupt the EBDA
-	 * area, and as such this kind of SMP config may be less
-	 * trustworthy, simply because the SMP table may have been
-	 * stomped on during early boot. These loaders are buggy and
-	 * should be fixed.
-	 */
-
-	address = *(unsigned short *)phys_to_virt(0x40E);
-	address <<= 4;
-	if (smp_scan_config(address, 0x1000))
-		return;
-
-	/* If we have come this far, we did not find an MP table  */
-	 printk(KERN_INFO "No mptable found.\n");
-}
-
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
-{
-#ifndef CONFIG_XEN
-	mp_lapic_addr = (unsigned long) address;
-	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-	if (boot_cpu_id == -1U)
-		boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-#endif
-}
-
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
-{
-	struct mpc_config_processor processor;
-	int			boot_cpu = 0;
-
-	if (id == boot_cpu_id)
-		boot_cpu = 1;
-
-#ifndef CONFIG_XEN
-	processor.mpc_type = MP_PROCESSOR;
-	processor.mpc_apicid = id;
-	processor.mpc_apicver = 0;
-	processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-	processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-	processor.mpc_cpufeature = 0;
-	processor.mpc_featureflag = 0;
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-#endif
-
-	MP_processor_info(&processor);
-}
-
-#define MP_ISA_BUS		0
-#define MP_MAX_IOAPIC_PIN	127
-
-static struct mp_ioapic_routing {
-	int			apic_id;
-	int			gsi_start;
-	int			gsi_end;
-	u32			pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic(int gsi)
-{
-	int i = 0;
-
-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_start)
-			&& (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
-	}
-
-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-	return -1;
-}
-
-static u8 uniq_ioapic_id(u8 id)
-{
-	int i;
-	DECLARE_BITMAP(used, 256);
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
-}
-
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
-{
-	int idx = 0;
-
-	if (bad_ioapic(address))
-		return;
-
-	idx = nr_ioapics;
-
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
-
-#ifndef CONFIG_XEN
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-#endif
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-	mp_ioapics[idx].mpc_apicver = 0;
-
-	/*
-	 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_start = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-		io_apic_get_redir_entries(idx);
-
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
-		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-		mp_ioapics[idx].mpc_apicaddr,
-		mp_ioapic_routing[idx].gsi_start,
-		mp_ioapic_routing[idx].gsi_end);
-
-	nr_ioapics++;
-}
-
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32	gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int			ioapic = -1;
-	int			pin = -1;
-
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;				       /* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	   /* APIC ID */
-	intsrc.mpc_dstirq = pin;				    /* INTIN# */
-
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
-		intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-		(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-		intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
-	mp_irqs[mp_irq_entries] = intsrc;
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!\n");
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
-{
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
-
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;					/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-				(irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;			/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;		   /* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-			"%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
-			(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
-			intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
-			intsrc.mpc_dstirq);
-
-		mp_irqs[mp_irq_entries] = intsrc;
-		if (++mp_irq_entries == MAX_IRQ_SOURCES)
-			panic("Max # of irq sources exceeded!\n");
-	}
-}
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
-	int ioapic = -1;
-	int ioapic_pin = 0;
-	int idx, bit = 0;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
-
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
-
-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	bit = ioapic_pin % 32;
-	idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-	if (idx > 3) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-			"%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-			ioapic_pin);
-		return gsi;
-	}
-	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi;
-	}
-
-	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-		triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-		polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
-}
-#endif /*CONFIG_ACPI*/
--- head-2010-05-25.orig/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -1,280 +1,251 @@
-/*
- * Dynamic DMA mapping support.
- *
- * On i386 there is no hardware dynamic DMA address translation,
- * so consistent alloc/free are merely page allocation/freeing.
- * The rest of the dynamic DMA mapping interface is implemented
- * in asm/pci.h.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmar.h>
+#include <linux/bootmem.h>
 #include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/version.h>
-#include <asm/io.h>
-#include <xen/balloon.h>
-#include <xen/gnttab.h>
-#include <asm/swiotlb.h>
-#include <asm/tlbflush.h>
-#include <asm/swiotlb_32.h>
-#include <asm/gnttab_dma.h>
-#include <asm/bug.h>

-#ifdef __x86_64__
-#include <asm/iommu.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/calgary.h>
+
+int forbid_dac __read_mostly;
+EXPORT_SYMBOL(forbid_dac);
+
+const struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static int iommu_sac_force __read_mostly;
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly = 0;
+#endif

 int iommu_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_merge);

-dma_addr_t bad_dma_address __read_mostly;
-EXPORT_SYMBOL(bad_dma_address);
+int no_iommu __read_mostly;
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;

 /* This tells the BIO block layer to assume merging. Default to off
    because we cannot guarantee merging later. */
 int iommu_bio_merge __read_mostly = 0;
 EXPORT_SYMBOL(iommu_bio_merge);

-int force_iommu __read_mostly= 0;
+dma_addr_t bad_dma_address __read_mostly = 0;
+EXPORT_SYMBOL(bad_dma_address);

-__init int iommu_setup(char *p)
-{
-    return 1;
-}
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to older i386. */
+struct device fallback_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = DMA_32BIT_MASK,
+	.dma_mask = &fallback_dev.coherent_dma_mask,
+};

-void __init pci_iommu_alloc(void)
+int dma_set_mask(struct device *dev, u64 mask)
 {
-#ifdef CONFIG_SWIOTLB
-	pci_swiotlb_init();
-#endif
-}
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+
+	*dev->dma_mask = mask;

-static int __init pci_iommu_init(void)
-{
-	no_iommu_init();
 	return 0;
 }
+EXPORT_SYMBOL(dma_set_mask);

-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
-#endif
-
-struct dma_coherent_mem {
-	void		*virt_base;
-	u32		device_base;
-	int		size;
-	int		flags;
-	unsigned long	*bitmap;
-};
-
-#define IOMMU_BUG_ON(test)				\
-do {							\
-	if (unlikely(test)) {				\
-		printk(KERN_ALERT "Fatal DMA error! "	\
-		       "Please use 'swiotlb=force'\n");	\
-		BUG();					\
-	}						\
-} while (0)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static __initdata void *dma32_bootmem_ptr;
+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);

-static int check_pages_physically_contiguous(unsigned long pfn,
-					     unsigned int offset,
-					     size_t length)
+static int __init parse_dma32_size_opt(char *p)
 {
-	unsigned long next_mfn;
-	int i;
-	int nr_pages;
-
-	next_mfn = pfn_to_mfn(pfn);
-	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
-
-	for (i = 1; i < nr_pages; i++) {
-		if (pfn_to_mfn(++pfn) != ++next_mfn)
-			return 0;
-	}
-	return 1;
+	if (!p)
+		return -EINVAL;
+	dma32_bootmem_size = memparse(p, &p);
+	return 0;
 }
+early_param("dma32_size", parse_dma32_size_opt);

-int range_straddles_page_boundary(paddr_t p, size_t size)
+void __init dma32_reserve_bootmem(void)
 {
-	unsigned long pfn = p >> PAGE_SHIFT;
-	unsigned int offset = p & ~PAGE_MASK;
+	unsigned long size, align;
+	if (end_pfn <= MAX_DMA32_PFN)
+		return;

-	return ((offset + size > PAGE_SIZE) &&
-		!check_pages_physically_contiguous(pfn, offset, size));
+	align = 64ULL<<20;
+	size = round_up(dma32_bootmem_size, align);
+	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
+				 __pa(MAX_DMA_ADDRESS));
+	if (dma32_bootmem_ptr)
+		dma32_bootmem_size = size;
+	else
+		dma32_bootmem_size = 0;
 }
-
-int
-dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-	   enum dma_data_direction direction)
+static void __init dma32_free_bootmem(void)
 {
-	int i, rc;
+	int node;
+
+	if (end_pfn <= MAX_DMA32_PFN)
+		return;

-	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(nents == 0 || sgl->length == 0);
+	if (!dma32_bootmem_ptr)
+		return;

-	if (swiotlb) {
-		rc = swiotlb_map_sg(hwdev, sgl, nents, direction);
-	} else {
-		struct scatterlist *sg;
-
-		for_each_sg(sgl, sg, nents, i) {
-			BUG_ON(!sg_page(sg));
-			sg->dma_address =
-				gnttab_dma_map_page(sg_page(sg)) + sg->offset;
-			sg->dma_length  = sg->length;
-			IOMMU_BUG_ON(address_needs_mapping(
-				hwdev, sg->dma_address));
-			IOMMU_BUG_ON(range_straddles_page_boundary(
-				page_to_pseudophys(sg_page(sg)) + sg->offset,
-				sg->length));
-		}
-		rc = nents;
-	}
+	for_each_online_node(node)
+		free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
+				  dma32_bootmem_size);

-	flush_write_buffers();
-	return rc;
+	dma32_bootmem_ptr = NULL;
+	dma32_bootmem_size = 0;
 }
-EXPORT_SYMBOL(dma_map_sg);
+#else
+#define dma32_free_bootmem() ((void)0)
+#endif

-void
-dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
-	     enum dma_data_direction direction)
-{
-	int i;
+static const struct dma_mapping_ops swiotlb_dma_ops = {
+	.mapping_error = swiotlb_dma_mapping_error,
+	.map_single = swiotlb_map_single_phys,
+	.unmap_single = swiotlb_unmap_single,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.map_sg = swiotlb_map_sg,
+	.unmap_sg = swiotlb_unmap_sg,
+	.dma_supported = swiotlb_dma_supported
+};

-	BUG_ON(!valid_dma_direction(direction));
-	if (swiotlb)
-		swiotlb_unmap_sg(hwdev, sgl, nents, direction);
-	else {
-		struct scatterlist *sg;
+void __init pci_iommu_alloc(void)
+{
+	/* free the range so iommu could get some range less than 4G */
+	dma32_free_bootmem();
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_GART_IOMMU
+	gart_iommu_hole_init();
+#endif

-		for_each_sg(sgl, sg, nents, i)
-			gnttab_dma_unmap_page(sg->dma_address);
-	}
-}
-EXPORT_SYMBOL(dma_unmap_sg);
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif

-#ifdef CONFIG_HIGHMEM
-dma_addr_t
-dma_map_page(struct device *dev, struct page *page, unsigned long offset,
-	     size_t size, enum dma_data_direction direction)
-{
-	dma_addr_t dma_addr;
+	detect_intel_iommu();

-	BUG_ON(!valid_dma_direction(direction));
+#ifdef CONFIG_SWIOTLB
+	swiotlb_init();
 	if (swiotlb) {
-		dma_addr = swiotlb_map_page(
-			dev, page, offset, size, direction);
-	} else {
-		dma_addr = gnttab_dma_map_page(page) + offset;
-		IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
+		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+		dma_ops = &swiotlb_dma_ops;
 	}
-
-	return dma_addr;
+#endif
 }
-EXPORT_SYMBOL(dma_map_page);

-void
-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+static __init int iommu_setup(char *p)
 {
-	BUG_ON(!valid_dma_direction(direction));
-	if (swiotlb)
-		swiotlb_unmap_page(dev, dma_address, size, direction);
-	else
-		gnttab_dma_unmap_page(dma_address);
-}
-EXPORT_SYMBOL(dma_unmap_page);
-#endif /* CONFIG_HIGHMEM */
+	iommu_merge = 1;

-int
-dma_mapping_error(dma_addr_t dma_addr)
-{
-	if (swiotlb)
-		return swiotlb_dma_mapping_error(dma_addr);
-	return 0;
-}
-EXPORT_SYMBOL(dma_mapping_error);
+	if (!p)
+		return -EINVAL;

-int
-dma_supported(struct device *dev, u64 mask)
-{
-	if (swiotlb)
-		return swiotlb_dma_supported(dev, mask);
-	/*
-	 * By default we'll BUG when an infeasible DMA is requested, and
-	 * request swiotlb=force (see IOMMU_BUG_ON).
-	 */
-	return 1;
-}
-EXPORT_SYMBOL(dma_supported);
+	while (*p) {
+		if (!strncmp(p, "off", 3))
+			no_iommu = 1;
+		/* gart_parse_options has more force support */
+		if (!strncmp(p, "force", 5))
+			force_iommu = 1;
+		if (!strncmp(p, "noforce", 7)) {
+			iommu_merge = 0;
+			force_iommu = 0;
+		}

-void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t gfp)
-{
-	void *ret;
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	unsigned int order = get_order(size);
-	unsigned long vstart;
-	u64 mask;
+		if (!strncmp(p, "biomerge", 8)) {
+			iommu_bio_merge = 4096;
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "panic", 5))
+			panic_on_overflow = 1;
+		if (!strncmp(p, "nopanic", 7))
+			panic_on_overflow = 0;
+		if (!strncmp(p, "merge", 5)) {
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "nomerge", 7))
+			iommu_merge = 0;
+		if (!strncmp(p, "forcesac", 8))
+			iommu_sac_force = 1;
+		if (!strncmp(p, "allowdac", 8))
+			forbid_dac = 0;
+		if (!strncmp(p, "nodac", 5))
+			forbid_dac = -1;
+		if (!strncmp(p, "usedac", 6)) {
+			forbid_dac = -1;
+			return 1;
+		}
+#ifdef CONFIG_SWIOTLB
+		if (!strncmp(p, "soft", 4))
+			swiotlb = 1;
+#endif

-	/* ignore region specifiers */
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+#ifdef CONFIG_GART_IOMMU
+		gart_parse_options(p);
+#endif

-	if (mem) {
-		int page = bitmap_find_free_region(mem->bitmap, mem->size,
-						     order);
-		if (page >= 0) {
-			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
-			ret = mem->virt_base + (page << PAGE_SHIFT);
-			memset(ret, 0, size);
-			return ret;
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-			return NULL;
+#ifdef CONFIG_CALGARY_IOMMU
+		if (!strncmp(p, "calgary", 7))
+			use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+		p += strcspn(p, ",");
+		if (*p == ',')
+			++p;
 	}
+	return 0;
+}
+early_param("iommu", iommu_setup);

-	vstart = __get_free_pages(gfp, order);
-	ret = (void *)vstart;
+static int check_pages_physically_contiguous(unsigned long pfn,
+					     unsigned int offset,
+					     size_t length)
+{
+	unsigned long next_mfn;
+	int i;
+	int nr_pages;

-	if (dev != NULL && dev->coherent_dma_mask)
-		mask = dev->coherent_dma_mask;
-	else
-		mask = 0xffffffff;
+	next_mfn = pfn_to_mfn(pfn);
+	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;

-	if (ret != NULL) {
-		if (xen_create_contiguous_region(vstart, order,
-						 fls64(mask)) != 0) {
-			free_pages(vstart, order);
-			return NULL;
-		}
-		memset(ret, 0, size);
-		*dma_handle = virt_to_bus(ret);
+	for (i = 1; i < nr_pages; i++) {
+		if (pfn_to_mfn(++pfn) != ++next_mfn)
+			return 0;
 	}
-	return ret;
+	return 1;
 }
-EXPORT_SYMBOL(dma_alloc_coherent);

-void dma_free_coherent(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle)
+int range_straddles_page_boundary(paddr_t p, size_t size)
 {
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	int order = get_order(size);
-
-	WARN_ON(irqs_disabled());	/* for portability */
-	if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+	unsigned long pfn = p >> PAGE_SHIFT;
+	unsigned int offset = p & ~PAGE_MASK;

-		bitmap_release_region(mem->bitmap, page, order);
-	} else {
-		xen_destroy_contiguous_region((unsigned long)vaddr, order);
-		free_pages((unsigned long)vaddr, order);
-	}
+	return ((offset + size > PAGE_SIZE) &&
+		!check_pages_physically_contiguous(pfn, offset, size));
 }
-EXPORT_SYMBOL(dma_free_coherent);

-#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
+#ifdef CONFIG_X86_32
 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
 				dma_addr_t device_addr, size_t size, int flags)
 {
@@ -324,8 +295,8 @@ EXPORT_SYMBOL(dma_declare_coherent_memor
 void dma_release_declared_memory(struct device *dev)
 {
 	struct dma_coherent_mem *mem = dev->dma_mem;
-
-	if(!mem)
+
+	if (!mem)
 		return;
 	dev->dma_mem = NULL;
 	iounmap(mem->virt_base);
@@ -338,8 +309,10 @@ void *dma_mark_declared_memory_occupied(
 					dma_addr_t device_addr, size_t size)
 {
 	struct dma_coherent_mem *mem = dev->dma_mem;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	int pos, err;
+	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+	pages >>= PAGE_SHIFT;

 	if (!mem)
 		return ERR_PTR(-EINVAL);
@@ -351,103 +324,270 @@ void *dma_mark_declared_memory_occupied(
 	return mem->virt_base + (pos << PAGE_SHIFT);
 }
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
-
-#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-int forbid_dac;
-EXPORT_SYMBOL(forbid_dac);

-static __devinit void via_no_dac(struct pci_dev *dev)
+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
+				       dma_addr_t *dma_handle, void **ret)
 {
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
-		forbid_dac = 1;
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+
+	if (mem) {
+		int page = bitmap_find_free_region(mem->bitmap, mem->size,
+						     order);
+		if (page >= 0) {
+			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
+			*ret = mem->virt_base + (page << PAGE_SHIFT);
+			memset(*ret, 0, size);
+		}
+		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+			*ret = NULL;
 	}
+	return (mem != NULL);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);

-static int check_iommu(char *s)
+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
 {
-	if (!strcmp(s, "usedac")) {
-		forbid_dac = -1;
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+	if (mem && vaddr >= mem->virt_base && vaddr <
+		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+		bitmap_release_region(mem->bitmap, page, order);
 		return 1;
 	}
 	return 0;
 }
-__setup("iommu=", check_iommu);
+#else
+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
+#define dma_release_coherent(dev, order, vaddr) (0)
+#endif /* CONFIG_X86_32 */
+
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+	if (mask > 0xffffffff && forbid_dac > 0) {
+		printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
+				 dev->bus_id);
+		return 0;
+	}
 #endif

-dma_addr_t
-dma_map_single(struct device *dev, void *ptr, size_t size,
-	       enum dma_data_direction direction)
+	if (dma_ops->dma_supported)
+		return dma_ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+	if (mask < DMA_24BIT_MASK)
+		return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
+				 dev->bus_id, mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+/* Allocate DMA memory on node near device */
+static struct page *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
-	dma_addr_t dma;
+	int node;

-	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(size == 0);
+	node = dev_to_node(dev);

-	if (swiotlb) {
-		dma = swiotlb_map_single(dev, ptr, size, direction);
-	} else {
-		dma = gnttab_dma_map_page(virt_to_page(ptr)) +
-		      offset_in_page(ptr);
-		IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size));
-		IOMMU_BUG_ON(address_needs_mapping(dev, dma));
-	}
-
-	flush_write_buffers();
-	return dma;
-}
-EXPORT_SYMBOL(dma_map_single);
-
-void
-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
-{
-	BUG_ON(!valid_dma_direction(direction));
-	if (swiotlb)
-		swiotlb_unmap_single(dev, dma_addr, size, direction);
-	else
-		gnttab_dma_unmap_page(dma_addr);
+	return alloc_pages_node(node, gfp, order);
 }
-EXPORT_SYMBOL(dma_unmap_single);

-void
-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
-			enum dma_data_direction direction)
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   gfp_t gfp)
+{
+	void *memory = NULL;
+	struct page *page;
+	unsigned long dma_mask = 0;
+	int noretry = 0;
+	unsigned int order = get_order(size);
+
+	/* ignore region specifiers */
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+	if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+		return memory;
+
+	if (!dev) {
+		dev = &fallback_dev;
+		gfp |= GFP_DMA;
+	}
+	dma_mask = dev->coherent_dma_mask;
+	if (dma_mask == 0)
+		dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+
+	/* Device not DMA able */
+	if (dev->dma_mask == NULL)
+		return NULL;
+
+#ifdef CONFIG_XEN
+	gfp &= ~(__GFP_DMA | __GFP_DMA32);
+#else
+	/* Don't invoke OOM killer or retry in lower 16MB DMA zone */
+	if (gfp & __GFP_DMA)
+		noretry = 1;
+
+#ifdef CONFIG_X86_64
+	/* Why <=? Even when the mask is smaller than 4GB it is often
+	   larger than 16MB and in this case we have a chance of
+	   finding fitting memory in the next higher zone first. If
+	   not retry with true GFP_DMA. -AK */
+	if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+		gfp |= GFP_DMA32;
+#endif
+
+ again:
+#endif
+	page = dma_alloc_pages(dev,
+		noretry ? gfp | __GFP_NORETRY : gfp, order);
+	if (page == NULL)
+		return NULL;
+
+#ifndef CONFIG_XEN
+	{
+		int high, mmu;
+		dma_addr_t bus = page_to_phys(page);
+		memory = page_address(page);
+		high = (bus + size) >= dma_mask;
+		mmu = high;
+		if (force_iommu && !(gfp & GFP_DMA))
+			mmu = 1;
+		else if (high) {
+			free_pages((unsigned long)memory, order);
+
+			/* Don't use the 16MB ZONE_DMA unless absolutely
+			   needed. It's better to use remapping first. */
+			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+				goto again;
+			}
+
+			/* Let low level make its own zone decisions */
+			gfp &= ~(GFP_DMA32|GFP_DMA);
+
+			if (dma_ops->alloc_coherent)
+				return dma_ops->alloc_coherent(dev, size,
+							   dma_handle, gfp);
+			return NULL;
+		}
+
+		memset(memory, 0, size);
+		if (!mmu) {
+			*dma_handle = bus;
+			return memory;
+		}
+	}
+
+	if (dma_ops->alloc_coherent) {
+		free_pages((unsigned long)memory, order);
+		gfp &= ~(GFP_DMA|GFP_DMA32);
+		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+	}
+
+	if (dma_ops->map_simple) {
+		*dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
+					      size,
+					      PCI_DMA_BIDIRECTIONAL);
+		if (*dma_handle != bad_dma_address)
+			return memory;
+	}
+#else
+	memory = page_address(page);
+	if (xen_create_contiguous_region((unsigned long)memory, order,
+					 fls64(dma_mask)) == 0) {
+		memset(memory, 0, size);
+		*dma_handle = virt_to_bus(memory);
+		return memory;
+	}
+#endif
+
+	if (panic_on_overflow)
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
+		      (unsigned long)size);
+	free_pages((unsigned long)memory, order);
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t bus)
 {
-	if (swiotlb)
-		swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
+	int order = get_order(size);
+	WARN_ON(irqs_disabled());	/* for portability */
+	if (dma_release_coherent(dev, order, vaddr))
+		return;
+#ifndef CONFIG_XEN
+	if (dma_ops->unmap_single)
+		dma_ops->unmap_single(dev, bus, size, 0);
+#endif
+	xen_destroy_contiguous_region((unsigned long)vaddr, order);
+	free_pages((unsigned long)vaddr, order);
 }
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
+EXPORT_SYMBOL(dma_free_coherent);

-void
-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
-                           enum dma_data_direction direction)
+static int __init pci_iommu_init(void)
 {
-	if (swiotlb)
-		swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
+#ifdef CONFIG_CALGARY_IOMMU
+	calgary_iommu_init();
+#endif
+
+	intel_iommu_init();
+
+#ifdef CONFIG_GART_IOMMU
+	gart_iommu_init();
+#endif
+
+	no_iommu_init();
+	return 0;
 }
-EXPORT_SYMBOL(dma_sync_single_for_device);

-void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
+void pci_iommu_shutdown(void)
 {
-	if (swiotlb)
-		swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
-	flush_write_buffers();
+	gart_iommu_shutdown();
 }
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */

-void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
+static __devinit void via_no_dac(struct pci_dev *dev)
 {
-	if (swiotlb)
-		swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
-	flush_write_buffers();
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+		printk(KERN_INFO "PCI: VIA PCI bridge detected."
+				 "Disabling DAC.\n");
+		forbid_dac = 1;
+	}
 }
-EXPORT_SYMBOL(dma_sync_sg_for_device);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/pci-nommu-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,108 @@
+#include <linux/dma-mapping.h>
+#include <linux/dmar.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <xen/gnttab.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+#include <asm/gnttab_dma.h>
+#include <asm/bug.h>
+
+#define IOMMU_BUG_ON(test)				\
+do {							\
+	if (unlikely(test)) {				\
+		printk(KERN_ALERT "Fatal DMA error! "	\
+		       "Please use 'swiotlb=force'\n");	\
+		BUG();					\
+	}						\
+} while (0)
+
+static int
+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+	      int direction)
+{
+	unsigned int i;
+	struct scatterlist *sg;
+
+	WARN_ON(nents == 0 || sgl->length == 0);
+
+	for_each_sg(sgl, sg, nents, i) {
+		BUG_ON(!sg_page(sg));
+		sg->dma_address =
+			gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+		sg->dma_length  = sg->length;
+		IOMMU_BUG_ON(address_needs_mapping(
+			hwdev, sg->dma_address));
+		IOMMU_BUG_ON(range_straddles_page_boundary(
+			page_to_pseudophys(sg_page(sg)) + sg->offset,
+			sg->length));
+	}
+
+	return nents;
+}
+
+static void
+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+		int direction)
+{
+	unsigned int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i)
+		gnttab_dma_unmap_page(sg->dma_address);
+}
+
+static dma_addr_t
+gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size,
+		  int direction)
+{
+	dma_addr_t dma;
+
+	WARN_ON(size == 0);
+
+	dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) +
+	      offset_in_page(paddr);
+	IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
+	IOMMU_BUG_ON(address_needs_mapping(dev, dma));
+
+	return dma;
+}
+
+static void
+gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		    int direction)
+{
+	gnttab_dma_unmap_page(dma_addr);
+}
+
+static int nommu_dma_supported(struct device *hwdev, u64 mask)
+{
+	return 1;
+}
+
+static int nommu_mapping_error(dma_addr_t dma_addr)
+{
+	return (dma_addr == bad_dma_address);
+}
+
+static const struct dma_mapping_ops nommu_dma_ops = {
+	.map_single = gnttab_map_single,
+	.unmap_single = gnttab_unmap_single,
+	.map_sg = gnttab_map_sg,
+	.unmap_sg = gnttab_unmap_sg,
+	.dma_supported = nommu_dma_supported,
+	.mapping_error = nommu_mapping_error
+};
+
+void __init no_iommu_init(void)
+{
+	if (dma_ops)
+		return;
+
+	force_iommu = 0; /* no HW IOMMU */
+	dma_ops = &nommu_dma_ops;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/process-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,188 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+
+struct kmem_cache *task_xstate_cachep;
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	*dst = *src;
+	if (src->thread.xstate) {
+		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+						      GFP_KERNEL);
+		if (!dst->thread.xstate)
+			return -ENOMEM;
+		WARN_ON((unsigned long)dst->thread.xstate & 15);
+		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	}
+	return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+	if (tsk->thread.xstate) {
+		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
+		tsk->thread.xstate = NULL;
+	}
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+	free_thread_xstate(ti->task);
+	free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+}
+
+void arch_task_cache_init(void)
+{
+        task_xstate_cachep =
+        	kmem_cache_create("task_xstate", xstate_size,
+				  __alignof__(union thread_xstate),
+				  SLAB_PANIC, NULL);
+}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+	smp_mb();
+	/* kick all the CPUs so that they exit out of pm_idle */
+	smp_call_function(do_nothing, NULL, 0, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifndef CONFIG_XEN
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+	if (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__mwait(ax, cx);
+	}
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+	if (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__sti_mwait(0, 0);
+		else
+			local_irq_enable();
+	} else
+		local_irq_enable();
+}
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+	local_irq_enable();
+	cpu_relax();
+}
+
+#ifndef CONFIG_XEN
+/*
+ * mwait selection logic:
+ *
+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
+ * then depend on a clock divisor and current Pstate of the core. If
+ * all cores of a processor are in halt state (C1) the processor can
+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
+ * happen.
+ *
+ * idle=mwait overrides this decision and forces the usage of mwait.
+ */
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+	if (force_mwait)
+		return 1;
+
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		switch(c->x86) {
+		case 0x10:
+		case 0x11:
+			return 0;
+		}
+	}
+	return 1;
+}
+#endif
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifndef CONFIG_XEN
+	static int selected;
+
+	if (selected)
+		return;
+#ifdef CONFIG_X86_SMP
+	if (pm_idle == poll_idle && smp_num_siblings > 1) {
+		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+			" performance may degrade.\n");
+	}
+#endif
+	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+		/*
+		 * Skip, if setup has overridden idle.
+		 * One CPU supports mwait => All CPUs supports mwait
+		 */
+		if (!pm_idle) {
+			printk(KERN_INFO "using mwait in idle threads.\n");
+			pm_idle = mwait_idle;
+		}
+	}
+	selected = 1;
+#endif
+}
+
+static int __init idle_setup(char *str)
+{
+	if (!strcmp(str, "poll")) {
+		printk("using polling idle threads.\n");
+		pm_idle = poll_idle;
+	}
+#ifndef CONFIG_XEN
+	else if (!strcmp(str, "mwait"))
+		force_mwait = 1;
+#endif
+	else
+		return -1;
+
+	boot_option_idle_override = 1;
+	return 0;
+}
+early_param("idle", idle_setup);
+
--- head-2010-05-25.orig/arch/x86/kernel/process_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/process_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -36,6 +36,7 @@
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
+#include <linux/prctl.h>

 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -45,7 +46,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/desc.h>
-#include <asm/vm86.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -102,16 +102,6 @@ void enable_hlt(void)

 EXPORT_SYMBOL(enable_hlt);

-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
-	cpu_relax();
-}
-
 static void xen_idle(void)
 {
 	current_thread_info()->status &= ~TS_POLLING;
@@ -121,20 +111,10 @@ static void xen_idle(void)
 	 */
 	smp_mb();

-	local_irq_disable();
-	if (!need_resched()) {
-		ktime_t t0, t1;
-		u64 t0n, t1n;
-
-		t0 = ktime_get();
-		t0n = ktime_to_ns(t0);
+	if (!need_resched())
 		safe_halt();	/* enables interrupts racelessly */
-		local_irq_disable();
-		t1 = ktime_get();
-		t1n = ktime_to_ns(t1);
-		sched_clock_idle_wakeup_event(t1n - t0n);
-	}
-	local_irq_enable();
+	else
+		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }
 #ifdef CONFIG_APM_MODULE
@@ -142,7 +122,6 @@ EXPORT_SYMBOL(default_idle);
 #endif

 #ifdef CONFIG_HOTPLUG_CPU
-extern cpumask_t cpu_initialized;
 static inline void play_dead(void)
 {
 	idle_task_exit();
@@ -187,6 +166,7 @@ void cpu_idle(void)
 			if (cpu_is_offline(cpu))
 				play_dead();

+			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
@@ -197,44 +177,6 @@ void cpu_idle(void)
 	}
 }

-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
-	smp_mb();
-	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-}
-
-static int __init idle_setup(char *str)
-{
-	if (!strcmp(str, "poll")) {
-		printk("using polling idle threads.\n");
-		pm_idle = poll_idle;
-	}
-	else
-		return -1;
-
-	boot_option_idle_override = 1;
-	return 0;
-}
-early_param("idle", idle_setup);
-
 void __show_registers(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -260,7 +202,7 @@ void __show_registers(struct pt_regs *re
 			init_utsname()->version);

 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-			0xffff & regs->cs, regs->ip, regs->flags,
+			(u16)regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
 	print_symbol("EIP is at %s\n", regs->ip);

@@ -269,8 +211,7 @@ void __show_registers(struct pt_regs *re
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
 		regs->si, regs->di, regs->bp, sp);
 	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       regs->ds & 0xffff, regs->es & 0xffff,
-	       regs->fs & 0xffff, gs, ss);
+	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);

 	if (!all)
 		return;
@@ -367,6 +308,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
@@ -437,11 +379,30 @@ int copy_thread(int nr, unsigned long cl
 	return err;
 }

-#ifdef CONFIG_SECCOMP
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+	__asm__("movl %0, %%gs" :: "r"(0));
+	regs->fs		= 0;
+	set_fs(USER_DS);
+	regs->ds		= __USER_DS;
+	regs->es		= __USER_DS;
+	regs->ss		= __USER_DS;
+	regs->cs		= __USER_CS;
+	regs->ip		= new_ip;
+	regs->sp		= new_sp;
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
 static void hard_disable_TSC(void)
 {
 	write_cr4(read_cr4() | X86_CR4_TSD);
 }
+
 void disable_TSC(void)
 {
 	preempt_disable();
@@ -453,11 +414,47 @@ void disable_TSC(void)
 		hard_disable_TSC();
 	preempt_enable();
 }
+
 static void hard_enable_TSC(void)
 {
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
-#endif /* CONFIG_SECCOMP */
+
+static void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}

 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
@@ -473,12 +470,12 @@ __switch_to_xtra(struct task_struct *pre
 		/* we clear debugctl to make sure DS
 		 * is not in use when we change it */
 		debugctl = 0;
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		update_debugctlmsr(0);
 		wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
 	}

 	if (next->debugctlmsr != debugctl)
-		wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
+		update_debugctlmsr(next->debugctlmsr);

 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		set_debugreg(next->debugreg0, 0);
@@ -490,7 +487,6 @@ __switch_to_xtra(struct task_struct *pre
 		set_debugreg(next->debugreg7, 7);
 	}

-#ifdef CONFIG_SECCOMP
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
@@ -499,7 +495,6 @@ __switch_to_xtra(struct task_struct *pre
 		else
 			hard_enable_TSC();
 	}
-#endif

 #ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
@@ -637,7 +632,7 @@ struct task_struct * __switch_to(struct

 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter > 5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);

 	/*
 	 * Now maybe handle debug registers
@@ -658,8 +653,11 @@ struct task_struct * __switch_to(struct
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
+	 *
+	 * tsk_used_math() checks prevent calling math_state_restore(),
+	 * which can sleep in the case of !tsk_used_math()
 	 */
-	if (next_p->fpu_counter > 5)
+	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();

 	/*
--- head-2010-05-25.orig/arch/x86/kernel/process_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/process_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -39,6 +39,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/tick.h>
+#include <linux/prctl.h>

 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -102,17 +103,6 @@ void exit_idle(void)
 	__exit_idle();
 }

-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
-	local_irq_enable();
-	cpu_relax();
-}
-
 static void xen_idle(void)
 {
 	current_thread_info()->status &= ~TS_POLLING;
@@ -121,20 +111,10 @@ static void xen_idle(void)
 	 * test NEED_RESCHED:
 	 */
 	smp_mb();
-	local_irq_disable();
-	if (!need_resched()) {
-		ktime_t t0, t1;
-		u64 t0n, t1n;
-
-		t0 = ktime_get();
-		t0n = ktime_to_ns(t0);
+	if (!need_resched())
 		safe_halt();	/* enables interrupts racelessly */
-		local_irq_disable();
-		t1 = ktime_get();
-		t1n = ktime_to_ns(t1);
-		sched_clock_idle_wakeup_event(t1n - t0n);
-	}
-	local_irq_enable();
+	else
+		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }

@@ -195,45 +175,6 @@ void cpu_idle(void)
 	}
 }

-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
-	smp_mb();
-	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-}
-
-static int __init idle_setup(char *str)
-{
-	if (!strcmp(str, "poll")) {
-		printk("using polling idle threads.\n");
-		pm_idle = poll_idle;
-	} else if (!strcmp(str, "mwait"))
-		force_mwait = 1;
-	else
-		return -1;
-
-	boot_option_idle_override = 1;
-	return 0;
-}
-early_param("idle", idle_setup);
-
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs * regs)
 {
@@ -360,6 +301,7 @@ void flush_thread(void)
 	/*
 	 * Forget coprocessor state..
 	 */
+	tsk->fpu_counter = 0;
 	clear_fpu(tsk);
 	clear_used_math();
 }
@@ -471,6 +413,82 @@ out:
 	return err;
 }

+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+	asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+	load_gs_index(0);
+	regs->ip		= new_ip;
+	regs->sp		= new_sp;
+	regs->cs		= __USER_CS;
+	regs->ss		= __USER_DS;
+	regs->flags		= 0x200;
+	set_fs(USER_DS);
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+static void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * This special macro can be used to load a debugging register
  */
@@ -490,12 +508,12 @@ static inline void __switch_to_xtra(stru
 		/* we clear debugctl to make sure DS
 		 * is not in use when we change it */
 		debugctl = 0;
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		update_debugctlmsr(0);
 		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
 	}

 	if (next->debugctlmsr != debugctl)
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
+		update_debugctlmsr(next->debugctlmsr);

 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		loaddebug(next, 0);
@@ -507,6 +525,15 @@ static inline void __switch_to_xtra(stru
 		loaddebug(next, 7);
 	}

+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+
 #ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
@@ -546,7 +573,7 @@ __switch_to(struct task_struct *prev_p,

 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter>5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);

 	/*
 	 * This is basically '__unlazy_fpu', except that we queue a
@@ -677,8 +704,11 @@ __switch_to(struct task_struct *prev_p,
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
+	 *
+	 * tsk_used_math() checks prevent calling math_state_restore(),
+	 * which can sleep in the case of !tsk_used_math()
 	 */
-	if (next_p->fpu_counter>5)
+	if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
 		math_state_restore();
 	return prev_p;
 }
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/setup-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,141 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/topology.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+#endif
+
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+#ifndef CONFIG_XEN
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+		per_cpu(x86_bios_cpu_apicid, cpu) =
+						x86_bios_cpu_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+		per_cpu(x86_cpu_to_node_map, cpu) =
+						x86_cpu_to_node_map_init[cpu];
+#endif
+	}
+
+	/* indicate the early static arrays will soon be gone */
+	x86_cpu_to_apicid_early_ptr = NULL;
+	x86_bios_cpu_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+	x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+#endif
+}
+
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+cpumask_t *cpumask_of_cpu_map __read_mostly;
+EXPORT_SYMBOL(cpumask_of_cpu_map);
+
+/* requires nr_cpu_ids to be initialized */
+static void __init setup_cpumask_of_cpu(void)
+{
+	int i;
+
+	/* alloc_bootmem zeroes memory */
+	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+	for (i = 0; i < nr_cpu_ids; i++)
+		cpu_set(i, cpumask_of_cpu_map[i]);
+}
+#else
+static inline void setup_cpumask_of_cpu(void) { }
+#endif
+
+#ifdef CONFIG_X86_32
+/*
+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
+ * the same way
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+#endif
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{
+	int i, highest_cpu = 0;
+	unsigned long size;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	prefill_possible_map();
+#endif
+
+	/* Copy section for each CPU (we discard the original) */
+	size = PERCPU_ENOUGH_ROOM;
+	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+			  size);
+
+	for_each_possible_cpu(i) {
+		char *ptr;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+		ptr = alloc_bootmem_pages(size);
+#else
+		int node = early_cpu_to_node(i);
+		if (!node_online(node) || !NODE_DATA(node)) {
+			ptr = alloc_bootmem_pages(size);
+			printk(KERN_INFO
+			       "cpu %d has no node or node-local memory\n", i);
+		}
+		else
+			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
+		if (!ptr)
+			panic("Cannot allocate cpu data for CPU %d\n", i);
+#ifdef CONFIG_X86_64
+		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+#else
+		__per_cpu_offset[i] = ptr - __per_cpu_start;
+#endif
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+		highest_cpu = i;
+	}
+
+	nr_cpu_ids = highest_cpu + 1;
+	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+
+	/* Setup percpu data maps */
+	setup_per_cpu_maps();
+
+	/* Setup cpumask_of_cpu map */
+	setup_cpumask_of_cpu();
+}
+
+#endif
--- head-2010-05-25.orig/arch/x86/kernel/setup64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/setup64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -15,6 +15,7 @@
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kgdb.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -27,6 +28,7 @@
 #include <asm/proto.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
+#include <asm/genapic.h>
 #ifdef CONFIG_XEN
 #include <asm/hypervisor.h>
 #endif
@@ -81,8 +83,8 @@ int force_personality32 = 0;
 Control non executable heap for 32bit processes.
 To control the stack too use noexec=off

-on	PROT_READ does not imply PROT_EXEC for 32bit processes
-off	PROT_READ implies PROT_EXEC (default)
+on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off	PROT_READ implies PROT_EXEC
 */
 static int __init nonx32_setup(char *str)
 {
@@ -94,85 +96,6 @@ static int __init nonx32_setup(char *str
 }
 __setup("noexec32=", nonx32_setup);

-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
-#ifndef CONFIG_XEN
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
-		if (per_cpu_offset(cpu)) {
-#endif
-			per_cpu(x86_cpu_to_apicid, cpu) =
-						x86_cpu_to_apicid_init[cpu];
-			per_cpu(x86_bios_cpu_apicid, cpu) =
-						x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
-			per_cpu(x86_cpu_to_node_map, cpu) =
-						x86_cpu_to_node_map_init[cpu];
-#endif
-#ifdef CONFIG_SMP
-		}
-		else
-			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
-									cpu);
-#endif
-	}
-
-	/* indicate the early static arrays will soon be gone */
-	x86_cpu_to_apicid_early_ptr = NULL;
-	x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = NULL;
-#endif
-#endif
-}
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
-void __init setup_per_cpu_areas(void)
-{
-	int i;
-	unsigned long size;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
-#endif
-
-	/* Copy section for each CPU (we discard the original) */
-	size = PERCPU_ENOUGH_ROOM;
-
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
-	for_each_cpu_mask (i, cpu_possible_map) {
-		char *ptr;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
-#else
-		int node = early_cpu_to_node(i);
-
-		if (!node_online(node) || !NODE_DATA(node))
-			ptr = alloc_bootmem_pages(size);
-		else
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-#endif
-		if (!ptr)
-			panic("Cannot allocate cpu data for CPU %d\n", i);
-		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-	}
-
-	/* setup percpu data maps early */
-	setup_per_cpu_maps();
-}
-
 #ifdef CONFIG_XEN
 static void __init_refok switch_pt(int cpu)
 {
@@ -410,6 +333,17 @@ void __cpuinit cpu_init (void)
 #endif
 	load_LDT(&init_mm.context);

+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
 	/*
 	 * Clear all 6 debug registers:
 	 */
@@ -420,10 +354,17 @@ void __cpuinit cpu_init (void)
 	set_debugreg(0UL, 3);
 	set_debugreg(0UL, 6);
 	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif

 	fpu_init();

 	asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
 	if (raw_irqs_disabled())
 		kernel_eflags &= ~X86_EFLAGS_IF;
+
+	if (is_uv_system())
+		uv_cpu_init();
 }
--- head-2010-05-25.orig/arch/x86/kernel/setup_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/setup_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -39,6 +39,7 @@
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
 #include <linux/nodemask.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
@@ -49,6 +50,7 @@
 #include <linux/pfn.h>
 #include <linux/pci.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>

 #include <video/edid.h>

@@ -70,8 +72,9 @@
 #include <xen/firmware.h>
 #include <xen/xencons.h>
 #include <setup_arch.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
+#include <asm/processor.h>

 #ifdef CONFIG_XEN
 #include <xen/interface/kexec.h>
@@ -136,7 +139,12 @@ static struct resource standard_io_resou
 }, {
 	.name	= "keyboard",
 	.start	= 0x0060,
-	.end	= 0x006f,
+	.end	= 0x0060,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "keyboard",
+	.start	= 0x0064,
+	.end	= 0x0064,
 	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
 }, {
 	.name	= "dma page reg",
@@ -166,6 +174,8 @@ struct cpuinfo_x86 new_cpu_data __cpuini
 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);

+unsigned int def_to_bigsmp;
+
 #ifndef CONFIG_X86_PAE
 unsigned long mmu_cr4_features;
 #else
@@ -204,7 +214,7 @@ EXPORT_SYMBOL(ist_info);
 extern void early_cpu_init(void);
 extern int root_mountflags;

-unsigned long saved_videomode;
+unsigned long saved_video_mode;

 #define RAMDISK_IMAGE_START_MASK	0x07FF
 #define RAMDISK_PROMPT_FLAG		0x8000
@@ -259,7 +269,7 @@ static inline void copy_edd(void)
 }
 #endif

-int __initdata user_defined_memmap = 0;
+int __initdata user_defined_memmap;

 /*
  * "mem=nopentium" disables the 4MB page tables.
@@ -420,20 +430,59 @@ unsigned long __init find_max_low_pfn(vo
 }

 #ifndef CONFIG_XEN
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
 /*
- * workaround for Dell systems that neglect to reserve EBDA
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
  */
 static void __init reserve_ebda_region(void)
 {
-	unsigned int addr;
-	addr = get_bios_ebda();
-	if (addr)
-		reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
 }
 #endif

 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init setup_bootmem_allocator(void);
+static void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
 {
 	/*
@@ -469,7 +518,7 @@ static unsigned long __init setup_memory
 	return max_low_pfn;
 }

-void __init zone_sizes_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
@@ -523,10 +572,16 @@ static void __init reserve_crashkernel(v
 					(unsigned long)(crash_size >> 20),
 					(unsigned long)(crash_base >> 20),
 					(unsigned long)(total_mem >> 20));
+
+			if (reserve_bootmem(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+				printk(KERN_INFO "crashkernel reservation "
+					"failed - memory is in use\n");
+				return;
+			}
+
 			crashk_res.start = crash_base;
 			crashk_res.end   = crash_base + crash_size - 1;
-			reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_DEFAULT);
 		} else
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
@@ -660,16 +715,9 @@ void __init setup_bootmem_allocator(void
 	 */
 	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);

-	/* reserve EBDA region, it's a 4K region */
+	/* reserve EBDA region */
 	reserve_ebda_region();

-    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
-       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
-       unless you have no PS/2 mouse plugged in. */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-	    boot_cpu_data.x86 == 6)
-	     reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
-
 #ifdef CONFIG_SMP
 	/*
 	 * But first pinch a few for the stack/trampoline stuff
@@ -691,6 +739,8 @@ void __init setup_bootmem_allocator(void
 #endif
 	numa_kva_reserve();
 	reserve_crashkernel();
+
+	reserve_ibft_region();
 }

 /*
@@ -726,6 +776,18 @@ char * __init __attribute__((weak)) memo
 	return machine_specific_memory_setup();
 }

+#ifdef CONFIG_NUMA
+/*
+ * In the golden day, when everything among i386 and x86_64 will be
+ * integrated, this will not live here
+ */
+void *x86_cpu_to_node_map_early_ptr;
+int x86_cpu_to_node_map_init[NR_CPUS] = {
+	[0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+#endif
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -775,7 +837,7 @@ void __init setup_arch(char **cmdline_p)
 	copy_edid();
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	saved_videomode = boot_params.hdr.vid_mode;
+	saved_video_mode = boot_params.hdr.vid_mode;
 	if( boot_params.sys_desc_table.length != 0 ) {
 		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
 		machine_id = boot_params.sys_desc_table.table[0];
@@ -842,15 +904,19 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();

 	/* update e820 for memory not covered by WB MTRRs */
-	find_max_pfn();
+	propagate_e820_map();
 	mtrr_bp_init();
 #ifndef CONFIG_XEN
 	if (mtrr_trim_uncached_memory(max_pfn))
-		find_max_pfn();
+		propagate_e820_map();
 #endif

 	max_low_pfn = setup_memory();

+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
 #ifdef CONFIG_VMI
 	/*
 	 * Must be after max_low_pfn is determined, and before kernel
@@ -858,6 +924,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	vmi_init();
 #endif
+	kvm_guest_init();

 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
@@ -979,6 +1046,18 @@ void __init setup_arch(char **cmdline_p)

 	io_delay_init();

+#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
+	/*
+	 * setup to use the early static init tables during kernel startup
+	 * X86_SMP will exclude sub-arches that don't deal well with it.
+	 */
+	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
+#endif
+
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
--- head-2010-05-25.orig/arch/x86/kernel/setup_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/setup_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -29,18 +29,22 @@
 #include <linux/crash_dump.h>
 #include <linux/root_dev.h>
 #include <linux/pci.h>
+#include <asm/pci-direct.h>
 #include <linux/efi.h>
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
 #include <linux/mmzone.h>
 #include <linux/kexec.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
+#include <linux/sort.h>
 #include <linux/uaccess.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>

 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -58,7 +62,6 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
-#include <asm/mach_apic.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
 #include <asm/dmi.h>
@@ -66,6 +69,9 @@
 #include <asm/mce.h>
 #include <asm/ds.h>
 #include <asm/topology.h>
+#include <asm/pat.h>
+
+#include <mach_apic.h>
 #ifdef CONFIG_XEN
 #include <linux/percpu.h>
 #include <xen/interface/physdev.h>
@@ -149,7 +155,7 @@ extern int root_mountflags;

 char __initdata command_line[COMMAND_LINE_SIZE];

-struct resource standard_io_resources[] = {
+static struct resource standard_io_resources[] = {
 	{ .name = "dma1", .start = 0x00, .end = 0x1f,
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
 	{ .name = "pic1", .start = 0x20, .end = 0x21,
@@ -158,7 +164,9 @@ struct resource standard_io_resources[]
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
 	{ .name = "timer1", .start = 0x50, .end = 0x53,
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
-	{ .name = "keyboard", .start = 0x60, .end = 0x6f,
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
 	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
@@ -224,10 +232,10 @@ contig_initmem_init(unsigned long start_
 	e820_register_active_regions(0, start_pfn, end_pfn);
 #ifdef CONFIG_XEN
 	if (xen_start_info->nr_pages < end_pfn)
-		free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
-	else
+		end_pfn = xen_start_info->nr_pages;
 #endif
 	free_bootmem_with_active_regions(0, end_pfn);
+	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
 	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
 #endif
@@ -291,6 +299,7 @@ static void __init reserve_crashkernel(v
 				(unsigned long)(total_mem >> 20));
 		crashk_res.start = crash_base;
 		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
 	}
 }
 #else
@@ -307,6 +316,40 @@ void __attribute__((weak)) __init memory
        machine_specific_memory_setup();
 }

+static void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	unsigned long pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __init check_enable_amd_mmconf_dmi(void);
+#else
+void __cpuinit fam10h_check_enable_mmcfg(void)
+{
+}
+void __init check_enable_amd_mmconf_dmi(void)
+{
+}
+#endif
+
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
@@ -390,6 +433,8 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;

+	parse_setup_data();
+
 	parse_early_param();

 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
@@ -399,6 +444,13 @@ void __init setup_arch(char **cmdline_p)

 	finish_e820_parsing();

+#ifndef CONFIG_XEN
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
+#endif
+
 	early_gart_iommu_check();

 	e820_register_active_regions(0, 0, -1UL);
@@ -421,15 +473,23 @@ void __init setup_arch(char **cmdline_p)

 	check_efer();

-	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();

+#ifndef CONFIG_XEN
+	vsmp_init();
+#endif
+
 	if (is_initial_xendomain())
 		dmi_scan_machine();

 	io_delay_init();

+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	/* setup to use the early static init tables during kernel startup */
 	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -460,9 +520,9 @@ void __init setup_arch(char **cmdline_p)
 	contig_initmem_init(0, end_pfn);
 #endif

-	early_res_to_bootmem();
-
 #ifndef CONFIG_XEN
+	dma32_reserve_bootmem();
+
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
@@ -488,16 +548,17 @@ void __init setup_arch(char **cmdline_p)
 		unsigned long end_of_mem    = end_pfn << PAGE_SHIFT;

 		if (ramdisk_end <= end_of_mem) {
-#ifndef CONFIG_XEN
-			reserve_bootmem_generic(ramdisk_image, ramdisk_size);
-#endif
+			/*
+			 * don't need to reserve again, already reserved early
+			 * in x86_64_start_kernel, and early_res_to_bootmem
+			 * convert that to reserved in bootmem
+			 */
 			initrd_start = ramdisk_image + PAGE_OFFSET;
 			initrd_end = initrd_start+ramdisk_size;
 #ifdef CONFIG_XEN
 			initrd_below_start_ok = 1;
 #endif
 		} else {
-			/* Assumes everything on node 0 */
 			free_bootmem(ramdisk_image, ramdisk_size);
 			printk(KERN_ERR "initrd extends beyond end of memory "
 			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
@@ -507,6 +568,9 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 	reserve_crashkernel();
+
+	reserve_ibft_region();
+
 	paging_init();
 	map_vsyscall();
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -634,16 +698,16 @@ void __init setup_arch(char **cmdline_p)
 	prefill_possible_map();
 #endif

+	kvm_guest_init();
+
 	/*
 	 * We trust e820 completely. No explicit ROM probing in memory.
 	 */
 #ifdef CONFIG_XEN
 	if (is_initial_xendomain())
-		e820_reserve_resources(machine_e820.map, machine_e820.nr_map,
-				       &code_resource, &data_resource, &bss_resource);
+		e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
 #else
-	e820_reserve_resources(e820.map, e820.nr_map,
-			       &code_resource, &data_resource, &bss_resource);
+	e820_reserve_resources(e820.map, e820.nr_map);
 	e820_mark_nosave_regions();
 #endif

@@ -691,6 +755,9 @@ void __init setup_arch(char **cmdline_p)
 #endif

 #endif /* !CONFIG_XEN */
+
+	/* do this before identify_cpu for boot cpu */
+	check_enable_amd_mmconf_dmi();
 }

 #ifdef CONFIG_XEN
@@ -787,9 +854,9 @@ static void __cpuinit amd_detect_cmp(str
 	bits = c->x86_coreid_bits;

 	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
-	/* Convert the APIC ID into the socket ID */
-	c->phys_proc_id = phys_pkg_id(bits);
+	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	/* Convert the initial APIC ID into the socket ID */
+	c->phys_proc_id = c->initial_apicid >> bits;

 #ifdef CONFIG_NUMA
 	node = c->phys_proc_id;
@@ -806,7 +873,7 @@ static void __cpuinit amd_detect_cmp(str
 		   If that doesn't result in a usable node fall back to the
 		   path for the previous case.  */

-		int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
+		int ht_nodeid = c->initial_apicid;

 		if (ht_nodeid >= 0 &&
 		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
@@ -914,7 +981,7 @@ static void __cpuinit init_amd(struct cp

 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
+	clear_cpu_cap(c, 0*32+31);

 	/* On C+ stepping K8 rep microcode works well for copy/memset */
 	level = cpuid_eax(1);
@@ -956,9 +1023,25 @@ static void __cpuinit init_amd(struct cp
 	/* MFENCE stops RDTSC speculation */
 	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);

+	if (c->x86 == 0x10)
+		fam10h_check_enable_mmcfg();
+
 #ifndef CONFIG_XEN
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
+
+	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+	}
 #endif
 }

@@ -1052,7 +1135,7 @@ static void __cpuinit early_init_intel(s
 {
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 }

 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
@@ -1095,9 +1178,6 @@ static void __cpuinit init_intel(struct

 	if (c->x86 == 15)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -1106,6 +1186,32 @@ static void __cpuinit init_intel(struct
 	srat_detect_node();
 }

+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+	if (c->x86 == 0x6 && c->x86_model >= 0xf)
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	n = c->extended_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
@@ -1114,6 +1220,8 @@ static void __cpuinit get_cpu_vendor(str
 		c->x86_vendor = X86_VENDOR_AMD;
 	else if (!strcmp(v, "GenuineIntel"))
 		c->x86_vendor = X86_VENDOR_INTEL;
+	else if (!strcmp(v, "CentaurHauls"))
+		c->x86_vendor = X86_VENDOR_CENTAUR;
 	else
 		c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
@@ -1161,15 +1269,16 @@ static void __cpuinit early_identify_cpu
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (c->x86_capability[0] & (1<<19))
+		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 	} else {
 		/* Have CPUID level 0 only - unheard of */
 		c->x86 = 4;
 	}

+	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
 #ifdef CONFIG_SMP
-	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+	c->phys_proc_id = c->initial_apicid;
 #endif
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
@@ -1202,8 +1311,12 @@ static void __cpuinit early_identify_cpu
 	case X86_VENDOR_INTEL:
 		early_init_intel(c);
 		break;
+	case X86_VENDOR_CENTAUR:
+		early_init_centaur(c);
+		break;
 	}

+	validate_pat_support(c);
 }

 /*
@@ -1240,6 +1353,10 @@ void __cpuinit identify_cpu(struct cpuin
 		init_intel(c);
 		break;

+	case X86_VENDOR_CENTAUR:
+		init_centaur(c);
+		break;
+
 	case X86_VENDOR_UNKNOWN:
 	default:
 		display_cacheinfo(c);
@@ -1269,14 +1386,24 @@ void __cpuinit identify_cpu(struct cpuin
 #endif
 	select_idle_routine(c);

-	if (c != &boot_cpu_data)
-		mtrr_ap_init();
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif

 }

+void __cpuinit identify_boot_cpu(void)
+{
+	identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+	BUG_ON(c == &boot_cpu_data);
+	identify_cpu(c);
+	mtrr_ap_init();
+}
+
 static __init int setup_noclflush(char *arg)
 {
 	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
@@ -1305,123 +1432,3 @@ static __init int setup_disablecpuid(cha
 	return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);
-
-/*
- *	Get CPU information for use by the procfs.
- */
-
-static int show_cpuinfo(struct seq_file *m, void *v)
-{
-	struct cpuinfo_x86 *c = v;
-	int cpu = 0, i;
-
-#ifdef CONFIG_SMP
-	cpu = c->cpu_index;
-#endif
-
-	seq_printf(m, "processor\t: %u\n"
-		   "vendor_id\t: %s\n"
-		   "cpu family\t: %d\n"
-		   "model\t\t: %d\n"
-		   "model name\t: %s\n",
-		   (unsigned)cpu,
-		   c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-		   c->x86,
-		   (int)c->x86_model,
-		   c->x86_model_id[0] ? c->x86_model_id : "unknown");
-
-	if (c->x86_mask || c->cpuid_level >= 0)
-		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
-	else
-		seq_printf(m, "stepping\t: unknown\n");
-
-	if (cpu_has(c, X86_FEATURE_TSC)) {
-		unsigned int freq = cpufreq_quick_get((unsigned)cpu);
-
-		if (!freq)
-			freq = cpu_khz;
-		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-			   freq / 1000, (freq % 1000));
-	}
-
-	/* Cache size */
-	if (c->x86_cache_size >= 0)
-		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-
-#ifdef CONFIG_SMP
-	if (smp_num_siblings * c->x86_max_cores > 1) {
-		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-		seq_printf(m, "siblings\t: %d\n",
-			       cpus_weight(per_cpu(cpu_core_map, cpu)));
-		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
-		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
-	}
-#endif
-
-	seq_printf(m,
-		   "fpu\t\t: yes\n"
-		   "fpu_exception\t: yes\n"
-		   "cpuid level\t: %d\n"
-		   "wp\t\t: yes\n"
-		   "flags\t\t:",
-		   c->cpuid_level);
-
-	for (i = 0; i < 32*NCAPINTS; i++)
-		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-			seq_printf(m, " %s", x86_cap_flags[i]);
-
-	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
-		   c->loops_per_jiffy/(500000/HZ),
-		   (c->loops_per_jiffy/(5000/HZ)) % 100);
-
-	if (c->x86_tlbsize > 0)
-		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
-	seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
-	seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
-
-	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
-		   c->x86_phys_bits, c->x86_virt_bits);
-
-	seq_printf(m, "power management:");
-	for (i = 0; i < 32; i++) {
-		if (c->x86_power & (1 << i)) {
-			if (i < ARRAY_SIZE(x86_power_flags) &&
-			    x86_power_flags[i])
-				seq_printf(m, "%s%s",
-					   x86_power_flags[i][0]?" ":"",
-					   x86_power_flags[i]);
-			else
-				seq_printf(m, " [%d]", i);
-		}
-	}
-
-	seq_printf(m, "\n\n");
-
-	return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-	if (*pos == 0)	/* just in case, cpu 0 is not the first */
-		*pos = first_cpu(cpu_online_map);
-	if ((*pos) < NR_CPUS && cpu_online(*pos))
-		return &cpu_data(*pos);
-	return NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	*pos = next_cpu(*pos, cpu_online_map);
-	return c_start(m, pos);
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
-	.start = c_start,
-	.next =	c_next,
-	.stop =	c_stop,
-	.show =	show_cpuinfo,
-};
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/kernel/smp-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,329 @@
+/*
+ *	Intel SMP support routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *	i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <mach_ipi.h>
+#include <xen/evtchn.h>
+/*
+ *	Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *	The Linux implications for SMP are handled as follows:
+ *
+ *	Pentium III / [Xeon]
+ *		None of the E1AP-E3AP errata are visible to the user.
+ *
+ *	E1AP.	see PII A1AP
+ *	E2AP.	see PII A2AP
+ *	E3AP.	see PII A3AP
+ *
+ *	Pentium II / [Xeon]
+ *		None of the A1AP-A3AP errata are visible to the user.
+ *
+ *	A1AP.	see PPro 1AP
+ *	A2AP.	see PPro 2AP
+ *	A3AP.	see PPro 7AP
+ *
+ *	Pentium Pro
+ *		None of 1AP-9AP errata are visible to the normal user,
+ *	except occasional delivery of 'spurious interrupt' as trap #15.
+ *	This is very rare and a non-problem.
+ *
+ *	1AP.	Linux maps APIC as non-cacheable
+ *	2AP.	worked around in hardware
+ *	3AP.	fixed in C0 and above steppings microcode update.
+ *		Linux does not use excessive STARTUP_IPIs.
+ *	4AP.	worked around in hardware
+ *	5AP.	symmetric IO mode (normal Linux operation) not affected.
+ *		'noapic' mode has vector 0xf filled out properly.
+ *	6AP.	'noapic' mode might be affected - fixed in later steppings
+ *	7AP.	We do not assume writes to the LVT deassering IRQs
+ *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
+ *	9AP.	We do not use mixed mode
+ *
+ *	Pentium
+ *		There is a marginal case where REP MOVS on 100MHz SMP
+ *	machines with B stepping processors can fail. XXX should provide
+ *	an L1cache=Writethrough or L1cache=off option.
+ *
+ *		B stepping CPUs may hang. There are hardware work arounds
+ *	for this. We warn about it in case your board doesn't have the work
+ *	arounds. Basically that's so I can tell anyone with a B stepping
+ *	CPU and SMP problems "tough".
+ *
+ *	Specific items [From Pentium Processor Specification Update]
+ *
+ *	1AP.	Linux doesn't use remote read
+ *	2AP.	Linux doesn't trust APIC errors
+ *	3AP.	We work around this
+ *	4AP.	Linux never generated 3 interrupts of the same priority
+ *		to cause a lost local interrupt.
+ *	5AP.	Remote read is never used
+ *	6AP.	not affected - worked around in hardware
+ *	7AP.	not affected - worked around in hardware
+ *	8AP.	worked around in hardware - we get explicit CS errors if not
+ *	9AP.	only 'noapic' mode affected. Might generate spurious
+ *		interrupts, we log only the first one and count the
+ *		rest silently.
+ *	10AP.	not affected - worked around in hardware
+ *	11AP.	Linux reads the APIC between writes to avoid this, as per
+ *		the documentation. Make sure you preserve this as it affects
+ *		the C stepping chips too.
+ *	12AP.	not affected - worked around in hardware
+ *	13AP.	not affected - worked around in hardware
+ *	14AP.	we always deassert INIT during bootup
+ *	15AP.	not affected - worked around in hardware
+ *	16AP.	not affected - worked around in hardware
+ *	17AP.	not affected - worked around in hardware
+ *	18AP.	not affected - worked around in hardware
+ *	19AP.	not affected - worked around in BIOS
+ *
+ *	If this sounds worrying believe me these bugs are either ___RARE___,
+ *	or are signal timing bugs worked around in hardware and there's
+ *	about nothing of note with C stepping upwards.
+ */
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void xen_smp_send_reschedule(int cpu)
+{
+	if (unlikely(cpu_is_offline(cpu))) {
+		WARN_ON(1);
+		return;
+	}
+	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+void lock_ipi_call_lock(void)
+{
+	spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+	spin_unlock_irq(&call_lock);
+}
+
+static struct call_data_struct *call_data;
+
+static void __smp_call_function(void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = num_online_cpus() - 1;
+
+	if (!cpus)
+		return;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();
+
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			cpu_relax();
+}
+
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int
+xen_smp_call_function_mask(cpumask_t mask,
+			      void (*func)(void *), void *info,
+			      int wait)
+{
+	struct call_data_struct data;
+	cpumask_t allbutself;
+	int cpus;
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+
+	allbutself = cpu_online_map;
+	cpu_clear(smp_processor_id(), allbutself);
+
+	cpus_and(mask, mask, allbutself);
+	cpus = cpus_weight(mask);
+
+	if (!cpus) {
+		spin_unlock(&call_lock);
+		return 0;
+	}
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+
+	/* Send a message to other CPUs */
+	if (cpus_equal(mask, allbutself) &&
+	    cpus_equal(cpu_online_map, cpu_callout_map))
+		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+	else
+		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			cpu_relax();
+	spin_unlock(&call_lock);
+
+	return 0;
+}
+
+static void stop_this_cpu(void *dummy)
+{
+	local_irq_disable();
+	/*
+	 * Remove this CPU:
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	disable_all_local_evtchn();
+	if (hlt_works(smp_processor_id()))
+		for (;;) halt();
+	for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+void xen_smp_send_stop(void)
+{
+	int nolock;
+	unsigned long flags;
+
+	/* Don't deadlock on the call lock in panic */
+	nolock = !spin_trylock(&call_lock);
+	local_irq_save(flags);
+	__smp_call_function(stop_this_cpu, NULL, 0, 0);
+	if (!nolock)
+		spin_unlock(&call_lock);
+	disable_all_local_evtchn();
+	local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
+{
+#ifdef CONFIG_X86_32
+	__get_cpu_var(irq_stat).irq_resched_count++;
+#else
+	add_pda(irq_resched_count, 1);
+#endif
+	return IRQ_HANDLED;
+}
+
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+#ifdef CONFIG_X86_32
+	__get_cpu_var(irq_stat).irq_call_count++;
+#else
+	add_pda(irq_call_count, 1);
+#endif
+	irq_exit();
+
+	if (wait) {
+		mb();
+		atomic_inc(&call_data->finished);
+	}
+
+	return IRQ_HANDLED;
+}
--- head-2010-05-25.orig/arch/x86/kernel/smp_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,647 +0,0 @@
-/*
- *	Intel SMP support routines.
- *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *	This code is released under the GNU General Public License version 2 or
- *	later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#if 0
-#include <mach_apic.h>
-#endif
-#include <xen/evtchn.h>
-
-/*
- *	Some notes on x86 processor bugs affecting SMP operation:
- *
- *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
- *	The Linux implications for SMP are handled as follows:
- *
- *	Pentium III / [Xeon]
- *		None of the E1AP-E3AP errata are visible to the user.
- *
- *	E1AP.	see PII A1AP
- *	E2AP.	see PII A2AP
- *	E3AP.	see PII A3AP
- *
- *	Pentium II / [Xeon]
- *		None of the A1AP-A3AP errata are visible to the user.
- *
- *	A1AP.	see PPro 1AP
- *	A2AP.	see PPro 2AP
- *	A3AP.	see PPro 7AP
- *
- *	Pentium Pro
- *		None of 1AP-9AP errata are visible to the normal user,
- *	except occasional delivery of 'spurious interrupt' as trap #15.
- *	This is very rare and a non-problem.
- *
- *	1AP.	Linux maps APIC as non-cacheable
- *	2AP.	worked around in hardware
- *	3AP.	fixed in C0 and above steppings microcode update.
- *		Linux does not use excessive STARTUP_IPIs.
- *	4AP.	worked around in hardware
- *	5AP.	symmetric IO mode (normal Linux operation) not affected.
- *		'noapic' mode has vector 0xf filled out properly.
- *	6AP.	'noapic' mode might be affected - fixed in later steppings
- *	7AP.	We do not assume writes to the LVT deassering IRQs
- *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
- *	9AP.	We do not use mixed mode
- *
- *	Pentium
- *		There is a marginal case where REP MOVS on 100MHz SMP
- *	machines with B stepping processors can fail. XXX should provide
- *	an L1cache=Writethrough or L1cache=off option.
- *
- *		B stepping CPUs may hang. There are hardware work arounds
- *	for this. We warn about it in case your board doesn't have the work
- *	arounds. Basically that's so I can tell anyone with a B stepping
- *	CPU and SMP problems "tough".
- *
- *	Specific items [From Pentium Processor Specification Update]
- *
- *	1AP.	Linux doesn't use remote read
- *	2AP.	Linux doesn't trust APIC errors
- *	3AP.	We work around this
- *	4AP.	Linux never generated 3 interrupts of the same priority
- *		to cause a lost local interrupt.
- *	5AP.	Remote read is never used
- *	6AP.	not affected - worked around in hardware
- *	7AP.	not affected - worked around in hardware
- *	8AP.	worked around in hardware - we get explicit CS errors if not
- *	9AP.	only 'noapic' mode affected. Might generate spurious
- *		interrupts, we log only the first one and count the
- *		rest silently.
- *	10AP.	not affected - worked around in hardware
- *	11AP.	Linux reads the APIC between writes to avoid this, as per
- *		the documentation. Make sure you preserve this as it affects
- *		the C stepping chips too.
- *	12AP.	not affected - worked around in hardware
- *	13AP.	not affected - worked around in hardware
- *	14AP.	we always deassert INIT during bootup
- *	15AP.	not affected - worked around in hardware
- *	16AP.	not affected - worked around in hardware
- *	17AP.	not affected - worked around in hardware
- *	18AP.	not affected - worked around in hardware
- *	19AP.	not affected - worked around in BIOS
- *
- *	If this sounds worrying believe me these bugs are either ___RARE___,
- *	or are signal timing bugs worked around in hardware and there's
- *	about nothing of note with C stepping upwards.
- */
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR (unsigned int shortcut, int vector)
-{
-	unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
-	switch (vector) {
-	default:
-		icr |= APIC_DM_FIXED | vector;
-		break;
-	case NMI_VECTOR:
-		icr |= APIC_DM_NMI;
-		break;
-	}
-	return icr;
-}
-
-static inline int __prepare_ICR2 (unsigned int mask)
-{
-	return SET_APIC_DEST_FIELD(mask);
-}
-
-DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
-
-static inline void __send_IPI_one(unsigned int cpu, int vector)
-{
-	int irq = per_cpu(ipi_to_irq, cpu)[vector];
-	BUG_ON(irq < 0);
-	notify_remote_via_irq(irq);
-}
-
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
-{
-	int cpu;
-
-	switch (shortcut) {
-	case APIC_DEST_SELF:
-		__send_IPI_one(smp_processor_id(), vector);
-		break;
-	case APIC_DEST_ALLBUT:
-		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
-			if (cpu == smp_processor_id())
-				continue;
-			if (cpu_isset(cpu, cpu_online_map)) {
-				__send_IPI_one(cpu, vector);
-			}
-		}
-		break;
-	default:
-		printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
-		       vector);
-		break;
-	}
-}
-
-void send_IPI_self(int vector)
-{
-	__send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(cpumask_t mask, int vector)
-{
-	unsigned long flags;
-	unsigned int cpu;
-
-	local_irq_save(flags);
-	WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
-
-	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
-		if (cpu_isset(cpu, mask)) {
-			__send_IPI_one(cpu, vector);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
-{
-
-	send_IPI_mask_bitmask(mask, vector);
-}
-
-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
-
-#if 0 /* XEN */
-/*
- *	Smarter SMP flushing macros.
- *		c/o Linus Torvalds.
- *
- *	These mean you can really definitely utterly forget about
- *	writing to user space from interrupts. (Its not allowed anyway).
- *
- *	Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct * flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
-	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
-		BUG();
-	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
-	load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * 	Stop ipi delivery for the old mm. This is not synchronized with
- * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * 	for the wrong mm, and in the worst case we perform a superfluous
- * 	tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *	was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- * 	Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * 	Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *	cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *	flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * 	Atomically set the bit [other cpus will start sending flush ipis],
- * 	and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
-{
-	unsigned long cpu;
-
-	cpu = get_cpu();
-
-	if (!cpu_isset(cpu, flush_cpumask))
-		goto out;
-		/*
-		 * This was a BUG() but until someone can quote me the
-		 * line from the intel manual that guarantees an IPI to
-		 * multiple CPUs is retried _only_ on the erroring CPUs
-		 * its staying as a return
-		 *
-		 * BUG();
-		 */
-
-	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
-		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-			if (flush_va == TLB_FLUSH_ALL)
-				local_flush_tlb();
-			else
-				__flush_tlb_one(flush_va);
-		} else
-			leave_mm(cpu);
-	}
-	smp_mb__before_clear_bit();
-	cpu_clear(cpu, flush_cpumask);
-	smp_mb__after_clear_bit();
-out:
-	put_cpu_no_resched();
-	__get_cpu_var(irq_stat).irq_tlb_count++;
-
-	return IRQ_HANDLED;
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-			     unsigned long va)
-{
-	cpumask_t cpumask = *cpumaskp;
-
-	/*
-	 * A couple of (to be removed) sanity checks:
-	 *
-	 * - current CPU must not be in mask
-	 * - mask must exist :)
-	 */
-	BUG_ON(cpus_empty(cpumask));
-	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-	BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
-	/* If a CPU which we ran on has gone down, OK. */
-	cpus_and(cpumask, cpumask, cpu_online_map);
-	if (unlikely(cpus_empty(cpumask)))
-		return;
-#endif
-
-	/*
-	 * i'm not happy about this global shared spinlock in the
-	 * MM hot path, but we'll see how contended it is.
-	 * AK: x86-64 has a faster method that could be ported.
-	 */
-	spin_lock(&tlbstate_lock);
-
-	flush_mm = mm;
-	flush_va = va;
-	cpus_or(flush_cpumask, cpumask, flush_cpumask);
-	/*
-	 * We have to send the IPI only to
-	 * CPUs affected.
-	 */
-	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-
-	while (!cpus_empty(flush_cpumask))
-		/* nothing. lockup detection does not belong here */
-		cpu_relax();
-
-	flush_mm = NULL;
-	flush_va = 0;
-	spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
-	struct mm_struct *mm = current->mm;
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	local_flush_tlb();
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-	preempt_enable();
-}
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	if (current->active_mm == mm) {
-		if (current->mm)
-			local_flush_tlb();
-		else
-			leave_mm(smp_processor_id());
-	}
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
-	preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	if (current->active_mm == mm) {
-		if(current->mm)
-			__flush_tlb_one(va);
-		else
-		 	leave_mm(smp_processor_id());
-	}
-
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, va);
-
-	preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void* info)
-{
-	unsigned long cpu = smp_processor_id();
-
-	__flush_tlb_all();
-	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
-		leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-
-#endif /* XEN */
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-void xen_smp_send_reschedule(int cpu)
-{
-	WARN_ON(cpu_is_offline(cpu));
-	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	atomic_t started;
-	atomic_t finished;
-	int wait;
-};
-
-void lock_ipi_call_lock(void)
-{
-	spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-	spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
-				int nonatomic, int wait)
-{
-	struct call_data_struct data;
-	int cpus = num_online_cpus() - 1;
-
-	if (!cpus)
-		return;
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	mb();
-
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
-  * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int
-xen_smp_call_function_mask(cpumask_t mask,
-			      void (*func)(void *), void *info,
-			      int wait)
-{
-	struct call_data_struct data;
-	cpumask_t allbutself;
-	int cpus;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	/* Holding any lock stops cpus from going down. */
-	spin_lock(&call_lock);
-
-	allbutself = cpu_online_map;
-	cpu_clear(smp_processor_id(), allbutself);
-
-	cpus_and(mask, mask, allbutself);
-	cpus = cpus_weight(mask);
-
-	if (!cpus) {
-		spin_unlock(&call_lock);
-		return 0;
-	}
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	mb();
-
-	/* Send a message to other CPUs */
-	if (cpus_equal(mask, allbutself))
-		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-	else
-		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-	spin_unlock(&call_lock);
-
-	return 0;
-}
-
-static void stop_this_cpu (void * dummy)
-{
-	local_irq_disable();
-	/*
-	 * Remove this CPU:
-	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
-	disable_all_local_evtchn();
-	if (cpu_data(smp_processor_id()).hlt_works_ok)
-		for(;;) halt();
-	for (;;);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-void xen_smp_send_stop(void)
-{
-	/* Don't deadlock on the call lock in panic */
-	int nolock = !spin_trylock(&call_lock);
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__smp_call_function(stop_this_cpu, NULL, 0, 0);
-	if (!nolock)
-		spin_unlock(&call_lock);
-	disable_all_local_evtchn();
-	local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
-{
-	__get_cpu_var(irq_stat).irq_resched_count++;
-
-	return IRQ_HANDLED;
-}
-
-#include <linux/kallsyms.h>
-irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
-{
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	int wait = call_data->wait;
-
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	atomic_inc(&call_data->started);
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
-	irq_enter();
-	(*func)(info);
-	__get_cpu_var(irq_stat).irq_call_count++;
-	irq_exit();
-
-	if (wait) {
-		mb();
-		atomic_inc(&call_data->finished);
-	}
-
-	return IRQ_HANDLED;
-}
--- head-2010-05-25.orig/arch/x86/kernel/smp_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,554 +0,0 @@
-/*
- *	Intel SMP support routines.
- *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *      (c) 2002,2003 Andi Kleen, SuSE Labs.
- *
- *	This code is released under the GNU General Public License version 2 or
- *	later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/interrupt.h>
-
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mach_apic.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#ifdef CONFIG_XEN
-#include <xen/evtchn.h>
-#endif
-
-#ifndef CONFIG_XEN
-/*
- *	Smarter SMP flushing macros.
- *		c/o Linus Torvalds.
- *
- *	These mean you can really definitely utterly forget about
- *	writing to user space from interrupts. (Its not allowed anyway).
- *
- *	Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- *	More scalable flush, from Andi Kleen
- *
- *	To avoid global state use 8 different call vectors.
- *	Each CPU uses a specific vector to trigger flushes on other
- *	CPUs. Depending on the received vector the target CPUs look into
- *	the right per cpu variable for the flush data.
- *
- *	With more than 8 CPUs they are hashed to the 8 available
- *	vectors. The limited global vector space forces us to this right now.
- *	In future when interrupts are split into per CPU domains this could be
- *	fixed, at the cost of triggering multiple IPIs in some cases.
- */
-
-union smp_flush_state {
-	struct {
-		cpumask_t flush_cpumask;
-		struct mm_struct *flush_mm;
-		unsigned long flush_va;
-		spinlock_t tlbstate_lock;
-	};
-	char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
-
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
-void leave_mm(int cpu)
-{
-	if (read_pda(mmu_state) == TLBSTATE_OK)
-		BUG();
-	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-	load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *	Stop ipi delivery for the old mm. This is not synchronized with
- *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *	for the wrong mm, and in the worst case we perform a superfluous
- *	tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *	was in lazy tlb mode.
- * 1a3) update cpu active_mm
- *	Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *	Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *	cpu active_mm is correct, cpu0 already handles
- *	flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *	Atomically set the bit [other cpus will start sending flush ipis],
- *	and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu mmu_state is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-	int cpu;
-	int sender;
-	union smp_flush_state *f;
-
-	cpu = smp_processor_id();
-	/*
-	 * orig_rax contains the negated interrupt vector.
-	 * Use that to determine where the sender put the data.
-	 */
-	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-	f = &per_cpu(flush_state, sender);
-
-	if (!cpu_isset(cpu, f->flush_cpumask))
-		goto out;
-		/*
-		 * This was a BUG() but until someone can quote me the
-		 * line from the intel manual that guarantees an IPI to
-		 * multiple CPUs is retried _only_ on the erroring CPUs
-		 * its staying as a return
-		 *
-		 * BUG();
-		 */
-
-	if (f->flush_mm == read_pda(active_mm)) {
-		if (read_pda(mmu_state) == TLBSTATE_OK) {
-			if (f->flush_va == TLB_FLUSH_ALL)
-				local_flush_tlb();
-			else
-				__flush_tlb_one(f->flush_va);
-		} else
-			leave_mm(cpu);
-	}
-out:
-	ack_APIC_irq();
-	cpu_clear(cpu, f->flush_cpumask);
-	add_pda(irq_tlb_count, 1);
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
-			     unsigned long va)
-{
-	int sender;
-	union smp_flush_state *f;
-	cpumask_t cpumask = *cpumaskp;
-
-	/* Caller has disabled preemption */
-	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-	f = &per_cpu(flush_state, sender);
-
-	/*
-	 * Could avoid this lock when
-	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-	 * probably not worth checking this for a cache-hot lock.
-	 */
-	spin_lock(&f->tlbstate_lock);
-
-	f->flush_mm = mm;
-	f->flush_va = va;
-	cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
-
-	/*
-	 * We have to send the IPI only to
-	 * CPUs affected.
-	 */
-	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-
-	while (!cpus_empty(f->flush_cpumask))
-		cpu_relax();
-
-	f->flush_mm = NULL;
-	f->flush_va = 0;
-	spin_unlock(&f->tlbstate_lock);
-}
-
-int __cpuinit init_smp_flush(void)
-{
-	int i;
-
-	for_each_cpu_mask(i, cpu_possible_map) {
-		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-	}
-	return 0;
-}
-core_initcall(init_smp_flush);
-
-void flush_tlb_current_task(void)
-{
-	struct mm_struct *mm = current->mm;
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	local_flush_tlb();
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-	preempt_enable();
-}
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	if (current->active_mm == mm) {
-		if (current->mm)
-			local_flush_tlb();
-		else
-			leave_mm(smp_processor_id());
-	}
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
-	preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	cpumask_t cpu_mask;
-
-	preempt_disable();
-	cpu_mask = mm->cpu_vm_mask;
-	cpu_clear(smp_processor_id(), cpu_mask);
-
-	if (current->active_mm == mm) {
-		if(current->mm)
-			__flush_tlb_one(va);
-		else
-			leave_mm(smp_processor_id());
-	}
-
-	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, va);
-
-	preempt_enable();
-}
-
-static void do_flush_tlb_all(void* info)
-{
-	unsigned long cpu = smp_processor_id();
-
-	__flush_tlb_all();
-	if (read_pda(mmu_state) == TLBSTATE_LAZY)
-		leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-#endif /* Xen */
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-
-void smp_send_reschedule(int cpu)
-{
-	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	atomic_t started;
-	atomic_t finished;
-	int wait;
-};
-
-static struct call_data_struct * call_data;
-
-void lock_ipi_call_lock(void)
-{
-	spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-	spin_unlock_irq(&call_lock);
-}
-
-/*
- * this function sends a 'generic call function' IPI to all other CPU
- * of the system defined in the mask.
- */
-static int __smp_call_function_mask(cpumask_t mask,
-				    void (*func)(void *), void *info,
-				    int wait)
-{
-	struct call_data_struct data;
-	cpumask_t allbutself;
-	int cpus;
-
-	allbutself = cpu_online_map;
-	cpu_clear(smp_processor_id(), allbutself);
-
-	cpus_and(mask, mask, allbutself);
-	cpus = cpus_weight(mask);
-
-	if (!cpus)
-		return 0;
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	wmb();
-
-	/* Send a message to other CPUs */
-	if (cpus_equal(mask, allbutself))
-		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-	else
-		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (!wait)
-		return 0;
-
-	while (atomic_read(&data.finished) != cpus)
-		cpu_relax();
-
-	return 0;
-}
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function_mask(cpumask_t mask,
-			   void (*func)(void *), void *info,
-			   int wait)
-{
-	int ret;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	spin_lock(&call_lock);
-	ret = __smp_call_function_mask(mask, func, info, wait);
-	spin_unlock(&call_lock);
-	return ret;
-}
-EXPORT_SYMBOL(smp_call_function_mask);
-
-/*
- * smp_call_function_single - Run a function on a specific CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Currently unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Retrurns 0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
-
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-			      int nonatomic, int wait)
-{
-	/* prevent preemption and reschedule on another processor */
-	int ret, me = get_cpu();
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	if (cpu == me) {
-		local_irq_disable();
-		func(info);
-		local_irq_enable();
-		put_cpu();
-		return 0;
-	}
-
-	ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
-	put_cpu();
-	return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
-/*
- * smp_call_function - run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
- * @wait: If true, wait (atomically) until function has completed on other
- *        CPUs.
- *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute func or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- * Actually there are a few legal cases, like panic.
- */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
-{
-	return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-static void stop_this_cpu(void *dummy)
-{
-	local_irq_disable();
-	/*
-	 * Remove this CPU:
-	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
-	disable_all_local_evtchn();
-	for (;;)
-		halt();
-}
-
-void smp_send_stop(void)
-{
-	int nolock;
-	unsigned long flags;
-
-#ifndef CONFIG_XEN
-	if (reboot_force)
-		return;
-#endif
-
-	/* Don't deadlock on the call lock in panic */
-	nolock = !spin_trylock(&call_lock);
-	local_irq_save(flags);
-	__smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
-	if (!nolock)
-		spin_unlock(&call_lock);
-	disable_all_local_evtchn();
-	local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-#ifndef CONFIG_XEN
-asmlinkage void smp_reschedule_interrupt(void)
-#else
-asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
-#endif
-{
-#ifndef CONFIG_XEN
-	ack_APIC_irq();
-#endif
-	add_pda(irq_resched_count, 1);
-#ifdef CONFIG_XEN
-	return IRQ_HANDLED;
-#endif
-}
-
-#ifndef CONFIG_XEN
-asmlinkage void smp_call_function_interrupt(void)
-#else
-asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
-#endif
-{
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	int wait = call_data->wait;
-
-#ifndef CONFIG_XEN
-	ack_APIC_irq();
-#endif
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	atomic_inc(&call_data->started);
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
-	exit_idle();
-	irq_enter();
-	(*func)(info);
-	add_pda(irq_call_count, 1);
-	irq_exit();
-	if (wait) {
-		mb();
-		atomic_inc(&call_data->finished);
-	}
-#ifdef CONFIG_XEN
-	return IRQ_HANDLED;
-#endif
-}
--- head-2010-05-25.orig/arch/x86/kernel/time-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/time-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -700,8 +700,6 @@ int xen_update_persistent_clock(void)
 	return 0;
 }

-extern void (*late_time_init)(void);
-
 /* Dynamically-mapped IRQ. */
 DEFINE_PER_CPU(int, timer_irq);

--- head-2010-05-25.orig/arch/x86/kernel/traps_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/traps_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -9,26 +9,28 @@
  * 'Traps.c' handles hardware traps and faults after we have saved some
  * state in 'asm.s'.
  */
-#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
 #include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/timer.h>
-#include <linux/mm.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/highmem.h>
-#include <linux/kallsyms.h>
-#include <linux/ptrace.h>
-#include <linux/utsname.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
-#include <linux/nmi.h>
 #include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>

 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -43,21 +45,18 @@
 #include <linux/edac.h>
 #endif

+#include <asm/arch_hooks.h>
+#include <asm/stacktrace.h>
 #include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
 #include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
 #include <asm/nmi.h>
-#include <asm/unwind.h>
 #include <asm/smp.h>
-#include <asm/arch_hooks.h>
-#include <linux/kdebug.h>
-#include <asm/stacktrace.h>
-
-#include <linux/module.h>
+#include <asm/io.h>

 #include "mach_traps.h"

@@ -71,7 +70,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
 asmlinkage int system_call(void);

 /* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq = 0;
+char ignore_fpu_irq;

 #ifndef CONFIG_X86_NO_IDT
 /*
@@ -113,12 +112,13 @@ static unsigned int code_bytes = 64;
 void printk_address(unsigned long address, int reliable)
 {
 #ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0, symsize;
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long offset = 0;
+	unsigned long symsize;
 	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[128];
 	char reliab[4] = "";
+	char *delim = ":";
+	char *modname;

 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -146,13 +146,14 @@ static inline int valid_stack_ptr(struct

 /* The form of the top of the frame on the stack */
 struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
+	struct stack_frame	*next_frame;
+	unsigned long		return_address;
 };

-static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long bp,
-				const struct stacktrace_ops *ops, void *data)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+		    unsigned long *stack, unsigned long bp,
+		    const struct stacktrace_ops *ops, void *data)
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;

@@ -174,7 +175,7 @@ static inline unsigned long print_contex
 	return bp;
 }

-#define MSG(msg) ops->warning(data, msg)
+#define MSG(msg)		ops->warning(data, msg)

 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
@@ -185,6 +186,7 @@ void dump_trace(struct task_struct *task

 	if (!stack) {
 		unsigned long dummy;
+
 		stack = &dummy;
 		if (task != current)
 			stack = (unsigned long *)task->thread.sp;
@@ -194,7 +196,7 @@ void dump_trace(struct task_struct *task
 	if (!bp) {
 		if (task == current) {
 			/* Grab bp right from our regs */
-			asm ("movl %%ebp, %0" : "=r" (bp) : );
+			asm("movl %%ebp, %0" : "=r" (bp) :);
 		} else {
 			/* bp is the last reg pushed by switch_to */
 			bp = *(unsigned long *) task->thread.sp;
@@ -204,15 +206,18 @@ void dump_trace(struct task_struct *task

 	while (1) {
 		struct thread_info *context;
+
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
 		bp = print_context_stack(context, stack, bp, ops, data);
-		/* Should be after the line below, but somewhere
-		   in early boot context comes out corrupted and we
-		   can't reference it -AK */
+		/*
+		 * Should be after the line below, but somewhere
+		 * in early boot context comes out corrupted and we
+		 * can't reference it:
+		 */
 		if (ops->stack(data, "IRQ") < 0)
 			break;
-		stack = (unsigned long*)context->previous_esp;
+		stack = (unsigned long *)context->previous_esp;
 		if (!stack)
 			break;
 		touch_nmi_watchdog();
@@ -251,15 +256,15 @@ static void print_trace_address(void *da
 }

 static const struct stacktrace_ops print_trace_ops = {
-	.warning = print_trace_warning,
-	.warning_symbol = print_trace_warning_symbol,
-	.stack = print_trace_stack,
-	.address = print_trace_address,
+	.warning		= print_trace_warning,
+	.warning_symbol		= print_trace_warning_symbol,
+	.stack			= print_trace_stack,
+	.address		= print_trace_address,
 };

 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp, char *log_lvl)
+		   unsigned long *stack, unsigned long bp, char *log_lvl)
 {
 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
@@ -271,21 +276,22 @@ void show_trace(struct task_struct *task
 	show_trace_log_lvl(task, regs, stack, bp, "");
 }

-static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		       unsigned long *sp, unsigned long bp, char *log_lvl)
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		   unsigned long *sp, unsigned long bp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;

 	if (sp == NULL) {
 		if (task)
-			sp = (unsigned long*)task->thread.sp;
+			sp = (unsigned long *)task->thread.sp;
 		else
 			sp = (unsigned long *)&sp;
 	}

 	stack = sp;
-	for(i = 0; i < kstack_depth_to_print; i++) {
+	for (i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
 		if (i && ((i % 8) == 0))
@@ -293,6 +299,7 @@ static void show_stack_log_lvl(struct ta
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
+
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }

@@ -307,8 +314,8 @@ void show_stack(struct task_struct *task
  */
 void dump_stack(void)
 {
-	unsigned long stack;
 	unsigned long bp = 0;
+	unsigned long stack;

 #ifdef CONFIG_FRAME_POINTER
 	if (!bp)
@@ -320,6 +327,7 @@ void dump_stack(void)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
+
 	show_trace(current, NULL, &stack, bp);
 }

@@ -331,6 +339,7 @@ void show_registers(struct pt_regs *regs

 	print_modules();
 	__show_registers(regs, 0);
+
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
 		current_thread_info(), current, task_thread_info(current));
@@ -339,10 +348,10 @@ void show_registers(struct pt_regs *regs
 	 * time of the fault..
 	 */
 	if (!user_mode_vm(regs)) {
-		u8 *ip;
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;
+		u8 *ip;

 		printk("\n" KERN_EMERG "Stack: ");
 		show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
@@ -369,7 +378,7 @@ void show_registers(struct pt_regs *regs
 		}
 	}
 	printk("\n");
-}
+}

 int is_valid_bugaddr(unsigned long ip)
 {
@@ -385,10 +394,10 @@ int is_valid_bugaddr(unsigned long ip)

 static int die_counter;

-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 {
-	unsigned long sp;
 	unsigned short ss;
+	unsigned long sp;

 	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
@@ -403,8 +412,8 @@ int __kprobes __die(const char * str, st
 	printk("\n");

 	if (notify_die(DIE_OOPS, str, regs, err,
-				current->thread.trap_no, SIGSEGV) !=
-			NOTIFY_STOP) {
+			current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+
 		show_registers(regs);
 		/* Executive summary in case the oops scrolled away */
 		sp = (unsigned long) (&regs->sp);
@@ -416,17 +425,18 @@ int __kprobes __die(const char * str, st
 		printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
 		print_symbol("%s", regs->ip);
 		printk(" SS:ESP %04x:%08lx\n", ss, sp);
+
 		return 0;
-	} else {
-		return 1;
 	}
+
+	return 1;
 }

 /*
- * This is gone through when something in the kernel has done something bad and
- * is about to be terminated.
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
  */
-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
 		raw_spinlock_t lock;
@@ -448,8 +458,9 @@ void die(const char * str, struct pt_reg
 		die.lock_owner = smp_processor_id();
 		die.lock_owner_depth = 0;
 		bust_spinlocks(1);
-	} else
+	} else {
 		raw_local_irq_save(flags);
+	}

 	if (++die.lock_owner_depth < 3) {
 		report_bug(regs->ip, regs);
@@ -482,19 +493,20 @@ void die(const char * str, struct pt_reg
 	do_exit(SIGSEGV);
 }

-static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+static inline void
+die_if_kernel(const char *str, struct pt_regs *regs, long err)
 {
 	if (!user_mode_vm(regs))
 		die(str, regs, err);
 }

-static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
-			      struct pt_regs * regs, long error_code,
-			      siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
+	long error_code, siginfo_t *info)
 {
 	struct task_struct *tsk = current;

-	if (regs->flags & VM_MASK) {
+	if (regs->flags & X86_VM_MASK) {
 		if (vm86)
 			goto vm86_trap;
 		goto trap_signal;
@@ -503,109 +515,112 @@ static void __kprobes do_trap(int trapnr
 	if (!user_mode(regs))
 		goto kernel_trap;

-	trap_signal: {
-		/*
-		 * We want error_code and trap_no set for userspace faults and
-		 * kernelspace faults which result in die(), but not
-		 * kernelspace faults which are fixed up.  die() gives the
-		 * process no chance to handle the signal and notice the
-		 * kernel fault information, so that won't result in polluting
-		 * the information about previously queued, but not yet
-		 * delivered, faults.  See also do_general_protection below.
-		 */
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = trapnr;
+trap_signal:
+	/*
+	 * We want error_code and trap_no set for userspace faults and
+	 * kernelspace faults which result in die(), but not
+	 * kernelspace faults which are fixed up.  die() gives the
+	 * process no chance to handle the signal and notice the
+	 * kernel fault information, so that won't result in polluting
+	 * the information about previously queued, but not yet
+	 * delivered, faults.  See also do_general_protection below.
+	 */
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = trapnr;

-		if (info)
-			force_sig_info(signr, info, tsk);
-		else
-			force_sig(signr, tsk);
-		return;
-	}
+	if (info)
+		force_sig_info(signr, info, tsk);
+	else
+		force_sig(signr, tsk);
+	return;

-	kernel_trap: {
-		if (!fixup_exception(regs)) {
-			tsk->thread.error_code = error_code;
-			tsk->thread.trap_no = trapnr;
-			die(str, regs, error_code);
-		}
-		return;
+kernel_trap:
+	if (!fixup_exception(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+		die(str, regs, error_code);
 	}
+	return;

-	vm86_trap: {
-		int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
-		if (ret) goto trap_signal;
-		return;
-	}
+vm86_trap:
+	if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
+						error_code, trapnr))
+		goto trap_signal;
+	return;
 }

-#define DO_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-						== NOTIFY_STOP) \
-		return; \
-	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	siginfo_t info; \
-	if (irq) \
-		local_irq_enable(); \
-	info.si_signo = signr; \
-	info.si_errno = 0; \
-	info.si_code = sicode; \
-	info.si_addr = (void __user *)siaddr; \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-						== NOTIFY_STOP) \
-		return; \
-	do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
-}
-
-#define DO_VM86_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-						== NOTIFY_STOP) \
-		return; \
-	do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
-}
-
-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-	siginfo_t info; \
-	info.si_signo = signr; \
-	info.si_errno = 0; \
-	info.si_code = sicode; \
-	info.si_addr = (void __user *)siaddr; \
-	trace_hardirqs_fixup(); \
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-						== NOTIFY_STOP) \
-		return; \
-	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+#define DO_ERROR(trapnr, signr, str, name)				\
+void do_##name(struct pt_regs *regs, long error_code)			\
+{									\
+	trace_hardirqs_fixup();						\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+						== NOTIFY_STOP)		\
+		return;							\
+	do_trap(trapnr, signr, str, 0, regs, error_code, NULL);		\
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq)	\
+void do_##name(struct pt_regs *regs, long error_code)			\
+{									\
+	siginfo_t info;							\
+	if (irq)							\
+		local_irq_enable();					\
+	info.si_signo = signr;						\
+	info.si_errno = 0;						\
+	info.si_code = sicode;						\
+	info.si_addr = (void __user *)siaddr;				\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+						== NOTIFY_STOP)		\
+		return;							\
+	do_trap(trapnr, signr, str, 0, regs, error_code, &info);	\
+}
+
+#define DO_VM86_ERROR(trapnr, signr, str, name)				\
+void do_##name(struct pt_regs *regs, long error_code)			\
+{									\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+						== NOTIFY_STOP)		\
+		return;							\
+	do_trap(trapnr, signr, str, 1, regs, error_code, NULL);		\
+}
+
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)	\
+void do_##name(struct pt_regs *regs, long error_code)			\
+{									\
+	siginfo_t info;							\
+	info.si_signo = signr;						\
+	info.si_errno = 0;						\
+	info.si_code = sicode;						\
+	info.si_addr = (void __user *)siaddr;				\
+	trace_hardirqs_fixup();						\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+						== NOTIFY_STOP)		\
+		return;							\
+	do_trap(trapnr, signr, str, 1, regs, error_code, &info);	\
 }

-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
 #endif
-DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)

 void __kprobes do_general_protection(struct pt_regs * regs,
 					      long error_code)
 {
-	if (regs->flags & VM_MASK)
+	struct thread_struct *thread;
+
+	thread = &current->thread;
+
+	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;

 	if (!user_mode(regs))
@@ -613,6 +628,7 @@ void __kprobes do_general_protection(str

 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
+
 	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
 	    printk_ratelimit()) {
 		printk(KERN_INFO
@@ -642,22 +658,25 @@ gp_in_kernel:
 	}
 }

-static __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
-	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-		"CPU %d.\n", reason, smp_processor_id());
-	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+	printk(KERN_EMERG
+		"Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+			reason, smp_processor_id());
+
+	printk(KERN_EMERG
+		"You have some hardware problem, likely on the PCI bus.\n");

 #if defined(CONFIG_EDAC)
-	if(edac_handler_set()) {
+	if (edac_handler_set()) {
 		edac_atomic_assert_error();
 		return;
 	}
 #endif

 	if (panic_on_unrecovered_nmi)
-                panic("NMI: Not continuing");
+		panic("NMI: Not continuing");

 	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");

@@ -665,8 +684,8 @@ mem_parity_error(unsigned char reason, s
 	clear_mem_error(reason);
 }

-static __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
 	show_registers(regs);
@@ -675,38 +694,43 @@ io_check_error(unsigned char reason, str
 	clear_io_check_error(reason);
 }

-static __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
+	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+		return;
 #ifdef CONFIG_MCA
-	/* Might actually be able to figure out what the guilty party
-	* is. */
-	if( MCA_bus ) {
+	/*
+	 * Might actually be able to figure out what the guilty party
+	 * is:
+	 */
+	if (MCA_bus) {
 		mca_handle_nmi();
 		return;
 	}
 #endif
-	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
-		"CPU %d.\n", reason, smp_processor_id());
+	printk(KERN_EMERG
+		"Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+			reason, smp_processor_id());
+
 	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
 	if (panic_on_unrecovered_nmi)
-                panic("NMI: Not continuing");
+		panic("NMI: Not continuing");

 	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }

 static DEFINE_SPINLOCK(nmi_print_lock);

-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 {
-	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
-	    NOTIFY_STOP)
+	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;

 	spin_lock(&nmi_print_lock);
 	/*
 	* We are in trouble anyway, lets at least try
-	* to get a message out.
+	* to get a message out:
 	*/
 	bust_spinlocks(1);
 	printk(KERN_EMERG "%s", msg);
@@ -717,9 +741,10 @@ void __kprobes die_nmi(struct pt_regs *r
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);

-	/* If we are in kernel we are probably nested up pretty bad
-	 * and might aswell get out now while we still can.
-	*/
+	/*
+	 * If we are in kernel we are probably nested up pretty bad
+	 * and might aswell get out now while we still can:
+	 */
 	if (!user_mode_vm(regs)) {
 		current->thread.trap_no = 2;
 		crash_kexec(regs);
@@ -728,14 +753,14 @@ void __kprobes die_nmi(struct pt_regs *r
 	do_exit(SIGSEGV);
 }

-static __kprobes void default_do_nmi(struct pt_regs * regs)
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;

-	/* Only the BSP gets external NMIs from the system.  */
+	/* Only the BSP gets external NMIs from the system: */
 	if (!smp_processor_id())
 		reason = get_nmi_reason();
-
+
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP)
@@ -748,8 +773,10 @@ static __kprobes void default_do_nmi(str
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, smp_processor_id()))
-#endif
 			unknown_nmi_error(reason, regs);
+#else
+		unknown_nmi_error(reason, regs);
+#endif

 		return;
 	}
@@ -761,14 +788,14 @@ static __kprobes void default_do_nmi(str
 		io_check_error(reason, regs);
 	/*
 	 * Reassert NMI in case it became active meanwhile
-	 * as it's edge-triggered.
+	 * as it's edge-triggered:
 	 */
 	reassert_nmi();
 }

 static int ignore_nmis;

-__kprobes void do_nmi(struct pt_regs * regs, long error_code)
+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
 {
 	int cpu;

@@ -804,9 +831,12 @@ void __kprobes do_int3(struct pt_regs *r
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
 		return;
-	/* This is an interrupt gate, because kprobes wants interrupts
-	disabled.  Normal trap handlers don't. */
+	/*
+	 * This is an interrupt gate, because kprobes wants interrupts
+	 * disabled. Normal trap handlers don't.
+	 */
 	restore_interrupts(regs);
+
 	do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
 }
 #endif
@@ -821,7 +851,7 @@ void __kprobes do_int3(struct pt_regs *r
  * from user space. Such code must not hold kernel locks (since it
  * can equally take a page fault), therefore it is safe to call
  * force_sig_info even though that claims and releases locks.
- *
+ *
  * Code in ./signal.c ensures that the debug control register
  * is restored before we deliver any signal, and therefore that
  * user code runs with the correct debug control register even though
@@ -833,10 +863,10 @@ void __kprobes do_int3(struct pt_regs *r
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
  */
-void __kprobes do_debug(struct pt_regs * regs, long error_code)
+void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
-	unsigned int condition;
 	struct task_struct *tsk = current;
+	unsigned int condition;

 	trace_hardirqs_fixup();

@@ -861,7 +891,7 @@ void __kprobes do_debug(struct pt_regs *
 			goto clear_dr7;
 	}

-	if (regs->flags & VM_MASK)
+	if (regs->flags & X86_VM_MASK)
 		goto debug_vm86;

 	/* Save debug status register where ptrace can see it */
@@ -884,7 +914,8 @@ void __kprobes do_debug(struct pt_regs *
 	/* Ok, finally something we can handle */
 	send_sigtrap(tsk, regs, error_code);

-	/* Disable additional traps. They'll be re-enabled when
+	/*
+	 * Disable additional traps. They'll be re-enabled when
 	 * the signal is delivered.
 	 */
 clear_dr7:
@@ -897,7 +928,7 @@ debug_vm86:

 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	return;
 }

@@ -908,9 +939,10 @@ clear_TF_reenable:
  */
 void math_error(void __user *ip)
 {
-	struct task_struct * task;
+	struct task_struct *task;
+	unsigned short cwd;
+	unsigned short swd;
 	siginfo_t info;
-	unsigned short cwd, swd;

 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -936,36 +968,36 @@ void math_error(void __user *ip)
 	cwd = get_fpu_cwd(task);
 	swd = get_fpu_swd(task);
 	switch (swd & ~cwd & 0x3f) {
-		case 0x000: /* No unmasked exception */
-			return;
-		default:    /* Multiple exceptions */
-			break;
-		case 0x001: /* Invalid Op */
-			/*
-			 * swd & 0x240 == 0x040: Stack Underflow
-			 * swd & 0x240 == 0x240: Stack Overflow
-			 * User must clear the SF bit (0x40) if set
-			 */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000: /* No unmasked exception */
+		return;
+	default:    /* Multiple exceptions */
+		break;
+	case 0x001: /* Invalid Op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }

-void do_coprocessor_error(struct pt_regs * regs, long error_code)
+void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
 	ignore_fpu_irq = 1;
 	math_error((void __user *)regs->ip);
@@ -973,9 +1005,9 @@ void do_coprocessor_error(struct pt_regs

 static void simd_math_error(void __user *ip)
 {
-	struct task_struct * task;
-	siginfo_t info;
+	struct task_struct *task;
 	unsigned short mxcsr;
+	siginfo_t info;

 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -996,84 +1028,82 @@ static void simd_math_error(void __user
 	 */
 	mxcsr = get_fpu_mxcsr(task);
 	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-		case 0x000:
-		default:
-			break;
-		case 0x001: /* Invalid Op */
-			info.si_code = FPE_FLTINV;
-			break;
-		case 0x002: /* Denormalize */
-		case 0x010: /* Underflow */
-			info.si_code = FPE_FLTUND;
-			break;
-		case 0x004: /* Zero Divide */
-			info.si_code = FPE_FLTDIV;
-			break;
-		case 0x008: /* Overflow */
-			info.si_code = FPE_FLTOVF;
-			break;
-		case 0x020: /* Precision */
-			info.si_code = FPE_FLTRES;
-			break;
+	case 0x000:
+	default:
+		break;
+	case 0x001: /* Invalid Op */
+		info.si_code = FPE_FLTINV;
+		break;
+	case 0x002: /* Denormalize */
+	case 0x010: /* Underflow */
+		info.si_code = FPE_FLTUND;
+		break;
+	case 0x004: /* Zero Divide */
+		info.si_code = FPE_FLTDIV;
+		break;
+	case 0x008: /* Overflow */
+		info.si_code = FPE_FLTOVF;
+		break;
+	case 0x020: /* Precision */
+		info.si_code = FPE_FLTRES;
+		break;
 	}
 	force_sig_info(SIGFPE, &info, task);
 }

-void do_simd_coprocessor_error(struct pt_regs * regs,
-					  long error_code)
+void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
 	if (cpu_has_xmm) {
 		/* Handle SIMD FPU exceptions on PIII+ processors. */
 		ignore_fpu_irq = 1;
 		simd_math_error((void __user *)regs->ip);
-	} else {
-		/*
-		 * Handle strange cache flush from user space exception
-		 * in all other cases.  This is undocumented behaviour.
-		 */
-		if (regs->flags & VM_MASK) {
-			handle_vm86_fault((struct kernel_vm86_regs *)regs,
-					  error_code);
-			return;
-		}
-		current->thread.trap_no = 19;
-		current->thread.error_code = error_code;
-		die_if_kernel("cache flush denied", regs, error_code);
-		force_sig(SIGSEGV, current);
+		return;
+	}
+	/*
+	 * Handle strange cache flush from user space exception
+	 * in all other cases.  This is undocumented behaviour.
+	 */
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
+		return;
 	}
+	current->thread.trap_no = 19;
+	current->thread.error_code = error_code;
+	die_if_kernel("cache flush denied", regs, error_code);
+	force_sig(SIGSEGV, current);
 }

 #ifndef CONFIG_XEN
-void do_spurious_interrupt_bug(struct pt_regs * regs,
-					  long error_code)
+void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
 #if 0
 	/* No need to warn about this any longer. */
-	printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
 #endif
 }

-unsigned long patch_espfix_desc(unsigned long uesp,
-					  unsigned long kesp)
+unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 {
 	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
 	__u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+
 	/* Set up base for espfix segment */
- 	desc &= 0x00f0ff0000000000ULL;
- 	desc |=	((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+	desc &= 0x00f0ff0000000000ULL;
+	desc |=	((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
 		((((__u64)base) << 32) & 0xff00000000000000ULL) |
 		((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
 		(lim_pages & 0xffff);
 	*(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+
 	return new_kesp;
 }
 #endif

 /*
- *  'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
  * old math state array, and gets the new ones from the current task
  *
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1087,9 +1117,22 @@ asmlinkage void math_state_restore(void)
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;

+	if (!tsk_used_math(tsk)) {
+		local_irq_enable();
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(tsk)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+		local_irq_disable();
+	}
+
 	/* NB. 'clts' is done for us by Xen during virtual trap. */
-	if (!tsk_used_math(tsk))
-		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
@@ -1100,15 +1143,15 @@ EXPORT_SYMBOL_GPL(math_state_restore);

 asmlinkage void math_emulate(long arg)
 {
-	printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
-	printk(KERN_EMERG "killing %s.\n",current->comm);
-	force_sig(SIGFPE,current);
+	printk(KERN_EMERG
+		"math-emulation not enabled and no coprocessor found.\n");
+	printk(KERN_EMERG "killing %s.\n", current->comm);
+	force_sig(SIGFPE, current);
 	schedule();
 }

 #endif /* CONFIG_MATH_EMULATION */

-
 /*
  * NB. All these are "trap gates" (i.e. events_mask isn't set) except
  * for those that specify <dpl>|4 in the second field.
@@ -1146,25 +1189,21 @@ void __init trap_init(void)
 	if (ret)
 		printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);

-	/*
-	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-	 * Generate a build-time error if the alignment is wrong.
-	 */
-	BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
 	if (cpu_has_fxsr) {
 		printk(KERN_INFO "Enabling fast FPU save and restore... ");
 		set_in_cr4(X86_CR4_OSFXSR);
 		printk("done.\n");
 	}
 	if (cpu_has_xmm) {
-		printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
-				"support... ");
+		printk(KERN_INFO
+			"Enabling unmasked SIMD FPU exception support... ");
 		set_in_cr4(X86_CR4_OSXMMEXCPT);
 		printk("done.\n");
 	}

+	init_thread_xstate();
 	/*
-	 * Should be a barrier for any external CPU state.
+	 * Should be a barrier for any external CPU state:
 	 */
 	cpu_init();
 }
@@ -1183,6 +1222,7 @@ void __cpuinit smp_trap_init(trap_info_t
 static int __init kstack_setup(char *s)
 {
 	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
 	return 1;
 }
 __setup("kstack=", kstack_setup);
--- head-2010-05-25.orig/arch/x86/kernel/traps_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/traps_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -33,6 +33,8 @@
 #include <linux/kdebug.h>
 #include <linux/utsname.h>

+#include <mach_traps.h>
+
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
 #endif
@@ -601,10 +603,16 @@ void die(const char * str, struct pt_reg
 }

 #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL)
-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
-	unsigned long flags = oops_begin();
+	unsigned long flags;
+
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
+	    NOTIFY_STOP)
+		return;

+	flags = oops_begin();
 	/*
 	 * We are in trouble anyway, lets at least try
 	 * to get a message out.
@@ -769,7 +777,7 @@ asmlinkage void __kprobes do_general_pro
 	die("general protection fault", regs, error_code);
 }

-static __kprobes void
+static notrace __kprobes void
 mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
@@ -792,7 +800,7 @@ mem_parity_error(unsigned char reason, s
 	clear_mem_error(reason);
 }

-static __kprobes void
+static notrace __kprobes void
 io_check_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk("NMI: IOCK error (debug interrupt?)\n");
@@ -802,9 +810,11 @@ io_check_error(unsigned char reason, str
 	clear_io_check_error(reason);
 }

-static __kprobes void
+static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {
+	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+		return;
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
 		reason);
 	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
@@ -817,7 +827,7 @@ unknown_nmi_error(unsigned char reason,

 /* Runs on IST stack. This code must keep interrupts off all the time.
    Nested NMIs are prevented by the CPU. */
-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 	int cpu;
@@ -1117,11 +1127,25 @@ asmlinkage void __attribute__((weak)) mc
 asmlinkage void math_state_restore(void)
 {
 	struct task_struct *me = current;
+
+	if (!used_math()) {
+		local_irq_enable();
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(me)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+		local_irq_disable();
+	}
+
         /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */

-	if (!used_math())
-		init_fpu(me);
-	restore_fpu_checking(&me->thread.i387.fxsave);
+	restore_fpu_checking(&me->thread.xstate->fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
@@ -1168,6 +1192,10 @@ void __init trap_init(void)
 		printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);

 	/*
+	 * initialize the per thread extended state:
+	 */
+        init_thread_xstate();
+	/*
 	 * Should be a barrier for any external CPU state.
 	 */
 	cpu_init();
--- head-2010-05-25.orig/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s
 	return 0;
 }

-long __vsyscall(3) venosys_1(void)
+static long __vsyscall(3) venosys_1(void)
 {
 	return -ENOSYS;
 }
--- head-2010-05-25.orig/arch/x86/mach-xen/setup.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mach-xen/setup.c	2010-03-24 15:12:36.000000000 +0100
@@ -56,11 +56,7 @@ char * __init machine_specific_memory_se
 {
 	int rc;
 	struct xen_memory_map memmap;
-	/*
-	 * This is rather large for a stack variable but this early in
-	 * the boot process we know we have plenty slack space.
-	 */
-	struct e820entry map[E820MAX];
+	static struct e820entry __initdata map[E820MAX];

 	memmap.nr_entries = E820MAX;
 	set_xen_guest_handle(memmap.buffer, map);
--- head-2010-05-25.orig/arch/x86/mm/fault-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/fault-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -510,6 +510,11 @@ static int vmalloc_fault(unsigned long a
 	unsigned long pgd_paddr;
 	pmd_t *pmd_k;
 	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
 	/*
 	 * Synchronize this task's top level page-table
 	 * with the 'reference' page table.
@@ -670,7 +675,7 @@ void __kprobes do_page_fault(struct pt_r
 #ifdef CONFIG_X86_32
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
 	   fault has been handled. */
-	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+	if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
 		local_irq_enable();

 	/*
@@ -1017,9 +1022,5 @@ void vmalloc_sync_all(void)
 		if (address == start)
 			start = address + PGDIR_SIZE;
 	}
-	/* Check that there is no need to do the same for the modules area. */
-	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
-				(__START_KERNEL & PGDIR_MASK)));
 #endif
 }
--- head-2010-05-25.orig/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/highmem_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -200,6 +200,8 @@ EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
+#ifdef CONFIG_HIGHPTE
 EXPORT_SYMBOL(kmap_atomic_to_page);
+#endif
 EXPORT_SYMBOL(clear_highpage);
 EXPORT_SYMBOL(copy_highpage);
--- head-2010-05-25.orig/arch/x86/mm/init_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/init_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/mm/init.c
  *
  *  Copyright (C) 1995  Linus Torvalds
  *
@@ -22,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/pci.h>
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/bootmem.h>
@@ -54,6 +54,8 @@

 unsigned int __VMALLOC_RESERVE = 128 << 20;

+unsigned long max_pfn_mapped;
+
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;

@@ -73,7 +75,7 @@ static pmd_t * __init one_md_table_init(
 	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
 		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);

-		paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
@@ -107,7 +109,7 @@ static pte_t * __init one_page_table_ini
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		}

-		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(page_table,
 					  XENFEAT_writable_page_tables);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -209,8 +211,13 @@ static void __init kernel_physical_mappi
 			/*
 			 * Map with big pages if possible, otherwise
 			 * create normal page tables:
+			 *
+			 * Don't use a large page for the first 2/4MB of memory
+			 * because there are often fixed size MTRRs in there
+			 * and overlapping MTRRs into large pages can cause
+			 * slowdowns.
 			 */
-			if (cpu_has_pse) {
+			if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
 				unsigned int addr2;
 				pgprot_t prot = PAGE_KERNEL_LARGE;

@@ -224,6 +231,7 @@ static void __init kernel_physical_mappi
 				set_pmd(pmd, pfn_pmd(pfn, prot));

 				pfn += PTRS_PER_PTE;
+				max_pfn_mapped = pfn;
 				continue;
 			}
 			pte = one_page_table_init(pmd);
@@ -241,6 +249,7 @@ static void __init kernel_physical_mappi

 				set_pte(pte, pfn_pte(pfn, prot));
 			}
+			max_pfn_mapped = pfn;
 			pte_ofs = 0;
 		}
 		pmd_idx = 0;
@@ -262,6 +271,25 @@ static inline int page_kills_ppro(unsign

 #endif

+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (mfn_to_local_pfn(pagenr) >= max_pfn)
+		return 1;
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
@@ -303,47 +331,17 @@ static void __init permanent_kmaps_init(
 	pkmap_page_table = pte;
 }

-static void __meminit free_new_highpage(struct page *page, int pfn)
-{
-	init_page_count(page);
-	__free_page(page);
-	totalhigh_pages++;
-}
-
 void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
-		free_new_highpage(page, pfn);
+		init_page_count(page);
+		__free_page(page);
+		totalhigh_pages++;
 	} else
 		SetPageReserved(page);
 }

-static int __meminit
-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
-	free_new_highpage(page, pfn);
-	totalram_pages++;
-#ifdef CONFIG_FLATMEM
-	max_mapnr = max(pfn, max_mapnr);
-#endif
-	num_physpages++;
-
-	return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM.
- */
-void __meminit online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
 #ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
@@ -466,15 +464,13 @@ void zap_low_mappings(void)
 {
 	int i;

-	save_pg_dir();
-
 	/*
 	 * Zap initial low-memory mappings.
 	 *
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+	for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
 #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
@@ -579,9 +575,9 @@ void __init paging_init(void)

 /*
  * Test if the WP bit works in supervisor mode. It isn't supported on 386's
- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
- * used to involve black magic jumps to work around some nasty CPU bugs,
- * but fortunately the switch to using exceptions got rid of all that.
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
  */
 static void __init test_wp_bit(void)
 {
@@ -612,9 +608,7 @@ void __init mem_init(void)
 	int tmp, bad_ppro;
 	unsigned long pfn;

-#if defined(CONFIG_SWIOTLB)
-	swiotlb_init();
-#endif
+	pci_iommu_alloc();

 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
@@ -716,16 +710,8 @@ void __init mem_init(void)
 		test_wp_bit();

 	cpa_init();
-
-	/*
-	 * Subtle. SMP is doing it's boot stuff late (because it has to
-	 * fork idle threads) - but it also needs low mappings for the
-	 * protected-mode entry to work. We zap these entries only after
-	 * the WP-bit has been tested.
-	 */
-#ifndef CONFIG_SMP
+	save_pg_dir();
 	zap_low_mappings();
-#endif

 	SetPagePinned(virt_to_page(init_mm.pgd));
 }
@@ -775,25 +761,17 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;

-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() <= 1)
-#endif
-	{
-		set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-		printk(KERN_INFO "Write protecting the kernel text: %luk\n",
-			size >> 10);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+		size >> 10);

 #ifdef CONFIG_CPA_DEBUG
-		printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
-			start, start+size);
-		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+		start, start+size);
+	set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);

-		printk(KERN_INFO "Testing CPA: write protecting again\n");
-		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
-	}
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
 	start += size;
 	size = (unsigned long)__end_rodata - start;
--- head-2010-05-25.orig/arch/x86/mm/init_64-xen.c	2010-04-29 09:50:58.000000000 +0200
+++ head-2010-05-25/arch/x86/mm/init_64-xen.c	2010-04-29 09:51:25.000000000 +0200
@@ -52,9 +52,6 @@

 #include <xen/features.h>

-const struct dma_mapping_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
 #if CONFIG_XEN_COMPAT <= 0x030002
 unsigned int __kernel_page_user;
 EXPORT_SYMBOL(__kernel_page_user);
@@ -120,6 +117,28 @@ void __meminit early_make_page_readonly(
 		BUG();
 }

+#ifndef CONFIG_XEN
+int direct_gbpages __meminitdata
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+
+static int __init parse_direct_gbpages_off(char *arg)
+{
+	direct_gbpages = 0;
+	return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+	direct_gbpages = 1;
+	return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+#endif
+
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -135,9 +154,6 @@ void show_mem(void)

 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n",
-		nr_swap_pages << (PAGE_SHIFT-10));
-
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			/*
@@ -328,7 +344,7 @@ void __init cleanup_highmap(void)
 	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

 	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
-		if (!pmd_present(*pmd))
+		if (pmd_none(*pmd))
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
 			set_pmd(pmd, __pmd(0));
@@ -337,8 +353,7 @@ void __init cleanup_highmap(void)
 #endif

 /* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 	unsigned long address = __fix_to_virt(idx);

@@ -463,7 +478,7 @@ __meminit void early_iounmap(void *addr,
 }
 #endif

-static void __meminit
+static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
 	int i = pmd_index(address);
@@ -503,21 +518,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned
 			set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
 		}
 	}
+	return address;
 }

-static void __meminit
+static unsigned long __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, 0);
+	unsigned long last_map_addr;
+
 	spin_lock(&init_mm.page_table_lock);
-	phys_pmd_init(pmd, address, end);
+	last_map_addr = phys_pmd_init(pmd, address, end);
 	spin_unlock(&init_mm.page_table_lock);
 	__flush_tlb_all();
+	return last_map_addr;
 }

-static void __meminit
+static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long last_map_addr = end;
 	int i = pud_index(addr);

 	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
@@ -529,7 +549,15 @@ phys_pud_init(pud_t *pud_page, unsigned
 			break;

 		if (__pud_val(*pud)) {
-			phys_pmd_update(pud, addr, end);
+			if (!pud_large(*pud))
+				last_map_addr = phys_pmd_update(pud, addr, end);
+			continue;
+		}
+
+		if (direct_gbpages) {
+			set_pte((pte_t *)pud,
+				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
 			continue;
 		}

@@ -537,12 +565,14 @@ phys_pud_init(pud_t *pud_page, unsigned

 		spin_lock(&init_mm.page_table_lock);
 		*pud = __pud(pmd_phys | _KERNPG_TABLE);
-		phys_pmd_init(pmd, addr, end);
+		last_map_addr = phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);

 		early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
 	}
 	__flush_tlb_all();
+
+	return last_map_addr >> PAGE_SHIFT;
 }

 void __init xen_init_pt(void)
@@ -785,16 +815,138 @@ static void __init xen_finish_init_mappi
 	table_end = start_pfn;
 }

+static void __init init_gbpages(void)
+{
+#ifndef CONFIG_XEN
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+#endif
+}
+
+#ifdef CONFIG_MEMTEST_BOOTPARAM
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+				 unsigned pattern)
+{
+	unsigned long i;
+	unsigned long *start;
+	unsigned long start_bad;
+	unsigned long last_bad;
+	unsigned long val;
+	unsigned long start_phys_aligned;
+	unsigned long count;
+	unsigned long incr;
+
+	switch (pattern) {
+	case 0:
+		val = 0UL;
+		break;
+	case 1:
+		val = -1UL;
+		break;
+	case 2:
+		val = 0x5555555555555555UL;
+		break;
+	case 3:
+		val = 0xaaaaaaaaaaaaaaaaUL;
+		break;
+	default:
+		return;
+	}
+
+	incr = sizeof(unsigned long);
+	start_phys_aligned = ALIGN(start_phys, incr);
+	count = (size - (start_phys_aligned - start_phys))/incr;
+	start = __va(start_phys_aligned);
+	start_bad = 0;
+	last_bad = 0;
+
+	for (i = 0; i < count; i++)
+		start[i] = val;
+	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+		if (*start != val) {
+			if (start_phys_aligned == last_bad + incr) {
+				last_bad += incr;
+			} else {
+				if (start_bad) {
+					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
+						val, start_bad, last_bad + incr);
+					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+				}
+				start_bad = last_bad = start_phys_aligned;
+			}
+		}
+	}
+	if (start_bad) {
+		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
+			val, start_bad, last_bad + incr);
+		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+	}
+
+}
+
+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
+static int __init parse_memtest(char *arg)
+{
+	if (arg)
+		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+	u64 t_start, t_size;
+	unsigned pattern;
+
+	if (!memtest_pattern)
+		return;
+
+	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+	for (pattern = 0; pattern < memtest_pattern; pattern++) {
+		t_start = start;
+		t_size = 0;
+		while (t_start < end) {
+			t_start = find_e820_area_size(t_start, &t_size, 1);
+
+			/* done ? */
+			if (t_start >= end)
+				break;
+			if (t_start + t_size > end)
+				t_size = end - t_start;
+
+			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
+				(unsigned long long)t_start,
+				(unsigned long long)t_start + t_size, pattern);
+
+			memtest(t_start, t_size, pattern);
+
+			t_start += t_size;
+		}
+	}
+	printk(KERN_CONT "\n");
+}
+#else
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
  * the physical memory. To access them they are temporarily mapped.
  */
-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 {
-	unsigned long next;
+	unsigned long next, last_map_addr = end;
+	unsigned long start_phys = start, end_phys = end;

-	pr_debug("init_memory_mapping\n");
+	printk(KERN_INFO "init_memory_mapping\n");

 	/*
 	 * Find space for the kernel direct mapping tables.
@@ -803,8 +955,10 @@ void __init_refok init_memory_mapping(un
 	 * memory mapped. Unfortunately this is done currently before the
 	 * nodes are discovered.
 	 */
-	if (!after_bootmem)
+	if (!after_bootmem) {
+		init_gbpages();
 		find_early_table_space(end);
+	}

 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
@@ -821,7 +975,7 @@ void __init_refok init_memory_mapping(un
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		phys_pud_init(pud, __pa(start), __pa(next));
+		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem) {
 			early_make_page_readonly(pud, XENFEAT_writable_page_tables);
 			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
@@ -838,6 +992,11 @@ void __init_refok init_memory_mapping(un
 	if (!after_bootmem)
 		reserve_early(table_start << PAGE_SHIFT,
 			      table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start_phys, end_phys);
+
+	return last_map_addr;
 }

 #ifndef CONFIG_NUMA
@@ -861,15 +1020,6 @@ void __init paging_init(void)
 /*
  * Memory hotplug specific functions
  */
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Memory is added always to NORMAL zone. This means you will never get
@@ -879,11 +1029,13 @@ int arch_add_memory(int nid, u64 start,
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;

-	init_memory_mapping(start, start + size-1);
+	last_mapped_pfn = init_memory_mapping(start, start + size-1);
+	if (last_mapped_pfn > max_pfn_mapped)
+		max_pfn_mapped = last_mapped_pfn;

 	ret = __add_pages(zone, start_pfn, nr_pages);
 	WARN_ON(1);
@@ -902,6 +1054,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to

 #endif /* CONFIG_MEMORY_HOTPLUG */

+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (mfn_to_local_pfn(pagenr) >= max_pfn)
+		return 1;
+	return 0;
+}
+
+
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;

@@ -1009,24 +1181,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);

 void mark_rodata_ro(void)
 {
-	unsigned long start = (unsigned long)_stext, end;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() > 1)
-		start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
-	start = (unsigned long)__start_rodata;
-#endif
-
-	end = (unsigned long)__end_rodata;
-	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
-	end &= PAGE_MASK;
-	if (end <= start)
-		return;
-
+	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);

 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -1049,6 +1204,7 @@ void mark_rodata_ro(void)
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 #endif
 }
+
 #endif

 #ifdef CONFIG_BLK_DEV_INITRD
@@ -1061,7 +1217,7 @@ void free_initrd_mem(unsigned long start
 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 {
 #ifdef CONFIG_NUMA
-	int nid = phys_to_nid(phys);
+	int nid, next_nid;
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;

@@ -1070,7 +1226,7 @@ void __init reserve_bootmem_generic(unsi
 		 * This can happen with kdump kernels when accessing
 		 * firmware tables:
 		 */
-		if (pfn < end_pfn_map)
+		if (pfn < max_pfn_mapped)
 			return;

 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
@@ -1080,10 +1236,16 @@ void __init reserve_bootmem_generic(unsi

 	/* Should check here against the e820 map to avoid double free */
 #ifdef CONFIG_NUMA
-	reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+	nid = phys_to_nid(phys);
+	next_nid = phys_to_nid(phys + len - 1);
+	if (nid == next_nid)
+		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+	else
+		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #else
 	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
+
 #ifndef CONFIG_XEN
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
 		dma_reserve += len / PAGE_SIZE;
@@ -1192,6 +1354,10 @@ const char *arch_vma_name(struct vm_area
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
 int __meminit
 vmemmap_populate(struct page *start_page, unsigned long size, int node)
 {
@@ -1226,12 +1392,32 @@ vmemmap_populate(struct page *start_page
 							PAGE_KERNEL_LARGE);
 			set_pmd(pmd, __pmd_ma(__pte_val(entry)));

-			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
-				addr, addr + PMD_SIZE - 1, p, node);
+			/* check to see if we have contiguous blocks */
+			if (p_end != p || node_start != node) {
+				if (p_start)
+					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						addr_start, addr_end-1, p_start, p_end-1, node_start);
+				addr_start = addr;
+				node_start = node;
+				p_start = p;
+			}
+			addr_end = addr + PMD_SIZE;
+			p_end = p + PMD_SIZE;
 		} else {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
 		}
 	}
 	return 0;
 }
+
+void __meminit vmemmap_populate_print_last(void)
+{
+	if (p_start) {
+		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+			addr_start, addr_end-1, p_start, p_end-1, node_start);
+		p_start = NULL;
+		p_end = NULL;
+		node_start = 0;
+	}
+}
 #endif
--- head-2010-05-25.orig/arch/x86/mm/ioremap-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/ioremap-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -20,14 +20,11 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
+#include <asm/pat.h>

-enum ioremap_mode {
-	IOR_MODE_UNCACHED,
-	IOR_MODE_CACHED,
-};
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+#ifdef CONFIG_X86_64

+#ifndef CONFIG_XEN
 unsigned long __phys_addr(unsigned long x)
 {
 	if (x >= __START_KERNEL_map)
@@ -35,6 +32,19 @@ unsigned long __phys_addr(unsigned long
 	return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
+#endif
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+	return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
+#else
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+	return 1;
+}

 #endif

@@ -92,7 +102,8 @@ static int __direct_remap_pfn_range(stru
 		 * Fill in the machine address: PTE ptr is done later by
 		 * apply_to_page_range().
 		 */
-		v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO;
+		pgprot_val(prot) |= _PAGE_IO;
+		v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));

 		mfn++;
 		address += PAGE_SIZE;
@@ -189,10 +200,9 @@ int touch_pte_range(struct mm_struct *mm

 EXPORT_SYMBOL(touch_pte_range);

-#ifdef CONFIG_X86_32
 int page_is_ram(unsigned long pagenr)
 {
-	unsigned long addr, end;
+	resource_size_t addr, end;
 	int i;

 #ifndef CONFIG_XEN
@@ -228,31 +238,51 @@ int page_is_ram(unsigned long pagenr)
 	}
 	return 0;
 }
-#endif

 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
  */
 static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
-			       enum ioremap_mode mode)
+			       unsigned long prot_val)
 {
 	unsigned long nrpages = size >> PAGE_SHIFT;
 	int err;

-	switch (mode) {
-	case IOR_MODE_UNCACHED:
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
 	default:
-		err = set_memory_uc(vaddr, nrpages);
+		err = _set_memory_uc(vaddr, nrpages);
+		break;
+	case _PAGE_CACHE_WC:
+		err = _set_memory_wc(vaddr, nrpages);
 		break;
-	case IOR_MODE_CACHED:
-		err = set_memory_wb(vaddr, nrpages);
+	case _PAGE_CACHE_WB:
+		err = _set_memory_wb(vaddr, nrpages);
 		break;
 	}

 	return err;
 }

+int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+			      unsigned long prot_val)
+{
+	unsigned long sz;
+	int rc;
+
+	for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
+		unsigned long pfn = mfn_to_local_pfn(mfn);
+
+		if (pfn >= max_pfn_mapped)
+			continue;
+		rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
+					 PAGE_SIZE, prot_val);
+	}
+
+	return rc;
+}
+
 /*
  * Remap an arbitrary physical address space into the kernel virtual
  * address space. Needed when the kernel wants to access high addresses
@@ -262,12 +292,15 @@ static int ioremap_change_attr(unsigned
  * have to convert them into an offset in a page-aligned mapping, but the
  * caller shouldn't need to know that small detail.
  */
-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
-			       enum ioremap_mode mode)
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, unsigned long prot_val, void *caller)
 {
-	unsigned long mfn, offset, last_addr, vaddr;
+	unsigned long mfn, offset, vaddr;
+	resource_size_t last_addr;
 	struct vm_struct *area;
+	unsigned long new_prot_val;
 	pgprot_t prot;
+	int retval;
 	domid_t domid = DOMID_IO;

 	/* Don't allow wraparound or zero size */
@@ -275,6 +308,13 @@ static void __iomem *__ioremap(resource_
 	if (!size || last_addr < phys_addr)
 		return NULL;

+	if (!phys_addr_valid(phys_addr)) {
+		printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+		       (unsigned long long)phys_addr);
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
 	/*
 	 * Don't remap the low PCI/ISA area, it's always mapped..
 	 */
@@ -287,55 +327,86 @@ static void __iomem *__ioremap(resource_
 	for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) {
 		unsigned long pfn = mfn_to_local_pfn(mfn);

-		if (pfn >= max_pfn)
-			continue;
+		if (pfn_valid(pfn)) {
+			if (!PageReserved(pfn_to_page(pfn)))
+				return NULL;
+			domid = DOMID_SELF;
+		}
+	}
+	WARN_ON_ONCE(domid == DOMID_SELF);

-		domid = DOMID_SELF;
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;

-		if (pfn >= max_pfn_mapped) /* bogus */
-			continue;
+	retval = reserve_memtype(phys_addr, phys_addr + size,
+						prot_val, &new_prot_val);
+	if (retval) {
+		pr_debug("Warning: reserve_memtype returned %d\n", retval);
+		return NULL;
+	}

-		if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+	if (prot_val != new_prot_val) {
+		/*
+		 * Do not fallback to certain memory types with certain
+		 * requested type:
+		 * - request is uc-, return cannot be write-back
+		 * - request is uc-, return cannot be write-combine
+		 * - request is write-combine, return cannot be write-back
+		 */
+		if ((prot_val == _PAGE_CACHE_UC_MINUS &&
+		     (new_prot_val == _PAGE_CACHE_WB ||
+		      new_prot_val == _PAGE_CACHE_WC)) ||
+		    (prot_val == _PAGE_CACHE_WC &&
+		     new_prot_val == _PAGE_CACHE_WB)) {
+			pr_debug(
+		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+				(unsigned long long)phys_addr,
+				(unsigned long long)(phys_addr + size),
+				prot_val, new_prot_val);
+			free_memtype(phys_addr, phys_addr + size);
 			return NULL;
+		}
+		prot_val = new_prot_val;
 	}

-	switch (mode) {
-	case IOR_MODE_UNCACHED:
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
 	default:
-		/*
-		 * FIXME: we will use UC MINUS for now, as video fb drivers
-		 * depend on it. Upcoming ioremap_wc() will fix this behavior.
-		 */
+		prot = PAGE_KERNEL_NOCACHE;
+		break;
+	case _PAGE_CACHE_UC_MINUS:
 		prot = PAGE_KERNEL_UC_MINUS;
 		break;
-	case IOR_MODE_CACHED:
+	case _PAGE_CACHE_WC:
+		prot = PAGE_KERNEL_WC;
+		break;
+	case _PAGE_CACHE_WB:
 		prot = PAGE_KERNEL;
 		break;
 	}

 	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-	/*
 	 * Ok, go for it..
 	 */
-	area = get_vm_area(size, VM_IOREMAP | (mode << 20));
+	area = get_vm_area_caller(size, VM_IOREMAP, caller);
 	if (!area)
 		return NULL;
 	area->phys_addr = phys_addr;
 	vaddr = (unsigned long) area->addr;
 	if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
 				     size, prot, domid)) {
+		free_memtype(phys_addr, phys_addr + size);
 		free_vm_area(area);
 		return NULL;
 	}

-	if (ioremap_change_attr(vaddr, size, mode) < 0) {
-		iounmap((void __iomem *) vaddr);
+	if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+		free_memtype(phys_addr, phys_addr + size);
+		vunmap(area->addr);
 		return NULL;
 	}

@@ -365,16 +436,72 @@ static void __iomem *__ioremap(resource_
  */
 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+	/*
+	 * Ideally, this should be:
+	 *	pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+	 *
+	 * Till we fix all X drivers to use ioremap_wc(), we will use
+	 * UC MINUS.
+	 */
+	unsigned long val = _PAGE_CACHE_UC_MINUS;
+
+	return __ioremap_caller(phys_addr, size, val,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap_nocache);

+/**
+ * ioremap_wc	-	map memory into CPU space write combined
+ * @offset:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+	if (pat_wc_enabled)
+		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+					__builtin_return_address(0));
+	else
+		return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0));
 }
 EXPORT_SYMBOL(ioremap_cache);

+#ifndef CONFIG_XEN
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+					unsigned long size)
+{
+	unsigned long flags;
+	void *ret;
+	int err;
+
+	/*
+	 * - WB for WB-able memory and no other conflicting mappings
+	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+	 * - Inherit from confliting mappings otherwise
+	 */
+	err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+	if (err < 0)
+		return NULL;
+
+	ret = (void *) __ioremap_caller(phys_addr, size, flags,
+					__builtin_return_address(0));
+
+	free_memtype(phys_addr, phys_addr + size);
+	return (void __iomem *)ret;
+}
+#endif
+
 /**
  * iounmap - Free a IO remapping
  * @addr: virtual address from ioremap_*
@@ -417,15 +544,7 @@ void iounmap(volatile void __iomem *addr
 		return;
 	}

-	if ((p->flags >> 20) != IOR_MODE_CACHED) {
-		unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT;
-		unsigned long mfn = p->phys_addr;
-		unsigned long va = (unsigned long)addr;
-
-		for (; n > 0; n--, mfn++, va += PAGE_SIZE)
-			if (mfn_to_local_pfn(mfn) < max_pfn)
-				set_memory_wb(va, 1);
-	}
+	free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));

 	/* Finally remove it */
 	o = remove_vm_area((void *)addr);
@@ -434,6 +553,37 @@ void iounmap(volatile void __iomem *addr
 }
 EXPORT_SYMBOL(iounmap);

+#ifndef CONFIG_XEN
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+	void *addr;
+	unsigned long start = phys & PAGE_MASK;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT))
+		return __va(phys);
+
+	addr = (void *)ioremap_default(start, PAGE_SIZE);
+	if (addr)
+		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+	return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return;
+}
+#endif
+
 int __initdata early_ioremap_debug;

 static int __init early_ioremap_debug_setup(char *str)
@@ -445,8 +595,8 @@ static int __init early_ioremap_debug_se
 early_param("early_ioremap_debug", early_ioremap_debug_setup);

 static __initdata int after_paging_init;
-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-				__attribute__((aligned(PAGE_SIZE)));
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+		__section(.bss.page_aligned);

 #ifdef CONFIG_X86_32
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -461,8 +611,8 @@ static inline pmd_t * __init early_iorem
 }
 #else
 #define early_ioremap_pmd early_get_pmd
+#undef make_lowmem_page_readonly
 #define make_lowmem_page_readonly early_make_page_readonly
-#define make_lowmem_page_writable make_page_writable
 #endif

 static inline pte_t * __init early_ioremap_pte(unsigned long addr)
@@ -512,7 +662,7 @@ void __init early_ioremap_clear(void)
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	pmd_clear(pmd);
 	make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
-	/* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */
+	/* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
 	__flush_tlb_all();
 }

@@ -654,10 +804,11 @@ void __init early_iounmap(void *addr, un
 	unsigned long offset;
 	unsigned int nrpages;
 	enum fixed_addresses idx;
-	unsigned int nesting;
+	int nesting;

 	nesting = --early_ioremap_nested;
-	WARN_ON(nesting < 0);
+	if (WARN_ON(nesting < 0))
+		return;

 	if (early_ioremap_debug) {
 		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
--- head-2010-05-25.orig/arch/x86/mm/pageattr-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pageattr-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>

 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -17,371 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
-#include <asm/mmu_context.h>
-
-#ifndef CONFIG_X86_64
-#define TASK_SIZE64 TASK_SIZE
-#endif
-
-static void _pin_lock(struct mm_struct *mm, int lock) {
-	if (lock)
-		spin_lock(&mm->page_table_lock);
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-	/* While mm->page_table_lock protects us against insertions and
-	 * removals of higher level page table pages, it doesn't protect
-	 * against updates of pte-s. Such updates, however, require the
-	 * pte pages to be in consistent state (unpinned+writable or
-	 * pinned+readonly). The pinning and attribute changes, however
-	 * cannot be done atomically, which is why such updates must be
-	 * prevented from happening concurrently.
-	 * Note that no pte lock can ever elsewhere be acquired nesting
-	 * with an already acquired one in the same mm, or with the mm's
-	 * page_table_lock already acquired, as that would break in the
-	 * non-split case (where all these are actually resolving to the
-	 * one page_table_lock). Thus acquiring all of them here is not
-	 * going to result in dead locks, and the order of acquires
-	 * doesn't matter.
-	 */
-	{
-		pgd_t *pgd = mm->pgd;
-		unsigned g;
-
-		for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-			pud_t *pud;
-			unsigned u;
-
-			if (pgd_none(*pgd))
-				continue;
-			pud = pud_offset(pgd, 0);
-			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-				pmd_t *pmd;
-				unsigned m;
-
-				if (pud_none(*pud))
-					continue;
-				pmd = pmd_offset(pud, 0);
-				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-					spinlock_t *ptl;
-
-					if (pmd_none(*pmd))
-						continue;
-					ptl = pte_lockptr(0, pmd);
-					if (lock)
-						spin_lock(ptl);
-					else
-						spin_unlock(ptl);
-				}
-			}
-		}
-	}
-#endif
-	if (!lock)
-		spin_unlock(&mm->page_table_lock);
-}
-#define pin_lock(mm) _pin_lock(mm, 1)
-#define pin_unlock(mm) _pin_lock(mm, 0)
-
-#define PIN_BATCH sizeof(void *)
-static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
-
-static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
-					     unsigned int cpu, unsigned int seq)
-{
-	unsigned long pfn = page_to_pfn(page);
-
-	if (PageHighMem(page)) {
-		if (pgprot_val(flags) & _PAGE_RW)
-			ClearPagePinned(page);
-		else
-			SetPagePinned(page);
-	} else {
-		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-					(unsigned long)__va(pfn << PAGE_SHIFT),
-					pfn_pte(pfn, flags), 0);
-		if (unlikely(++seq == PIN_BATCH)) {
-			if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-								PIN_BATCH, NULL)))
-				BUG();
-			seq = 0;
-		}
-	}
-
-	return seq;
-}
-
-static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
-{
-	pgd_t       *pgd = pgd_base;
-	pud_t       *pud;
-	pmd_t       *pmd;
-	int          g,u,m;
-	unsigned int cpu, seq;
-	multicall_entry_t *mcl;
-
-	if (xen_feature(XENFEAT_auto_translated_physmap))
-		return;
-
-	cpu = get_cpu();
-
-	/*
-	 * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
-	 * may not be the 'current' task's pagetables (e.g., current may be
-	 * 32-bit, but the pagetables may be for a 64-bit task).
-	 * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
-	 * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
-	 */
-	for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
-		if (pgd_none(*pgd))
-			continue;
-		pud = pud_offset(pgd, 0);
-		if (PTRS_PER_PUD > 1) /* not folded */
-			seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
-		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
-			if (pud_none(*pud))
-				continue;
-			pmd = pmd_offset(pud, 0);
-			if (PTRS_PER_PMD > 1) /* not folded */
-				seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
-			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
-				if (pmd_none(*pmd))
-					continue;
-				seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
-			}
-		}
-	}
-
-	mcl = per_cpu(pb_mcl, cpu);
-#ifdef CONFIG_X86_64
-	if (unlikely(seq > PIN_BATCH - 2)) {
-		if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
-			BUG();
-		seq = 0;
-	}
-	MULTI_update_va_mapping(mcl + seq,
-	       (unsigned long)__user_pgd(pgd_base),
-	       pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
-	       0);
-	MULTI_update_va_mapping(mcl + seq + 1,
-	       (unsigned long)pgd_base,
-	       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-	       UVMF_TLB_FLUSH);
-	if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
-		BUG();
-#else
-	if (likely(seq != 0)) {
-		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
-			(unsigned long)pgd_base,
-			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-			UVMF_TLB_FLUSH);
-		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
-		                                        seq + 1, NULL)))
-			BUG();
-	} else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
-			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
-			UVMF_TLB_FLUSH))
-		BUG();
-#endif
-
-	put_cpu();
-}
-
-static void __pgd_pin(pgd_t *pgd)
-{
-	pgd_walk(pgd, PAGE_KERNEL_RO);
-	kmap_flush_unused();
-	xen_pgd_pin(__pa(pgd)); /* kernel */
-#ifdef CONFIG_X86_64
-	xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
-#endif
-	SetPagePinned(virt_to_page(pgd));
-}
-
-static void __pgd_unpin(pgd_t *pgd)
-{
-	xen_pgd_unpin(__pa(pgd));
-#ifdef CONFIG_X86_64
-	xen_pgd_unpin(__pa(__user_pgd(pgd)));
-#endif
-	pgd_walk(pgd, PAGE_KERNEL);
-	ClearPagePinned(virt_to_page(pgd));
-}
-
-void pgd_test_and_unpin(pgd_t *pgd)
-{
-	if (PagePinned(virt_to_page(pgd)))
-		__pgd_unpin(pgd);
-}
-
-void mm_pin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	pin_lock(mm);
-	__pgd_pin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_unpin(struct mm_struct *mm)
-{
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	pin_lock(mm);
-	__pgd_unpin(mm->pgd);
-	pin_unlock(mm);
-}
-
-void mm_pin_all(void)
-{
-	struct page *page;
-	unsigned long flags;
-
-	if (xen_feature(XENFEAT_writable_page_tables))
-		return;
-
-	/*
-	 * Allow uninterrupted access to the pgd_list. Also protects
-	 * __pgd_pin() by disabling preemption.
-	 * All other CPUs must be at a safe point (e.g., in stop_machine
-	 * or offlined entirely).
-	 */
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_for_each_entry(page, &pgd_list, lru) {
-		if (!PagePinned(page))
-			__pgd_pin((pgd_t *)page_address(page));
-	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-{
-	if (!PagePinned(virt_to_page(mm->pgd)))
-		mm_pin(mm);
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
-	struct task_struct *tsk = current;
-
-	task_lock(tsk);
-
-	/*
-	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-	 */
-	if (tsk->active_mm == mm) {
-		tsk->active_mm = &init_mm;
-		atomic_inc(&init_mm.mm_count);
-
-		switch_mm(mm, &init_mm, tsk);
-
-		atomic_dec(&mm->mm_count);
-		BUG_ON(atomic_read(&mm->mm_count) == 0);
-	}
-
-	task_unlock(tsk);
-
-	if (PagePinned(virt_to_page(mm->pgd))
-	    && atomic_read(&mm->mm_count) == 1
-	    && !mm->context.has_foreign_mappings)
-		mm_unpin(mm);
-}
-
-static void _pte_free(struct page *page, unsigned int order)
-{
-	BUG_ON(order);
-	__pte_free(page);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *pte;
-
-#ifdef CONFIG_HIGHPTE
-	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
-	if (pte) {
-		pgtable_page_ctor(pte);
-		SetPageForeign(pte, _pte_free);
-		init_page_count(pte);
-	}
-	return pte;
-}
-
-void __pte_free(pgtable_t pte)
-{
-	if (!PageHighMem(pte)) {
-		unsigned long va = (unsigned long)page_address(pte);
-		unsigned int level;
-		pte_t *ptep = lookup_address(va, &level);
-
-		BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-		if (!pte_write(*ptep)
-		    && HYPERVISOR_update_va_mapping(va,
-						    mk_pte(pte, PAGE_KERNEL),
-						    0))
-			BUG();
-	} else
-#ifdef CONFIG_HIGHPTE
-		ClearPagePinned(pte);
-#else
-		BUG();
-#endif
-
-	ClearPageForeign(pte);
-	init_page_count(pte);
-	pgtable_page_dtor(pte);
-	__free_page(pte);
-}
-
-#if PAGETABLE_LEVELS >= 3
-static void _pmd_free(struct page *page, unsigned int order)
-{
-	BUG_ON(order);
-	__pmd_free(page);
-}
-
-pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-	struct page *pmd;
-
-	pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-	if (!pmd)
-		return NULL;
-	SetPageForeign(pmd, _pmd_free);
-	init_page_count(pmd);
-	return page_address(pmd);
-}
-
-void __pmd_free(pgtable_t pmd)
-{
-	unsigned long va = (unsigned long)page_address(pmd);
-	unsigned int level;
-	pte_t *ptep = lookup_address(va, &level);
-
-	BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
-	if (!pte_write(*ptep)
-	    && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
-		BUG();
-
-	ClearPageForeign(pmd);
-	init_page_count(pmd);
-	__free_page(pmd);
-}
-#endif
-
-/* blktap and gntdev need this, as otherwise they would implicitly (and
- * needlessly, as they never use it) reference init_mm. */
-pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
-				  unsigned long addr, pte_t *ptep, int full)
-{
-	return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
-				       addr, ptep, full);
-}
-EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+#include <asm/pat.h>

 /*
  * The current flushing context - we pass it instead of 5 arguments:
@@ -393,6 +31,7 @@ struct cpa_data {
 	int		numpages;
 	int		flushtlb;
 	unsigned long	pfn;
+	unsigned	force_split : 1;
 };

 #ifdef CONFIG_X86_64
@@ -638,6 +277,9 @@ try_preserve_large_page(pte_t *kpte, uns
 	int i, do_split = 1;
 	unsigned int level;

+	if (cpa->force_split)
+		return 1;
+
 	spin_lock_irqsave(&pgd_lock, flags);
 	/*
 	 * Check for races, another CPU might have split this page
@@ -857,9 +499,7 @@ static int split_large_page(pte_t *kpte,
 		goto out_unlock;

 	pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
-	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
+	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));

 #ifdef CONFIG_X86_64
@@ -920,7 +560,7 @@ static int __change_page_attr(struct cpa
 repeat:
 	kpte = lookup_address(address, &level);
 	if (!kpte)
-		return primary ? -EINVAL : 0;
+		return 0;

 	old_pte = *kpte;
 	if (!__pte_val(old_pte)) {
@@ -1079,7 +719,8 @@ static inline int cache_attr(pgprot_t at
 }

 static int change_page_attr_set_clr(unsigned long addr, int numpages,
-				    pgprot_t mask_set, pgprot_t mask_clr)
+				    pgprot_t mask_set, pgprot_t mask_clr,
+				    int force_split)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
@@ -1090,7 +731,7 @@ static int change_page_attr_set_clr(unsi
 	 */
 	mask_set = canon_pgprot(mask_set);
 	mask_clr = canon_pgprot(mask_clr);
-	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
 		return 0;

 	/* Ensure we are PAGE_SIZE aligned */
@@ -1107,6 +748,7 @@ static int change_page_attr_set_clr(unsi
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
 	cpa.flushtlb = 0;
+	cpa.force_split = force_split;

 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -1145,26 +787,67 @@ out:
 static inline int change_page_attr_set(unsigned long addr, int numpages,
 				       pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
 }

 static inline int change_page_attr_clear(unsigned long addr, int numpages,
 					 pgprot_t mask)
 {
-	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
 }

-int set_memory_uc(unsigned long addr, int numpages)
+int _set_memory_uc(unsigned long addr, int numpages)
 {
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
 	return change_page_attr_set(addr, numpages,
-				    __pgprot(_PAGE_PCD));
+				    __pgprot(_PAGE_CACHE_UC_MINUS));
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+			    _PAGE_CACHE_UC_MINUS, NULL))
+		return -EINVAL;
+
+	return _set_memory_uc(addr, numpages);
 }
 EXPORT_SYMBOL(set_memory_uc);

-int set_memory_wb(unsigned long addr, int numpages)
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages,
+				    __pgprot(_PAGE_CACHE_WC));
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+	if (!pat_wc_enabled)
+		return set_memory_uc(addr, numpages);
+
+	if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+		_PAGE_CACHE_WC, NULL))
+		return -EINVAL;
+
+	return _set_memory_wc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(addr, numpages,
-				      __pgprot(_PAGE_PCD | _PAGE_PWT));
+				      __pgprot(_PAGE_CACHE_MASK));
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	free_memtype(addr, addr + numpages * PAGE_SIZE);
+
+	return _set_memory_wb(addr, numpages);
 }
 EXPORT_SYMBOL(set_memory_wb);

@@ -1195,6 +878,12 @@ int set_memory_np(unsigned long addr, in
 	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
 }

+int set_memory_4k(unsigned long addr, int numpages)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+					__pgprot(0), 1);
+}
+
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
@@ -1304,6 +993,45 @@ void kernel_map_pages(struct page *page,
 	cpa_fill_pool(NULL);
 }

+#ifdef CONFIG_DEBUG_FS
+static int dpa_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "DEBUG_PAGEALLOC\n");
+	seq_printf(m, "pool_size     : %lu\n", pool_size);
+	seq_printf(m, "pool_pages    : %lu\n", pool_pages);
+	seq_printf(m, "pool_low      : %lu\n", pool_low);
+	seq_printf(m, "pool_used     : %lu\n", pool_used);
+	seq_printf(m, "pool_failed   : %lu\n", pool_failed);
+
+	return 0;
+}
+
+static int dpa_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, dpa_show, NULL);
+}
+
+static const struct file_operations dpa_fops = {
+	.open		= dpa_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init debug_pagealloc_proc_init(void)
+{
+	struct dentry *de;
+
+	de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
+				 &dpa_fops);
+	if (!de)
+		return -ENOMEM;
+
+	return 0;
+}
+__initcall(debug_pagealloc_proc_init);
+#endif
+
 #ifdef CONFIG_HIBERNATION

 bool kernel_page_present(struct page *page)
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/mm/pat-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,602 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/bootmem.h>
+
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+#include <asm/e820.h>
+#include <asm/cacheflush.h>
+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_X86_PAT
+int __read_mostly pat_wc_enabled = 1;
+
+void __cpuinit pat_disable(char *reason)
+{
+	pat_wc_enabled = 0;
+	printk(KERN_INFO "%s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+	pat_disable("PAT support disabled.");
+	return 0;
+}
+early_param("nopat", nopat);
+#endif
+
+static u64 __read_mostly boot_pat_state;
+
+enum {
+	PAT_UC = 0,		/* uncached */
+	PAT_WC = 1,		/* Write combining */
+	PAT_WT = 4,		/* Write Through */
+	PAT_WP = 5,		/* Write Protected */
+	PAT_WB = 6,		/* Write Back (default) */
+	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y)	((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+	u64 pat;
+
+	if (!pat_wc_enabled)
+		return;
+
+	/* Paranoia check. */
+	if (!cpu_has_pat) {
+		printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+		/*
+		 * Panic if this happens on the secondary CPU, and we
+		 * switched to PAT on the boot CPU. We have no way to
+		 * undo PAT.
+		*/
+		BUG_ON(boot_pat_state);
+	}
+
+#ifndef CONFIG_XEN
+	/* Set PWT to Write-Combining. All other bits stay the same */
+	/*
+	 * PTE encoding used in Linux:
+	 *      PAT
+	 *      |PCD
+	 *      ||PWT
+	 *      |||
+	 *      000 WB		_PAGE_CACHE_WB
+	 *      001 WC		_PAGE_CACHE_WC
+	 *      010 UC-		_PAGE_CACHE_UC_MINUS
+	 *      011 UC		_PAGE_CACHE_UC
+	 * PAT bit unused
+	 */
+	pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+	      PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+
+	/* Boot CPU check */
+	if (!boot_pat_state)
+		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+#else
+	/*
+	 * PAT settings are part of the hypervisor interface, and their
+	 * assignment cannot be changed.
+	 */
+	rdmsrl(MSR_IA32_CR_PAT, pat);
+	if (!boot_pat_state)
+		boot_pat_state = pat;
+#endif
+	printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+	       smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static char *cattr_name(unsigned long flags)
+{
+	switch (flags & _PAGE_CACHE_MASK) {
+		case _PAGE_CACHE_UC:		return "uncached";
+		case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+		case _PAGE_CACHE_WB:		return "write-back";
+		case _PAGE_CACHE_WC:		return "write-combining";
+		case _PAGE_CACHE_WP:		return "write-protected";
+		case _PAGE_CACHE_WT:		return "write-through";
+		default:			return "broken";
+	}
+}
+
+/*
+ * The global memtype list keeps track of memory type for specific
+ * physical memory areas. Conflicting memory types in different
+ * mappings can cause CPU cache corruption. To avoid this we keep track.
+ *
+ * The list is sorted based on starting address and can contain multiple
+ * entries for each address (this allows reference counting for overlapping
+ * areas). All the aliases have the same cache attributes of course.
+ * Zero attributes are represented as holes.
+ *
+ * Currently the data structure is a list because the number of mappings
+ * are expected to be relatively small. If this should be a problem
+ * it could be changed to a rbtree or similar.
+ *
+ * memtype_lock protects the whole list.
+ */
+
+struct memtype {
+	u64 start;
+	u64 end;
+	unsigned long type;
+	struct list_head nd;
+};
+
+static LIST_HEAD(memtype_list);
+static DEFINE_SPINLOCK(memtype_lock); 	/* protects memtype list */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+				unsigned long *ret_prot)
+{
+	unsigned long pat_type;
+	u8 mtrr_type;
+
+	pat_type = prot & _PAGE_CACHE_MASK;
+	prot &= (~_PAGE_CACHE_MASK);
+
+	/*
+	 * We return the PAT request directly for types where PAT takes
+	 * precedence with respect to MTRR and for UC_MINUS.
+	 * Consistency checks with other PAT requests is done later
+	 * while going through memtype list.
+	 */
+	if (pat_type == _PAGE_CACHE_WC) {
+		*ret_prot = prot | _PAGE_CACHE_WC;
+		return 0;
+	} else if (pat_type == _PAGE_CACHE_UC_MINUS) {
+		*ret_prot = prot | _PAGE_CACHE_UC_MINUS;
+		return 0;
+	} else if (pat_type == _PAGE_CACHE_UC) {
+		*ret_prot = prot | _PAGE_CACHE_UC;
+		return 0;
+	}
+
+	/*
+	 * Look for MTRR hint to get the effective type in case where PAT
+	 * request is for WB.
+	 */
+	mtrr_type = mtrr_type_lookup(start, end);
+
+	if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
+		*ret_prot = prot | _PAGE_CACHE_UC;
+	} else if (mtrr_type == MTRR_TYPE_WRCOMB) {
+		*ret_prot = prot | _PAGE_CACHE_WC;
+	} else {
+		*ret_prot = prot | _PAGE_CACHE_WB;
+	}
+
+	return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+			unsigned long *ret_type)
+{
+	struct memtype *new_entry = NULL;
+	struct memtype *parse;
+	unsigned long actual_type;
+	int err = 0;
+
+	/* Only track when pat_wc_enabled */
+	if (!pat_wc_enabled) {
+		/* This is identical to page table setting without PAT */
+		if (ret_type) {
+			if (req_type == -1) {
+				*ret_type = _PAGE_CACHE_WB;
+			} else {
+				*ret_type = req_type;
+			}
+		}
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB in page table. No need to track */
+	if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+		if (ret_type)
+			*ret_type = _PAGE_CACHE_WB;
+
+		return 0;
+	}
+
+	if (req_type == -1) {
+		/*
+		 * Call mtrr_lookup to get the type hint. This is an
+		 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+		 * tools and ACPI tools). Use WB request for WB memory and use
+		 * UC_MINUS otherwise.
+		 */
+		u8 mtrr_type = mtrr_type_lookup(start, end);
+
+		if (mtrr_type == MTRR_TYPE_WRBACK) {
+			req_type = _PAGE_CACHE_WB;
+			actual_type = _PAGE_CACHE_WB;
+		} else {
+			req_type = _PAGE_CACHE_UC_MINUS;
+			actual_type = _PAGE_CACHE_UC_MINUS;
+		}
+	} else {
+		req_type &= _PAGE_CACHE_MASK;
+		err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	}
+
+	if (err) {
+		if (ret_type)
+			*ret_type = actual_type;
+
+		return -EINVAL;
+	}
+
+	new_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	new_entry->start = start;
+	new_entry->end = end;
+	new_entry->type = actual_type;
+
+	if (ret_type)
+		*ret_type = actual_type;
+
+	spin_lock(&memtype_lock);
+
+	/* Search for existing mapping that overlaps the current range */
+	list_for_each_entry(parse, &memtype_list, nd) {
+		struct memtype *saved_ptr;
+
+		if (parse->start >= end) {
+			pr_debug("New Entry\n");
+			list_add(&new_entry->nd, parse->nd.prev);
+			new_entry = NULL;
+			break;
+		}
+
+		if (start <= parse->start && end >= parse->start) {
+			if (actual_type != parse->type && ret_type) {
+				actual_type = parse->type;
+				*ret_type = actual_type;
+				new_entry->type = actual_type;
+			}
+
+			if (actual_type != parse->type) {
+				printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+					current->comm, current->pid,
+					start, end,
+					cattr_name(actual_type),
+					cattr_name(parse->type));
+				err = -EBUSY;
+				break;
+			}
+
+			saved_ptr = parse;
+			/*
+			 * Check to see whether the request overlaps more
+			 * than one entry in the list
+			 */
+			list_for_each_entry_continue(parse, &memtype_list, nd) {
+				if (end <= parse->start) {
+					break;
+				}
+
+				if (actual_type != parse->type) {
+					printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+						current->comm, current->pid,
+						start, end,
+						cattr_name(actual_type),
+						cattr_name(parse->type));
+					err = -EBUSY;
+					break;
+				}
+			}
+
+			if (err) {
+				break;
+			}
+
+			pr_debug("Overlap at 0x%Lx-0x%Lx\n",
+			       saved_ptr->start, saved_ptr->end);
+			/* No conflict. Go ahead and add this new entry */
+			list_add(&new_entry->nd, saved_ptr->nd.prev);
+			new_entry = NULL;
+			break;
+		}
+
+		if (start < parse->end) {
+			if (actual_type != parse->type && ret_type) {
+				actual_type = parse->type;
+				*ret_type = actual_type;
+				new_entry->type = actual_type;
+			}
+
+			if (actual_type != parse->type) {
+				printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+					current->comm, current->pid,
+					start, end,
+					cattr_name(actual_type),
+					cattr_name(parse->type));
+				err = -EBUSY;
+				break;
+			}
+
+			saved_ptr = parse;
+			/*
+			 * Check to see whether the request overlaps more
+			 * than one entry in the list
+			 */
+			list_for_each_entry_continue(parse, &memtype_list, nd) {
+				if (end <= parse->start) {
+					break;
+				}
+
+				if (actual_type != parse->type) {
+					printk(
+		KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+						current->comm, current->pid,
+						start, end,
+						cattr_name(actual_type),
+						cattr_name(parse->type));
+					err = -EBUSY;
+					break;
+				}
+			}
+
+			if (err) {
+				break;
+			}
+
+			pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
+				 saved_ptr->start, saved_ptr->end);
+			/* No conflict. Go ahead and add this new entry */
+			list_add(&new_entry->nd, &saved_ptr->nd);
+			new_entry = NULL;
+			break;
+		}
+	}
+
+	if (err) {
+		printk(KERN_INFO
+	"reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+			start, end, cattr_name(new_entry->type),
+			cattr_name(req_type));
+		kfree(new_entry);
+		spin_unlock(&memtype_lock);
+		return err;
+	}
+
+	if (new_entry) {
+		/* No conflict. Not yet added to the list. Add to the tail */
+		list_add_tail(&new_entry->nd, &memtype_list);
+		pr_debug("New Entry\n");
+	}
+
+	if (ret_type) {
+		pr_debug(
+	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+			start, end, cattr_name(actual_type),
+			cattr_name(req_type), cattr_name(*ret_type));
+	} else {
+		pr_debug(
+	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
+			start, end, cattr_name(actual_type),
+			cattr_name(req_type));
+	}
+
+	spin_unlock(&memtype_lock);
+	return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+	struct memtype *ml;
+	int err = -EINVAL;
+
+	/* Only track when pat_wc_enabled */
+	if (!pat_wc_enabled) {
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB. No need to track */
+	if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+		return 0;
+	}
+
+	spin_lock(&memtype_lock);
+	list_for_each_entry(ml, &memtype_list, nd) {
+		if (ml->start == start && ml->end == end) {
+			list_del(&ml->nd);
+			kfree(ml);
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&memtype_lock);
+
+	if (err) {
+		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+			current->comm, current->pid, start, end);
+	}
+
+	pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	return err;
+}
+
+
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ *   inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ *   X drivers.
+ */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
+				unsigned long size, pgprot_t vma_prot)
+{
+	return vma_prot;
+}
+
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+	u64 from = ((u64)mfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(mfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		mfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
+				unsigned long size, pgprot_t *vma_prot)
+{
+	u64 addr = (u64)mfn << PAGE_SHIFT;
+	unsigned long flags = _PAGE_CACHE_UC_MINUS;
+	int retval;
+
+	if (!range_is_allowed(mfn, size))
+		return 0;
+
+	if (file->f_flags & O_SYNC) {
+		flags = _PAGE_CACHE_UC;
+	}
+
+#ifndef CONFIG_X86_32
+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
+	/*
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory,
+	 * so blindly setting UC or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+	if (!pat_wc_enabled &&
+	    ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+		flags = _PAGE_CACHE_UC;
+	}
+#endif
+#endif
+
+	/*
+	 * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+	 * Without O_SYNC, we want to get
+	 * - WB for WB-able memory and no other conflicting mappings
+	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+	 * - Inherit from confliting mappings otherwise
+	 */
+	if (flags != _PAGE_CACHE_UC_MINUS) {
+		retval = reserve_memtype(addr, addr + size, flags, NULL);
+	} else {
+		retval = reserve_memtype(addr, addr + size, -1, &flags);
+	}
+
+	if (retval < 0)
+		return 0;
+
+	if (ioremap_check_change_attr(mfn, size, flags) < 0) {
+		free_memtype(addr, addr + size);
+		printk(KERN_INFO
+		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+			current->comm, current->pid,
+			cattr_name(flags),
+			addr, addr + size);
+		return 0;
+	}
+
+	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+			     flags);
+	return 1;
+}
+
+void map_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)mfn << PAGE_SHIFT;
+	unsigned long flags;
+	unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+	reserve_memtype(addr, addr + size, want_flags, &flags);
+	if (flags != want_flags) {
+		printk(KERN_INFO
+		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+			current->comm, current->pid,
+			cattr_name(want_flags),
+			addr, (unsigned long long)(addr + size),
+			cattr_name(flags));
+	}
+}
+
+void unmap_devmem(unsigned long mfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)mfn << PAGE_SHIFT;
+
+	free_memtype(addr, addr + size);
+}
+
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ head-2010-05-25/arch/x86/mm/pgtable-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -0,0 +1,710 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <xen/features.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/hypervisor.h>
+#include <asm/mmu_context.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+	if (pte)
+		make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
+	return pte;
+}
+
+static void _pte_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pte_free(page);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+	if (pte) {
+		pgtable_page_ctor(pte);
+		SetPageForeign(pte, _pte_free);
+		init_page_count(pte);
+	}
+	return pte;
+}
+
+void __pte_free(pgtable_t pte)
+{
+	if (!PageHighMem(pte)) {
+		unsigned long va = (unsigned long)page_address(pte);
+		unsigned int level;
+		pte_t *ptep = lookup_address(va, &level);
+
+		BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+		if (!pte_write(*ptep)
+		    && HYPERVISOR_update_va_mapping(va,
+						    mk_pte(pte, PAGE_KERNEL),
+						    0))
+			BUG();
+	} else
+#ifdef CONFIG_HIGHPTE
+		ClearPagePinned(pte);
+#else
+		BUG();
+#endif
+
+	ClearPageForeign(pte);
+	init_page_count(pte);
+	pgtable_page_dtor(pte);
+	__free_page(pte);
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	paravirt_release_pte(page_to_pfn(pte));
+	tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+static void _pmd_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pmd_free(page);
+}
+
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pmd;
+
+	pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+	if (!pmd)
+		return NULL;
+	SetPageForeign(pmd, _pmd_free);
+	init_page_count(pmd);
+	return page_address(pmd);
+}
+
+void __pmd_free(pgtable_t pmd)
+{
+	unsigned long va = (unsigned long)page_address(pmd);
+	unsigned int level;
+	pte_t *ptep = lookup_address(va, &level);
+
+	BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep));
+	if (!pte_write(*ptep)
+	    && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0))
+		BUG();
+
+	ClearPageForeign(pmd);
+	init_page_count(pmd);
+	__free_page(pmd);
+}
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+#ifndef CONFIG_X86_64
+#define TASK_SIZE64 TASK_SIZE
+#endif
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+	if (lock)
+		spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	/* While mm->page_table_lock protects us against insertions and
+	 * removals of higher level page table pages, it doesn't protect
+	 * against updates of pte-s. Such updates, however, require the
+	 * pte pages to be in consistent state (unpinned+writable or
+	 * pinned+readonly). The pinning and attribute changes, however
+	 * cannot be done atomically, which is why such updates must be
+	 * prevented from happening concurrently.
+	 * Note that no pte lock can ever elsewhere be acquired nesting
+	 * with an already acquired one in the same mm, or with the mm's
+	 * page_table_lock already acquired, as that would break in the
+	 * non-split case (where all these are actually resolving to the
+	 * one page_table_lock). Thus acquiring all of them here is not
+	 * going to result in dead locks, and the order of acquires
+	 * doesn't matter.
+	 */
+	{
+		pgd_t *pgd = mm->pgd;
+		unsigned g;
+
+		for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+			pud_t *pud;
+			unsigned u;
+
+			if (pgd_none(*pgd))
+				continue;
+			pud = pud_offset(pgd, 0);
+			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+				pmd_t *pmd;
+				unsigned m;
+
+				if (pud_none(*pud))
+					continue;
+				pmd = pmd_offset(pud, 0);
+				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+					spinlock_t *ptl;
+
+					if (pmd_none(*pmd))
+						continue;
+					ptl = pte_lockptr(0, pmd);
+					if (lock)
+						spin_lock(ptl);
+					else
+						spin_unlock(ptl);
+				}
+			}
+		}
+	}
+#endif
+	if (!lock)
+		spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
+#define PIN_BATCH sizeof(void *)
+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+
+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+					     unsigned int cpu, unsigned int seq)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	if (PageHighMem(page)) {
+		if (pgprot_val(flags) & _PAGE_RW)
+			ClearPagePinned(page);
+		else
+			SetPagePinned(page);
+	} else {
+		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					pfn_pte(pfn, flags), 0);
+		if (unlikely(++seq == PIN_BATCH)) {
+			if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+								PIN_BATCH, NULL)))
+				BUG();
+			seq = 0;
+		}
+	}
+
+	return seq;
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+	pgd_t       *pgd = pgd_base;
+	pud_t       *pud;
+	pmd_t       *pmd;
+	int          g,u,m;
+	unsigned int cpu, seq;
+	multicall_entry_t *mcl;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	cpu = get_cpu();
+
+	/*
+	 * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+	 * may not be the 'current' task's pagetables (e.g., current may be
+	 * 32-bit, but the pagetables may be for a 64-bit task).
+	 * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
+	 * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
+	 */
+	for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		if (PTRS_PER_PUD > 1) /* not folded */
+			seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (pud_none(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			if (PTRS_PER_PMD > 1) /* not folded */
+				seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+				if (pmd_none(*pmd))
+					continue;
+				seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+			}
+		}
+	}
+
+	mcl = per_cpu(pb_mcl, cpu);
+#ifdef CONFIG_X86_64
+	if (unlikely(seq > PIN_BATCH - 2)) {
+		if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+			BUG();
+		seq = 0;
+	}
+	MULTI_update_va_mapping(mcl + seq,
+	       (unsigned long)__user_pgd(pgd_base),
+	       pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags),
+	       0);
+	MULTI_update_va_mapping(mcl + seq + 1,
+	       (unsigned long)pgd_base,
+	       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+	       UVMF_TLB_FLUSH);
+	if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+		BUG();
+#else
+	if (likely(seq != 0)) {
+		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+			(unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH);
+		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+		                                        seq + 1, NULL)))
+			BUG();
+	} else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH))
+		BUG();
+#endif
+
+	put_cpu();
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+	pgd_walk(pgd, PAGE_KERNEL_RO);
+	kmap_flush_unused();
+	xen_pgd_pin(__pa(pgd)); /* kernel */
+#ifdef CONFIG_X86_64
+	xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
+#endif
+	SetPagePinned(virt_to_page(pgd));
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+	xen_pgd_unpin(__pa(pgd));
+#ifdef CONFIG_X86_64
+	xen_pgd_unpin(__pa(__user_pgd(pgd)));
+#endif
+	pgd_walk(pgd, PAGE_KERNEL);
+	ClearPagePinned(virt_to_page(pgd));
+}
+
+static void pgd_test_and_unpin(pgd_t *pgd)
+{
+	if (PagePinned(virt_to_page(pgd)))
+		__pgd_unpin(pgd);
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_pin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_unpin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_pin_all(void)
+{
+	struct page *page;
+	unsigned long flags;
+
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	/*
+	 * Allow uninterrupted access to the pgd_list. Also protects
+	 * __pgd_pin() by disabling preemption.
+	 * All other CPUs must be at a safe point (e.g., in stop_machine
+	 * or offlined entirely).
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (!PagePinned(page))
+			__pgd_pin((pgd_t *)page_address(page));
+	}
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	if (!PagePinned(virt_to_page(mm->pgd)))
+		mm_pin(mm);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+
+	/*
+	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+	 */
+	if (tsk->active_mm == mm) {
+		tsk->active_mm = &init_mm;
+		atomic_inc(&init_mm.mm_count);
+
+		switch_mm(mm, &init_mm, tsk);
+
+		atomic_dec(&mm->mm_count);
+		BUG_ON(atomic_read(&mm->mm_count) == 0);
+	}
+
+	task_unlock(tsk);
+
+	if (PagePinned(virt_to_page(mm->pgd))
+	    && atomic_read(&mm->mm_count) == 1
+	    && !mm->context.has_foreign_mappings)
+		mm_unpin(mm);
+}
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+	pgd_t *pgd = p;
+	unsigned long flags;
+
+	pgd_test_and_unpin(pgd);
+
+	/* Clear usermode parts of PGD */
+	memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (PAGETABLE_LEVELS == 2 ||
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    PAGETABLE_LEVELS == 4) {
+		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+				KERNEL_PGD_PTRS);
+		paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+					 __pa(swapper_pg_dir) >> PAGE_SHIFT,
+					 KERNEL_PGD_BOUNDARY,
+					 KERNEL_PGD_PTRS);
+	}
+
+#ifdef CONFIG_X86_64
+	/* set level3_user_pgt for vsyscall area */
+	__user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
+		__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+#endif
+
+#ifndef CONFIG_X86_PAE
+	/* list required to sync kernel mapping updates */
+	if (!SHARED_KERNEL_PMD)
+		pgd_list_add(pgd);
+#endif
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+	unsigned long flags; /* can be called from interrupt context */
+
+	if (!SHARED_KERNEL_PMD) {
+		spin_lock_irqsave(&pgd_lock, flags);
+		pgd_list_del(pgd);
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
+
+	pgd_test_and_unpin(pgd);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (__pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = xen_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+		xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	pud_t *pud;
+	pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
+	unsigned long addr, flags;
+	int i;
+
+	/*
+	 * We can race save/restore (if we sleep during a GFP_KERNEL memory
+	 * allocation). We therefore store virtual addresses of pmds as they
+	 * do not change across save/restore, and poke the machine addresses
+	 * into the pgdir under the pgd_lock.
+	 */
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
+		pmds[i] = pmd_alloc_one(mm, addr);
+		if (!pmds[i])
+			goto out_oom;
+	}
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	/* Protect against save/restore: move below 4GB under pgd_lock. */
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+		spin_unlock_irqrestore(&pgd_lock, flags);
+out_oom:
+		while (i--)
+			pmd_free(mm, pmds[i]);
+		return 0;
+	}
+
+	/* Copy kernel pmd contents and write-protect the new pmds. */
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+	     i++, pud++, addr += PUD_SIZE) {
+		if (i >= KERNEL_PGD_BOUNDARY) {
+			memcpy(pmds[i],
+			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+			make_lowmem_page_readonly(
+				pmds[i], XENFEAT_writable_page_tables);
+		}
+
+		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
+		pud_populate(mm, pud, pmds[i]);
+	}
+
+	/* List required to sync kernel mapping updates and
+	 * to pin/unpin on save/restore. */
+	pgd_list_add(pgd);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
+	return 1;
+}
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+	struct page *page = virt_to_page(pmd);
+	unsigned long pfn = page_to_pfn(page);
+
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+	/* Note: almost everything apart from _PAGE_PRESENT is
+	   reserved at the pmd (PDPT) level. */
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		BUG_ON(PageHighMem(page));
+		BUG_ON(HYPERVISOR_update_va_mapping(
+			  (unsigned long)__va(pfn << PAGE_SHIFT),
+			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+		set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+	} else
+		*pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 */
+	if (mm == current->active_mm)
+		xen_tlb_flush();
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif	/* CONFIG_X86_PAE */
+
+#ifdef CONFIG_X86_64
+/* We allocate two contiguous pages for kernel and user. */
+#define PGD_ORDER 1
+#else
+#define PGD_ORDER 0
+#endif
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+
+	/* so that alloc_pd can use it */
+	mm->pgd = pgd;
+	if (pgd)
+		pgd_ctor(pgd);
+
+	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+		free_pages((unsigned long)pgd, PGD_ORDER);
+		pgd = NULL;
+	}
+
+	return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	/*
+	 * After this the pgd should not be pinned for the duration of this
+	 * function's execution. We should never sleep and thus never race:
+	 *  1. User pmds will not become write-protected under our feet due
+	 *     to a concurrent mm_pin_all().
+	 *  2. The machine addresses in PGD entries will not become invalid
+	 *     due to a concurrent save/restore.
+	 */
+	pgd_dtor(pgd);
+
+	pgd_mop_up_pmds(mm, pgd);
+	free_pages((unsigned long)pgd, PGD_ORDER);
+}
+
+/* blktap and gntdev need this, as otherwise they would implicitly (and
+ * needlessly, as they never use it) reference init_mm. */
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep, int full)
+{
+	return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
+				       addr, ptep, full);
+}
+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	int changed = !pte_same(*ptep, entry);
+
+	if (changed && dirty) {
+		if (likely(vma->vm_mm == current->mm)) {
+			if (HYPERVISOR_update_va_mapping(address,
+				entry,
+				uvm_multi(vma->vm_mm->cpu_vm_mask) |
+					UVMF_INVLPG))
+				BUG();
+		} else {
+			xen_l1_entry_update(ptep, entry);
+			flush_tlb_page(vma, address);
+		}
+	}
+
+	return changed;
+}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pte_t *ptep)
+{
+	int ret = 0;
+
+	if (pte_young(*ptep))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 &ptep->pte);
+
+	if (ret)
+		pte_update(vma->vm_mm, addr, ptep);
+
+	return ret;
+}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	int young = pte_young(pte);
+
+	pte = pte_mkold(pte);
+	if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
+		ptep_set_access_flags(vma, address, ptep, pte, young);
+	else if (young)
+		ptep->pte_low = pte.pte_low;
+
+	return young;
+}
--- head-2010-05-25.orig/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/mm/pgtable_32-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -1,7 +1,3 @@
-/*
- *  linux/arch/i386/mm/pgtable.c
- */
-
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -41,7 +37,6 @@ void show_mem(void)

 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_pgdat(pgdat) {
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
@@ -157,243 +152,6 @@ void __init reserve_top_address(unsigned
 	__VMALLOC_RESERVE += reserve;
 }

-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-	if (pte)
-		make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
-	return pte;
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-
-	list_del(&page->lru);
-}
-
-#define UNSHARED_PTRS_PER_PGD				\
-	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-static void pgd_ctor(void *p)
-{
-	pgd_t *pgd = p;
-	unsigned long flags;
-
-	pgd_test_and_unpin(pgd);
-
-	/* Clear usermode parts of PGD */
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
-	/* If the pgd points to a shared pagetable level (either the
-	   ptes in non-PAE, or shared PMD in PAE), then just copy the
-	   references from swapper_pg_dir. */
-	if (PAGETABLE_LEVELS == 2 ||
-	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-		clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-				swapper_pg_dir + USER_PTRS_PER_PGD,
-				KERNEL_PGD_PTRS);
-		paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-					__pa(swapper_pg_dir) >> PAGE_SHIFT,
-					USER_PTRS_PER_PGD,
-					KERNEL_PGD_PTRS);
-	}
-
-	/* list required to sync kernel mapping updates */
-	if (PAGETABLE_LEVELS == 2)
-		pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static void pgd_dtor(void *pgd)
-{
-	unsigned long flags; /* can be called from interrupt context */
-
-	if (!SHARED_KERNEL_PMD) {
-		spin_lock_irqsave(&pgd_lock, flags);
-		pgd_list_del(pgd);
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
-
-	pgd_test_and_unpin(pgd);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-	int i;
-
-	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-		pgd_t pgd = pgdp[i];
-
-		if (__pgd_val(pgd) != 0) {
-			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
-			pgdp[i] = xen_make_pgd(0);
-
-			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(mm, pmd);
-		}
-	}
-}
-
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update.  Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	pud_t *pud;
-	pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
-	unsigned long addr, flags;
-	int i;
-
-	/*
-	 * We can race save/restore (if we sleep during a GFP_KERNEL memory
-	 * allocation). We therefore store virtual addresses of pmds as they
-	 * do not change across save/restore, and poke the machine addresses
-	 * into the pgdir under the pgd_lock.
-	 */
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
-		pmds[i] = pmd_alloc_one(mm, addr);
-		if (!pmds[i])
-			goto out_oom;
-	}
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
-	/* Protect against save/restore: move below 4GB under pgd_lock. */
-	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
-	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
-		spin_unlock_irqrestore(&pgd_lock, flags);
-out_oom:
-		while (i--)
-			pmd_free(mm, pmds[i]);
-		return 0;
-	}
-
-	/* Copy kernel pmd contents and write-protect the new pmds. */
-	pud = pud_offset(pgd, 0);
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-	     i++, pud++, addr += PUD_SIZE) {
-		if (i >= USER_PTRS_PER_PGD) {
-			memcpy(pmds[i],
-			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-			make_lowmem_page_readonly(
-				pmds[i], XENFEAT_writable_page_tables);
-		}
-
-		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
-		pud_populate(mm, pud, pmds[i]);
-	}
-
-	/* List required to sync kernel mapping updates and
-	 * to pin/unpin on save/restore. */
-	pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
-
-	return 1;
-}
-#else  /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif	/* CONFIG_X86_PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
-	/* so that alloc_pd can use it */
-	mm->pgd = pgd;
-	if (pgd)
-		pgd_ctor(pgd);
-
-	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-		free_page((unsigned long)pgd);
-		pgd = NULL;
-	}
-
-	return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	/*
-	 * After this the pgd should not be pinned for the duration of this
-	 * function's execution. We should never sleep and thus never race:
-	 *  1. User pmds will not become write-protected under our feet due
-	 *     to a concurrent mm_pin_all().
-	 *  2. The machine addresses in PGD entries will not become invalid
-	 *     due to a concurrent save/restore.
-	 */
-	pgd_dtor(pgd);
-
-	if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb))
-		xen_destroy_contiguous_region((unsigned long)pgd, 0);
-
-	pgd_mop_up_pmds(mm, pgd);
-	free_page((unsigned long)pgd);
-}
-
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
-	pgtable_page_dtor(pte);
-	paravirt_release_pt(page_to_pfn(pte));
-	tlb_remove_page(tlb, pte);
-}
-
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
-	paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
-
 void make_lowmem_page_readonly(void *va, unsigned int feature)
 {
 	pte_t *pte;
--- head-2010-05-25.orig/arch/x86/pci/irq-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/pci/irq-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -140,9 +140,11 @@ static void __init pirq_peer_trick(void)
 		busmap[e->bus] = 1;
 	}
 	for(i = 1; i < 256; i++) {
+		int node;
 		if (!busmap[i] || pci_find_bus(0, i))
 			continue;
-		if (pci_scan_bus_with_sysdata(i))
+		node = get_mp_bus_to_node(i);
+		if (pci_scan_bus_on_node(i, &pci_root_ops, node))
 			printk(KERN_INFO "PCI: Discovered primary peer "
 			       "bus %02x [IRQ]\n", i);
 	}
@@ -204,7 +206,7 @@ static int pirq_ali_get(struct pci_dev *
 {
 	static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };

-	WARN_ON_ONCE(pirq >= 16);
+	WARN_ON_ONCE(pirq > 16);
 	return irqmap[read_config_nybble(router, 0x48, pirq-1)];
 }

@@ -213,7 +215,7 @@ static int pirq_ali_set(struct pci_dev *
 	static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
 	unsigned int val = irqmap[irq];

-	WARN_ON_ONCE(pirq >= 16);
+	WARN_ON_ONCE(pirq > 16);
 	if (val) {
 		write_config_nybble(router, 0x48, pirq-1, val);
 		return 1;
@@ -264,7 +266,7 @@ static int pirq_via586_get(struct pci_de
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };

-	WARN_ON_ONCE(pirq >= 5);
+	WARN_ON_ONCE(pirq > 5);
 	return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
 }

@@ -272,7 +274,7 @@ static int pirq_via586_set(struct pci_de
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };

-	WARN_ON_ONCE(pirq >= 5);
+	WARN_ON_ONCE(pirq > 5);
 	write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
 	return 1;
 }
@@ -286,7 +288,7 @@ static int pirq_ite_get(struct pci_dev *
 {
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };

-	WARN_ON_ONCE(pirq >= 4);
+	WARN_ON_ONCE(pirq > 4);
 	return read_config_nybble(router,0x43, pirqmap[pirq-1]);
 }

@@ -294,7 +296,7 @@ static int pirq_ite_set(struct pci_dev *
 {
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };

-	WARN_ON_ONCE(pirq >= 4);
+	WARN_ON_ONCE(pirq > 4);
 	write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
 	return 1;
 }
@@ -623,6 +625,13 @@ static __init int via_router_probe(struc
 			 */
 			device = PCI_DEVICE_ID_VIA_8235;
 			break;
+		case PCI_DEVICE_ID_VIA_8237:
+			/**
+			 * Asus a7v600 bios wrongly reports 8237
+			 * as 586-compatible
+			 */
+			device = PCI_DEVICE_ID_VIA_8237;
+			break;
 		}
 	}

--- head-2010-05-25.orig/arch/x86/vdso/vdso32-setup-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/vdso/vdso32-setup-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -164,7 +164,7 @@ static __init void relocate_vdso(Elf32_E
 	Elf32_Shdr *shdr;
 	int i;

-	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
 	       !elf_check_arch_ia32(ehdr) ||
 	       ehdr->e_type != ET_DYN);

@@ -233,8 +233,12 @@ void syscall32_cpu_init(void)
 		BUG();
 #endif

-	if (use_sysenter < 0)
-		use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+	if (use_sysenter < 0) {
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+			use_sysenter = 1;
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
+			use_sysenter = 1;
+	}
 }

 #define compat_uses_vma		1
@@ -337,8 +341,6 @@ int __init sysenter_setup(void)

 #ifdef CONFIG_X86_32
 	gate_vma_init();
-
-	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
 #endif

 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
@@ -383,6 +385,9 @@ int arch_setup_additional_pages(struct l
 	int ret = 0;
 	bool compat;

+	if (vdso_enabled == VDSO_DISABLED)
+		return 0;
+
 	down_write(&mm->mmap_sem);

 	/* Test compat mode once here, in case someone
--- head-2010-05-25.orig/drivers/acpi/processor_driver.c	2010-05-25 09:19:10.000000000 +0200
+++ head-2010-05-25/drivers/acpi/processor_driver.c	2010-04-15 10:04:18.000000000 +0200
@@ -489,7 +489,7 @@ static int acpi_processor_get_info(struc
 	 * of /proc/cpuinfo
 	 */
 	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
-	if (ACPI_SUCCESS(status))
+	if (ACPI_SUCCESS(status) && pr->id != -1)
 		arch_fix_phys_package_id(pr->id, object.integer.value);

 	return 0;
--- head-2010-05-25.orig/drivers/firmware/Kconfig	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/drivers/firmware/Kconfig	2010-03-24 15:12:36.000000000 +0100
@@ -114,7 +114,7 @@ config DMIID

 config ISCSI_IBFT_FIND
 	bool "iSCSI Boot Firmware Table Attributes"
-	depends on X86
+	depends on X86 && !XEN_UNPRIVILEGED_GUEST
 	default n
 	help
 	  This option enables the kernel to find the region of memory
--- head-2010-05-25.orig/drivers/input/xen-kbdfront.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/drivers/input/xen-kbdfront.c	2010-04-15 10:04:29.000000000 +0200
@@ -329,7 +329,6 @@ static const struct xenbus_device_id xen

 static struct xenbus_driver xenkbd_driver = {
 	.name = "vkbd",
-	.owner = THIS_MODULE,
 	.ids = xenkbd_ids,
 	.probe = xenkbd_probe,
 	.remove = xenkbd_remove,
--- head-2010-05-25.orig/drivers/oprofile/cpu_buffer.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/drivers/oprofile/cpu_buffer.c	2010-03-24 15:12:36.000000000 +0100
@@ -502,7 +502,7 @@ fail:
 #ifdef CONFIG_XEN
 int oprofile_add_domain_switch(int32_t domain_id)
 {
-	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
+	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);

 	/* should have space for switching into and out of domain
 	   (2 slots each) plus one sample and one cpu mode switch */
--- head-2010-05-25.orig/drivers/pci/msi-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/pci/msi-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -543,7 +543,7 @@ int pci_enable_msi(struct pci_dev* dev)
 EXPORT_SYMBOL(pci_enable_msi);

 extern void pci_frontend_disable_msi(struct pci_dev* dev);
-void pci_disable_msi(struct pci_dev* dev)
+void pci_msi_shutdown(struct pci_dev* dev)
 {
 	int pirq;
 	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
@@ -571,6 +571,10 @@ void pci_disable_msi(struct pci_dev* dev
 	pci_intx_for_msi(dev, 1);
 	dev->msi_enabled = 0;
 }
+void pci_disable_msi(struct pci_dev* dev)
+{
+	pci_msi_shutdown(dev);
+}
 EXPORT_SYMBOL(pci_disable_msi);

 /**
@@ -675,7 +679,7 @@ int pci_enable_msix(struct pci_dev* dev,
 EXPORT_SYMBOL(pci_enable_msix);

 extern void pci_frontend_disable_msix(struct pci_dev* dev);
-void pci_disable_msix(struct pci_dev* dev)
+void pci_msix_shutdown(struct pci_dev* dev)
 {
 	if (!pci_msi_enable || !dev || !dev->msix_enabled)
 		return;
@@ -708,6 +712,10 @@ void pci_disable_msix(struct pci_dev* de
 	pci_intx_for_msi(dev, 1);
 	dev->msix_enabled = 0;
 }
+void pci_disable_msix(struct pci_dev* dev)
+{
+	pci_msix_shutdown(dev);
+}
 EXPORT_SYMBOL(pci_disable_msix);

 /**
--- head-2010-05-25.orig/drivers/video/Kconfig	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/drivers/video/Kconfig	2010-03-24 15:12:36.000000000 +0100
@@ -2110,7 +2110,7 @@ config FB_VIRTUAL

 config XEN_FBDEV_FRONTEND
 	tristate "Xen virtual frame buffer support"
-	depends on FB && XEN
+	depends on FB && PARAVIRT_XEN
 	select FB_SYS_FILLRECT
 	select FB_SYS_COPYAREA
 	select FB_SYS_IMAGEBLIT
--- head-2010-05-25.orig/drivers/video/xen-fbfront.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/drivers/video/xen-fbfront.c	2010-04-15 10:04:38.000000000 +0200
@@ -674,7 +674,6 @@ static struct xenbus_device_id xenfb_ids

 static struct xenbus_driver xenfb_driver = {
 	.name = "vfb",
-	.owner = THIS_MODULE,
 	.ids = xenfb_ids,
 	.probe = xenfb_probe,
 	.remove = xenfb_remove,
--- head-2010-05-25.orig/drivers/xen/Kconfig	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-05-25/drivers/xen/Kconfig	2010-03-24 15:12:36.000000000 +0100
@@ -2,8 +2,6 @@
 # This Kconfig describe xen options
 #

-mainmenu "Xen Configuration"
-
 config XEN
 	bool

--- head-2010-05-25.orig/drivers/xen/Makefile	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-05-25/drivers/xen/Makefile	2010-04-19 14:50:44.000000000 +0200
@@ -1,13 +1,16 @@
-obj-$(CONFIG_PARAVIRT_XEN)	+= grant-table.o
+obj-$(CONFIG_PARAVIRT_XEN)	+= grant-table.o features.o events.o
+xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o

+xen-balloon-$(CONFIG_XEN)	:= balloon/
 obj-$(CONFIG_XEN)		+= core/
 obj-$(CONFIG_XEN)		+= console/
 obj-$(CONFIG_XEN)		+= evtchn/
 obj-y				+= xenbus/
 obj-$(CONFIG_XEN)		+= char/

-obj-$(CONFIG_XEN)		+= util.o
-obj-$(CONFIG_XEN_BALLOON)		+= balloon/
+obj-$(CONFIG_XEN)		+= features.o util.o
+obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
+obj-$(CONFIG_XEN_BALLOON)	+= $(xen-balloon-y)
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
 obj-$(CONFIG_XEN_BLKDEV_TAP)		+= blktap/
 obj-$(CONFIG_XEN_BLKDEV_TAP2)           += blktap2/
--- head-2010-05-25.orig/drivers/xen/blkfront/blkfront.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/blkfront/blkfront.c	2010-03-24 15:12:36.000000000 +0100
@@ -285,7 +285,11 @@ static void backend_changed(struct xenbu
 		break;

 	case XenbusStateClosing:
-		bd = bdget(info->dev);
+		if (!info->gd) {
+			xenbus_frontend_closed(dev);
+			break;
+		}
+		bd = bdget_disk(info->gd, 0);
 		if (bd == NULL) {
 			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
 			break;
--- head-2010-05-25.orig/drivers/xen/blkfront/block.h	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-05-25/drivers/xen/blkfront/block.h	2010-03-24 15:12:36.000000000 +0100
@@ -97,7 +97,6 @@ struct blk_shadow {
 struct blkfront_info
 {
 	struct xenbus_device *xbdev;
-	dev_t dev;
  	struct gendisk *gd;
 	int vdevice;
 	blkif_vdev_t handle;
--- head-2010-05-25.orig/drivers/xen/blkfront/vbd.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-05-25/drivers/xen/blkfront/vbd.c	2010-03-24 15:12:36.000000000 +0100
@@ -327,17 +327,32 @@ xlvbd_init_blk_queue(struct gendisk *gd,
 	return 0;
 }

-static int
-xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice,
-		    u16 vdisk_info, u16 sector_size,
-		    struct blkfront_info *info)
+int
+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+	  u16 sector_size, struct blkfront_info *info)
 {
+	int major, minor;
 	struct gendisk *gd;
 	struct xlbd_major_info *mi;
 	int nr_minors = 1;
 	int err = -ENODEV;
 	unsigned int offset;

+	if ((vdevice>>EXT_SHIFT) > 1) {
+		/* this is above the extended range; something is wrong */
+		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
+		return -ENODEV;
+	}
+
+	if (!VDEV_IS_EXTENDED(vdevice)) {
+		major = BLKIF_MAJOR(vdevice);
+		minor = BLKIF_MINOR(vdevice);
+	}
+	else {
+		major = 202;
+		minor = BLKIF_MINOR_EXT(vdevice);
+	}
+
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->mi != NULL);
 	BUG_ON(info->rq != NULL);
@@ -425,41 +440,6 @@ xlvbd_alloc_gendisk(int major, int minor
 	return err;
 }

-int
-xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
-	  u16 sector_size, struct blkfront_info *info)
-{
-	struct block_device *bd;
-	int err = 0;
-	int major, minor;
-
-	if ((vdevice>>EXT_SHIFT) > 1) {
-		/* this is above the extended range; something is wrong */
-		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice);
-		return -ENODEV;
-	}
-
-	if (!VDEV_IS_EXTENDED(vdevice)) {
-		major = BLKIF_MAJOR(vdevice);
-		minor = BLKIF_MINOR(vdevice);
-	}
-	else {
-		major = 202;
-		minor = BLKIF_MINOR_EXT(vdevice);
-	}
-
-	info->dev = MKDEV(major, minor);
-	bd = bdget(info->dev);
-	if (bd == NULL)
-		return -ENODEV;
-
-	err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info,
-				  sector_size, info);
-
-	bdput(bd);
-	return err;
-}
-
 void
 xlvbd_del(struct blkfront_info *info)
 {
--- head-2010-05-25.orig/drivers/xen/blktap/blktap.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/blktap/blktap.c	2010-04-29 09:51:23.000000000 +0200
@@ -111,6 +111,7 @@ typedef struct tap_blkif {
 	unsigned long mode;           /*current switching mode               */
 	int minor;                    /*Minor number for tapdisk device      */
 	pid_t pid;                    /*tapdisk process id                   */
+	struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
 	enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
 						  shutdown                   */
 	unsigned long *idx_map;       /*Record the user ring id to kern
@@ -301,16 +302,14 @@ static inline int OFFSET_TO_SEG(int offs
  * BLKTAP VM OPS
  */

-static struct page *blktap_nopage(struct vm_area_struct *vma,
-				  unsigned long address,
-				  int *type)
+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	/*
 	 * if the page has not been mapped in by the driver then return
-	 * NOPAGE_SIGBUS to the domain.
+	 * VM_FAULT_SIGBUS to the domain.
 	 */

-	return NOPAGE_SIGBUS;
+	return VM_FAULT_SIGBUS;
 }

 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
@@ -428,7 +427,7 @@ static void blktap_vma_close(struct vm_a
 }

 static struct vm_operations_struct blktap_vm_ops = {
-	nopage:   blktap_nopage,
+	fault:    blktap_fault,
 	zap_pte:  blktap_clear_pte,
 	open:     blktap_vma_open,
 	close:    blktap_vma_close,
@@ -523,9 +522,8 @@ found:
 		tapfds[minor] = info;

 		if ((class = get_xen_class()) != NULL)
-			class_device_create(class, NULL,
-					    MKDEV(blktap_major, minor), NULL,
-					    "blktap%d", minor);
+			device_create(class, NULL, MKDEV(blktap_major, minor),
+				      "blktap%d", minor);
 	}

 out:
@@ -568,7 +566,7 @@ void signal_tapdisk(int idx)
 		return;

 	if (info->pid > 0) {
-		ptask = find_task_by_pid(info->pid);
+		ptask = find_task_by_pid_ns(info->pid, info->pid_ns);
 		if (ptask)
 			info->status = CLEANSHUTDOWN;
 	}
@@ -809,8 +807,9 @@ static int blktap_ioctl(struct inode *in
 	{
 		if (info) {
 			info->pid = (pid_t)arg;
-			DPRINTK("blktap: pid received %d\n",
-			       info->pid);
+			info->pid_ns = current->nsproxy->pid_ns;
+			DPRINTK("blktap: pid received %p:%d\n",
+			        info->pid_ns, info->pid);
 		}
 		return 0;
 	}
@@ -1762,9 +1761,7 @@ static int __init blkif_init(void)
 		 * We only create the device when a request of a new device is
 		 * made.
 		 */
-		class_device_create(class, NULL,
-				    MKDEV(blktap_major, 0), NULL,
-				    "blktap0");
+		device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
 	} else {
 		/* this is bad, but not fatal */
 		WPRINTK("blktap: sysfs xen_class not created\n");
--- head-2010-05-25.orig/drivers/xen/blktap2/blktap.h	2010-05-19 17:47:46.000000000 +0200
+++ head-2010-05-25/drivers/xen/blktap2/blktap.h	2010-03-24 15:12:36.000000000 +0100
@@ -127,7 +127,7 @@ struct blktap_ring {
 	wait_queue_head_t              poll_wait;

 	dev_t                          devno;
-	struct class_device           *dev;
+	struct device                 *dev;
 	atomic_t                       sysfs_refcnt;
 	struct mutex                   sysfs_mutex;
 };
--- head-2010-05-25.orig/drivers/xen/blktap2/device.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/blktap2/device.c	2010-04-19 11:30:22.000000000 +0200
@@ -571,7 +571,8 @@ blktap_map(struct blktap *tap,

 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		pte = mk_pte(page, ring->vma->vm_page_prot);
-		blktap_map_uaddr(ring->vma, uvaddr, pte_mkwrite(pte));
+		blktap_map_uaddr(ring->vma, uvaddr,
+				 pte_mkspecial(pte_mkwrite(pte)));
 		flush_tlb_page(ring->vma, uvaddr);
 		blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
 		flush_tlb_kernel_page(kvaddr);
--- head-2010-05-25.orig/drivers/xen/blktap2/ring.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/blktap2/ring.c	2010-03-24 15:12:36.000000000 +0100
@@ -66,16 +66,15 @@ blktap_read_ring(struct blktap *tap)
 	return 0;
 }

-static struct page *
-blktap_ring_nopage(struct vm_area_struct *vma,
-		   unsigned long address, int *type)
+static int
+blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	/*
 	 * if the page has not been mapped in by the driver then return
-	 * NOPAGE_SIGBUS to the domain.
+	 * VM_FAULT_SIGBUS to the domain.
 	 */

-	return NOPAGE_SIGBUS;
+	return VM_FAULT_SIGBUS;
 }

 static pte_t
@@ -205,7 +204,7 @@ blktap_ring_vm_close(struct vm_area_stru
 static struct vm_operations_struct blktap_ring_vm_operations = {
 	.close    = blktap_ring_vm_close,
 	.unmap    = blktap_ring_vm_unmap,
-	.nopage   = blktap_ring_nopage,
+	.fault    = blktap_ring_fault,
 	.zap_pte  = blktap_ring_clear_pte,
 };

--- head-2010-05-25.orig/drivers/xen/blktap2/sysfs.c	2010-05-19 17:49:22.000000000 +0200
+++ head-2010-05-25/drivers/xen/blktap2/sysfs.c	2010-05-25 09:24:03.000000000 +0200
@@ -36,16 +36,21 @@ blktap_sysfs_exit(struct blktap *tap)
 	blktap_sysfs_put(tap);
 }

-static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t);
-CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
-static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t);
-CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
+static ssize_t blktap_sysfs_pause_device(struct device *,
+					 struct device_attribute *,
+					 const char *, size_t);
+DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
+static ssize_t blktap_sysfs_resume_device(struct device *,
+					  struct device_attribute *,
+					  const char *, size_t);
+DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);

 static ssize_t
-blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size)
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr,
+		      const char *buf, size_t size)
 {
 	int err;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	blktap_sysfs_enter(tap);

@@ -79,10 +84,11 @@ out:
 }

 static ssize_t
-blktap_sysfs_get_name(struct class_device *dev, char *buf)
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr,
+		      char *buf)
 {
 	ssize_t size;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	blktap_sysfs_enter(tap);

@@ -97,15 +103,15 @@ blktap_sysfs_get_name(struct class_devic

 	return size;
 }
-CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
+DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
 		  blktap_sysfs_get_name, blktap_sysfs_set_name);

 static ssize_t
-blktap_sysfs_remove_device(struct class_device *dev,
+blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t size)
 {
 	int err;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	if (!tap->ring.dev)
 		return size;
@@ -117,14 +123,14 @@ blktap_sysfs_remove_device(struct class_

 	return (err ? : size);
 }
-CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);

 static ssize_t
-blktap_sysfs_pause_device(struct class_device *dev,
+blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr,
 			  const char *buf, size_t size)
 {
 	int err;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	blktap_sysfs_enter(tap);

@@ -149,8 +155,8 @@ blktap_sysfs_pause_device(struct class_d

 	err = blktap_device_pause(tap);
 	if (!err) {
-		class_device_remove_file(dev, &class_device_attr_pause);
-		err = class_device_create_file(dev, &class_device_attr_resume);
+		device_remove_file(dev, &dev_attr_pause);
+		err = device_create_file(dev, &dev_attr_resume);
 	}

 out:
@@ -160,11 +166,11 @@ out:
 }

 static ssize_t
-blktap_sysfs_resume_device(struct class_device *dev,
+blktap_sysfs_resume_device(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t size)
 {
 	int err;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	blktap_sysfs_enter(tap);

@@ -181,8 +187,8 @@ blktap_sysfs_resume_device(struct class_

 	err = blktap_device_resume(tap);
 	if (!err) {
-		class_device_remove_file(dev, &class_device_attr_resume);
-		err = class_device_create_file(dev, &class_device_attr_pause);
+		device_remove_file(dev, &dev_attr_resume);
+		err = device_create_file(dev, &dev_attr_pause);
 	}

 out:
@@ -194,12 +200,13 @@ out:

 #ifdef ENABLE_PASSTHROUGH
 static ssize_t
-blktap_sysfs_enable_passthrough(struct class_device *dev,
+blktap_sysfs_enable_passthrough(struct device *dev,
+				struct device_attribute *attr,
 				const char *buf, size_t size)
 {
 	int err;
 	unsigned major, minor;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	BTINFO("passthrough request enabled\n");

@@ -237,11 +244,12 @@ out:
 #endif

 static ssize_t
-blktap_sysfs_debug_device(struct class_device *dev, char *buf)
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr,
+			  char *buf)
 {
 	char *tmp;
 	int i, ret;
-	struct blktap *tap = (struct blktap *)dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

 	tmp = buf;
 	blktap_sysfs_get(tap);
@@ -285,13 +293,13 @@ out:

 	return ret;
 }
-CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
+DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);

 int
 blktap_sysfs_create(struct blktap *tap)
 {
 	struct blktap_ring *ring;
-	struct class_device *dev;
+	struct device *dev;
 	int err, state = 0;

 	if (!class)
@@ -299,53 +307,52 @@ blktap_sysfs_create(struct blktap *tap)

 	ring = &tap->ring;

-	dev = class_device_create(class, NULL, ring->devno,
-				  NULL, "blktap%d", tap->minor);
+	dev = device_create_drvdata(class, NULL, ring->devno, tap,
+				    "blktap%d", tap->minor);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);

-	ring->dev       = dev;
-	dev->class_data = tap;
+	ring->dev = dev;

 	mutex_init(&ring->sysfs_mutex);
 	atomic_set(&ring->sysfs_refcnt, 0);
 	set_bit(BLKTAP_SYSFS, &tap->dev_inuse);

-	err = class_device_create_file(dev, &class_device_attr_name);
+	err = device_create_file(dev, &dev_attr_name);
 	if (!err) {
 		++state;
-		err = class_device_create_file(dev, &class_device_attr_remove);
+		err = device_create_file(dev, &dev_attr_remove);
 	}
 	if (!err) {
 		++state;
-		err = class_device_create_file(dev, &class_device_attr_pause);
+		err = device_create_file(dev, &dev_attr_pause);
 	}
 	if (!err) {
 		++state;
-		err = class_device_create_file(dev, &class_device_attr_debug);
+		err = device_create_file(dev, &dev_attr_debug);
 	}

 	switch (state * !!err) {
-	case 3: class_device_remove_file(dev, &class_device_attr_pause);
-	case 2: class_device_remove_file(dev, &class_device_attr_remove);
-	case 1: class_device_remove_file(dev, &class_device_attr_name);
+	case 3: device_remove_file(dev, &dev_attr_pause);
+	case 2: device_remove_file(dev, &dev_attr_remove);
+	case 1: device_remove_file(dev, &dev_attr_name);
 	}

 	return err;
 }

 static void
-_blktap_sysfs_destroy(struct device *_dev)
+_blktap_sysfs_destroy(struct device *dev)
 {
-	struct class_device *dev = _dev->???;
-	struct blktap *tap = dev->class_data;
+	struct blktap *tap = dev_get_drvdata(dev);

-	class_device_remove_file(dev, &class_device_attr_name);
-	class_device_remove_file(dev, &class_device_attr_remove);
-	class_device_remove_file(dev, &class_device_attr_pause);
-	class_device_remove_file(dev, &class_device_attr_resume);
-	class_device_remove_file(dev, &class_device_attr_debug);
-	class_device_unregister(dev);
+	device_remove_file(dev, &dev_attr_name);
+	device_remove_file(dev, &dev_attr_remove);
+	device_remove_file(dev, &dev_attr_pause);
+	device_remove_file(dev, &dev_attr_resume);
+	device_remove_file(dev, &dev_attr_debug);
+
+	device_unregister(dev);

 	clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);

@@ -356,7 +363,7 @@ int
 blktap_sysfs_destroy(struct blktap *tap)
 {
 	struct blktap_ring *ring;
-	struct class_device *dev;
+	struct device *dev;

 	ring = &tap->ring;
 	dev  = ring->dev;
@@ -368,7 +375,7 @@ blktap_sysfs_destroy(struct blktap *tap)
 				     !atomic_read(&tap->ring.sysfs_refcnt)))
 		return -EAGAIN;

-	return device_schedule_callback(dev->???, _blktap_sysfs_destroy);
+	return device_schedule_callback(dev, _blktap_sysfs_destroy);
 }

 static ssize_t
--- head-2010-05-25.orig/drivers/xen/char/mem.c	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-05-25/drivers/xen/char/mem.c	2010-03-24 15:12:36.000000000 +0100
@@ -33,6 +33,27 @@ static inline int uncached_access(struct
 	return 0;
 }

+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+#ifdef CONFIG_NONPROMISC_DEVMEM
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+#endif
+	return 1;
+}
+
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the
  * memory location.
@@ -55,6 +76,9 @@ static ssize_t read_mem(struct file * fi

 		sz = min_t(unsigned long, sz, count);

+		if (!range_is_allowed(p >> PAGE_SHIFT, count))
+			return -EPERM;
+
 		v = ioremap(p, sz);
 		if (IS_ERR(v) || v == NULL) {
 			/*
@@ -103,6 +127,9 @@ static ssize_t write_mem(struct file * f

 		sz = min_t(unsigned long, sz, count);

+		if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+			return -EPERM;
+
 		v = ioremap(p, sz);
 		if (v == NULL)
 			break;
@@ -131,6 +158,23 @@ static ssize_t write_mem(struct file * f
 }

 #ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
+static void mmap_mem_open(struct vm_area_struct *vma)
+{
+	map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static void mmap_mem_close(struct vm_area_struct *vma)
+{
+	unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static struct vm_operations_struct mmap_mem_ops = {
+	.open  = mmap_mem_open,
+	.close = mmap_mem_close
+};
+
 static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;
@@ -138,6 +182,15 @@ static int xen_mmap_mem(struct file * fi
 	if (uncached_access(file))
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

+	if (!range_is_allowed(vma->vm_pgoff, size))
+		return -EPERM;
+
+	if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+						&vma->vm_page_prot))
+		return -EINVAL;
+
+	vma->vm_ops = &mmap_mem_ops;
+
 	/* We want to return the real error code, not EAGAIN. */
 	return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 				      size, vma->vm_page_prot, DOMID_IO);
--- head-2010-05-25.orig/drivers/xen/console/console.c	2010-03-24 15:08:58.000000000 +0100
+++ head-2010-05-25/drivers/xen/console/console.c	2010-03-24 15:12:36.000000000 +0100
@@ -551,16 +551,18 @@ static int xencons_write(
 	return i;
 }

-static void xencons_put_char(struct tty_struct *tty, u_char ch)
+static int xencons_put_char(struct tty_struct *tty, u_char ch)
 {
 	unsigned long flags;
+	int ret;

 	if (DUMMY_TTY(tty))
-		return;
+		return 0;

 	spin_lock_irqsave(&xencons_lock, flags);
-	(void)__xencons_put_char(ch);
+	ret = __xencons_put_char(ch);
 	spin_unlock_irqrestore(&xencons_lock, flags);
+	return ret;
 }

 static void xencons_flush_chars(struct tty_struct *tty)
@@ -582,7 +584,7 @@ static void xencons_wait_until_sent(stru
 	if (DUMMY_TTY(tty))
 		return;

-	while (DRV(tty->driver)->chars_in_buffer(tty)) {
+	while (tty_chars_in_buffer(tty)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(1);
 		if (signal_pending(current))
@@ -631,8 +633,7 @@ static void xencons_close(struct tty_str

 	tty->closing = 1;
 	tty_wait_until_sent(tty, 0);
-	if (DRV(tty->driver)->flush_buffer != NULL)
-		DRV(tty->driver)->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 	if (tty->ldisc.flush_buffer != NULL)
 		tty->ldisc.flush_buffer(tty);
 	tty->closing = 0;
--- head-2010-05-25.orig/drivers/xen/core/Makefile	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/core/Makefile	2010-04-19 14:50:32.000000000 +0200
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #

-obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o
+obj-y := evtchn.o gnttab.o reboot.o machine_reboot.o firmware.o

 obj-$(CONFIG_PCI)		+= pci.o
 obj-$(CONFIG_PROC_FS)		+= xen_proc.o
@@ -12,4 +12,3 @@ obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
 obj-$(CONFIG_X86_SMP)		+= spinlock.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o
-obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
--- head-2010-05-25.orig/drivers/xen/core/machine_kexec.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/drivers/xen/core/machine_kexec.c	2010-03-24 15:12:36.000000000 +0100
@@ -5,6 +5,7 @@

 #include <linux/kexec.h>
 #include <xen/interface/kexec.h>
+#include <linux/reboot.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>

@@ -90,6 +91,9 @@ void __init xen_machine_kexec_setup_reso
 	xen_hypervisor_res.start = range.start;
 	xen_hypervisor_res.end = range.start + range.size - 1;
 	xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+#ifdef CONFIG_X86_64
+	insert_resource(&iomem_resource, &xen_hypervisor_res);
+#endif

 	/* fill in crashk_res if range is reserved by hypervisor */

@@ -102,6 +106,9 @@ void __init xen_machine_kexec_setup_reso
 	if (range.size) {
 		crashk_res.start = range.start;
 		crashk_res.end = range.start + range.size - 1;
+#ifdef CONFIG_X86_64
+		insert_resource(&iomem_resource, &crashk_res);
+#endif
 	}

 	/* get physical address of vmcoreinfo */
@@ -134,6 +141,14 @@ void __init xen_machine_kexec_setup_reso
 					  xen_max_nr_phys_cpus))
 		goto err;

+#ifdef CONFIG_X86_64
+	for (k = 0; k < xen_max_nr_phys_cpus; k++) {
+		res = xen_phys_cpus + k;
+		if (!res->parent) /* outside of xen_hypervisor_res range */
+			insert_resource(&iomem_resource, res);
+	}
+#endif
+
 #ifdef CONFIG_X86
 	if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
 					 get_order(sizeof(vmcoreinfo_note)),
@@ -153,6 +168,7 @@ void __init xen_machine_kexec_setup_reso
 	return;
 }

+#ifndef CONFIG_X86_64
 void __init xen_machine_kexec_register_resources(struct resource *res)
 {
 	int k;
@@ -166,6 +182,7 @@ void __init xen_machine_kexec_register_r
 	}
 	machine_kexec_register_resources(res);
 }
+#endif

 static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
 {
@@ -236,6 +253,11 @@ void machine_shutdown(void)
 	/* do nothing */
 }

+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* The kernel is broken so disable interrupts */
+	local_irq_disable();
+}

 /*
  * Local variables:
--- head-2010-05-25.orig/drivers/xen/core/smpboot.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/core/smpboot.c	2010-03-24 15:12:36.000000000 +0100
@@ -53,16 +53,15 @@ static DEFINE_PER_CPU(int, callfunc_irq)
 static char resched_name[NR_CPUS][15];
 static char callfunc_name[NR_CPUS][15];

-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+#ifdef CONFIG_X86_LOCAL_APIC
+#define set_cpu_to_apicid(cpu, apicid) (per_cpu(x86_cpu_to_apicid, cpu) = (apicid))
+#else
+#define set_cpu_to_apicid(cpu, apicid)
+#endif

 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_t, cpu_core_map);

-#if defined(__i386__)
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-#endif
-
 void __init prefill_possible_map(void)
 {
 	int i, rc;
@@ -157,7 +156,7 @@ static int __cpuinit xen_smp_intr_init(u
 }

 #ifdef CONFIG_HOTPLUG_CPU
-static void xen_smp_intr_exit(unsigned int cpu)
+static void __cpuexit xen_smp_intr_exit(unsigned int cpu)
 {
 	if (cpu != 0)
 		local_teardown_timer(cpu);
@@ -266,8 +265,7 @@ void __init smp_prepare_cpus(unsigned in
 	boot_cpu_data.apicid = apicid;
 	cpu_data(0) = boot_cpu_data;

-	cpu_2_logical_apicid[0] = apicid;
-	per_cpu(x86_cpu_to_apicid, 0) = apicid;
+	set_cpu_to_apicid(0, apicid);

 	current_thread_info()->cpu = 0;

@@ -322,8 +320,7 @@ void __init smp_prepare_cpus(unsigned in
 		cpu_data(cpu).cpu_index = cpu;
 		cpu_data(cpu).apicid = apicid;

-		cpu_2_logical_apicid[cpu] = apicid;
-		per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+		set_cpu_to_apicid(cpu, apicid);

 #ifdef __x86_64__
 		cpu_pda(cpu)->pcurrent = idle;
@@ -378,7 +375,7 @@ static int __init initialize_cpu_present
 }
 core_initcall(initialize_cpu_present_map);

-int __cpu_disable(void)
+int __cpuexit __cpu_disable(void)
 {
 	cpumask_t map = cpu_online_map;
 	unsigned int cpu = smp_processor_id();
@@ -395,7 +392,7 @@ int __cpu_disable(void)
 	return 0;
 }

-void __cpu_die(unsigned int cpu)
+void __cpuexit __cpu_die(unsigned int cpu)
 {
 	while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
 		current->state = TASK_UNINTERRUPTIBLE;
--- head-2010-05-25.orig/drivers/xen/core/xen_proc.c	2007-06-12 13:13:44.000000000 +0200
+++ head-2010-05-25/drivers/xen/core/xen_proc.c	2010-03-24 15:12:36.000000000 +0100
@@ -8,7 +8,7 @@ static struct proc_dir_entry *xen_base;
 struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
 {
 	if ( xen_base == NULL )
-		if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
+		if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
 			panic("Couldn't create /proc/xen");
 	return create_proc_entry(name, mode, xen_base);
 }
--- head-2010-05-25.orig/drivers/xen/fbfront/xenfb.c	2010-03-24 15:09:08.000000000 +0100
+++ head-2010-05-25/drivers/xen/fbfront/xenfb.c	2010-03-24 15:12:36.000000000 +0100
@@ -93,7 +93,7 @@ struct xenfb_info
  *    only mappings.  The former creates unfaulted pages.  Preserves
  *    invariant.  The latter removes pages.  Preserves invariant.
  *
- * 3. Holding both locks: xenfb_vm_nopage().  Extends the dirty
+ * 3. Holding both locks: xenfb_vm_fault().  Extends the dirty
  *    rectangle and updates mappings consistently.  Preserves
  *    invariant.
  *
@@ -112,13 +112,13 @@ struct xenfb_info
  *
  * But FIXME: the invariant is too weak.  It misses that the fault
  * record in mappings must be consistent with the mapping of pages in
- * the associated address space!  do_no_page() updates the PTE after
- * xenfb_vm_nopage() returns, i.e. outside the critical region.  This
+ * the associated address space!  __do_fault() updates the PTE after
+ * xenfb_vm_fault() returns, i.e. outside the critical region.  This
  * allows the following race:
  *
  * X writes to some address in the Xen frame buffer
- * Fault - call do_no_page()
- *     call xenfb_vm_nopage()
+ * Fault - call __do_fault()
+ *     call xenfb_vm_fault()
  *         grab mm_lock
  *         map->faults++;
  *         release mm_lock
@@ -387,18 +387,17 @@ static void xenfb_vm_close(struct vm_are
 	mutex_unlock(&info->mm_lock);
 }

-static struct page *xenfb_vm_nopage(struct vm_area_struct *vma,
-				    unsigned long vaddr, int *type)
+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct xenfb_mapping *map = vma->vm_private_data;
 	struct xenfb_info *info = map->info;
-	int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT;
+	int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
 	unsigned long flags;
 	struct page *page;
 	int y1, y2;

 	if (pgnr >= info->nr_pages)
-		return NOPAGE_SIGBUS;
+		return VM_FAULT_SIGBUS;

 	mutex_lock(&info->mm_lock);
 	spin_lock_irqsave(&info->dirty_lock, flags);
@@ -414,16 +413,15 @@ static struct page *xenfb_vm_nopage(stru
 	spin_unlock_irqrestore(&info->dirty_lock, flags);
 	mutex_unlock(&info->mm_lock);

-	if (type)
-		*type = VM_FAULT_MINOR;
+	vmf->page = page;

-	return page;
+	return VM_FAULT_MINOR;
 }

 static struct vm_operations_struct xenfb_vm_ops = {
 	.open	= xenfb_vm_open,
 	.close	= xenfb_vm_close,
-	.nopage	= xenfb_vm_nopage,
+	.fault	= xenfb_vm_fault,
 };

 static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
--- head-2010-05-25.orig/drivers/xen/features.c	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/drivers/xen/features.c	2010-03-24 15:12:36.000000000 +0100
@@ -9,14 +9,21 @@
 #include <linux/cache.h>
 #include <linux/module.h>

+#ifdef CONFIG_PARAVIRT_XEN
 #include <asm/xen/hypercall.h>
+#else
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif

 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
 #include <xen/features.h>

 u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
+EXPORT_SYMBOL(xen_features);

 void xen_setup_features(void)
 {
--- head-2010-05-25.orig/drivers/xen/gntdev/gntdev.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/drivers/xen/gntdev/gntdev.c	2010-03-24 15:12:36.000000000 +0100
@@ -392,7 +392,7 @@ nomem_out:
 static int __init gntdev_init(void)
 {
 	struct class *class;
-	struct class_device *device;
+	struct device *device;

 	if (!is_running_on_xen()) {
 		printk(KERN_ERR "You must be running Xen to use gntdev\n");
@@ -417,8 +417,8 @@ static int __init gntdev_init(void)
 		return 0;
 	}

-	device = class_device_create(class, NULL, MKDEV(gntdev_major, 0),
-				     NULL, GNTDEV_NAME);
+	device = device_create(class, NULL, MKDEV(gntdev_major, 0),
+			       GNTDEV_NAME);
 	if (IS_ERR(device)) {
 		printk(KERN_ERR "Error creating gntdev device in xen_class\n");
 		printk(KERN_ERR "gntdev created with major number = %d\n",
@@ -435,7 +435,7 @@ static void __exit gntdev_exit(void)
 {
 	struct class *class;
 	if ((class = get_xen_class()) != NULL)
-		class_device_destroy(class, MKDEV(gntdev_major, 0));
+		device_destroy(class, MKDEV(gntdev_major, 0));
 	unregister_chrdev(gntdev_major, GNTDEV_NAME);
 }

--- head-2010-05-25.orig/drivers/xen/netfront/netfront.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/drivers/xen/netfront/netfront.c	2010-03-24 15:12:36.000000000 +0100
@@ -1464,8 +1464,7 @@ err:
 		}
 	}

-	while ((skb = __skb_dequeue(&errq)))
-		kfree_skb(skb);
+	__skb_queue_purge(&errq);

 	while ((skb = __skb_dequeue(&rxq)) != NULL) {
 		struct page *page = NETFRONT_SKB_CB(skb)->page;
@@ -1630,8 +1629,7 @@ static void netif_release_rx_bufs_flip(s
 		}
 	}

-	while ((skb = __skb_dequeue(&free_list)) != NULL)
-		dev_kfree_skb(skb);
+	__skb_queue_purge(&free_list);

 	spin_unlock_bh(&np->rx_lock);
 }
--- head-2010-05-25.orig/drivers/xen/privcmd/privcmd.c	2010-03-24 15:06:12.000000000 +0100
+++ head-2010-05-25/drivers/xen/privcmd/privcmd.c	2010-03-24 15:12:36.000000000 +0100
@@ -401,15 +401,13 @@ static long privcmd_ioctl(struct file *f
 }

 #ifndef HAVE_ARCH_PRIVCMD_MMAP
-static struct page *privcmd_nopage(struct vm_area_struct *vma,
-				   unsigned long address,
-				   int *type)
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return NOPAGE_SIGBUS;
+	return VM_FAULT_SIGBUS;
 }

 static struct vm_operations_struct privcmd_vm_ops = {
-	.nopage = privcmd_nopage
+	.fault = privcmd_fault
 };

 static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
--- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_client.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-05-25/drivers/xen/xenbus/xenbus_client.c	2010-03-24 15:12:36.000000000 +0100
@@ -440,7 +440,7 @@ int xenbus_map_ring_valloc(struct xenbus

 	*vaddr = NULL;

-	area = alloc_vm_area(PAGE_SIZE);
+	area = xen_alloc_vm_area(PAGE_SIZE);
 	if (!area)
 		return -ENOMEM;

@@ -450,7 +450,7 @@ int xenbus_map_ring_valloc(struct xenbus
 		BUG();

 	if (op.status != GNTST_okay) {
-		free_vm_area(area);
+		xen_free_vm_area(area);
 		xenbus_dev_fatal(dev, op.status,
 				 "mapping in shared page %d from domain %d",
 				 gnt_ref, dev->otherend_id);
@@ -549,7 +549,7 @@ int xenbus_unmap_ring_vfree(struct xenbu
 		BUG();

 	if (op.status == GNTST_okay)
-		free_vm_area(area);
+		xen_free_vm_area(area);
 	else
 		xenbus_dev_error(dev, op.status,
 				 "unmapping page at handle %d error %d",
--- head-2010-05-25.orig/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:12:36.000000000 +0100
@@ -174,7 +174,7 @@ static int read_backend_details(struct x
 	return read_otherend_details(xendev, "backend-id", "backend");
 }

-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE))
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
 static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct xenbus_device *xdev;
@@ -186,8 +186,10 @@ static int xenbus_uevent_frontend(struct
 		return -ENODEV;

 	/* stuff we want to pass to /sbin/hotplug */
+#if defined(CONFIG_XEN) || defined(MODULE)
 	add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype);
 	add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename);
+#endif
 	add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype);

 	return 0;
@@ -208,10 +210,8 @@ static struct xen_bus_type xenbus_fronte
 		.probe    = xenbus_dev_probe,
 		.remove   = xenbus_dev_remove,
 		.shutdown = xenbus_dev_shutdown,
-#if defined(CONFIG_XEN) || defined(MODULE)
 		.uevent   = xenbus_uevent_frontend,
 #endif
-#endif
 	},
 #if defined(CONFIG_XEN) || defined(MODULE)
 	.dev = {
@@ -529,6 +529,15 @@ static ssize_t xendev_show_devtype(struc
 }
 static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);

+static ssize_t xendev_show_modalias(struct device *dev,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+				    struct device_attribute *attr,
+#endif
+				    char *buf)
+{
+	return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
+}
+DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);

 int xenbus_probe_node(struct xen_bus_type *bus,
 		      const char *type,
@@ -589,10 +598,16 @@ int xenbus_probe_node(struct xen_bus_typ

 	err = device_create_file(&xendev->dev, &dev_attr_devtype);
 	if (err)
-		goto fail_remove_file;
+		goto fail_remove_nodename;
+
+	err = device_create_file(&xendev->dev, &dev_attr_modalias);
+	if (err)
+		goto fail_remove_devtype;

 	return 0;
-fail_remove_file:
+fail_remove_devtype:
+	device_remove_file(&xendev->dev, &dev_attr_devtype);
+fail_remove_nodename:
 	device_remove_file(&xendev->dev, &dev_attr_nodename);
 fail_unregister:
 	device_unregister(&xendev->dev);
--- head-2010-05-25.orig/fs/aio.c	2010-03-24 15:09:15.000000000 +0100
+++ head-2010-05-25/fs/aio.c	2010-03-24 15:12:36.000000000 +0100
@@ -1238,6 +1238,7 @@ static void io_destroy(struct kioctx *io
 #ifdef CONFIG_EPOLL
 	/* forget the poll file, but it's up to the user to close it */
 	if (ioctx->file) {
+		fput(ioctx->file);
 		ioctx->file->private_data = 0;
 		ioctx->file = 0;
 	}
@@ -1262,6 +1263,7 @@ static int aio_queue_fd_close(struct ino
 		spin_lock_irq(&ioctx->ctx_lock);
 		ioctx->file = 0;
 		spin_unlock_irq(&ioctx->ctx_lock);
+		fput(file);
 	}
 	return 0;
 }
@@ -1297,16 +1299,17 @@ static const struct file_operations aioq

 static int make_aio_fd(struct kioctx *ioctx)
 {
-	int error, fd;
-	struct inode *inode;
+	int fd;
 	struct file *file;

-	error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
-				 &aioq_fops, ioctx);
-	if (error)
-		return error;
+	fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
+	if (fd < 0)
+		return fd;

 	/* associate the file with the IO context */
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
 	file->private_data = ioctx;
 	ioctx->file = file;
 	init_waitqueue_head(&ioctx->poll_wait);
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:12:36.000000000 +0100
@@ -64,8 +64,8 @@ static inline struct desc_struct *get_cp
 }

 static inline void pack_gate(gate_desc *gate, unsigned char type,
-       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
-
+			     unsigned long base, unsigned dpl, unsigned flags,
+			     unsigned short seg)
 {
 	gate->a = (seg << 16) | (base & 0xffff);
 	gate->b = (base & 0xffff0000) |
@@ -84,22 +84,23 @@ static inline int desc_empty(const void
 #define load_TR_desc() native_load_tr_desc()
 #define load_gdt(dtr) native_load_gdt(dtr)
 #define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))

 #define store_gdt(dtr) native_store_gdt(dtr)
 #define store_idt(dtr) native_store_idt(dtr)
 #define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))

 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt

-#define write_ldt_entry(dt, entry, desc) \
-				native_write_ldt_entry(dt, entry, desc)
-#define write_gdt_entry(dt, entry, desc, type) \
-				native_write_gdt_entry(dt, entry, desc, type)
-#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
+#define write_ldt_entry(dt, entry, desc)	\
+	native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type)		\
+	native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g)		\
+	native_write_idt_entry(dt, entry, g)

 static inline void native_write_idt_entry(gate_desc *idt, int entry,
 					  const gate_desc *gate)
@@ -138,8 +139,8 @@ static inline void pack_descriptor(struc
 {
 	desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
 	desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
-		  (limit & 0x000f0000) | ((type & 0xff) << 8) |
-		  ((flags & 0xf) << 20);
+		(limit & 0x000f0000) | ((type & 0xff) << 8) |
+		((flags & 0xf) << 20);
 	desc->p = 1;
 }

@@ -160,7 +161,6 @@ static inline void set_tssldt_descriptor
 	desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
 	desc->base3 = PTR_HIGH(addr);
 #else
-
 	pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
 #endif
 }
@@ -178,7 +178,8 @@ static inline void __set_tss_desc(unsign
 	 * last valid byte
 	 */
 	set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
-		IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
+			      IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+			      sizeof(unsigned long) - 1);
 	write_gdt_entry(d, entry, &tss, DESC_TSS);
 }

@@ -187,16 +188,16 @@ static inline void __set_tss_desc(unsign
 static inline void native_set_ldt(const void *addr, unsigned int entries)
 {
 	if (likely(entries == 0))
-		__asm__ __volatile__("lldt %w0"::"q" (0));
+		asm volatile("lldt %w0"::"q" (0));
 	else {
 		unsigned cpu = smp_processor_id();
 		ldt_desc ldt;

-		set_tssldt_descriptor(&ldt, (unsigned long)addr,
-				      DESC_LDT, entries * sizeof(ldt) - 1);
+		set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+				      entries * LDT_ENTRY_SIZE - 1);
 		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
 				&ldt, DESC_LDT);
-		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+		asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
 	}
 }

@@ -261,15 +262,15 @@ static inline void xen_load_tls(struct t
 }
 #endif

-#define _LDT_empty(info) (\
-	(info)->base_addr	== 0	&& \
-	(info)->limit		== 0	&& \
-	(info)->contents	== 0	&& \
-	(info)->read_exec_only	== 1	&& \
-	(info)->seg_32bit	== 0	&& \
-	(info)->limit_in_pages	== 0	&& \
-	(info)->seg_not_present	== 1	&& \
-	(info)->useable		== 0)
+#define _LDT_empty(info)				\
+	((info)->base_addr		== 0	&&	\
+	 (info)->limit			== 0	&&	\
+	 (info)->contents		== 0	&&	\
+	 (info)->read_exec_only		== 1	&&	\
+	 (info)->seg_32bit		== 0	&&	\
+	 (info)->limit_in_pages		== 0	&&	\
+	 (info)->seg_not_present	== 1	&&	\
+	 (info)->useable		== 0)

 #ifdef CONFIG_X86_64
 #define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
@@ -309,7 +310,7 @@ static inline unsigned long get_desc_lim

 #ifndef CONFIG_X86_NO_IDT
 static inline void _set_gate(int gate, unsigned type, void *addr,
-			      unsigned dpl, unsigned ist, unsigned seg)
+			     unsigned dpl, unsigned ist, unsigned seg)
 {
 	gate_desc s;
 	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
@@ -393,10 +394,10 @@ static inline void set_system_gate_ist(i
  *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
  */
 #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
-	movb idx*8+4(gdt), lo_b; \
-	movb idx*8+7(gdt), hi_b; \
-	shll $16, base; \
-	movw idx*8+2(gdt), lo_w;
+	movb idx * 8 + 4(gdt), lo_b;			\
+	movb idx * 8 + 7(gdt), hi_b;			\
+	shll $16, base;					\
+	movw idx * 8 + 2(gdt), lo_w;


 #endif /* __ASSEMBLY__ */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/dma-mapping.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/dma-mapping.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,17 @@
-#ifdef CONFIG_X86_32
-# include "dma-mapping_32.h"
-#else
-# include "dma-mapping_64.h"
-#endif
+#ifndef _ASM_DMA_MAPPING_H_
+
+#include_next <asm/dma-mapping.h>
+
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+	dma_addr_t mask = 0xffffffff;
+	/* If the device has a mask, use it, otherwise default to 32 bits */
+	if (hwdev && hwdev->dma_mask)
+		mask = *hwdev->dma_mask;
+	return (addr & ~mask) != 0;
+}
+
+extern int range_straddles_page_boundary(paddr_t p, size_t size);
+
+#endif /* _ASM_DMA_MAPPING_H_ */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,13 @@
+#ifndef _ASM_FIXMAP_H
+#define _ASM_FIXMAP_H
+
 #ifdef CONFIG_X86_32
 # include "fixmap_32.h"
 #else
 # include "fixmap_64.h"
 #endif
+
+#define clear_fixmap(idx)			\
+	__set_fixmap(idx, 0, __pgprot(0))
+
+#endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_32.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap_32.h	2010-03-24 15:12:36.000000000 +0100
@@ -10,8 +10,8 @@
  * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  */

-#ifndef _ASM_FIXMAP_H
-#define _ASM_FIXMAP_H
+#ifndef _ASM_FIXMAP_32_H
+#define _ASM_FIXMAP_32_H

 /* used by vmalloc.c, vsyscall.lds.S.
  *
@@ -102,8 +102,7 @@ enum fixed_addresses {
 	 */
 #define NR_FIX_BTMAPS		64
 #define FIX_BTMAPS_NESTING	4
-	FIX_BTMAP_END =
-		__end_of_permanent_fixed_addresses + 512 -
+	FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
 			(__end_of_permanent_fixed_addresses & 511),
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
 	FIX_WP_TEST,
@@ -114,19 +113,16 @@ enum fixed_addresses {
 };

 extern void __set_fixmap(enum fixed_addresses idx,
-					maddr_t phys, pgprot_t flags);
+			 maddr_t phys, pgprot_t flags);
 extern void reserve_top_address(unsigned long reserve);

-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
+#define set_fixmap(idx, phys)				\
+	__set_fixmap(idx, phys, PAGE_KERNEL)
 /*
  * Some hardware wants to get fixmapped without caching.
  */
-#define set_fixmap_nocache(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-#define clear_fixmap(idx) \
-		__set_fixmap(idx, 0, __pgprot(0))
+#define set_fixmap_nocache(idx, phys)			\
+	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)

 #define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)

@@ -159,7 +155,7 @@ static __always_inline unsigned long fix
 	if (idx >= __end_of_fixed_addresses)
 		__this_fixmap_does_not_exist();

-        return __fix_to_virt(idx);
+	return __fix_to_virt(idx);
 }

 static inline unsigned long virt_to_fix(const unsigned long vaddr)
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:12:36.000000000 +0100
@@ -8,8 +8,8 @@
  * Copyright (C) 1998 Ingo Molnar
  */

-#ifndef _ASM_FIXMAP_H
-#define _ASM_FIXMAP_H
+#ifndef _ASM_FIXMAP_64_H
+#define _ASM_FIXMAP_64_H

 #include <linux/kernel.h>
 #include <asm/apicdef.h>
@@ -35,7 +35,8 @@

 enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
-	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
 	VSYSCALL_HPET,
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
@@ -45,11 +46,12 @@ enum fixed_addresses {
 #endif
 #ifndef CONFIG_XEN
 	FIX_IO_APIC_BASE_0,
-	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
 #ifdef CONFIG_EFI
 	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
+	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+				  + MAX_EFI_IO_PAGES - 1,
 #endif
 #ifdef CONFIG_ACPI
 	FIX_ACPI_BEGIN,
@@ -79,19 +81,16 @@ enum fixed_addresses {
 	__end_of_fixed_addresses
 };

-extern void __set_fixmap (enum fixed_addresses idx,
-					unsigned long phys, pgprot_t flags);
+extern void __set_fixmap(enum fixed_addresses idx,
+			 unsigned long phys, pgprot_t flags);

-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
+#define set_fixmap(idx, phys)			\
+	__set_fixmap(idx, phys, PAGE_KERNEL)
 /*
  * Some hardware wants to get fixmapped without caching.
  */
-#define set_fixmap_nocache(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-#define clear_fixmap(idx) \
-                __set_fixmap(idx, 0, __pgprot(0))
+#define set_fixmap_nocache(idx, phys)			\
+	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)

 #define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
 #define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 15:12:36.000000000 +0100
@@ -8,7 +8,7 @@
  *		      Gerhard.Wichert@pdb.siemens.de
  *
  *
- * Redesigned the x86 32-bit VM architecture to deal with
+ * Redesigned the x86 32-bit VM architecture to deal with
  * up to 16 Terabyte physical memory. With current x86 CPUs
  * we now support up to 64 Gigabytes physical RAM.
  *
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,22 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+#define ARCH_HAS_IOREMAP_WC
+
 #ifdef CONFIG_X86_32
 # include "io_32.h"
 #else
 # include "io_64.h"
 #endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
+extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
+
+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+				     unsigned long prot_val);
+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+
+#endif /* _ASM_X86_IO_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/irqflags.h	2010-03-24 15:12:36.000000000 +0100
@@ -139,11 +139,11 @@ sysexit_ecrit:	/**** END OF SYSEXIT CRIT
 #endif /* __ASSEMBLY__ */

 #ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
+#define raw_local_save_flags(flags)				\
+	do { (flags) = __raw_local_save_flags(); } while (0)

-#define raw_local_irq_save(flags) \
-		do { (flags) = __raw_local_irq_save(); } while (0)
+#define raw_local_irq_save(flags)				\
+	do { (flags) = __raw_local_irq_save(); } while (0)

 static inline int raw_irqs_disabled_flags(unsigned long flags)
 {
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:12:36.000000000 +0100
@@ -94,7 +94,7 @@ static inline void switch_mm(struct mm_s
 		BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);

 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-			/* We were in lazy tlb mode and leave_mm disabled
+			/* We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload %cr3.
 			 */
 			load_cr3(next->pgd);
@@ -107,10 +107,10 @@ static inline void switch_mm(struct mm_s
 #define deactivate_mm(tsk, mm)			\
 	asm("movl %0,%%gs": :"r" (0));

-#define activate_mm(prev, next)				\
-	do {						\
-		xen_activate_mm(prev, next);		\
-		switch_mm((prev),(next),NULL);		\
-	} while(0)
+#define activate_mm(prev, next)			\
+do {						\
+	xen_activate_mm(prev, next);		\
+	switch_mm((prev), (next), NULL);	\
+} while (0)

 #endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/mmu_context_64.h	2010-03-24 15:12:36.000000000 +0100
@@ -21,7 +21,7 @@ void destroy_context(struct mm_struct *m
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
-	if (read_pda(mmu_state) == TLBSTATE_OK)
+	if (read_pda(mmu_state) == TLBSTATE_OK)
 		write_pda(mmu_state, TLBSTATE_LAZY);
 #endif
 }
@@ -62,7 +62,7 @@ extern void mm_pin(struct mm_struct *mm)
 extern void mm_unpin(struct mm_struct *mm);
 void mm_pin_all(void);

-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
 	unsigned cpu = smp_processor_id();
@@ -106,7 +106,7 @@ static inline void switch_mm(struct mm_s
 		if (read_pda(active_mm) != next)
 			BUG();
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
-			/* We were in lazy tlb mode and leave_mm disabled
+			/* We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
 			 */
@@ -118,10 +118,11 @@ static inline void switch_mm(struct mm_s
 #endif
 }

-#define deactivate_mm(tsk,mm)	do { \
-	load_gs_index(0); \
-	asm volatile("movl %0,%%fs"::"r"(0));  \
-} while(0)
+#define deactivate_mm(tsk, mm)			\
+do {						\
+	load_gs_index(0);			\
+	asm volatile("movl %0,%%fs"::"r"(0));	\
+} while (0)

 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:12:36.000000000 +0100
@@ -8,14 +8,13 @@
 #include <asm/scatterlist.h>
 #include <asm/io.h>

-
 #ifdef __KERNEL__

 struct pci_sysdata {
 	int		domain;		/* PCI domain */
 	int		node;		/* NUMA node */
 #ifdef CONFIG_X86_64
-	void*		iommu;		/* IOMMU private data */
+	void		*iommu;		/* IOMMU private data */
 #endif
 #ifdef CONFIG_XEN_PCIDEV_FRONTEND
 	struct pcifront_device *pdev;
@@ -23,6 +22,8 @@ struct pci_sysdata {
 };

 /* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+					    int node);
 extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);

 static inline int pci_domain_nr(struct pci_bus *bus)
@@ -36,6 +37,7 @@ static inline int pci_proc_domain(struct
 	return pci_domain_nr(bus);
 }

+extern void pci_iommu_alloc(void);

 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
@@ -57,7 +59,7 @@ extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_CARDBUS_IO	0x4000

 void pcibios_config_init(void);
-struct pci_bus * pcibios_scan_root(int bus);
+struct pci_bus *pcibios_scan_root(int bus);

 void pcibios_set_master(struct pci_dev *dev);
 void pcibios_penalize_isa_irq(int irq, int active);
@@ -67,7 +69,8 @@ int pcibios_set_irq_routing(struct pci_d

 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			       enum pci_mmap_state mmap_state, int write_combine);
+			       enum pci_mmap_state mmap_state,
+			       int write_combine);


 #ifdef CONFIG_PCI
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgalloc.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgalloc.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,149 @@
-#ifdef CONFIG_X86_32
-# include "pgalloc_32.h"
-#else
-# include "pgalloc_64.h"
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h>		/* for struct page */
+#include <linux/pagemap.h>
+
+#include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+					    unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+
+#ifdef CONFIG_X86_64
+void early_make_page_readonly(void *va, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+#define make_lowmem_page_readonly make_page_readonly
+#define make_lowmem_page_writable make_page_writable
 #endif
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+   done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+	make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+	free_page((unsigned long)pte);
+}
+
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	__pte_free(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+				       pmd_t *pmd, pte_t *pte)
+{
+	paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+				struct page *pte)
+{
+	unsigned long pfn = page_to_pfn(pte);
+
+	paravirt_alloc_pte(mm, pfn);
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		if (!PageHighMem(pte))
+			BUG_ON(HYPERVISOR_update_va_mapping(
+			  (unsigned long)__va(pfn << PAGE_SHIFT),
+			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
+#ifndef CONFIG_X86_64
+		else if (!TestSetPagePinned(pte))
+			kmap_flush_unused();
+#endif
+		set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+	} else
+		*pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+extern void __pmd_free(pgtable_t);
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+	__pmd_free(virt_to_page(pmd));
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else	/* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+		BUG_ON(HYPERVISOR_update_va_mapping(
+			       (unsigned long)pmd,
+			       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
+				       PAGE_KERNEL_RO), 0));
+		set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+	} else
+		*pud =  __pud(_PAGE_TABLE | __pa(pmd));
+}
+#endif	/* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
+
+/*
+ * We need to use the batch mode here, but pgd_pupulate() won't be
+ * be called frequently.
+ */
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
+		BUG_ON(HYPERVISOR_update_va_mapping(
+			       (unsigned long)pud,
+			       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
+				       PAGE_KERNEL_RO), 0));
+		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+		set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+	} else {
+		*(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
+		*__user_pgd(pgd) = *(pgd);
+	}
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+	__pmd_free(virt_to_page(pud));
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+#endif	/* _ASM_X86_PGALLOC_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgalloc_32.h	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,111 +0,0 @@
-#ifndef _I386_PGALLOC_H
-#define _I386_PGALLOC_H
-
-#include <linux/threads.h>
-#include <linux/mm.h>		/* for struct page */
-#include <linux/pagemap.h>
-#include <asm/tlb.h>
-#include <asm-generic/tlb.h>
-#include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
-
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
-
-static inline void pmd_populate_kernel(struct mm_struct *mm,
-				       pmd_t *pmd, pte_t *pte)
-{
-	paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
-	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
-	unsigned long pfn = page_to_pfn(pte);
-
-	paravirt_alloc_pt(mm, pfn);
-	if (PagePinned(virt_to_page(mm->pgd))) {
-		if (!PageHighMem(pte))
-			BUG_ON(HYPERVISOR_update_va_mapping(
-			  (unsigned long)__va(pfn << PAGE_SHIFT),
-			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-		else if (!test_and_set_bit(PG_pinned, &pte->flags))
-			kmap_flush_unused();
-		set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
-	} else
-		*pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-/*
- * Allocate and free page tables.
- */
-extern void pgd_test_and_unpin(pgd_t *);
-extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
-	free_page((unsigned long)pte);
-}
-
-extern void __pte_free(pgtable_t);
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	__pte_free(pte);
-}
-
-
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
-
-#ifdef CONFIG_X86_PAE
-/*
- * In the PAE case we free the pmds as part of the pgd.
- */
-extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long);
-
-extern void __pmd_free(pgtable_t);
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	__pmd_free(virt_to_page(pmd));
-}
-
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
-{
-	struct page *page = virt_to_page(pmd);
-	unsigned long pfn = page_to_pfn(page);
-
-	paravirt_alloc_pd(mm, pfn);
-
-	/* Note: almost everything apart from _PAGE_PRESENT is
-	   reserved at the pmd (PDPT) level. */
-	if (PagePinned(virt_to_page(mm->pgd))) {
-		BUG_ON(PageHighMem(page));
-		BUG_ON(HYPERVISOR_update_va_mapping(
-			  (unsigned long)__va(pfn << PAGE_SHIFT),
-			  pfn_pte(pfn, PAGE_KERNEL_RO), 0));
-		set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
-	} else
-		*pudp = __pud(__pa(pmd) | _PAGE_PRESENT);
-
-	/*
-	 * According to Intel App note "TLBs, Paging-Structure Caches,
-	 * and Their Invalidation", April 2007, document 317080-001,
-	 * section 8.1: in PAE mode we explicitly have to flush the
-	 * TLB via cr3 if the top-level pgd is changed...
-	 */
-	if (mm == current->active_mm)
-		xen_tlb_flush();
-}
-#endif	/* CONFIG_X86_PAE */
-
-#endif /* _I386_PGALLOC_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgalloc_64.h	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,179 +0,0 @@
-#ifndef _X86_64_PGALLOC_H
-#define _X86_64_PGALLOC_H
-
-#include <asm/pda.h>
-#include <linux/threads.h>
-#include <linux/mm.h>
-#include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
-
-pmd_t *early_get_pmd(unsigned long va);
-void early_make_page_readonly(void *va, unsigned int feature);
-
-#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
-
-#define pmd_populate_kernel(mm, pmd, pte) \
-		set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
-	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-		BUG_ON(HYPERVISOR_update_va_mapping(
-			       (unsigned long)pmd,
-			       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
-				       PAGE_KERNEL_RO), 0));
-		set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
-	} else {
-		*(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
-	}
-}
-
-/*
- * We need to use the batch mode here, but pgd_pupulate() won't be
- * be called frequently.
- */
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
-	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-		BUG_ON(HYPERVISOR_update_va_mapping(
-			       (unsigned long)pud,
-			       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
-				       PAGE_KERNEL_RO), 0));
-		set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-		set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-	} else {
-		*(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
-		*(__user_pgd(pgd)) = *(pgd);
-	}
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
-	if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) {
-		BUG_ON(HYPERVISOR_update_va_mapping(
-			       (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
-			       pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
-		set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
-	} else {
-		*(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
-	}
-}
-
-extern void __pmd_free(pgtable_t);
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
-	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	__pmd_free(virt_to_page(pmd));
-}
-
-extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
-	return (pud_t *)pmd_alloc_one(mm, addr);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
-	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	__pmd_free(virt_to_page(pud));
-}
-
-static inline void pgd_list_add(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_add(&page->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-	struct page *page = virt_to_page(pgd);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_del(&page->lru);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-extern void pgd_test_and_unpin(pgd_t *);
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	/*
-	 * We allocate two contiguous pages for kernel and user.
-	 */
-	unsigned boundary;
-	pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
-	if (!pgd)
-		return NULL;
-	pgd_list_add(pgd);
-	pgd_test_and_unpin(pgd);
-	/*
-	 * Copy kernel pointers in from init.
-	 * Could keep a freelist or slab cache of those because the kernel
-	 * part never changes.
-	 */
-	boundary = pgd_index(__PAGE_OFFSET);
-	memset(pgd, 0, boundary * sizeof(pgd_t));
-	memcpy(pgd + boundary,
-	       init_level4_pgt + boundary,
-	       (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
-
-	memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
-	/*
-	 * Set level3_user_pgt for vsyscall area
-	 */
-	__user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
-		__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
-	return pgd;
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	pgd_test_and_unpin(pgd);
-	pgd_list_del(pgd);
-	free_pages((unsigned long)pgd, 1);
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-	if (pte)
-		make_page_readonly(pte, XENFEAT_writable_page_tables);
-
-	return pte;
-}
-
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
-
-/* Should really implement gc for free page table pages. This could be
-   done with a reference count in struct page. */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
-	make_page_writable(pte, XENFEAT_writable_page_tables);
-	free_page((unsigned long)pte);
-}
-
-extern void __pte_free(pgtable_t);
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	__pte_free(pte);
-}
-
-#define __pte_free_tlb(tlb,pte)				\
-do {							\
-	pgtable_page_dtor((pte));				\
-	tlb_remove_page((tlb), (pte));			\
-} while (0)
-
-#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
-#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
-
-#endif /* _X86_64_PGALLOC_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,17 +1,15 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H

-#define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
 #define FIRST_USER_ADDRESS	0

-#define _PAGE_BIT_PRESENT	0
-#define _PAGE_BIT_RW		1
-#define _PAGE_BIT_USER		2
-#define _PAGE_BIT_PWT		3
-#define _PAGE_BIT_PCD		4
-#define _PAGE_BIT_ACCESSED	5
-#define _PAGE_BIT_DIRTY		6
-#define _PAGE_BIT_FILE		6
+#define _PAGE_BIT_PRESENT	0	/* is present */
+#define _PAGE_BIT_RW		1	/* writeable */
+#define _PAGE_BIT_USER		2	/* userspace addressable */
+#define _PAGE_BIT_PWT		3	/* page write through */
+#define _PAGE_BIT_PCD		4	/* page cache disabled */
+#define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_PAT		7	/* on 4KB pages */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
@@ -22,6 +20,14 @@
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */

+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+
+/* set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
+
+/* if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
+
 /*
  * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
  * sign-extended value on 32-bit with all 1's in the upper word,
@@ -48,10 +54,8 @@
 #define _PAGE_NX	0
 #endif

-/* If _PAGE_PRESENT is clear, we use these: */
-#define _PAGE_FILE	_PAGE_DIRTY	/* nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_PROTNONE	_PAGE_PSE	/* if the user mapped it with PROT_NONE;
-					   pte_present gives true */
+#define _PAGE_FILE	(_AC(1, L)<<_PAGE_BIT_FILE)
+#define _PAGE_PROTNONE	(_AC(1, L)<<_PAGE_BIT_PROTNONE)

 #ifndef __ASSEMBLY__
 #if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
@@ -61,20 +65,42 @@ extern unsigned int __kernel_page_user;
 #endif
 #endif

-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY)

-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO)
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB		(0)
+#define _PAGE_CACHE_WT		(_PAGE_PWT)
+#define _PAGE_CACHE_WC		(_PAGE_PAT)
+#define _PAGE_CACHE_WP		(_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS	(_PAGE_PCD)
+#define _PAGE_CACHE_UC		(_PAGE_PCD | _PAGE_PWT)

 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+				 _PAGE_ACCESSED | _PAGE_NX)

-#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW |	\
+					 _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)
 #define PAGE_COPY		PAGE_COPY_NOEXEC
-#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)

 #ifdef CONFIG_X86_32
 #define _PAGE_KERNEL_EXEC \
@@ -93,6 +119,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_CACHE_WC)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
@@ -109,6 +136,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
 #define PAGE_KERNEL_RO			MAKE_GLOBAL(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_EXEC		MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX			MAKE_GLOBAL(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			MAKE_GLOBAL(__PAGE_KERNEL_WC)
 #define PAGE_KERNEL_NOCACHE		MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_UC_MINUS		MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
 #define PAGE_KERNEL_EXEC_NOCACHE	MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
@@ -142,7 +170,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))

 extern spinlock_t pgd_lock;
@@ -152,30 +180,111 @@ extern struct list_head pgd_list;
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-static inline int pte_dirty(pte_t pte)		{ return __pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return __pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_write(pte_t pte)		{ return __pte_val(pte) & _PAGE_RW; }
-static inline int pte_file(pte_t pte)		{ return __pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)		{ return __pte_val(pte) & _PAGE_PSE; }
-static inline int pte_global(pte_t pte) 	{ return 0; }
-static inline int pte_exec(pte_t pte)		{ return !(__pte_val(pte) & _PAGE_NX); }
-
-static inline int pmd_large(pmd_t pte) {
-	return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
-		(_PAGE_PSE|_PAGE_PRESENT);
-}
-
-static inline pte_t pte_mkclean(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
-static inline pte_t pte_mkold(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
-static inline pte_t pte_wrprotect(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); }
-static inline pte_t pte_mkexec(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); }
-static inline pte_t pte_mkdirty(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); }
-static inline pte_t pte_mkyoung(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); }
-static inline pte_t pte_mkwrite(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_RW); }
-static inline pte_t pte_mkhuge(pte_t pte)	{ return __pte_ma(__pte_val(pte) | _PAGE_PSE); }
-static inline pte_t pte_clrhuge(pte_t pte)	{ return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
-static inline pte_t pte_mkglobal(pte_t pte)	{ return pte; }
-static inline pte_t pte_clrglobal(pte_t pte)	{ return pte; }
+static inline int pte_dirty(pte_t pte)
+{
+	return __pte_val(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+	return __pte_val(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+	return __pte_val(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+	return __pte_val(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+	return __pte_val(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+	return 0;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+	return !(__pte_val(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+	return 0;
+}
+
+static inline int pmd_large(pmd_t pte)
+{
+	return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+		(_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) | _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+	return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return pte;
+}

 extern pteval_t __supported_pte_mask;

@@ -202,15 +311,33 @@ static inline pte_t pte_modify(pte_t pte
 	pteval_t val = pte_val(pte);

 	val &= _PAGE_CHG_MASK;
-	val |= pgprot_val(newprot) & __supported_pte_mask;
+	val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;

 	return __pte(val);
 }

-#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+	pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+	pgprotval_t addbits = pgprot_val(newprot);
+	return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)

 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)

+#ifndef __ASSEMBLY__
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+#endif
+
 #define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)

@@ -246,6 +373,9 @@ static inline pte_t pte_modify(pte_t pte
 # include "pgtable_64.h"
 #endif

+#define KERNEL_PGD_BOUNDARY	pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS		(PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
 #ifndef __ASSEMBLY__

 enum {
@@ -312,46 +442,17 @@ static inline void xen_pte_clear(struct
  * bit at the same time.
  */
 #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
-({									\
-	int __changed = !pte_same(*(ptep), entry);			\
-	if (__changed && (dirty)) {					\
-		if ( likely((vma)->vm_mm == current->mm) ) {		\
-			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
-				entry,					\
-				uvm_multi((vma)->vm_mm->cpu_vm_mask) |	\
-					UVMF_INVLPG));			\
-		} else {						\
-			xen_l1_entry_update(ptep, entry);		\
-			flush_tlb_page(vma, address);			\
-		}							\
-	}								\
-	__changed;							\
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);

 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({			\
-	int __ret = 0;							\
-	if (pte_young(*(ptep)))						\
-		__ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,		\
-					   &(ptep)->pte);		\
-	if (__ret)							\
-		pte_update((vma)->vm_mm, addr, ptep);			\
-	__ret;								\
-})
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pte_t *ptep);

 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep)			\
-({									\
-	pte_t __pte = *(ptep);						\
-	int __young = pte_young(__pte);					\
-	__pte = pte_mkold(__pte);					\
-	if (PagePinned(virt_to_page((vma)->vm_mm->pgd)))		\
-		(void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
-	else if (__young)						\
-		(ptep)->pte_low = __pte.pte_low;			\
-	__young;							\
-})
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pte_t *ptep);

 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 #define ptep_clear_flush(vma, addr, ptep)			\
@@ -370,7 +471,8 @@ static inline void xen_pte_clear(struct
 })

 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pte_t *ptep)
 {
 	pte_t pte = *ptep;
 	if (!pte_none(pte)
@@ -398,13 +500,29 @@ static inline pte_t ptep_get_and_clear(s
 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);

 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
 	if (pte_write(pte))
 		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
 }

+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
 #define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
 	xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)

--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:12:36.000000000 +0100
@@ -8,25 +8,28 @@
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
  */

-#define pte_ERROR(e) \
-	printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \
-	       &(e), __pte_val(e), pte_pfn(e))
-#define pmd_ERROR(e) \
-	printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
-	       &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
-#define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \
-	       &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
-
+#define pte_ERROR(e)							\
+	printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n",		\
+	        __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)							\
+	printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pmd_val(e),			\
+	       (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)							\
+	printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
+	       (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)

 static inline int pud_none(pud_t pud)
 {
 	return __pud_val(pud) == 0;
+
 }
 static inline int pud_bad(pud_t pud)
 {
 	return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
 }
+
 static inline int pud_present(pud_t pud)
 {
 	return __pud_val(pud) & _PAGE_PRESENT;
@@ -48,12 +51,14 @@ static inline void xen_set_pte(pte_t *pt

 static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
-	set_64bit((unsigned long long *)(ptep),__pte_val(pte));
+	set_64bit((unsigned long long *)(ptep), __pte_val(pte));
 }
+
 static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 	xen_l2_entry_update(pmdp, pmd);
 }
+
 static inline void xen_set_pud(pud_t *pudp, pud_t pud)
 {
 	xen_l3_entry_update(pudp, pud);
@@ -92,20 +97,19 @@ static inline void pud_clear(pud_t *pudp
 	 * current pgd to avoid unnecessary TLB flushes.
 	 */
 	pgd = read_cr3();
-	if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+	if (__pa(pudp) >= pgd && __pa(pudp) <
+	    (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
 		xen_tlb_flush();
 }

-#define pud_page(pud) \
-((struct page *) __va(pud_val(pud) & PAGE_MASK))
+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))

-#define pud_page_vaddr(pud) \
-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))


 /* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
-			pmd_index(address))
+#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) +	\
+				  pmd_index(address))

 #ifdef CONFIG_SMP
 static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
@@ -150,7 +154,8 @@ static inline int pte_none(pte_t pte)
  * put the 32 bits of offset into the high part.
  */
 #define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define pgoff_to_pte(off)						\
+	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
 #define PTE_FILE_MAX_BITS       32

 /* Encode and de-code a swap entry */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:12:36.000000000 +0100
@@ -38,16 +38,13 @@ void paging_init(void);
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level-defs.h>
 # define PMD_SIZE	(1UL << PMD_SHIFT)
-# define PMD_MASK	(~(PMD_SIZE-1))
+# define PMD_MASK	(~(PMD_SIZE - 1))
 #else
 # include <asm/pgtable-2level-defs.h>
 #endif

 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
-#define PGDIR_MASK	(~(PGDIR_SIZE-1))
-
-#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
+#define PGDIR_MASK	(~(PGDIR_SIZE - 1))

 /* Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 8MB value just means that there will be a 8MB "hole" after the
@@ -56,21 +53,22 @@ void paging_init(void);
  * The vmalloc() routines leaves a hole of 4kB between each vmalloced
  * area for the same reason. ;)
  */
-#define VMALLOC_OFFSET	(8*1024*1024)
-#define VMALLOC_START	(((unsigned long) high_memory + \
-			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
+#define VMALLOC_OFFSET	(8 * 1024 * 1024)
+#define VMALLOC_START	(((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \
+			 & ~(VMALLOC_OFFSET - 1))
 #ifdef CONFIG_X86_PAE
 #define LAST_PKMAP 512
 #else
 #define LAST_PKMAP 1024
 #endif

-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1))	\
+		    & PMD_MASK)

 #ifdef CONFIG_HIGHMEM
-# define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
+# define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
 #else
-# define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
+# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE)
 #endif

 /*
@@ -91,10 +89,10 @@ extern unsigned long pg0[];
 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
    can temporarily clear it. */
 #define pmd_present(x)	(__pmd_val(x))
-#define pmd_bad(x)	((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+#define pmd_bad(x)	((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
 #else
 #define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
-#define pmd_bad(x)	((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define pmd_bad(x)	((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 #endif


@@ -107,32 +105,18 @@ extern unsigned long pg0[];
 #endif

 /*
- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
- *
- *  dst - pointer to pgd range anwhere on a pgd page
- *  src - ""
- *  count - the number of pgds to copy.
- *
- * dst and src can be on the same page, but the range must not overlap,
- * and must not cross a page boundary.
+ * Macro to mark a page protection value as "uncacheable".
+ * On processors which do not support it, this is a no-op.
  */
-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
-{
-       memcpy(dst, src, count * sizeof(pgd_t));
-}
-
-/*
- * Macro to mark a page protection value as "uncacheable".  On processors which do not support
- * it, this is a no-op.
- */
-#define pgprot_noncached(prot)	((boot_cpu_data.x86 > 3)					  \
-				 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
+#define pgprot_noncached(prot)					\
+	((boot_cpu_data.x86 > 3)				\
+	 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))	\
+	 : (prot))

 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  */
-
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))

 /*
@@ -141,20 +125,20 @@ static inline void clone_pgd_range(pgd_t
  * this macro returns the index of the entry in the pgd page which would
  * control the given virtual address
  */
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
-#define pgd_index_k(addr) pgd_index(addr)
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd_index_k(addr) pgd_index((addr))

 /*
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  */
-#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))

 /*
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
  */
-#define pgd_offset_k(address) pgd_offset(&init_mm, address)
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))

 static inline int pud_large(pud_t pud) { return 0; }

@@ -164,8 +148,8 @@ static inline int pud_large(pud_t pud) {
  * this macro returns the index of the entry in the pmd page which would
  * control the given virtual address
  */
-#define pmd_index(address) \
-		(((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
+#define pmd_index(address)				\
+	(((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))

 /*
  * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
@@ -173,33 +157,36 @@ static inline int pud_large(pud_t pud) {
  * this macro returns the index of the entry in the pte page which would
  * control the given virtual address
  */
-#define pte_index(address) \
-		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
+#define pte_index(address)					\
+	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address)				\
+	((pte_t *)pmd_page_vaddr(*(dir)) +  pte_index((address)))

-#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))

-#define pmd_page_vaddr(pmd) \
-		((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page_vaddr(pmd)					\
+	((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))

 #if defined(CONFIG_HIGHPTE)
-#define pte_offset_map(dir, address) \
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
-#define pte_offset_map_nested(dir, address) \
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
-#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
-#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
-#else
-#define pte_offset_map(dir, address) \
-	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
-#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
+#define pte_offset_map(dir, address)					\
+	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) +		\
+	 pte_index((address)))
+#define pte_offset_map_nested(dir, address)				\
+	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) +		\
+	 pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#else
+#define pte_offset_map(dir, address)					\
+	((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
 #define pte_unmap(pte) do { } while (0)
 #define pte_unmap_nested(pte) do { } while (0)
 #endif

 /* Clear a kernel PTE and flush it from the TLB */
-#define kpte_clear_flush(ptep, vaddr) do { \
+#define kpte_clear_flush(ptep, vaddr)					\
+do {									\
 	if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
 		BUG(); \
 } while (0)
@@ -208,7 +195,7 @@ static inline int pud_large(pud_t pud) {
  * The i386 doesn't have any external MMU info: the kernel page
  * tables contain all the necessary information.
  */
-#define update_mmu_cache(vma,address,pte) do { } while (0)
+#define update_mmu_cache(vma, address, pte) do { } while (0)

 void make_lowmem_page_readonly(void *va, unsigned int feature);
 void make_lowmem_page_writable(void *va, unsigned int feature);
@@ -225,7 +212,7 @@ void make_lowmem_page_writable(void *va,
 #define kern_addr_valid(kaddr)	(0)
 #endif

-#define io_remap_pfn_range(vma,from,pfn,size,prot) \
-direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
+#define io_remap_pfn_range(vma, from, pfn, size, prot)			\
+	direct_remap_pfn_range(vma, from, pfn, size, prot, DOMID_IO)

 #endif /* _I386_PGTABLE_H */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:12:36.000000000 +0100
@@ -31,7 +31,7 @@ extern void paging_init(void);

 #endif /* !__ASSEMBLY__ */

-#define SHARED_KERNEL_PMD	1
+#define SHARED_KERNEL_PMD	0

 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
@@ -59,18 +59,20 @@ extern void paging_init(void);

 #ifndef __ASSEMBLY__

-#define pte_ERROR(e) \
-	printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-	       &(e), __pte_val(e), pte_pfn(e))
-#define pmd_ERROR(e) \
-	printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-	       &(e), __pmd_val(e), pmd_pfn(e))
-#define pud_ERROR(e) \
-	printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-	       &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
-#define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
-	       &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pte_ERROR(e)							\
+	printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)							\
+	printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
+#define pud_ERROR(e)							\
+	printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pud_val(e),			\
+	       (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)							\
+	printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
+	       (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)

 #define pgd_none(x)	(!__pgd_val(x))
 #define pud_none(x)	(!__pud_val(x))
@@ -125,7 +127,7 @@ static inline void xen_set_pgd(pgd_t *pg
 	xen_l4_entry_update(pgdp, pgd);
 }

-static inline void xen_pgd_clear(pgd_t * pgd)
+static inline void xen_pgd_clear(pgd_t *pgd)
 {
 	xen_set_pgd(pgd, xen_make_pgd(0));
 	xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0));
@@ -135,43 +137,43 @@ static inline void xen_pgd_clear(pgd_t *

 #endif /* !__ASSEMBLY__ */

-#define PMD_SIZE	(_AC(1,UL) << PMD_SHIFT)
-#define PMD_MASK	(~(PMD_SIZE-1))
-#define PUD_SIZE	(_AC(1,UL) << PUD_SHIFT)
-#define PUD_MASK	(~(PUD_SIZE-1))
-#define PGDIR_SIZE	(_AC(1,UL) << PGDIR_SHIFT)
-#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+#define PMD_SIZE	(_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE - 1))
+#define PUD_SIZE	(_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE - 1))
+#define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE - 1))


-#define MAXMEM		 _AC(0x6fffffffff, UL)
+#define MAXMEM		 _AC(0x0000006fffffffff, UL)
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
-#define MODULES_VADDR    _AC(0xffffffff88000000, UL)
+#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)

 #ifndef __ASSEMBLY__

-static inline unsigned long pgd_bad(pgd_t pgd)
+static inline int pgd_bad(pgd_t pgd)
 {
-	return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+	return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

-static inline unsigned long pud_bad(pud_t pud)
+static inline int pud_bad(pud_t pud)
 {
-	return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+	return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

-static inline unsigned long pmd_bad(pmd_t pmd)
+static inline int pmd_bad(pmd_t pmd)
 {
-	return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
+	return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
 }

 #define pte_none(x)	(!(x).pte)
 #define pte_present(x)	((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))

-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))	/* FIXME: is this right? */
+#define pages_to_mb(x)	((x) >> (20 - PAGE_SHIFT))   /* FIXME: is this right? */

 #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
 #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
@@ -181,13 +183,13 @@ static inline unsigned long pmd_bad(pmd_
 		       mfn_to_local_pfn(__pte_mfn(_pte)) :	\
 		       __pte_mfn(_pte))

-#define pte_page(x)	pfn_to_page(pte_pfn(x))
+#define pte_page(x)	pfn_to_page(pte_pfn((x)))

 /*
  * Macro to mark a page protection value as "uncacheable".
  */
-#define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
-
+#define pgprot_noncached(prot)					\
+	(__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))

 /*
  * Conversion functions: convert a page and protection to a page entry,
@@ -197,36 +199,39 @@ static inline unsigned long pmd_bad(pmd_
 /*
  * Level 4 access.
  */
-#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
-#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
-#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
-#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
+#define pgd_page_vaddr(pgd)						\
+	((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define pgd_offset(mm, address)	((mm)->pgd + pgd_index((address)))
+#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
 #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
 static inline int pgd_large(pgd_t pgd) { return 0; }
 #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)

 /* PUD - Level3 access */
 /* to find an entry in a page-table-directory. */
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
-#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
+#define pud_page_vaddr(pud)						\
+	((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud)	(pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pud_offset(pgd, address)					\
+	((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)

 static inline int pud_large(pud_t pte)
 {
-	return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
-		(_PAGE_PSE|_PAGE_PRESENT);
+	return (__pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+		(_PAGE_PSE | _PAGE_PRESENT);
 }

 /* PMD  - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
-#define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
+#define pmd_page(pmd)		(pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))

-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
-                                  pmd_index(address))
+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
+				  pmd_index(address))
 #define pmd_none(x)	(!__pmd_val(x))
 #if CONFIG_XEN_COMPAT <= 0x030002
 /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
@@ -235,43 +240,56 @@ static inline int pud_large(pud_t pte)
 #else
 #define pmd_present(x)	(__pmd_val(x) & _PAGE_PRESENT)
 #endif
-#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
-#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
+#define pmd_pfn(x)  ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)

 #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
+					    _PAGE_FILE })
 #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT

 /* PTE - Level 1 access. */

 /* page, protection -> pte */
-#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-
-#define pte_index(address) \
-		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn((page)), (pgprot))
+
+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
-			pte_index(address))
+					 pte_index((address)))

 /* x86-64 always has all page tables mapped. */
-#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
-#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap_nested(pte) /* NOP */
+
+#define update_mmu_cache(vma, address, pte) do { } while (0)

-#define update_mmu_cache(vma,address,pte) do { } while (0)
+#define direct_gbpages 0

 /* Encode and de-code a swap entry */
-#define __swp_type(x)			(((x).val >> 1) & 0x3f)
-#define __swp_offset(x)			((x).val >> 8)
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define __swp_type(x)			(((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+					 & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)			((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset)	((swp_entry_t) { \
+					 ((type) << (_PAGE_BIT_PRESENT + 1)) \
+					 | ((offset) << SWP_OFFSET_SHIFT) })
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { __pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })

-extern int kern_addr_valid(unsigned long addr);
+extern int kern_addr_valid(unsigned long addr);
 extern void cleanup_highmap(void);

-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
+	direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)

 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
@@ -284,8 +302,10 @@ extern void cleanup_highmap(void);

 /* fs/proc/kcore.c */
 #define	kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
-#define	kc_offset_to_vaddr(o) \
-   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
+#define	kc_offset_to_vaddr(o)				\
+	(((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1)))	\
+	 ? ((o) | ~__VIRTUAL_MASK)			\
+	 : (o))

 #define __HAVE_ARCH_PTE_SAME
 #endif /* !__ASSEMBLY__ */
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:12:36.000000000 +0100
@@ -3,10 +3,6 @@

 #include <asm/processor-flags.h>

-/* migration helpers, for KVM - will be removed in 2.6.25: */
-#include <asm/vm86.h>
-#define Xgt_desc_struct	desc_ptr
-
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
@@ -24,6 +20,7 @@ struct mm_struct;
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
+
 #include <linux/personality.h>
 #include <linux/cpumask.h>
 #include <linux/cache.h>
@@ -38,16 +35,18 @@ struct mm_struct;
 static inline void *current_text_addr(void)
 {
 	void *pc;
-	asm volatile("mov $1f,%0\n1:":"=r" (pc));
+
+	asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
 	return pc;
 }

 #ifdef CONFIG_X86_VSMP
-#define ARCH_MIN_TASKALIGN	(1 << INTERNODE_CACHE_SHIFT)
-#define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
 #else
-#define ARCH_MIN_TASKALIGN	16
-#define ARCH_MIN_MMSTRUCT_ALIGN	0
+# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif

 /*
@@ -57,68 +56,80 @@ static inline void *current_text_addr(vo
  */

 struct cpuinfo_x86 {
-	__u8	x86;		/* CPU family */
-	__u8	x86_vendor;	/* CPU vendor */
-	__u8	x86_model;
-	__u8	x86_mask;
+	__u8			x86;		/* CPU family */
+	__u8			x86_vendor;	/* CPU vendor */
+	__u8			x86_model;
+	__u8			x86_mask;
 #ifdef CONFIG_X86_32
-	char	wp_works_ok;	/* It doesn't on 386's */
-	char	hlt_works_ok;	/* Problems on some 486Dx4's and old 386's */
-	char	hard_math;
-	char	rfu;
-	char	fdiv_bug;
-	char	f00f_bug;
-	char	coma_bug;
-	char	pad0;
+	char			wp_works_ok;	/* It doesn't on 386's */
+
+	/* Problems on some 486Dx4's and old 386's: */
+	char			hlt_works_ok;
+	char			hard_math;
+	char			rfu;
+	char			fdiv_bug;
+	char			f00f_bug;
+	char			coma_bug;
+	char			pad0;
 #else
-	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
-	int     x86_tlbsize;
-	__u8    x86_virt_bits, x86_phys_bits;
-	/* cpuid returned core id bits */
-	__u8    x86_coreid_bits;
-	/* Max extended CPUID function supported */
-	__u32   extended_cpuid_level;
-#endif
-	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
-	__u32	x86_capability[NCAPINTS];
-	char	x86_vendor_id[16];
-	char	x86_model_id[64];
-	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
-				    call  */
-	int 	x86_cache_alignment;	/* In bytes */
-	int	x86_power;
-	unsigned long loops_per_jiffy;
+	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
+	int			 x86_tlbsize;
+	__u8			x86_virt_bits;
+	__u8			x86_phys_bits;
+	/* CPUID returned core id bits: */
+	__u8			x86_coreid_bits;
+	/* Max extended CPUID function supported: */
+	__u32			extended_cpuid_level;
+#endif
+	/* Maximum supported CPUID level, -1=no CPUID: */
+	int			cpuid_level;
+	__u32			x86_capability[NCAPINTS];
+	char			x86_vendor_id[16];
+	char			x86_model_id[64];
+	/* in KB - valid for CPUS which support this call: */
+	int			x86_cache_size;
+	int			x86_cache_alignment;	/* In bytes */
+	int			x86_power;
+	unsigned long		loops_per_jiffy;
 #ifdef CONFIG_SMP
-	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+	/* cpus sharing the last level cache: */
+	cpumask_t		llc_shared_map;
 #endif
-	u16 x86_max_cores;		/* cpuid returned max cores value */
-	u16 apicid;
-	u16 x86_clflush_size;
+	/* cpuid returned max cores value: */
+	u16			 x86_max_cores;
+	u16			apicid;
+	u16			initial_apicid;
+	u16			x86_clflush_size;
 #ifdef CONFIG_SMP
-	u16 booted_cores;		/* number of cores as seen by OS */
-	u16 phys_proc_id; 		/* Physical processor id. */
-	u16 cpu_core_id;  		/* Core id */
-	u16 cpu_index;			/* index into per_cpu list */
+	/* number of cores as seen by the OS: */
+	u16			booted_cores;
+	/* Physical processor id: */
+	u16			phys_proc_id;
+	/* Core id: */
+	u16			cpu_core_id;
+	/* Index into per_cpu list: */
+	u16			cpu_index;
 #endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));

-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
-#define X86_VENDOR_UNKNOWN 0xff
+#define X86_VENDOR_INTEL	0
+#define X86_VENDOR_CYRIX	1
+#define X86_VENDOR_AMD		2
+#define X86_VENDOR_UMC		3
+#define X86_VENDOR_CENTAUR	5
+#define X86_VENDOR_TRANSMETA	7
+#define X86_VENDOR_NSC		8
+#define X86_VENDOR_NUM		9
+
+#define X86_VENDOR_UNKNOWN	0xff

 /*
  * capabilities of CPUs
  */
-extern struct cpuinfo_x86 boot_cpu_data;
-extern struct cpuinfo_x86 new_cpu_data;
-extern __u32 cleared_cpu_caps[NCAPINTS];
+extern struct cpuinfo_x86	boot_cpu_data;
+extern struct cpuinfo_x86	new_cpu_data;
+
+extern __u32			cleared_cpu_caps[NCAPINTS];

 #ifdef CONFIG_SMP
 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
@@ -129,7 +140,18 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_
 #define current_cpu_data	boot_cpu_data
 #endif

-void cpu_detect(struct cpuinfo_x86 *c);
+static inline int hlt_works(int cpu)
+{
+#ifdef CONFIG_X86_32
+	return cpu_data(cpu).hlt_works_ok;
+#else
+	return 1;
+#endif
+}
+
+#define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);

 extern void identify_cpu(struct cpuinfo_x86 *);
 extern void identify_boot_cpu(void);
@@ -149,12 +171,12 @@ static inline void xen_cpuid(unsigned in
 			     unsigned int *ecx, unsigned int *edx)
 {
 	/* ecx is often an input as well as an output. */
-	__asm__(XEN_CPUID
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (*eax), "2" (*ecx));
+	asm(XEN_CPUID
+	    : "=a" (*eax),
+	      "=b" (*ebx),
+	      "=c" (*ecx),
+	      "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx));
 }

 static inline void load_cr3(pgd_t *pgdir)
@@ -166,57 +188,70 @@ static inline void load_cr3(pgd_t *pgdir
 #ifdef CONFIG_X86_32
 /* This is the TSS defined by the hardware. */
 struct x86_hw_tss {
-	unsigned short	back_link, __blh;
-	unsigned long	sp0;
-	unsigned short	ss0, __ss0h;
-	unsigned long	sp1;
-	unsigned short	ss1, __ss1h;	/* ss1 caches MSR_IA32_SYSENTER_CS */
-	unsigned long	sp2;
-	unsigned short	ss2, __ss2h;
-	unsigned long	__cr3;
-	unsigned long	ip;
-	unsigned long	flags;
-	unsigned long	ax, cx, dx, bx;
-	unsigned long	sp, bp, si, di;
-	unsigned short	es, __esh;
-	unsigned short	cs, __csh;
-	unsigned short	ss, __ssh;
-	unsigned short	ds, __dsh;
-	unsigned short	fs, __fsh;
-	unsigned short	gs, __gsh;
-	unsigned short	ldt, __ldth;
-	unsigned short	trace, io_bitmap_base;
+	unsigned short		back_link, __blh;
+	unsigned long		sp0;
+	unsigned short		ss0, __ss0h;
+	unsigned long		sp1;
+	/* ss1 caches MSR_IA32_SYSENTER_CS: */
+	unsigned short		ss1, __ss1h;
+	unsigned long		sp2;
+	unsigned short		ss2, __ss2h;
+	unsigned long		__cr3;
+	unsigned long		ip;
+	unsigned long		flags;
+	unsigned long		ax;
+	unsigned long		cx;
+	unsigned long		dx;
+	unsigned long		bx;
+	unsigned long		sp;
+	unsigned long		bp;
+	unsigned long		si;
+	unsigned long		di;
+	unsigned short		es, __esh;
+	unsigned short		cs, __csh;
+	unsigned short		ss, __ssh;
+	unsigned short		ds, __dsh;
+	unsigned short		fs, __fsh;
+	unsigned short		gs, __gsh;
+	unsigned short		ldt, __ldth;
+	unsigned short		trace;
+	unsigned short		io_bitmap_base;
+
 } __attribute__((packed));
 extern struct tss_struct doublefault_tss;
 #else
 struct x86_hw_tss {
-	u32 reserved1;
-	u64 sp0;
-	u64 sp1;
-	u64 sp2;
-	u64 reserved2;
-	u64 ist[7];
-	u32 reserved3;
-	u32 reserved4;
-	u16 reserved5;
-	u16 io_bitmap_base;
+	u32			reserved1;
+	u64			sp0;
+	u64			sp1;
+	u64			sp2;
+	u64			reserved2;
+	u64			ist[7];
+	u32			reserved3;
+	u32			reserved4;
+	u16			reserved5;
+	u16			io_bitmap_base;
+
 } __attribute__((packed)) ____cacheline_aligned;
 #endif
 #endif /* CONFIG_X86_NO_TSS */

 /*
- * Size of io_bitmap.
+ * IO-bitmap sizes:
  */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+#define IO_BITMAP_BITS			65536
+#define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET	0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY	0x9000

 #ifndef CONFIG_X86_NO_TSS
 struct tss_struct {
-	struct x86_hw_tss x86_tss;
+	/*
+	 * The hardware state:
+	 */
+	struct x86_hw_tss	x86_tss;

 	/*
 	 * The extra 1 is there because the CPU will access an
@@ -224,135 +259,161 @@ struct tss_struct {
 	 * bitmap. The extra byte must be all 1 bits, and must
 	 * be within the limit.
 	 */
-	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
+	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 	/*
 	 * Cache the current maximum and the last task that used the bitmap:
 	 */
-	unsigned long io_bitmap_max;
-	struct thread_struct *io_bitmap_owner;
+	unsigned long		io_bitmap_max;
+	struct thread_struct	*io_bitmap_owner;
+
 	/*
-	 * pads the TSS to be cacheline-aligned (size is 0x100)
+	 * Pad the TSS to be cacheline-aligned (size is 0x100):
 	 */
-	unsigned long __cacheline_filler[35];
+	unsigned long		__cacheline_filler[35];
 	/*
-	 * .. and then another 0x100 bytes for emergency kernel stack
+	 * .. and then another 0x100 bytes for the emergency kernel stack:
 	 */
-	unsigned long stack[64];
+	unsigned long		stack[64];
+
 } __attribute__((packed));

 DECLARE_PER_CPU(struct tss_struct, init_tss);

-/* Save the original ist values for checking stack pointers during debugging */
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
 struct orig_ist {
-	unsigned long ist[7];
+	unsigned long		ist[7];
 };
 #endif /* CONFIG_X86_NO_TSS */

 #define	MXCSR_DEFAULT		0x1f80

 struct i387_fsave_struct {
-	u32	cwd;
-	u32	swd;
-	u32	twd;
-	u32	fip;
-	u32	fcs;
-	u32	foo;
-	u32	fos;
-	u32	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	u32	status;		/* software status information */
+	u32			cwd;	/* FPU Control Word		*/
+	u32			swd;	/* FPU Status Word		*/
+	u32			twd;	/* FPU Tag Word			*/
+	u32			fip;	/* FPU IP Offset		*/
+	u32			fcs;	/* FPU IP Selector		*/
+	u32			foo;	/* FPU Operand Pointer Offset	*/
+	u32			fos;	/* FPU Operand Pointer Selector	*/
+
+	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
+	u32			st_space[20];
+
+	/* Software status information [not touched by FSAVE ]:		*/
+	u32			status;
 };

 struct i387_fxsave_struct {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
+	u16			cwd; /* Control Word			*/
+	u16			swd; /* Status Word			*/
+	u16			twd; /* Tag Word			*/
+	u16			fop; /* Last Instruction Opcode		*/
 	union {
 		struct {
-			u64	rip;
-			u64	rdp;
+			u64	rip; /* Instruction Pointer		*/
+			u64	rdp; /* Data Pointer			*/
 		};
 		struct {
-			u32	fip;
-			u32	fcs;
-			u32	foo;
-			u32	fos;
+			u32	fip; /* FPU IP Offset			*/
+			u32	fcs; /* FPU IP Selector			*/
+			u32	foo; /* FPU Operand Offset		*/
+			u32	fos; /* FPU Operand Selector		*/
 		};
 	};
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-	u32	padding[24];
+	u32			mxcsr;		/* MXCSR Register State */
+	u32			mxcsr_mask;	/* MXCSR Mask		*/
+
+	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
+	u32			st_space[32];
+
+	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
+	u32			xmm_space[64];
+
+	u32			padding[24];
+
 } __attribute__((aligned(16)));

 struct i387_soft_struct {
-	u32	cwd;
-	u32	swd;
-	u32	twd;
-	u32	fip;
-	u32	fcs;
-	u32	foo;
-	u32	fos;
-	u32	st_space[20];	/* 8*10 bytes for each FP-reg = 80 bytes */
-	u8	ftop, changed, lookahead, no_update, rm, alimit;
-	struct info	*info;
-	u32	entry_eip;
+	u32			cwd;
+	u32			swd;
+	u32			twd;
+	u32			fip;
+	u32			fcs;
+	u32			foo;
+	u32			fos;
+	/* 8*10 bytes for each FP-reg = 80 bytes: */
+	u32			st_space[20];
+	u8			ftop;
+	u8			changed;
+	u8			lookahead;
+	u8			no_update;
+	u8			rm;
+	u8			alimit;
+	struct info		*info;
+	u32			entry_eip;
 };

-union i387_union {
+union thread_xstate {
 	struct i387_fsave_struct	fsave;
 	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct 	soft;
+	struct i387_soft_struct		soft;
 };

-#ifdef CONFIG_X86_32
-DECLARE_PER_CPU(u8, cpu_llc_id);
-#elif !defined(CONFIG_X86_NO_TSS)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_TSS)
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 #endif

 extern void print_cpu_info(struct cpuinfo_x86 *);
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;

 struct thread_struct {
-/* cached TLS descriptors. */
-	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-	unsigned long	sp0;
-	unsigned long	sp;
+	/* Cached TLS descriptors: */
+	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
+	unsigned long		sp0;
+	unsigned long		sp;
 #ifdef CONFIG_X86_32
-	unsigned long	sysenter_cs;
+	unsigned long		sysenter_cs;
 #else
-	unsigned short	es, ds, fsindex, gsindex;
-#endif
-	unsigned long	ip;
-	unsigned long	fs;
-	unsigned long	gs;
-/* Hardware debugging registers */
-	unsigned long	debugreg0;
-	unsigned long	debugreg1;
-	unsigned long	debugreg2;
-	unsigned long	debugreg3;
-	unsigned long	debugreg6;
-	unsigned long	debugreg7;
-/* fault info */
-	unsigned long	cr2, trap_no, error_code;
-/* floating point info */
-	union i387_union	i387 __attribute__((aligned(16)));;
+	unsigned short		es;
+	unsigned short		ds;
+	unsigned short		fsindex;
+	unsigned short		gsindex;
+#endif
+	unsigned long		ip;
+	unsigned long		fs;
+	unsigned long		gs;
+	/* Hardware debugging registers: */
+	unsigned long		debugreg0;
+	unsigned long		debugreg1;
+	unsigned long		debugreg2;
+	unsigned long		debugreg3;
+	unsigned long		debugreg6;
+	unsigned long		debugreg7;
+	/* Fault info: */
+	unsigned long		cr2;
+	unsigned long		trap_no;
+	unsigned long		error_code;
+	/* floating point and extended processor state */
+	union thread_xstate	*xstate;
 #ifdef CONFIG_X86_32
-/* virtual 86 mode info */
+	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
 	unsigned long		screen_bitmap;
 	unsigned long		v86flags, v86mask, saved_sp0;
 	unsigned int		saved_fs, saved_gs;
 #endif
-/* IO permissions */
-	unsigned long	*io_bitmap_ptr;
-	unsigned long	iopl;
-/* max allowed port in the bitmap, in bytes: */
-	unsigned io_bitmap_max;
+	/* IO permissions: */
+	unsigned long		*io_bitmap_ptr;
+	unsigned long		iopl;
+	/* Max allowed port in the bitmap, in bytes: */
+	unsigned		io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
 /* Debug Store - if not 0 points to a DS Save Area configuration;
@@ -383,12 +444,12 @@ static inline void xen_set_iopl_mask(uns
 }

 #ifndef CONFIG_X86_NO_TSS
-static inline void native_load_sp0(struct tss_struct *tss,
-				   struct thread_struct *thread)
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
 {
 	tss->x86_tss.sp0 = thread->sp0;
 #ifdef CONFIG_X86_32
-	/* Only happens when SEP is enabled, no need to test "SEP"arately */
+	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
 	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
 		tss->x86_tss.ss1 = thread->sysenter_cs;
 		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
@@ -402,8 +463,8 @@ static inline void native_load_sp0(struc
 } while (0)
 #endif

-#define __cpuid xen_cpuid
-#define paravirt_enabled() 0
+#define __cpuid			xen_cpuid
+#define paravirt_enabled()	0

 /*
  * These special macros can be used to get or set a debugging register
@@ -423,11 +484,12 @@ static inline void native_load_sp0(struc
  * enable), so that any CPU's that boot up
  * after us can get the correct flags.
  */
-extern unsigned long mmu_cr4_features;
+extern unsigned long		mmu_cr4_features;

 static inline void set_in_cr4(unsigned long mask)
 {
 	unsigned cr4;
+
 	mmu_cr4_features |= mask;
 	cr4 = read_cr4();
 	cr4 |= mask;
@@ -437,6 +499,7 @@ static inline void set_in_cr4(unsigned l
 static inline void clear_in_cr4(unsigned long mask)
 {
 	unsigned cr4;
+
 	mmu_cr4_features &= ~mask;
 	cr4 = read_cr4();
 	cr4 &= ~mask;
@@ -444,42 +507,42 @@ static inline void clear_in_cr4(unsigned
 }

 struct microcode_header {
-	unsigned int hdrver;
-	unsigned int rev;
-	unsigned int date;
-	unsigned int sig;
-	unsigned int cksum;
-	unsigned int ldrver;
-	unsigned int pf;
-	unsigned int datasize;
-	unsigned int totalsize;
-	unsigned int reserved[3];
+	unsigned int		hdrver;
+	unsigned int		rev;
+	unsigned int		date;
+	unsigned int		sig;
+	unsigned int		cksum;
+	unsigned int		ldrver;
+	unsigned int		pf;
+	unsigned int		datasize;
+	unsigned int		totalsize;
+	unsigned int		reserved[3];
 };

 struct microcode {
-	struct microcode_header hdr;
-	unsigned int bits[0];
+	struct microcode_header	hdr;
+	unsigned int		bits[0];
 };

-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
+typedef struct microcode	microcode_t;
+typedef struct microcode_header	microcode_header_t;

 /* microcode format is extended from prescott processors */
 struct extended_signature {
-	unsigned int sig;
-	unsigned int pf;
-	unsigned int cksum;
+	unsigned int		sig;
+	unsigned int		pf;
+	unsigned int		cksum;
 };

 struct extended_sigtable {
-	unsigned int count;
-	unsigned int cksum;
-	unsigned int reserved[3];
+	unsigned int		count;
+	unsigned int		cksum;
+	unsigned int		reserved[3];
 	struct extended_signature sigs[0];
 };

 typedef struct {
-	unsigned long seg;
+	unsigned long		seg;
 } mm_segment_t;


@@ -491,7 +554,7 @@ extern int kernel_thread(int (*fn)(void
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);

-/* Prepare to copy thread state - unlazy all lazy status */
+/* Prepare to copy thread state - unlazy all lazy state */
 extern void prepare_to_copy(struct task_struct *tsk);

 unsigned long get_wchan(struct task_struct *p);
@@ -528,118 +591,138 @@ static inline unsigned int cpuid_eax(uns
 	unsigned int eax, ebx, ecx, edx;

 	cpuid(op, &eax, &ebx, &ecx, &edx);
+
 	return eax;
 }
+
 static inline unsigned int cpuid_ebx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;

 	cpuid(op, &eax, &ebx, &ecx, &edx);
+
 	return ebx;
 }
+
 static inline unsigned int cpuid_ecx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;

 	cpuid(op, &eax, &ebx, &ecx, &edx);
+
 	return ecx;
 }
+
 static inline unsigned int cpuid_edx(unsigned int op)
 {
 	unsigned int eax, ebx, ecx, edx;

 	cpuid(op, &eax, &ebx, &ecx, &edx);
+
 	return edx;
 }

 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static inline void rep_nop(void)
 {
-	__asm__ __volatile__("rep;nop": : :"memory");
+	asm volatile("rep; nop" ::: "memory");
 }

-/* Stop speculative execution */
+static inline void cpu_relax(void)
+{
+	rep_nop();
+}
+
+/* Stop speculative execution: */
 static inline void sync_core(void)
 {
 	int tmp;
+
 	asm volatile("cpuid" : "=a" (tmp) : "0" (1)
-					  : "ebx", "ecx", "edx", "memory");
+		     : "ebx", "ecx", "edx", "memory");
 }

-#define cpu_relax()   rep_nop()
-
 static inline void __monitor(const void *eax, unsigned long ecx,
-		unsigned long edx)
+			     unsigned long edx)
 {
-	/* "monitor %eax,%ecx,%edx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc8;"
-		: :"a" (eax), "c" (ecx), "d"(edx));
+	/* "monitor %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc8;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
 }

 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
-	/* "mwait %eax,%ecx;" */
-	asm volatile(
-		".byte 0x0f,0x01,0xc9;"
-		: :"a" (eax), "c" (ecx));
+	/* "mwait %eax, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
 }

 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
-	/* "mwait %eax,%ecx;" */
-	asm volatile(
-		"sti; .byte 0x0f,0x01,0xc9;"
-		: :"a" (eax), "c" (ecx));
+	trace_hardirqs_on();
+	/* "mwait %eax, %ecx;" */
+	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
 }

 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);

-extern int force_mwait;
+extern int			force_mwait;

 extern void select_idle_routine(const struct cpuinfo_x86 *c);

-extern unsigned long boot_option_idle_override;
+extern unsigned long		boot_option_idle_override;

 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);

 /* Defined in head.S */
-extern struct desc_ptr early_gdt_descr;
+extern struct desc_ptr		early_gdt_descr;

 extern void cpu_set_gdt(int);
 extern void switch_to_new_gdt(void);
 extern void cpu_init(void);
 extern void init_gdt(int cpu);

-/* from system description table in BIOS.  Mostly for MCA use, but
- * others may find it useful. */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return;
+#endif
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}

-/* Boot loader type from the setup header */
-extern int bootloader_type;
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int		machine_id;
+extern unsigned int		machine_submodel_id;
+extern unsigned int		BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int			bootloader_type;

-extern char ignore_fpu_irq;
-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+extern char			ignore_fpu_irq;

 #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
 #define ARCH_HAS_PREFETCHW
 #define ARCH_HAS_SPINLOCK_PREFETCH

 #ifdef CONFIG_X86_32
-#define BASE_PREFETCH	ASM_NOP4
-#define ARCH_HAS_PREFETCH
+# define BASE_PREFETCH		ASM_NOP4
+# define ARCH_HAS_PREFETCH
 #else
-#define BASE_PREFETCH	"prefetcht0 (%1)"
+# define BASE_PREFETCH		"prefetcht0 (%1)"
 #endif

-/* Prefetch instructions for Pentium III and AMD Athlon */
-/* It's not worth to care about 3dnow! prefetches for the K6
-   because they are microcoded there and very slow.
-   However we don't do prefetches for pre XP Athlons currently
-   That should be fixed. */
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
 static inline void prefetch(const void *x)
 {
 	alternative_input(BASE_PREFETCH,
@@ -648,8 +731,11 @@ static inline void prefetch(const void *
 			  "r" (x));
 }

-/* 3dnow! prefetch to get an exclusive cache line. Useful for
-   spinlocks to avoid one state transition in the cache coherency protocol. */
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
 static inline void prefetchw(const void *x)
 {
 	alternative_input(BASE_PREFETCH,
@@ -658,21 +744,25 @@ static inline void prefetchw(const void
 			  "r" (x));
 }

-#define spin_lock_prefetch(x)	prefetchw(x)
+static inline void spin_lock_prefetch(const void *x)
+{
+	prefetchw(x);
+}
+
 #ifdef CONFIG_X86_32
 /*
  * User space process size: 3GB (default).
  */
-#define TASK_SIZE	(PAGE_OFFSET)
-#define STACK_TOP	TASK_SIZE
-#define STACK_TOP_MAX	STACK_TOP
-
-#define INIT_THREAD  {							\
-	.sp0 = sizeof(init_stack) + (long)&init_stack,			\
-	.vm86_info = NULL,						\
-	.sysenter_cs = __KERNEL_CS,					\
-	.io_bitmap_ptr = NULL,						\
-	.fs = __KERNEL_PERCPU,						\
+#define TASK_SIZE		PAGE_OFFSET
+#define STACK_TOP		TASK_SIZE
+#define STACK_TOP_MAX		STACK_TOP
+
+#define INIT_THREAD  {							  \
+	.sp0			= sizeof(init_stack) + (long)&init_stack, \
+	.vm86_info		= NULL,					  \
+	.sysenter_cs		= __KERNEL_CS,				  \
+	.io_bitmap_ptr		= NULL,					  \
+	.fs			= __KERNEL_PERCPU,			  \
 }

 /*
@@ -681,28 +771,15 @@ static inline void prefetchw(const void
  * permission bitmap. The extra byte must be all 1 bits, and must
  * be within the limit.
  */
-#define INIT_TSS  {							\
-	.x86_tss = {							\
+#define INIT_TSS  {							  \
+	.x86_tss = {							  \
 		.sp0		= sizeof(init_stack) + (long)&init_stack, \
-		.ss0		= __KERNEL_DS,				\
-		.ss1		= __KERNEL_CS,				\
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		\
-	 },								\
-	.io_bitmap	= { [0 ... IO_BITMAP_LONGS] = ~0 },		\
-}
-
-#define start_thread(regs, new_eip, new_esp) do {		\
-	__asm__("movl %0,%%gs": :"r" (0));			\
-	regs->fs = 0;						\
-	set_fs(USER_DS);					\
-	regs->ds = __USER_DS;					\
-	regs->es = __USER_DS;					\
-	regs->ss = __USER_DS;					\
-	regs->cs = __USER_CS;					\
-	regs->ip = new_eip;					\
-	regs->sp = new_esp;					\
-} while (0)
-
+		.ss0		= __KERNEL_DS,				  \
+		.ss1		= __KERNEL_CS,				  \
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		  \
+	 },								  \
+	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
+}

 extern unsigned long thread_saved_pc(struct task_struct *tsk);

@@ -734,18 +811,18 @@ extern unsigned long thread_saved_pc(str
 /*
  * User space process size. 47bits minus one guard page.
  */
-#define TASK_SIZE64	(0x800000000000UL - 4096)
+#define TASK_SIZE64	((1UL << 47) - PAGE_SIZE)

 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
-			   0xc0000000 : 0xFFFFe000)
+#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
+					0xc0000000 : 0xFFFFe000)

-#define TASK_SIZE 		(test_thread_flag(TIF_IA32) ? \
-				 IA32_PAGE_OFFSET : TASK_SIZE64)
-#define TASK_SIZE_OF(child) 	((test_tsk_thread_flag(child, TIF_IA32)) ? \
-				  IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_IA32)) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE64)

 #define STACK_TOP		TASK_SIZE
 #define STACK_TOP_MAX		TASK_SIZE64
@@ -758,32 +835,32 @@ extern unsigned long thread_saved_pc(str
 	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
 }

-#define start_thread(regs, new_rip, new_rsp) do { 			     \
-	asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));  \
-	load_gs_index(0);						     \
-	(regs)->ip = (new_rip);						     \
-	(regs)->sp = (new_rsp);						     \
-	(regs)->cs = __USER_CS;						     \
-	(regs)->ss = __USER_DS;						     \
-	(regs)->flags = 0x200;						     \
-	set_fs(USER_DS);						     \
-} while (0)
-
 /*
  * Return saved PC of a blocked thread.
  * What is this good for? it will be always the scheduler or ret_from_fork.
  */
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+#define thread_saved_pc(t)	(*(unsigned long *)((t)->thread.sp - 8))

-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 #endif /* CONFIG_X86_64 */

-/* This decides where the kernel will search for a free chunk of vm
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+					       unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
 #define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))

-#define KSTK_EIP(task) (task_pt_regs(task)->ip)
-#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+#define KSTK_EIP(task)		(task_pt_regs(task)->ip)
+#define KSTK_ESP(task)		(task_pt_regs(task)->sp)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr)	get_tsc_mode((adr))
+#define SET_TSC_CTL(val)	set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);

 #endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,224 @@
-#ifdef CONFIG_X86_32
-# include "smp_32.h"
+#ifndef _ASM_X86_SMP_H_
+#define _ASM_X86_SMP_H_
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+#  include <asm/io_apic.h>
+# endif
+#endif
+#include <asm/pda.h>
+#include <asm/thread_info.h>
+
+#define cpu_callout_map cpu_possible_map
+extern cpumask_t cpu_initialized;
+#define cpu_callin_map cpu_possible_map
+
+extern void (*mtrr_hook)(void);
+extern void zap_low_mappings(void);
+
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+extern cpumask_t cpu_initialized;
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+extern u16 x86_cpu_to_apicid_init[];
+extern u16 x86_bios_cpu_apicid_init[];
+extern void *x86_cpu_to_apicid_early_ptr;
+extern void *x86_bios_cpu_apicid_early_ptr;
+#else
+#define x86_cpu_to_apicid_early_ptr NULL
+#define x86_bios_cpu_apicid_early_ptr NULL
+#endif
+
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+#ifdef CONFIG_SMP
+
+#ifndef CONFIG_XEN
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+	void *sp;
+	unsigned short ss;
+} stack_start;
+
+struct smp_ops {
+	void (*smp_prepare_boot_cpu)(void);
+	void (*smp_prepare_cpus)(unsigned max_cpus);
+	int (*cpu_up)(unsigned cpu);
+	void (*smp_cpus_done)(unsigned max_cpus);
+
+	void (*smp_send_stop)(void);
+	void (*smp_send_reschedule)(int cpu);
+	int (*smp_call_function_mask)(cpumask_t mask,
+				      void (*func)(void *info), void *info,
+				      int wait);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+	smp_ops.smp_send_stop();
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+	smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+	smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+	smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+	return smp_ops.cpu_up(cpu);
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+	smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline int smp_call_function_mask(cpumask_t mask,
+					 void (*func) (void *info), void *info,
+					 int wait)
+{
+	return smp_ops.smp_call_function_mask(mask, func, info, wait);
+}
+
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+
+#else /* CONFIG_XEN */
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function_mask(cpumask_t mask,
+			       void (*func) (void *info), void *info,
+			       int wait);
+
+#define smp_send_stop		xen_smp_send_stop
+#define smp_send_reschedule	xen_smp_send_reschedule
+#define smp_call_function_mask	xen_smp_call_function_mask
+
+extern void prefill_possible_map(void);
+
+#endif /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+
+extern void prefill_possible_map(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+	return cpus_weight(cpu_callout_map);
+}
+#endif /* CONFIG_SMP */
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#ifdef CONFIG_X86_32_SMP
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+DECLARE_PER_CPU(int, cpu_number);
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define safe_smp_processor_id() smp_processor_id()
+
+#elif defined(CONFIG_X86_64_SMP)
+#define raw_smp_processor_id()	read_pda(cpunumber)
+
+#define stack_smp_processor_id()					\
+({								\
+	struct thread_info *ti;						\
+	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
+	ti->cpu;							\
+})
+#define safe_smp_processor_id()		smp_processor_id()
+
+#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
+#define cpu_physical_id(cpu)		boot_cpu_physical_apicid
+#define safe_smp_processor_id()		0
+#define stack_smp_processor_id() 	0
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static inline int logical_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+#ifndef CONFIG_X86_64
+static inline unsigned int read_apic_id(void)
+{
+	return *(u32 *)(APIC_BASE + APIC_ID);
+}
 #else
-# include "smp_64.h"
+extern unsigned int read_apic_id(void);
+#endif
+
+
+# ifdef APIC_DEFINITION
+extern int hard_smp_processor_id(void);
+# else
+#  include <mach_apicdef.h>
+static inline int hard_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_ID(read_apic_id());
+}
+# endif /* APIC_DEFINITION */
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+#  define hard_smp_processor_id()	0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void cpu_exit_clear(void);
+extern void cpu_uninit(void);
+#endif
+
+extern void smp_alloc_memory(void);
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
+#endif /* __ASSEMBLY__ */
 #endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp_32.h	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,174 +0,0 @@
-#ifndef __ASM_SMP_H
-#define __ASM_SMP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/cpumask.h>
-#include <linux/init.h>
-
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <asm/mpspec.h>
-# include <asm/apic.h>
-# ifdef CONFIG_X86_IO_APIC
-#  include <asm/io_apic.h>
-# endif
-#endif
-
-#define cpu_callout_map cpu_possible_map
-#define cpu_callin_map cpu_possible_map
-
-extern int smp_num_siblings;
-extern unsigned int num_processors;
-
-extern void smp_alloc_memory(void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
-
-extern void (*mtrr_hook) (void);
-extern void zap_low_mappings (void);
-
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern void cpu_exit_clear(void);
-extern void cpu_uninit(void);
-#endif
-
-#ifdef CONFIG_SMP
-
-#ifndef CONFIG_XEN
-
-/* Globals due to paravirt */
-extern void set_cpu_sibling_map(int cpu);
-
-struct smp_ops
-{
-	void (*smp_prepare_boot_cpu)(void);
-	void (*smp_prepare_cpus)(unsigned max_cpus);
-	int (*cpu_up)(unsigned cpu);
-	void (*smp_cpus_done)(unsigned max_cpus);
-
-	void (*smp_send_stop)(void);
-	void (*smp_send_reschedule)(int cpu);
-	int (*smp_call_function_mask)(cpumask_t mask,
-				      void (*func)(void *info), void *info,
-				      int wait);
-};
-
-extern struct smp_ops smp_ops;
-
-static inline void smp_prepare_boot_cpu(void)
-{
-	smp_ops.smp_prepare_boot_cpu();
-}
-static inline void smp_prepare_cpus(unsigned int max_cpus)
-{
-	smp_ops.smp_prepare_cpus(max_cpus);
-}
-static inline int __cpu_up(unsigned int cpu)
-{
-	return smp_ops.cpu_up(cpu);
-}
-static inline void smp_cpus_done(unsigned int max_cpus)
-{
-	smp_ops.smp_cpus_done(max_cpus);
-}
-
-static inline void smp_send_stop(void)
-{
-	smp_ops.smp_send_stop();
-}
-static inline void smp_send_reschedule(int cpu)
-{
-	smp_ops.smp_send_reschedule(cpu);
-}
-static inline int smp_call_function_mask(cpumask_t mask,
-					 void (*func) (void *info), void *info,
-					 int wait)
-{
-	return smp_ops.smp_call_function_mask(mask, func, info, wait);
-}
-
-void native_smp_prepare_boot_cpu(void);
-void native_smp_prepare_cpus(unsigned int max_cpus);
-int native_cpu_up(unsigned int cpunum);
-void native_smp_cpus_done(unsigned int max_cpus);
-
-#else /* CONFIG_XEN */
-
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function_mask(cpumask_t mask,
-			       void (*func) (void *info), void *info,
-			       int wait);
-
-#define smp_send_stop		xen_smp_send_stop
-#define smp_send_reschedule	xen_smp_send_reschedule
-#define smp_call_function_mask	xen_smp_call_function_mask
-
-extern void prefill_possible_map(void);
-
-#endif /* CONFIG_XEN */
-
-extern int __cpu_disable(void);
-extern void __cpu_die(unsigned int cpu);
-
-/*
- * This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
- */
-DECLARE_PER_CPU(int, cpu_number);
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
-
-#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
-
-#define safe_smp_processor_id() smp_processor_id()
-
-/* We don't mark CPUs online until __cpu_up(), so we need another measure */
-static inline int num_booting_cpus(void)
-{
-	return cpus_weight(cpu_callout_map);
-}
-
-#else /* CONFIG_SMP */
-
-#define safe_smp_processor_id()		0
-#define cpu_physical_id(cpu)		boot_cpu_physical_apicid
-
-#endif /* !CONFIG_SMP */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static __inline int logical_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
-}
-
-# ifdef APIC_DEFINITION
-extern int hard_smp_processor_id(void);
-# else
-#  include <mach_apicdef.h>
-static inline int hard_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
-}
-# endif /* APIC_DEFINITION */
-
-#else /* CONFIG_X86_LOCAL_APIC */
-
-# ifndef CONFIG_SMP
-#  define hard_smp_processor_id()	0
-# endif
-
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#endif /* !ASSEMBLY */
-#endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/smp_64.h	2010-03-24 15:10:37.000000000 +0100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,103 +0,0 @@
-#ifndef __ASM_SMP_H
-#define __ASM_SMP_H
-
-#include <linux/cpumask.h>
-#include <linux/init.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#include <asm/apic.h>
-#ifdef CONFIG_X86_IO_APIC
-#include <asm/io_apic.h>
-#endif
-#include <asm/mpspec.h>
-#endif
-#include <asm/pda.h>
-#include <asm/thread_info.h>
-
-extern cpumask_t cpu_initialized;
-
-extern int smp_num_siblings;
-extern unsigned int num_processors;
-
-extern void smp_alloc_memory(void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
-
-extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-				  void *info, int wait);
-
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline int cpu_present_to_apicid(int mps_cpu)
-{
-	if (cpu_present(mps_cpu))
-		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
-	else
-		return BAD_APICID;
-}
-#endif
-
-#ifdef CONFIG_SMP
-
-#define SMP_TRAMPOLINE_BASE 0x6000
-
-extern int __cpu_disable(void);
-extern void __cpu_die(unsigned int cpu);
-extern void prefill_possible_map(void);
-extern unsigned __cpuinitdata disabled_cpus;
-
-#define raw_smp_processor_id()	read_pda(cpunumber)
-#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
-
-#define stack_smp_processor_id()					\
-	({								\
-	struct thread_info *ti;						\
-	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
-	ti->cpu;							\
-})
-
-/*
- * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
- * scheduling and IPI sending and compresses data structures.
- */
-static inline int num_booting_cpus(void)
-{
-	return cpus_weight(cpu_possible_map);
-}
-
-extern void smp_send_reschedule(int cpu);
-
-#else /* CONFIG_SMP */
-
-extern unsigned int boot_cpu_id;
-#define cpu_physical_id(cpu)	boot_cpu_id
-#define stack_smp_processor_id() 0
-
-#endif /* !CONFIG_SMP */
-
-#define safe_smp_processor_id()		smp_processor_id()
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static __inline int logical_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
-}
-
-static inline int hard_smp_processor_id(void)
-{
-	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
-}
-#endif
-
-#endif
-
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:12:36.000000000 +0100
@@ -95,7 +95,7 @@ void xen_spin_kick(raw_spinlock_t *, uns
 	    : \
 	    : "memory", "cc")

-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp, new;

@@ -158,7 +158,7 @@ static inline int __raw_spin_trylock(raw
 		    : "memory", "cc"); \
 	} while (0)

-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int tmp;
 	int new;
@@ -183,19 +183,19 @@ static inline int __raw_spin_trylock(raw

 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
 {
-	int tmp = *(volatile signed int *)(&(lock)->slock);
+	int tmp = ACCESS_ONCE(lock->slock);

 	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
 }

 static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 {
-	int tmp = *(volatile signed int *)(&(lock)->slock);
+	int tmp = ACCESS_ONCE(lock->slock);

 	return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
 }

-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	unsigned int token, count;
 	unsigned int flags = __raw_local_irq_save();
@@ -214,8 +214,8 @@ static inline void __raw_spin_lock(raw_s
 	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
 }

-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
-					 unsigned long flags)
+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+						  unsigned long flags)
 {
 	unsigned int token, count;
 	bool free;
@@ -230,7 +230,7 @@ static inline void __raw_spin_lock_flags
 	} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
 }

-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	unsigned int token;
 	bool kick;
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/swiotlb.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/swiotlb.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,5 +1,4 @@
-#ifdef CONFIG_X86_32
-# include "swiotlb_32.h"
-#else
-# include_next <asm/swiotlb.h>
-#endif
+#include_next <asm/swiotlb.h>
+
+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
+				   int dir);
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:12:36.000000000 +0100
@@ -28,22 +28,44 @@ struct task_struct *__switch_to(struct t
  * Saving eflags is important. It switches not only IOPL between tasks,
  * it also protects other tasks from NT leaking through sysenter etc.
  */
-#define switch_to(prev, next, last) do {				\
-	unsigned long esi, edi;						\
-	asm volatile("pushfl\n\t"		/* Save flags */	\
-		     "pushl %%ebp\n\t"					\
-		     "movl %%esp,%0\n\t"	/* save ESP */		\
-		     "movl %5,%%esp\n\t"	/* restore ESP */	\
-		     "movl $1f,%1\n\t"		/* save EIP */		\
-		     "pushl %6\n\t"		/* restore EIP */	\
-		     "jmp __switch_to\n"				\
+#define switch_to(prev, next, last)					\
+do {									\
+	/*								\
+	 * Context-switching clobbers all registers, so we clobber	\
+	 * them explicitly, via unused output variables.		\
+	 * (EAX and EBP is not listed because EBP is saved/restored	\
+	 * explicitly for wchan access and EAX is the return value of	\
+	 * __switch_to())						\
+	 */								\
+	unsigned long ebx, ecx, edx, esi, edi;				\
+									\
+	asm volatile("pushfl\n\t"		/* save    flags */	\
+		     "pushl %%ebp\n\t"		/* save    EBP   */	\
+		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
+		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
+		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
+		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
+		     "jmp __switch_to\n"	/* regparm call  */	\
 		     "1:\t"						\
-		     "popl %%ebp\n\t"					\
-		     "popfl"						\
-		     :"=m" (prev->thread.sp), "=m" (prev->thread.ip),	\
-		      "=a" (last), "=S" (esi), "=D" (edi)		\
-		     :"m" (next->thread.sp), "m" (next->thread.ip),	\
-		      "2" (prev), "d" (next));				\
+		     "popl %%ebp\n\t"		/* restore EBP   */	\
+		     "popfl\n"			/* restore flags */	\
+									\
+		     /* output parameters */				\
+		     : [prev_sp] "=m" (prev->thread.sp),		\
+		       [prev_ip] "=m" (prev->thread.ip),		\
+		       "=a" (last),					\
+									\
+		       /* clobbered output registers: */		\
+		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
+		       "=S" (esi), "=D" (edi)				\
+		       							\
+		       /* input parameters: */				\
+		     : [next_sp]  "m" (next->thread.sp),		\
+		       [next_ip]  "m" (next->thread.ip),		\
+		       							\
+		       /* regparm parameters for __switch_to(): */	\
+		       [prev]     "a" (prev),				\
+		       [next]     "d" (next));				\
 } while (0)

 /*
@@ -123,30 +145,29 @@ extern void load_gs_index(unsigned);
  */
 #define loadsegment(seg, value)			\
 	asm volatile("\n"			\
-		"1:\t"				\
-		"movl %k0,%%" #seg "\n"		\
-		"2:\n"				\
-		".section .fixup,\"ax\"\n"	\
-		"3:\t"				\
-		"movl %k1, %%" #seg "\n\t"	\
-		"jmp 2b\n"			\
-		".previous\n"			\
-		_ASM_EXTABLE(1b,3b)		\
-		: :"r" (value), "r" (0))
+		     "1:\t"			\
+		     "movl %k0,%%" #seg "\n"	\
+		     "2:\n"			\
+		     ".section .fixup,\"ax\"\n"	\
+		     "3:\t"			\
+		     "movl %k1, %%" #seg "\n\t"	\
+		     "jmp 2b\n"			\
+		     ".previous\n"		\
+		     _ASM_EXTABLE(1b,3b)	\
+		     : :"r" (value), "r" (0))


 /*
  * Save a segment register away
  */
-#define savesegment(seg, value) \
+#define savesegment(seg, value)				\
 	asm volatile("mov %%" #seg ",%0":"=rm" (value))

 static inline unsigned long get_limit(unsigned long segment)
 {
 	unsigned long __limit;
-	__asm__("lsll %1,%0"
-		:"=r" (__limit):"r" (segment));
-	return __limit+1;
+	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+	return __limit + 1;
 }

 static inline void xen_clts(void)
@@ -171,13 +192,13 @@ static unsigned long __force_order;
 static inline unsigned long xen_read_cr0(void)
 {
 	unsigned long val;
-	asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
+	asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
 	return val;
 }

 static inline void xen_write_cr0(unsigned long val)
 {
-	asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
+	asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
 }

 #define xen_read_cr2() (current_vcpu_info()->arch.cr2)
@@ -186,7 +207,7 @@ static inline void xen_write_cr0(unsigne
 static inline unsigned long xen_read_cr3(void)
 {
 	unsigned long val;
-	asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
+	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
 #ifdef CONFIG_X86_32
 	return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
 #else
@@ -201,13 +222,13 @@ static inline void xen_write_cr3(unsigne
 #else
 	val = phys_to_machine(val);
 #endif
-	asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
+	asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
 }

 static inline unsigned long xen_read_cr4(void)
 {
 	unsigned long val;
-	asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
+	asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
 	return val;
 }

@@ -215,7 +236,7 @@ static inline unsigned long xen_read_cr4

 static inline void xen_write_cr4(unsigned long val)
 {
-	asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
+	asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
 }

 #ifdef CONFIG_X86_64
@@ -234,6 +255,7 @@ static inline void xen_wbinvd(void)
 {
 	asm volatile("wbinvd": : :"memory");
 }
+
 #define read_cr0()	(xen_read_cr0())
 #define write_cr0(x)	(xen_write_cr0(x))
 #define read_cr2()	(xen_read_cr2())
@@ -260,7 +282,7 @@ static inline void clflush(volatile void
 	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
 }

-#define nop() __asm__ __volatile__ ("nop")
+#define nop() asm volatile ("nop")

 void disable_hlt(void);
 void enable_hlt(void);
@@ -280,16 +302,7 @@ void default_idle(void);
  */
 #ifdef CONFIG_X86_32
 /*
- * For now, "wmb()" doesn't actually do anything, as all
- * Intel CPU's follow what Intel calls a *Processor Order*,
- * in which all writes are seen in the program order even
- * outside the CPU.
- *
- * I expect future Intel CPU's to have a weaker ordering,
- * but I'd also expect them to finally get their act together
- * and add some real memory barriers if so.
- *
- * Some non intel clones support out of order store. wmb() ceases to be a
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
  * nop for these.
  */
 #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
@@ -368,7 +381,7 @@ void default_idle(void);
 # define smp_wmb()	barrier()
 #endif
 #define smp_read_barrier_depends()	read_barrier_depends()
-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/tlbflush.h	2010-03-24 15:12:36.000000000 +0100
@@ -86,8 +86,7 @@ static inline void flush_tlb_range(struc
 #define TLBSTATE_LAZY	2

 #ifdef CONFIG_X86_32
-struct tlb_state
-{
+struct tlb_state {
 	struct mm_struct *active_mm;
 	int state;
 	char __cacheline_padding[L1_CACHE_BYTES-8];
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/vga.h	2007-06-12 13:14:02.000000000 +0200
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/vga.h	2010-03-24 15:12:36.000000000 +0100
@@ -12,9 +12,9 @@
  *	access the videoram directly without any black magic.
  */

-#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x)
+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)

 #define vga_readb(x) (*(x))
-#define vga_writeb(x,y) (*(y) = (x))
+#define vga_writeb(x, y) (*(y) = (x))

 #endif
--- head-2010-05-25.orig/arch/x86/include/mach-xen/asm/xor_64.h	2007-06-12 13:14:13.000000000 +0200
+++ head-2010-05-25/arch/x86/include/mach-xen/asm/xor_64.h	2010-03-24 15:12:36.000000000 +0100
@@ -1,20 +1,23 @@
 /*
- * x86-64 changes / gcc fixes from Andi Kleen.
+ * x86-64 changes / gcc fixes from Andi Kleen.
  * Copyright 2002 Andi Kleen, SuSE Labs.
  *
  * This hasn't been optimized for the hammer yet, but there are likely
  * no advantages to be gotten from x86-64 here anyways.
  */

-typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+typedef struct {
+	unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;

-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
    tell it to do a clts before the register saving. */
-#define XMMS_SAVE do {				\
+#define XMMS_SAVE				\
+do {						\
 	preempt_disable();			\
 	if (!(current_thread_info()->status & TS_USEDFPU))	\
 		clts();				\
-	__asm__ __volatile__ ( 			\
+	asm volatile(				\
 		"movups %%xmm0,(%1)	;\n\t"	\
 		"movups %%xmm1,0x10(%1)	;\n\t"	\
 		"movups %%xmm2,0x20(%1)	;\n\t"	\
@@ -22,10 +25,11 @@ typedef struct { unsigned long a,b; } __
 		: "=&r" (cr0)			\
 		: "r" (xmm_save) 		\
 		: "memory");			\
-} while(0)
+} while (0)

-#define XMMS_RESTORE do {			\
-	asm volatile (				\
+#define XMMS_RESTORE				\
+do {						\
+	asm volatile(				\
 		"sfence			;\n\t"	\
 		"movups (%1),%%xmm0	;\n\t"	\
 		"movups 0x10(%1),%%xmm1	;\n\t"	\
@@ -37,72 +41,72 @@ typedef struct { unsigned long a,b; } __
 	if (!(current_thread_info()->status & TS_USEDFPU))	\
 		stts();				\
 	preempt_enable();			\
-} while(0)
+} while (0)

 #define OFFS(x)		"16*("#x")"
 #define PF_OFFS(x)	"256+16*("#x")"
 #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
-#define LD(x,y)		"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
-#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
+#define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
+#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
 #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
 #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
 #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
 #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
 #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
-#define XO1(x,y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
-#define XO2(x,y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
-#define XO3(x,y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
-#define XO4(x,y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
-#define XO5(x,y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
+#define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
+#define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
+#define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
+#define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
+#define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"


 static void
 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
-        unsigned int lines = bytes >> 8;
+	unsigned int lines = bytes >> 8;
 	unsigned long cr0;
 	xmm_store_t xmm_save[4];

 	XMMS_SAVE;

-        asm volatile (
+	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
-		LD(i,0)					\
-			LD(i+1,1)			\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
 		PF1(i)					\
-				PF1(i+2)		\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
+				PF1(i + 2)		\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\


 		PF0(0)
 				PF0(2)

 	" .align 32			;\n"
-        " 1:                            ;\n"
+	" 1:                            ;\n"

 		BLOCK(0)
 		BLOCK(4)
 		BLOCK(8)
 		BLOCK(12)

-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
 		"		decl %[cnt] ; jnz 1b"
 	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
-	: [inc] "r" (256UL)
-        : "memory");
+	: [inc] "r" (256UL)
+	: "memory");

 	XMMS_RESTORE;
 }
@@ -117,52 +121,52 @@ xor_sse_3(unsigned long bytes, unsigned

 	XMMS_SAVE;

-        __asm__ __volatile__ (
+	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
 		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
+				PF1(i + 2)		\
+		LD(i, 0)					\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
 		PF2(i)					\
-				PF2(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
+				PF2(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\


 		PF0(0)
 				PF0(2)

 	" .align 32			;\n"
-        " 1:                            ;\n"
+	" 1:                            ;\n"

 		BLOCK(0)
 		BLOCK(4)
 		BLOCK(8)
 		BLOCK(12)

-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]          ;\n"
-        "       addq %[inc], %[p3]           ;\n"
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]          ;\n"
+	"       addq %[inc], %[p3]           ;\n"
 		"		decl %[cnt] ; jnz 1b"
 	: [cnt] "+r" (lines),
 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 	: [inc] "r" (256UL)
-	: "memory");
+	: "memory");
 	XMMS_RESTORE;
 }

@@ -171,64 +175,64 @@ xor_sse_4(unsigned long bytes, unsigned
 	  unsigned long *p3, unsigned long *p4)
 {
 	unsigned int lines = bytes >> 8;
-	xmm_store_t xmm_save[4];
+	xmm_store_t xmm_save[4];
 	unsigned long cr0;

 	XMMS_SAVE;

-        __asm__ __volatile__ (
+	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
 		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
 		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
 		PF3(i)					\
-				PF3(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
+				PF3(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\


 		PF0(0)
 				PF0(2)

 	" .align 32			;\n"
-        " 1:                            ;\n"
+	" 1:                            ;\n"

 		BLOCK(0)
 		BLOCK(4)
 		BLOCK(8)
 		BLOCK(12)

-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
+	"       addq %[inc], %[p3]           ;\n"
+	"       addq %[inc], %[p4]           ;\n"
 	"	decl %[cnt] ; jnz 1b"
 	: [cnt] "+c" (lines),
 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 	: [inc] "r" (256UL)
-        : "memory" );
+	: "memory" );

 	XMMS_RESTORE;
 }
@@ -237,70 +241,70 @@ static void
 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
-        unsigned int lines = bytes >> 8;
+	unsigned int lines = bytes >> 8;
 	xmm_store_t xmm_save[4];
 	unsigned long cr0;

 	XMMS_SAVE;

-        __asm__ __volatile__ (
+	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
 		PF1(i)					\
-				PF1(i+2)		\
-		LD(i,0)					\
-			LD(i+1,1)			\
-				LD(i+2,2)		\
-					LD(i+3,3)	\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
 		PF2(i)					\
-				PF2(i+2)		\
-		XO1(i,0)				\
-			XO1(i+1,1)			\
-				XO1(i+2,2)		\
-					XO1(i+3,3)	\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
 		PF3(i)					\
-				PF3(i+2)		\
-		XO2(i,0)				\
-			XO2(i+1,1)			\
-				XO2(i+2,2)		\
-					XO2(i+3,3)	\
+				PF3(i + 2)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
 		PF4(i)					\
-				PF4(i+2)		\
-		PF0(i+4)				\
-				PF0(i+6)		\
-		XO3(i,0)				\
-			XO3(i+1,1)			\
-				XO3(i+2,2)		\
-					XO3(i+3,3)	\
-		XO4(i,0)				\
-			XO4(i+1,1)			\
-				XO4(i+2,2)		\
-					XO4(i+3,3)	\
-		ST(i,0)					\
-			ST(i+1,1)			\
-				ST(i+2,2)		\
-					ST(i+3,3)	\
+				PF4(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		XO4(i, 0)				\
+			XO4(i + 1, 1)			\
+				XO4(i + 2, 2)		\
+					XO4(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\


 		PF0(0)
 				PF0(2)

 	" .align 32			;\n"
-        " 1:                            ;\n"
+	" 1:                            ;\n"

 		BLOCK(0)
 		BLOCK(4)
 		BLOCK(8)
 		BLOCK(12)

-        "       addq %[inc], %[p1]           ;\n"
-        "       addq %[inc], %[p2]           ;\n"
-        "       addq %[inc], %[p3]           ;\n"
-        "       addq %[inc], %[p4]           ;\n"
-        "       addq %[inc], %[p5]           ;\n"
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
+	"       addq %[inc], %[p3]           ;\n"
+	"       addq %[inc], %[p4]           ;\n"
+	"       addq %[inc], %[p5]           ;\n"
 	"	decl %[cnt] ; jnz 1b"
 	: [cnt] "+c" (lines),
-  	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
 	  [p5] "+r" (p5)
 	: [inc] "r" (256UL)
 	: "memory");
@@ -309,18 +313,18 @@ xor_sse_5(unsigned long bytes, unsigned
 }

 static struct xor_block_template xor_block_sse = {
-        .name = "generic_sse",
-        .do_2 = xor_sse_2,
-        .do_3 = xor_sse_3,
-        .do_4 = xor_sse_4,
-        .do_5 = xor_sse_5,
+	.name = "generic_sse",
+	.do_2 = xor_sse_2,
+	.do_3 = xor_sse_3,
+	.do_4 = xor_sse_4,
+	.do_5 = xor_sse_5,
 };

 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-	do {						\
-		xor_speed(&xor_block_sse);	\
-	} while (0)
+#define XOR_TRY_TEMPLATES			\
+do {						\
+	xor_speed(&xor_block_sse);		\
+} while (0)

 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
--- head-2010-05-25.orig/arch/x86/include/asm/scatterlist.h	2010-05-25 09:12:09.000000000 +0200
+++ head-2010-05-25/arch/x86/include/asm/scatterlist.h	2010-03-24 15:12:36.000000000 +0100
@@ -3,6 +3,10 @@

 #define ISA_DMA_THRESHOLD (0x00ffffff)

+#ifdef CONFIG_X86_XEN
+# define sg_dma_len(sg)		((sg)->dma_length)
+#endif
+
 #include <asm-generic/scatterlist.h>

 #endif /* _ASM_X86_SCATTERLIST_H */
--- head-2010-05-25.orig/include/linux/page-flags.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/include/linux/page-flags.h	2010-03-24 15:12:36.000000000 +0100
@@ -349,29 +349,28 @@ static inline void SetPageUptodate(struc

 CLEARPAGEFLAG(Uptodate, uptodate)

-#define PageForeign(page)	test_bit(PG_foreign, &(page)->flags)
-#define SetPageForeign(_page, dtor) do {		\
-	set_bit(PG_foreign, &(_page)->flags);		\
-	BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \
-	(_page)->index = (long)(dtor);			\
-} while (0)
-#define ClearPageForeign(page) do {			\
-	clear_bit(PG_foreign, &(page)->flags);		\
-	(page)->index = 0;				\
-} while (0)
-#define PageForeignDestructor(_page, order)		\
-	((void (*)(struct page *, unsigned int))(_page)->index)(_page, order)
-
-#if 0
-#define PageNetback(page)       test_bit(PG_netback, &(page)->flags)
-#define SetPageNetback(page)    set_bit(PG_netback, &(page)->flags)
-#define ClearPageNetback(page)  clear_bit(PG_netback, &(page)->flags)
+#ifdef CONFIG_XEN
+TESTPAGEFLAG(Foreign, foreign)
+static inline void SetPageForeign(struct page *page,
+				  void (*dtor)(struct page *, unsigned int))
+{
+	BUG_ON(!dtor);
+	set_bit(PG_foreign, &page->flags);
+	page->index = (long)dtor;
+}
+static inline void ClearPageForeign(struct page *page)
+{
+	clear_bit(PG_foreign, &page->flags);
+	page->index = 0;
+}
+static inline void PageForeignDestructor(struct page *page, unsigned int order)
+{
+	((void (*)(struct page *, unsigned int))page->index)(page, order);
+}
+/*PAGEFLAG(Netback, netback)*/
+PAGEFLAG(Blkback, blkback)
 #endif

-#define PageBlkback(page)       test_bit(PG_blkback, &(page)->flags)
-#define SetPageBlkback(page)    set_bit(PG_blkback, &(page)->flags)
-#define ClearPageBlkback(page)  clear_bit(PG_blkback, &(page)->flags)
-
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);

 int test_clear_page_writeback(struct page *page);
--- head-2010-05-25.orig/include/xen/balloon.h	2007-06-12 13:14:19.000000000 +0200
+++ head-2010-05-25/include/xen/balloon.h	2010-03-24 15:12:36.000000000 +0100
@@ -31,9 +31,12 @@
  * IN THE SOFTWARE.
  */

-#ifndef __ASM_BALLOON_H__
-#define __ASM_BALLOON_H__
+#ifndef __XEN_BALLOON_H__
+#define __XEN_BALLOON_H__

+#include <linux/spinlock.h>
+
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
 /*
  * Inform the balloon driver that it should allow some slop for device-driver
  * memory activities.
@@ -53,5 +56,6 @@ void balloon_release_driver_page(struct
 extern spinlock_t balloon_lock;
 #define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
 #define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
+#endif

-#endif /* __ASM_BALLOON_H__ */
+#endif /* __XEN_BALLOON_H__ */
--- head-2010-05-25.orig/include/xen/interface/grant_table.h	2010-01-19 16:01:04.000000000 +0100
+++ head-2010-05-25/include/xen/interface/grant_table.h	2010-03-24 15:12:36.000000000 +0100
@@ -288,6 +288,7 @@ struct gnttab_map_grant_ref {
     grant_handle_t handle;
     uint64_t dev_bus_addr;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
 typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);

@@ -311,6 +312,7 @@ struct gnttab_unmap_grant_ref {
     /* OUT parameters. */
     int16_t  status;              /* GNTST_* */
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
 typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);

@@ -332,6 +334,7 @@ struct gnttab_setup_table {
     int16_t  status;              /* GNTST_* */
     XEN_GUEST_HANDLE(ulong) frame_list;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
 typedef struct gnttab_setup_table gnttab_setup_table_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);

@@ -346,6 +349,7 @@ struct gnttab_dump_table {
     /* OUT parameters. */
     int16_t status;               /* GNTST_* */
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
 typedef struct gnttab_dump_table gnttab_dump_table_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);

@@ -366,6 +370,7 @@ struct gnttab_transfer {
     /* OUT parameters. */
     int16_t       status;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
 typedef struct gnttab_transfer gnttab_transfer_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);

@@ -411,6 +416,7 @@ typedef struct gnttab_copy {
     /* OUT parameters. */
     int16_t       status;
 } gnttab_copy_t;
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
 DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);

 /*
@@ -429,6 +435,7 @@ struct gnttab_query_size {
     uint32_t max_nr_frames;
     int16_t  status;              /* GNTST_* */
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
 typedef struct gnttab_query_size gnttab_query_size_t;
 DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);

--- head-2010-05-25.orig/include/xen/interface/io/fbif.h	2010-01-19 16:01:04.000000000 +0100
+++ head-2010-05-25/include/xen/interface/io/fbif.h	2010-03-24 15:12:36.000000000 +0100
@@ -150,7 +150,12 @@ struct xenfb_page
      * framebuffer with a max resolution of 12,800x10,240.  Should
      * be enough for a while with room leftover for expansion.
      */
+#ifndef CONFIG_PARAVIRT_XEN
     unsigned long pd[256];
+#else
+	/* Two directory pages should be enough for a while. */
+	unsigned long pd[2];
+#endif
 };

 /*
--- head-2010-05-25.orig/include/xen/interface/memory.h	2010-03-24 15:09:23.000000000 +0100
+++ head-2010-05-25/include/xen/interface/memory.h	2010-03-24 15:12:36.000000000 +0100
@@ -85,7 +85,6 @@ struct xen_memory_reservation {
      */
     domid_t        domid;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
 typedef struct xen_memory_reservation xen_memory_reservation_t;
 DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);

@@ -171,7 +170,11 @@ struct xen_machphys_mfn_list {
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
+#ifndef CONFIG_PARAVIRT_XEN
     XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
+#else
+    ulong extent_start;
+#endif

     /*
      * Number of extents written to the above array. This will be smaller
@@ -179,7 +182,6 @@ struct xen_machphys_mfn_list {
      */
     unsigned int nr_extents;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
 typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
 DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);

@@ -221,7 +223,6 @@ struct xen_add_to_physmap {
     /* GPFN where the source mapping page should appear. */
     xen_pfn_t     gpfn;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);

--- head-2010-05-25.orig/include/xen/interface/vcpu.h	2010-01-19 16:01:04.000000000 +0100
+++ head-2010-05-25/include/xen/interface/vcpu.h	2010-03-24 15:12:36.000000000 +0100
@@ -87,6 +87,7 @@ struct vcpu_runstate_info {
      */
     uint64_t time[4];
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
 typedef struct vcpu_runstate_info vcpu_runstate_info_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);

@@ -142,6 +143,7 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_register_ru
 struct vcpu_set_periodic_timer {
     uint64_t period_ns;
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
 typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);

@@ -155,6 +157,7 @@ struct vcpu_set_singleshot_timer {
     uint64_t timeout_abs_ns;   /* Absolute system time value in nanoseconds. */
     uint32_t flags;            /* VCPU_SSHOTTMR_??? */
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
 typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);

@@ -178,6 +181,7 @@ struct vcpu_register_vcpu_info {
     uint32_t offset; /* offset within page */
     uint32_t rsvd;   /* unused */
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
 typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);

--- head-2010-05-25.orig/lib/swiotlb-xen.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-05-25/lib/swiotlb-xen.c	2010-03-24 15:12:36.000000000 +0100
@@ -20,6 +20,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
 #include <linux/highmem.h>
 #include <asm/io.h>
 #include <asm/pci.h>
@@ -288,15 +289,6 @@ __sync_single(struct phys_addr buffer, c
 	}
 }

-static inline unsigned int is_span_boundary(unsigned int index,
-					    unsigned int nslots,
-					    unsigned long offset_slots,
-					    unsigned long max_slots)
-{
-	unsigned long offset = (offset_slots + index) & (max_slots - 1);
-	return offset + nslots > max_slots;
-}
-
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -335,61 +327,53 @@ map_single(struct device *hwdev, struct
 	 * request and allocate a buffer from that IO TLB pool.
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
-	{
-		index = ALIGN(io_tlb_index, stride);
-		if (index >= iotlb_nslabs)
-			index = 0;
-		wrap = index;
+	index = ALIGN(io_tlb_index, stride);
+	if (index >= iotlb_nslabs)
+		index = 0;
+	wrap = index;

-		do {
-			while (is_span_boundary(index, nslots, offset_slots,
-						max_slots)) {
-				index += stride;
-				if (index >= iotlb_nslabs)
-					index = 0;
-				if (index == wrap)
-					goto not_found;
-			}
+	do {
+		while (iommu_is_span_boundary(index, nslots, offset_slots,
+					      max_slots)) {
+			index += stride;
+			if (index >= iotlb_nslabs)
+				index = 0;
+			if (index == wrap)
+				goto not_found;
+		}
+
+		/*
+		 * If we find a slot that indicates we have 'nslots' number of
+		 * contiguous buffers, we allocate the buffers from that slot
+		 * and mark the entries as '0' indicating unavailable.
+		 */
+		if (io_tlb_list[index] >= nslots) {
+			int count = 0;
+
+			for (i = index; i < (int) (index + nslots); i++)
+				io_tlb_list[i] = 0;
+			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+				io_tlb_list[i] = ++count;
+			dma_addr = iotlb_virt_start + (index << IO_TLB_SHIFT);

 			/*
-			 * If we find a slot that indicates we have 'nslots'
-			 * number of contiguous buffers, we allocate the
-			 * buffers from that slot and mark the entries as '0'
-			 * indicating unavailable.
+			 * Update the indices to avoid searching in the next
+			 * round.
 			 */
-			if (io_tlb_list[index] >= nslots) {
-				int count = 0;
-
-				for (i = index; i < (int)(index + nslots); i++)
-					io_tlb_list[i] = 0;
-				for (i = index - 1;
-				     (OFFSET(i, IO_TLB_SEGSIZE) !=
-				      IO_TLB_SEGSIZE -1) && io_tlb_list[i];
-				     i--)
-					io_tlb_list[i] = ++count;
-				dma_addr = iotlb_virt_start +
-					(index << IO_TLB_SHIFT);
-
-				/*
-				 * Update the indices to avoid searching in
-				 * the next round.
-				 */
-				io_tlb_index =
-					((index + nslots) < iotlb_nslabs
-					 ? (index + nslots) : 0);
+			io_tlb_index = ((index + nslots) < iotlb_nslabs
+					? (index + nslots) : 0);

-				goto found;
-			}
-			index += stride;
-			if (index >= iotlb_nslabs)
-				index = 0;
-		} while (index != wrap);
+			goto found;
+		}
+		index += stride;
+		if (index >= iotlb_nslabs)
+			index = 0;
+	} while (index != wrap);

-  not_found:
-		spin_unlock_irqrestore(&io_tlb_lock, flags);
-		return NULL;
-	}
-  found:
+not_found:
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	return NULL;
+found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);

 	/*
@@ -502,11 +486,13 @@ swiotlb_full(struct device *dev, size_t
  * Once the device is given the dma address, the device owns this memory until
  * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
  */
-dma_addr_t
-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
-{
-	dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) +
-			      offset_in_page(ptr);
+static dma_addr_t
+_swiotlb_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
+			 int dir, struct dma_attrs *attrs)
+{
+	struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+	dma_addr_t dev_addr = gnttab_dma_map_page(page) +
+			      offset_in_page(paddr);
 	void *map;
 	struct phys_addr buffer;

@@ -517,7 +503,7 @@ swiotlb_map_single(struct device *hwdev,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!range_straddles_page_boundary(__pa(ptr), size) &&
+	if (!range_straddles_page_boundary(paddr, size) &&
 	    !address_needs_mapping(hwdev, dev_addr))
 		return dev_addr;

@@ -525,8 +511,8 @@ swiotlb_map_single(struct device *hwdev,
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
 	gnttab_dma_unmap_page(dev_addr);
-	buffer.page   = virt_to_page(ptr);
-	buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
+	buffer.page   = page;
+	buffer.offset = offset_in_page(paddr);
 	map = map_single(hwdev, buffer, size, dir);
 	if (!map) {
 		swiotlb_full(hwdev, size, dir, 1);
@@ -537,6 +523,26 @@ swiotlb_map_single(struct device *hwdev,
 	return dev_addr;
 }

+dma_addr_t
+swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
+			 int dir, struct dma_attrs *attrs)
+{
+	return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, attrs);
+}
+EXPORT_SYMBOL(swiotlb_map_single_attrs);
+
+dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
+{
+	return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
+}
+
+dma_addr_t
+swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
+{
+	return _swiotlb_map_single(hwdev, paddr, size, dir, NULL);
+}
+
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
  * match what was provided for in a previous swiotlb_map_single call.  All
@@ -546,8 +552,8 @@ swiotlb_map_single(struct device *hwdev,
  * whatever the device wrote there.
  */
 void
-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
-		     int dir)
+swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
+			   size_t size, int dir, struct dma_attrs *attrs)
 {
 	BUG_ON(dir == DMA_NONE);
 	if (in_swiotlb_aperture(dev_addr))
@@ -555,7 +561,14 @@ swiotlb_unmap_single(struct device *hwde
 	else
 		gnttab_dma_unmap_page(dev_addr);
 }
+EXPORT_SYMBOL(swiotlb_unmap_single_attrs);

+void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
+		     int dir)
+{
+	return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
+}
 /*
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
@@ -584,6 +597,26 @@ swiotlb_sync_single_for_device(struct de
 		sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
 }

+void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				  unsigned long offset, size_t size, int dir)
+{
+	BUG_ON(dir == DMA_NONE);
+	if (in_swiotlb_aperture(dev_addr))
+		sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
+}
+
+void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				     unsigned long offset, size_t size, int dir)
+{
+	BUG_ON(dir == DMA_NONE);
+	if (in_swiotlb_aperture(dev_addr))
+		sync_single(hwdev, bus_to_virt(dev_addr + offset), size, dir);
+}
+
+void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
+			    struct dma_attrs *);
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
  * This is the scatter-gather version of the above swiotlb_map_single
@@ -601,8 +634,8 @@ swiotlb_sync_single_for_device(struct de
  * same here.
  */
 int
-swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-	       int dir)
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		     int dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	struct phys_addr buffer;
@@ -626,7 +659,8 @@ swiotlb_map_sg(struct device *hwdev, str
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
 				swiotlb_full(hwdev, sg->length, dir, 0);
-				swiotlb_unmap_sg(hwdev, sgl, i, dir);
+				swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+						       attrs);
 				sgl[0].dma_length = 0;
 				return 0;
 			}
@@ -637,14 +671,22 @@ swiotlb_map_sg(struct device *hwdev, str
 	}
 	return nelems;
 }
+EXPORT_SYMBOL(swiotlb_map_sg_attrs);
+
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+	       int dir)
+{
+	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}

 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
  * concerning calls here are the same as for swiotlb_unmap_single() above.
  */
 void
-swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		 int dir)
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+		       int nelems, int dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -659,6 +701,14 @@ swiotlb_unmap_sg(struct device *hwdev, s
 			gnttab_dma_unmap_page(sg->dma_address);
 	}
 }
+EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
+
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		 int dir)
+{
+	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}

 /*
  * Make physical memory consistent for a set of streaming mode DMA translations
@@ -699,46 +749,6 @@ swiotlb_sync_sg_for_device(struct device
 	}
 }

-#ifdef CONFIG_HIGHMEM
-
-dma_addr_t
-swiotlb_map_page(struct device *hwdev, struct page *page,
-		 unsigned long offset, size_t size,
-		 enum dma_data_direction direction)
-{
-	struct phys_addr buffer;
-	dma_addr_t dev_addr;
-	char *map;
-
-	dev_addr = gnttab_dma_map_page(page) + offset;
-	if (address_needs_mapping(hwdev, dev_addr)) {
-		gnttab_dma_unmap_page(dev_addr);
-		buffer.page   = page;
-		buffer.offset = offset;
-		map = map_single(hwdev, buffer, size, direction);
-		if (!map) {
-			swiotlb_full(hwdev, size, direction, 1);
-			map = io_tlb_overflow_buffer;
-		}
-		dev_addr = (dma_addr_t)virt_to_bus(map);
-	}
-
-	return dev_addr;
-}
-
-void
-swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
-		   size_t size, enum dma_data_direction direction)
-{
-	BUG_ON(direction == DMA_NONE);
-	if (in_swiotlb_aperture(dma_address))
-		unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
-	else
-		gnttab_dma_unmap_page(dma_address);
-}
-
-#endif
-
 int
 swiotlb_dma_mapping_error(dma_addr_t dma_addr)
 {