From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 2.6.29
Patch-mainline: 2.6.29

 This patch contains the differences between 2.6.28 and 2.6.29.

Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches.py

--- head-2010-04-29.orig/arch/x86/Kconfig	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/Kconfig	2010-03-24 15:17:58.000000000 +0100
@@ -331,7 +331,6 @@ config X86_XEN
 	select X86_PAE
 	select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
 	select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
-	select SWIOTLB
 	help
 	  Choose this option if you plan to run this kernel on top of the
 	  Xen Hypervisor.
@@ -369,7 +368,6 @@ config X86_64_XEN
 	bool "Enable Xen compatible kernel"
 	depends on X86_64
 	select XEN
-	select SWIOTLB
 	help
 	  This option will compile a kernel compatible with Xen hypervisor
 
@@ -819,7 +817,7 @@ config AMD_IOMMU_STATS
 
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
-	def_bool y if X86_64
+	def_bool y if X86_64 || XEN
 	---help---
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU (e.g. the current generation
@@ -925,7 +923,7 @@ config X86_XEN_GENAPIC
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
 	default n
-	depends on X86_IO_APIC
+	depends on X86_IO_APIC && !XEN
 	---help---
 	  This option enables a workaround that fixes a source of
 	  spurious interrupts. This is recommended when threaded
--- head-2010-04-29.orig/arch/x86/Makefile	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/Makefile	2010-03-24 15:17:58.000000000 +0100
@@ -156,8 +156,8 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdi
 PHONY += bzImage vmlinuz $(BOOT_TARGETS)
 
 ifdef CONFIG_XEN
-KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
-	-I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS)
+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+	-I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
 
 ifdef CONFIG_X86_64
 LDFLAGS_vmlinux := -e startup_64
--- head-2010-04-29.orig/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/ia32/ia32entry-xen.S	2010-03-24 15:17:58.000000000 +0100
@@ -363,9 +363,9 @@ ENTRY(ia32_syscall)
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
-ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls-1),%eax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
+	ja ia32_badsys
+ia32_do_call:
 	IA32_ARG_FIXUP
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
@@ -380,7 +380,9 @@ ia32_tracesys:			 
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
-	jmp ia32_do_syscall
+	cmpl $(IA32_NR_syscalls-1),%eax
+	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
+	jmp ia32_do_call
 END(ia32_syscall)
 
 ia32_badsys:
--- head-2010-04-29.orig/arch/x86/include/asm/hw_irq.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/asm/hw_irq.h	2010-03-24 15:17:58.000000000 +0100
@@ -136,7 +136,9 @@ extern irqreturn_t smp_call_function_sin
 #endif
 #endif
 
+#ifndef CONFIG_XEN
 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+#endif
 
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
--- head-2010-04-29.orig/arch/x86/include/asm/hypervisor.h	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/include/asm/hypervisor.h	2010-03-24 15:17:58.000000000 +0100
@@ -24,3 +24,7 @@ extern void init_hypervisor(struct cpuin
 extern void init_hypervisor_platform(void);
 
 #endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include_next <asm/hypervisor.h>
+#endif
--- head-2010-04-29.orig/arch/x86/include/asm/kexec.h	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/include/asm/kexec.h	2010-03-24 15:17:58.000000000 +0100
@@ -12,13 +12,10 @@
 /*
  * The hypervisor interface implicitly requires that all entries (except
  * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+#  define VA_PGD		3
  */
-#  define PA_PMD_0		8
-#  define VA_PMD_0		9
-#  define PA_PMD_1		10
-#  define VA_PMD_1		11
-#  define PA_SWAP_PAGE		12
-#  define PAGES_NR		13
+#  define PA_SWAP_PAGE		4
+#  define PAGES_NR		5
 # endif /* CONFIG_XEN */
 #else
 # define PA_CONTROL_PAGE	0
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/desc.h	2010-03-24 15:17:58.000000000 +0100
@@ -342,16 +342,14 @@ static inline void set_intr_gate(unsigne
 	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }
 
-#define SYS_VECTOR_FREE		0
-#define SYS_VECTOR_ALLOCED	1
-
 extern int first_system_vector;
-extern char system_vectors[];
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
 
 static inline void alloc_system_vector(int vector)
 {
-	if (system_vectors[vector] == SYS_VECTOR_FREE) {
-		system_vectors[vector] = SYS_VECTOR_ALLOCED;
+	if (!test_bit(vector, used_vectors)) {
+		set_bit(vector, used_vectors);
 		if (first_system_vector > vector)
 			first_system_vector = vector;
 	} else
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/fixmap_64.h	2010-03-24 15:17:58.000000000 +0100
@@ -16,7 +16,6 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
-#include <asm/efi.h>
 #include <asm/acpi.h>
 
 /*
@@ -52,11 +51,6 @@ enum fixed_addresses {
 	FIX_ISAMAP_END,
 	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
 #endif
-#ifdef CONFIG_EFI
-	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-				  + MAX_EFI_IO_PAGES - 1,
-#endif
 #ifdef CONFIG_PARAVIRT
 	FIX_PARAVIRT_BOOTMAP,
 #else
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 17:05:09.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/highmem.h	2010-03-24 17:05:16.000000000 +0100
@@ -79,6 +79,7 @@ static inline void clear_user_highpage(s
 	clear_highpage(page);
 }
 #define __HAVE_ARCH_CLEAR_HIGHPAGE
+#define clear_user_highpage clear_user_highpage
 #define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
 
 void copy_highpage(struct page *to, struct page *from);
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/hypervisor.h	2010-03-24 15:17:58.000000000 +0100
@@ -69,6 +69,8 @@ extern start_info_t *xen_start_info;
 #define is_initial_xendomain() 0
 #endif
 
+#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN))
+
 struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
 
 /* arch/xen/kernel/evtchn.c */
@@ -139,7 +141,7 @@ void scrub_pages(void *, unsigned int);
 
 DECLARE_PER_CPU(bool, xen_lazy_mmu);
 
-int xen_multicall_flush(bool);
+void xen_multicall_flush(bool);
 
 int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
 					     unsigned long flags);
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/io.h	2010-03-24 15:17:58.000000000 +0100
@@ -4,6 +4,7 @@
 #define ARCH_HAS_IOREMAP_WC
 
 #include <linux/compiler.h>
+#include <asm-generic/int-ll64.h>
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
@@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned
 #define mmiowb() barrier()
 
 #ifdef CONFIG_X86_64
+
 build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
 build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
 
-#define readq_relaxed(a) __readq(a)
-#define __raw_readq __readq
-#define __raw_writeq writeq
-
-/* Let people know we have them */
-#define readq readq
-#define writeq writeq
+#else
+
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+	const volatile u32 __iomem *p = addr;
+	u32 low, high;
+
+	low = readl(p);
+	high = readl(p + 1);
+
+	return low + ((u64)high << 32);
+}
+
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, addr+4);
+}
+
 #endif
 
-extern int iommu_bio_merge;
+#define readq_relaxed(a)	readq(a)
+
+#define __raw_readq(a)		readq(a)
+#define __raw_writeq(val, addr)	writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq			readq
+#define writeq			writeq
 
 #define native_io_delay xen_io_delay
 
@@ -120,7 +139,6 @@ extern void __iomem *ioremap_wc(unsigned
  * A boot-time mapping is currently limited to at most 16 pages.
  */
 extern void early_ioremap_init(void);
-extern void early_ioremap_clear(void);
 extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/irq_vectors.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/irq_vectors.h	2010-03-24 15:17:58.000000000 +0100
@@ -24,6 +24,8 @@
 #define LAST_VM86_IRQ		15
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY		16
+
 /*
  * The flat IRQ space is divided into two regions:
  *  1. A one-to-one mapping of real physical IRQs. This space is only used
@@ -36,8 +38,10 @@
 
 #define PIRQ_BASE		0
 #if defined(NR_CPUS) && defined(MAX_IO_APICS)
-# if NR_CPUS < MAX_IO_APICS
+# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS
 #  define NR_PIRQS		(NR_VECTORS + 32 * NR_CPUS)
+# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS
+#  define NR_PIRQS		(NR_VECTORS + 8 * NR_CPUS)
 # else
 #  define NR_PIRQS		(NR_VECTORS + 32 * MAX_IO_APICS)
 # endif
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/mmu_context_32.h	2010-03-24 15:17:58.000000000 +0100
@@ -3,10 +3,9 @@
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-#if 0 /* XEN: no lazy tlb */
-	unsigned cpu = smp_processor_id();
-	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
-		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
+		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
 #endif
 }
 
@@ -38,9 +37,9 @@ static inline void switch_mm(struct mm_s
 
 		/* stop flush ipis for the previous mm */
 		cpu_clear(cpu, prev->cpu_vm_mask);
-#if 0 /* XEN: no lazy tlb */
-		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
-		per_cpu(cpu_tlbstate, cpu).active_mm = next;
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+		x86_write_percpu(cpu_tlbstate.active_mm, next);
 #endif
 		cpu_set(cpu, next->cpu_vm_mask);
 
@@ -62,10 +61,10 @@ static inline void switch_mm(struct mm_s
 
 		BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
 	}
-#if 0 /* XEN: no lazy tlb */
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
 	else {
-		per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
-		BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
 
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pci.h	2010-03-24 15:17:58.000000000 +0100
@@ -22,6 +22,8 @@ struct pci_sysdata {
 };
 
 extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
 
 /* scan a bus after allocating a pci_sysdata for it */
 extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -88,6 +90,8 @@ static inline void pci_dma_burst_advice(
 static inline void early_quirks(void) { }
 #endif
 
+extern void pci_iommu_alloc(void);
+
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_32
@@ -104,9 +108,9 @@ static inline void early_quirks(void) { 
 
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
-static inline int __pcibus_to_node(struct pci_bus *bus)
+static inline int __pcibus_to_node(const struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
+	const struct pci_sysdata *sd = bus->sysdata;
 
 	return sd->node;
 }
@@ -115,6 +119,12 @@ static inline cpumask_t __pcibus_to_cpum
 {
 	return node_to_cpumask(__pcibus_to_node(bus));
 }
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+	return cpumask_of_node(__pcibus_to_node(bus));
+}
 #endif
 
 #endif /* _ASM_X86_PCI_H */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable.h	2010-03-24 15:17:58.000000000 +0100
@@ -22,12 +22,10 @@
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
-
-/* set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
-
-/* if the user mapped it with PROT_NONE; pte_present gives true */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -176,8 +174,19 @@ extern unsigned int __kernel_page_user;
 #define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
 #endif
 
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot)					\
+	((boot_cpu_data.x86 > 3)				\
+	 ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS))	\
+	 : (prot))
+
 #ifndef __ASSEMBLY__
 
+#define pgprot_writecombine	pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -309,41 +318,43 @@ static inline pte_t pte_mkspecial(pte_t 
 
 extern pteval_t __supported_pte_mask;
 
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+/*
+ * Mask out unsupported bits in a present pgprot.  Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
 {
-	pgprotval_t prot = pgprot_val(pgprot);
+	pgprotval_t protval = pgprot_val(pgprot);
+
+	if (protval & _PAGE_PRESENT)
+		protval &= __supported_pte_mask;
 
-	if (prot & _PAGE_PRESENT)
-		prot &= __supported_pte_mask;
-	return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
+	return protval;
 }
 
-static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
-	pgprotval_t prot = pgprot_val(pgprot);
+	return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
 
-	if (prot & _PAGE_PRESENT)
-		prot &= __supported_pte_mask;
-	return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
+static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) |
+			massage_pgprot(pgprot));
 }
 
 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
-	pgprotval_t prot = pgprot_val(pgprot);
-
-	if (prot & _PAGE_PRESENT)
-		prot &= __supported_pte_mask;
-	return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
+	return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	pgprotval_t prot = pgprot_val(newprot);
 	pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
 
-	if (prot & _PAGE_PRESENT)
-		prot &= __supported_pte_mask;
-	val |= prot & ~_PAGE_CHG_MASK;
+	val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
 
 	return __pte(val);
 }
@@ -359,11 +370,33 @@ static inline pgprot_t pgprot_modify(pgp
 
 #define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
 
-#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
-				 ? pgprot_val(p) & __supported_pte_mask \
-				 : pgprot_val(p))
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(unsigned long flags,
+						unsigned long new_flags)
+{
+	/*
+	 * Certain new memtypes are not allowed with certain
+	 * requested memtype:
+	 * - request is uncached, return cannot be write-back
+	 * - request is write-combine, return cannot be write-back
+	 */
+	if ((flags == _PAGE_CACHE_UC_MINUS &&
+	     new_flags == _PAGE_CACHE_WB) ||
+	    (flags == _PAGE_CACHE_WC &&
+	     new_flags == _PAGE_CACHE_WB)) {
+		return 0;
+	}
+
+	return 1;
+}
 
 #ifndef __ASSEMBLY__
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 struct file;
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable-3level.h	2010-03-24 15:17:58.000000000 +0100
@@ -151,6 +151,7 @@ static inline int pte_none(pte_t pte)
 #define PTE_FILE_MAX_BITS       32
 
 /* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
 #define __swp_offset(x)			((x).val >> 5)
 #define __swp_entry(type, offset)	((swp_entry_t){(type) | (offset) << 5})
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_32.h	2010-03-24 15:17:58.000000000 +0100
@@ -107,15 +107,6 @@ extern unsigned long pg0[];
 #endif
 
 /*
- * Macro to mark a page protection value as "uncacheable".
- * On processors which do not support it, this is a no-op.
- */
-#define pgprot_noncached(prot)					\
-	((boot_cpu_data.x86 > 3)				\
-	 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))	\
-	 : (prot))
-
-/*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  */
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/pgtable_64.h	2010-03-24 15:17:58.000000000 +0100
@@ -149,8 +149,8 @@ static inline void xen_pgd_clear(pgd_t *
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-
-#define MAXMEM		 _AC(0x000004ffffffffff, UL)
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
@@ -183,12 +183,6 @@ static inline int pmd_bad(pmd_t pmd)
 #define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
 
 /*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot)					\
-	(__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
-
-/*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  */
@@ -270,6 +264,8 @@ static inline int pud_large(pud_t pte)
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
 #endif
 
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
 #define __swp_type(x)			(((x).val >> (_PAGE_BIT_PRESENT + 1)) \
 					 & ((1U << SWP_TYPE_BITS) - 1))
 #define __swp_offset(x)			((x).val >> SWP_OFFSET_SHIFT)
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/processor.h	2010-03-24 15:17:58.000000000 +0100
@@ -111,6 +111,7 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
+	unsigned int		x86_hyper_vendor;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -124,6 +125,10 @@ struct cpuinfo_x86 {
 
 #define X86_VENDOR_UNKNOWN	0xff
 
+#define X86_HYPER_VENDOR_NONE  0
+#define X86_HYPER_VENDOR_VMWARE 1
+#define X86_HYPER_VENDOR_XEN   'X'
+
 /*
  * capabilities of CPUs
  */
@@ -354,7 +359,7 @@ struct i387_soft_struct {
 	u8			no_update;
 	u8			rm;
 	u8			alimit;
-	struct info		*info;
+	struct math_emu_info	*info;
 	u32			entry_eip;
 };
 
@@ -695,6 +700,19 @@ extern void switch_to_new_gdt(void);
 extern void cpu_init(void);
 extern void init_gdt(int cpu);
 
+static inline unsigned long get_debugctlmsr(void)
+{
+    unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+    return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/smp.h	2010-03-24 15:17:58.000000000 +0100
@@ -18,9 +18,26 @@
 #include <asm/pda.h>
 #include <asm/thread_info.h>
 
+#ifdef CONFIG_X86_64
+
+#define cpu_callin_mask cpu_possible_mask
+#define cpu_callout_mask cpu_possible_mask
+extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_sibling_setup_mask;
+
+#else /* CONFIG_X86_32 */
+
+#define cpu_callin_map cpu_possible_map
 #define cpu_callout_map cpu_possible_map
 extern cpumask_t cpu_initialized;
-#define cpu_callin_map cpu_possible_map
+extern cpumask_t cpu_sibling_setup_map;
+
+#define cpu_callin_mask		((struct cpumask *)&cpu_callin_map)
+#define cpu_callout_mask	((struct cpumask *)&cpu_callout_map)
+#define cpu_initialized_mask	((struct cpumask *)&cpu_initialized)
+#define cpu_sibling_setup_mask	((struct cpumask *)&cpu_sibling_setup_map)
+
+#endif /* CONFIG_X86_32 */
 
 extern void (*mtrr_hook)(void);
 extern void zap_low_mappings(void);
@@ -29,7 +46,6 @@ extern int __cpuinit get_local_pda(int c
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
-extern cpumask_t cpu_initialized;
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
@@ -38,6 +54,16 @@ DECLARE_PER_CPU(u16, cpu_llc_id);
 DECLARE_PER_CPU(int, cpu_number);
 #endif
 
+static inline struct cpumask *cpu_sibling_mask(int cpu)
+{
+	return &per_cpu(cpu_sibling_map, cpu);
+}
+
+static inline struct cpumask *cpu_core_mask(int cpu)
+{
+	return &per_cpu(cpu_core_map, cpu);
+}
+
 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
 
@@ -64,7 +90,7 @@ struct smp_ops {
 	void (*cpu_die)(unsigned int cpu);
 	void (*play_dead)(void);
 
-	void (*send_call_func_ipi)(cpumask_t mask);
+	void (*send_call_func_ipi)(const struct cpumask *mask);
 	void (*send_call_func_single_ipi)(int cpu);
 };
 
@@ -125,7 +151,7 @@ static inline void arch_send_call_functi
 
 static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-	smp_ops.send_call_func_ipi(mask);
+	smp_ops.send_call_func_ipi(&mask);
 }
 
 void cpu_disable_common(void);
@@ -144,13 +170,13 @@ extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 void xen_smp_send_stop(void);
 void xen_smp_send_reschedule(int cpu);
-void xen_send_call_func_ipi(cpumask_t mask);
+void xen_send_call_func_ipi(const struct cpumask *mask);
 void xen_send_call_func_single_ipi(int cpu);
 
 #define smp_send_stop		xen_smp_send_stop
 #define smp_send_reschedule	xen_smp_send_reschedule
 #define arch_send_call_function_single_ipi	xen_send_call_func_single_ipi
-#define arch_send_call_function_ipi		xen_send_call_func_ipi
+#define arch_send_call_function_ipi(m)		xen_send_call_func_ipi(&(m))
 
 void play_dead(void);
 
@@ -164,7 +190,7 @@ void smp_store_cpu_info(int id);
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
 {
-	return cpus_weight(cpu_callout_map);
+	return cpumask_weight(cpu_callout_mask);
 }
 #else
 static inline void prefill_possible_map(void)
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/spinlock.h	2010-03-24 15:17:58.000000000 +0100
@@ -337,6 +337,7 @@ static inline int __raw_spin_is_contende
 {
 	return __raw_spin(is_contended)(lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
--- head-2010-04-29.orig/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/include/mach-xen/asm/system.h	2010-03-24 15:17:58.000000000 +0100
@@ -18,12 +18,12 @@
 # define AT_VECTOR_SIZE_ARCH 1
 #endif
 
-#ifdef CONFIG_X86_32
-
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next);
 
+#ifdef CONFIG_X86_32
+
 /*
  * Saving eflags is important. It switches not only IOPL between tasks,
  * it also protects other tasks from NT leaking through sysenter etc.
@@ -298,6 +298,8 @@ extern void free_init_pages(char *what, 
 
 void xen_idle(void);
 
+void stop_this_cpu(void *dummy);
+
 /*
  * Force strict CPU ordering.
  * And yes, this is required on UP too when we're talking
--- head-2010-04-29.orig/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/acpi/sleep-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -163,6 +163,8 @@ static int __init acpi_sleep_setup(char 
 #ifdef CONFIG_HIBERNATION
 		if (strncmp(str, "s4_nohwsig", 10) == 0)
 			acpi_no_s4_hw_signature();
+		if (strncmp(str, "s4_nonvs", 8) == 0)
+			acpi_s4_no_nvs();
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
--- head-2010-04-29.orig/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/apic/apic-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -32,7 +32,7 @@ static int __init apic_set_verbosity(cha
 	else if (strcmp("verbose", arg) == 0)
 		apic_verbosity = APIC_VERBOSE;
 	else {
-		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+		pr_warning("APIC Verbosity level %s not recognised"
 			" use apic=verbose or apic=debug\n", arg);
 		return -EINVAL;
 	}
--- head-2010-04-29.orig/arch/x86/kernel/cpu/Makefile	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/arch/x86/kernel/cpu/Makefile	2010-03-24 15:17:58.000000000 +0100
@@ -34,6 +34,8 @@ obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
+disabled-obj-$(CONFIG_XEN) := hypervisor.o vmware.o
+
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
 
--- head-2010-04-29.orig/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/cpu/common-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -38,17 +38,41 @@
 #include <asm/proto.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
+#include <asm/hypervisor.h>
 
 #ifdef CONFIG_XEN
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC)
 #define phys_pkg_id(a,b) a
 #endif
-#include <asm/hypervisor.h>
 #include <xen/interface/callback.h>
 #endif
 
 #include "cpu.h"
 
+#ifdef CONFIG_X86_64
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+#ifndef CONFIG_XEN
+cpumask_var_t cpu_callin_mask;
+cpumask_var_t cpu_callout_mask;
+#endif
+cpumask_var_t cpu_initialized_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+
+#else /* CONFIG_X86_32 */
+
+#ifndef CONFIG_XEN
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+#endif
+cpumask_t cpu_initialized;
+cpumask_t cpu_sibling_setup_map;
+
+#endif /* CONFIG_X86_32 */
+
+
 static struct cpu_dev *this_cpu __cpuinitdata;
 
 #ifdef CONFIG_X86_64
@@ -377,7 +401,7 @@ void __cpuinit detect_ht(struct cpuinfo_
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
 	} else if (smp_num_siblings > 1) {
 
-		if (smp_num_siblings > NR_CPUS) {
+		if (smp_num_siblings > nr_cpu_ids) {
 			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
 					smp_num_siblings);
 			smp_num_siblings = 1;
@@ -728,6 +752,7 @@ static void __cpuinit identify_cpu(struc
 	detect_ht(c);
 #endif
 
+	init_hypervisor(c);
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -879,8 +904,6 @@ static __init int setup_disablecpuid(cha
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
 #ifdef CONFIG_X86_64
 struct x8664_pda **_cpu_pda __read_mostly;
 EXPORT_SYMBOL(_cpu_pda);
@@ -889,7 +912,7 @@ EXPORT_SYMBOL(_cpu_pda);
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 #endif
 
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
 
 static void __ref switch_pt(int cpu)
 {
@@ -949,8 +972,8 @@ void __cpuinit pda_init(int cpu)
 }
 
 #ifndef CONFIG_X86_NO_TSS
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-			   DEBUG_STKSZ] __page_aligned_bss;
+static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+				  DEBUG_STKSZ] __page_aligned_bss;
 #endif
 
 extern asmlinkage void ignore_sysret(void);
@@ -1038,7 +1061,7 @@ void __cpuinit cpu_init(void)
 
 	me = current;
 
-	if (cpu_test_and_set(cpu, cpu_initialized))
+	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
 		panic("CPU#%d already initialized!\n", cpu);
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1163,7 +1186,7 @@ void __cpuinit cpu_init(void)
 #endif
 	struct thread_struct *thread = &curr->thread;
 
-	if (cpu_test_and_set(cpu, cpu_initialized)) {
+	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
 		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
 		for (;;) local_irq_enable();
 	}
--- head-2010-04-29.orig/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:12:36.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/cpu/mtrr/main-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
 
 struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
 unsigned int num_var_ranges;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 
 static u64 tom2;
 
--- head-2010-04-29.orig/arch/x86/kernel/e820-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/e820-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -719,6 +719,27 @@ void __init e820_mark_nosave_regions(uns
 	}
 }
 #endif
+
+#ifdef CONFIG_HIBERNATION
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_NVS)
+			hibernate_nvs_register(ei->addr, ei->size);
+	}
+
+	return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
 #endif
 
 /*
@@ -734,22 +755,6 @@ struct early_res {
 static struct early_res early_res[MAX_EARLY_RES] __initdata = {
 #ifndef CONFIG_XEN
 	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
-	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	{ PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
-	/*
-	 * Has to be in very low memory so we can execute
-	 * real-mode AP code.
-	 */
-	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
-#endif
 #endif
 	{}
 };
--- head-2010-04-29.orig/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/early_printk-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -904,49 +904,6 @@ static struct console early_dbgp_console
 };
 #endif
 
-/* Console interface to a host file on AMD's SimNow! */
-
-static int simnow_fd;
-
-enum {
-	MAGIC1 = 0xBACCD00A,
-	MAGIC2 = 0xCA110000,
-	XOPEN = 5,
-	XWRITE = 4,
-};
-
-static noinline long simnow(long cmd, long a, long b, long c)
-{
-	long ret;
-
-	asm volatile("cpuid" :
-		     "=a" (ret) :
-		     "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
-	return ret;
-}
-
-static void __init simnow_init(char *str)
-{
-	char *fn = "klog";
-
-	if (*str == '=')
-		fn = ++str;
-	/* error ignored */
-	simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
-}
-
-static void simnow_write(struct console *con, const char *s, unsigned n)
-{
-	simnow(XWRITE, simnow_fd, (unsigned long)s, n);
-}
-
-static struct console simnow_console = {
-	.name =		"simnow",
-	.write =	simnow_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
 static int __initdata early_console_initialized;
@@ -958,7 +915,7 @@ asmlinkage void early_printk(const char 
 	va_list ap;
 
 	va_start(ap, fmt);
-	n = vscnprintf(buf, 512, fmt, ap);
+	n = vscnprintf(buf, sizeof(buf), fmt, ap);
 	early_console->write(early_console, buf, n);
 	va_end(ap);
 }
@@ -991,10 +948,6 @@ static int __init setup_early_printk(cha
 		current_ypos = boot_params.screen_info.orig_y;
 #endif
 		early_console = &early_vga_console;
-	} else if (!strncmp(buf, "simnow", 6)) {
-		simnow_init(buf + 6);
-		early_console = &simnow_console;
-		keep_early = 1;
 #ifdef CONFIG_EARLY_PRINTK_DBGP
 	} else if (!strncmp(buf, "dbgp", 4)) {
 		if (early_dbgp_init(buf+4) < 0)
--- head-2010-04-29.orig/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/entry_32-xen.S	2010-03-24 15:17:58.000000000 +0100
@@ -690,28 +690,37 @@ END(syscall_badsys)
 27:;
 
 /*
- * Build the entry stubs and pointer table with
- * some assembler magic.
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
  */
-.section .rodata,"a"
+.section .init.rodata,"a"
 ENTRY(interrupt)
 .text
-
+	.p2align 5
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
 	RING0_INT_FRAME
-vector=0
-.rept NR_VECTORS
-	ALIGN
- .if vector
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+	.balign 32
+  .rept	7
+    .if vector < NR_VECTORS
+      .if vector <> FIRST_EXTERNAL_VECTOR
 	CFI_ADJUST_CFA_OFFSET -4
- .endif
-1:	pushl $~(vector)
+      .endif
+1:	pushl $(~vector+0x80)	/* Note: always in signed byte range */
 	CFI_ADJUST_CFA_OFFSET 4
-	jmp common_interrupt
- .previous
+      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+	jmp 2f
+      .endif
+      .previous
 	.long 1b
- .text
+      .text
 vector=vector+1
+    .endif
+  .endr
+2:	jmp common_interrupt
 .endr
 END(irq_entries_start)
 
@@ -723,8 +732,9 @@ END(interrupt)
  * the CPU automatically disables interrupts when executing an IRQ vector,
  * so IRQ-flags tracing has to follow that:
  */
-	ALIGN
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
+	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	movl %esp,%eax
@@ -751,68 +761,7 @@ ENDPROC(name)
 
 #else
 #define UNWIND_ESPFIX_STACK
-#endif
-
-KPROBE_ENTRY(page_fault)
-	RING0_EC_FRAME
-	pushl $do_page_fault
-	CFI_ADJUST_CFA_OFFSET 4
-	ALIGN
-error_code:
-	/* the function address is in %fs's slot on the stack */
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET es, 0*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ebx, 0
-	cld
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET fs, 0*/
-	movl $(__KERNEL_PERCPU), %ecx
-	movl %ecx, %fs
-	UNWIND_ESPFIX_STACK
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	/*CFI_REGISTER es, ecx*/
-	movl PT_FS(%esp), %edi		# get the function address
-	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_FS(%esp)
-	/*CFI_REL_OFFSET fs, ES*/
-	movl $(__USER_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
-	TRACE_IRQS_OFF
-	movl %esp,%eax			# pt_regs pointer
-	call *%edi
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(page_fault)
 
-#ifdef CONFIG_XEN
 # A note on the "critical region" in our callback handler.
 # We want to avoid stacking callback handlers due to events occurring
 # during handling of the last event. To do this, we keep events disabled
@@ -981,158 +930,6 @@ ENTRY(device_not_available)
 	CFI_ENDPROC
 END(device_not_available)
 
-#ifndef CONFIG_XEN
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label)		\
-	cmpw $__KERNEL_CS,4(%esp);		\
-	jne ok;					\
-label:						\
-	movl SYSENTER_stack_sp0+offset(%esp),%esp;	\
-	CFI_DEF_CFA esp, 0;			\
-	CFI_UNDEFINED eip;			\
-	pushfl;					\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $__KERNEL_CS;			\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $sysenter_past_esp;		\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	CFI_REL_OFFSET eip, 0
-#endif /* CONFIG_XEN */
-
-KPROBE_ENTRY(debug)
-	RING0_INT_FRAME
-#ifndef CONFIG_XEN
-	cmpl $ia32_sysenter_target,(%esp)
-	jne debug_stack_correct
-	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
-#endif /* !CONFIG_XEN */
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx			# error code 0
-	movl %esp,%eax			# pt_regs pointer
-	call do_debug
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(debug)
-
-#ifndef CONFIG_XEN
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
-	RING0_INT_FRAME
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	je nmi_espfix_stack
-	cmpl $ia32_sysenter_target,(%esp)
-	je nmi_stack_fixup
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	movl %esp,%eax
-	/* Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl $(THREAD_SIZE-1),%eax
-	cmpl $(THREAD_SIZE-20),%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	jae nmi_stack_correct
-	cmpl $ia32_sysenter_target,12(%esp)
-	je nmi_debug_stack_check
-nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	jmp restore_nocheck_notrace
-	CFI_ENDPROC
-
-nmi_stack_fixup:
-	RING0_INT_FRAME
-	FIX_STACK(12,nmi_stack_correct, 1)
-	jmp nmi_stack_correct
-
-nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
-	cmpw $__KERNEL_CS,16(%esp)
-	jne nmi_stack_correct
-	cmpl $debug,(%esp)
-	jb nmi_stack_correct
-	cmpl $debug_esp_fix_insn,(%esp)
-	ja nmi_stack_correct
-	FIX_STACK(24,nmi_stack_correct, 1)
-	jmp nmi_stack_correct
-
-nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
-	 * create the pointer to lss back
-	 */
-	pushl %ss
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %esp
-	CFI_ADJUST_CFA_OFFSET 4
-	addw $4, (%esp)
-	/* copy the iret frame of 12 bytes */
-	.rept 3
-	pushl 16(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
-	.endr
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	FIXUP_ESPFIX_STACK		# %eax == %esp
-	xorl %edx,%edx			# zero error code
-	call do_nmi
-	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
-	jmp irq_return
-	CFI_ENDPROC
-#else
-KPROBE_ENTRY(nmi)
-	RING0_INT_FRAME
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	orl  $NMI_MASK, PT_EFLAGS(%esp)
-	jmp restore_all
-	CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iret
@@ -1148,19 +945,6 @@ ENTRY(native_irq_enable_sysexit)
 END(native_irq_enable_sysexit)
 #endif
 
-KPROBE_ENTRY(int3)
-	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_int3
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(int3)
-
 ENTRY(overflow)
 	RING0_INT_FRAME
 	pushl $0
@@ -1225,14 +1009,6 @@ ENTRY(stack_segment)
 	CFI_ENDPROC
 END(stack_segment)
 
-KPROBE_ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl $do_general_protection
-	CFI_ADJUST_CFA_OFFSET 4
-	jmp error_code
-	CFI_ENDPROC
-KPROBE_END(general_protection)
-
 ENTRY(alignment_check)
 	RING0_EC_FRAME
 	pushl $do_alignment_check
@@ -1292,6 +1068,7 @@ ENTRY(kernel_thread_helper)
 	push %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	call do_exit
+	ud2			# padding for call trace
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
@@ -1303,6 +1080,9 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1317,6 +1097,11 @@ ftrace_call:
 	popl %edx
 	popl %ecx
 	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
@@ -1326,8 +1111,18 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1346,12 +1141,43 @@ trace:
 	popl %edx
 	popl %ecx
 	popl %eax
-
 	jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %edx
+	lea 0x4(%ebp), %eax
+	subl $MCOUNT_INSN_SIZE, %edx
+	call prepare_ftrace_return
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl $0
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	call ftrace_return_to_handler
+	movl %eax, 0xc(%esp)
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+#endif
+
 #include <asm/alternative-asm.h>
 
 	# pv syscall call handler stub
@@ -1485,3 +1311,238 @@ mask=0
 #undef	sys_fork
 #undef	sys_clone
 #undef	sys_vfork
+
+/*
+ * Some functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
+	ALIGN
+error_code:
+	/* the function address is in %fs's slot on the stack */
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
+	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0*/
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	cld
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0*/
+	movl $(__KERNEL_PERCPU), %ecx
+	movl %ecx, %fs
+	UNWIND_ESPFIX_STACK
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_REGISTER es, ecx*/
+	movl PT_FS(%esp), %edi		# get the function address
+	movl PT_ORIG_EAX(%esp), %edx	# get the error code
+	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	mov  %ecx, PT_FS(%esp)
+	/*CFI_REL_OFFSET fs, ES*/
+	movl $(__USER_DS), %ecx
+	movl %ecx, %ds
+	movl %ecx, %es
+	TRACE_IRQS_OFF
+	movl %esp,%eax			# pt_regs pointer
+	call *%edi
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(page_fault)
+
+#ifndef CONFIG_XEN
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)		\
+	cmpw $__KERNEL_CS,4(%esp);		\
+	jne ok;					\
+label:						\
+	movl TSS_sysenter_sp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
+	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $__KERNEL_CS;			\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
+#endif /* CONFIG_XEN */
+
+ENTRY(debug)
+	RING0_INT_FRAME
+#ifndef CONFIG_XEN
+	cmpl $ia32_sysenter_target,(%esp)
+	jne debug_stack_correct
+	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+#endif /* !CONFIG_XEN */
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx			# error code 0
+	movl %esp,%eax			# pt_regs pointer
+	call do_debug
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	RING0_INT_FRAME
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+#ifndef CONFIG_XEN
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	je nmi_espfix_stack
+	cmpl $ia32_sysenter_target,(%esp)
+	je nmi_stack_fixup
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %esp,%eax
+	/* Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl $(THREAD_SIZE-1),%eax
+	cmpl $(THREAD_SIZE-20),%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	jae nmi_stack_correct
+	cmpl $ia32_sysenter_target,12(%esp)
+	je nmi_debug_stack_check
+nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	jmp restore_nocheck_notrace
+	CFI_ENDPROC
+
+nmi_stack_fixup:
+	RING0_INT_FRAME
+	FIX_STACK(12,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
+	cmpw $__KERNEL_CS,16(%esp)
+	jne nmi_stack_correct
+	cmpl $debug,(%esp)
+	jb nmi_stack_correct
+	cmpl $debug_esp_fix_insn,(%esp)
+	ja nmi_stack_correct
+	FIX_STACK(24,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_espfix_stack:
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
+	pushl %ss
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %esp
+	CFI_ADJUST_CFA_OFFSET 4
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	.endr
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to espfix stack
+	CFI_ADJUST_CFA_OFFSET -24
+	jmp irq_return
+#else
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	orl  $NMI_MASK, PT_EFLAGS(%esp)
+	jmp restore_all
+#endif
+	CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_int3
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+	RING0_EC_FRAME
+	pushl $do_general_protection
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+	.popsection
--- head-2010-04-29.orig/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/entry_64-xen.S	2010-03-24 15:17:58.000000000 +0100
@@ -14,15 +14,15 @@
  *
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
- * 
- * Normal syscalls and interrupts don't save a full stack frame, this is 
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
  * only done for syscall tracing, signals or fork/exec et.al.
- * 
- * A note on terminology:	 
- * - top of stack: Architecture defined interrupt frame from SS to RIP 
- * at the top of the kernel process stack.	
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
  * - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved. 
+ * - full stack frame: Like partial stack frame, but all register saved.
  *
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
@@ -65,7 +65,6 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
-
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -73,16 +72,10 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
 
-	/* taken from glibc */
-	subq $0x38, %rsp
-	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
+	MCOUNT_SAVE_FRAME
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
@@ -92,14 +85,13 @@ ENTRY(ftrace_caller)
 ftrace_call:
 	call ftrace_stub
 
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
-	movq (%rsp), %rax
-	addq $0x38, %rsp
+	MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
@@ -108,15 +100,63 @@ END(ftrace_caller)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpq $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
 	retq
 
 trace:
-	/* taken from glibc */
-	subq $0x38, %rsp
+	MCOUNT_SAVE_FRAME
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+	call   *ftrace_trace_function
+
+	MCOUNT_RESTORE_FRAME
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	MCOUNT_SAVE_FRAME
+
+	leaq 8(%rbp), %rdi
+	movq 0x38(%rsp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rsi
+
+	call	prepare_ftrace_return
+
+	MCOUNT_RESTORE_FRAME
+
+	retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+	subq  $80, %rsp
+
 	movq %rax, (%rsp)
 	movq %rcx, 8(%rsp)
 	movq %rdx, 16(%rsp)
@@ -124,13 +164,14 @@ trace:
 	movq %rdi, 32(%rsp)
 	movq %r8, 40(%rsp)
 	movq %r9, 48(%rsp)
+	movq %r10, 56(%rsp)
+	movq %r11, 64(%rsp)
 
-	movq 0x38(%rsp), %rdi
-	movq 8(%rbp), %rsi
-	subq $MCOUNT_INSN_SIZE, %rdi
-
-	call   *ftrace_trace_function
+	call ftrace_return_to_handler
 
+	movq %rax, 72(%rsp)
+	movq 64(%rsp), %r11
+	movq 56(%rsp), %r10
 	movq 48(%rsp), %r9
 	movq 40(%rsp), %r8
 	movq 32(%rsp), %rdi
@@ -138,16 +179,14 @@ trace:
 	movq 16(%rsp), %rdx
 	movq 8(%rsp), %rcx
 	movq (%rsp), %rax
-	addq $0x38, %rsp
+	addq $72, %rsp
+	retq
+#endif
 
-	jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
-#endif	
+#endif
 
 
 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
@@ -162,20 +201,20 @@ END(mcount)
 NMI_MASK = 0x80000000
 	
 /*
- * C code is not supposed to know about undefined top of stack. Every time 
- * a C function with an pt_regs argument is called from the SYSCALL based 
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
  * fast path FIXUP_TOP_OF_STACK is needed.
  * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
  * manipulation.
- */        	
-		
-	/* %rsp:at FRAMEEND */ 
-	.macro FIXUP_TOP_OF_STACK tmp
-	movq    $__USER_CS,CS(%rsp)
-	movq 	$-1,RCX(%rsp)
+ */
+
+	/* %rsp:at FRAMEEND */
+	.macro FIXUP_TOP_OF_STACK tmp offset=0
+	movq $__USER_CS,CS+\offset(%rsp)
+	movq $-1,RCX+\offset(%rsp)
 	.endm
 
-	.macro RESTORE_TOP_OF_STACK tmp,offset=0
+	.macro RESTORE_TOP_OF_STACK tmp offset=0
 	.endm
 
 	.macro FAKE_STACK_FRAME child_rip
@@ -187,7 +226,7 @@ NMI_MASK = 0x80000000
 	pushq %rax /* rsp */
 	CFI_ADJUST_CFA_OFFSET	8
 	CFI_REL_OFFSET	rsp,0
-	pushq $(1<<9) /* eflags - interrupts on */
+	pushq $X86_EFLAGS_IF /* eflags - interrupts on */
 	CFI_ADJUST_CFA_OFFSET	8
 	/*CFI_REL_OFFSET	rflags,0*/
 	pushq $__KERNEL_CS /* cs */
@@ -205,36 +244,80 @@ NMI_MASK = 0x80000000
 	CFI_ADJUST_CFA_OFFSET	-(6*8)
 	.endm
 
-	.macro	CFI_DEFAULT_STACK start=1,adj=0
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+	.macro EMPTY_FRAME start=1 offset=0
 	.if \start
-	CFI_STARTPROC	simple
+	CFI_STARTPROC simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8 - \adj*ARGOFFSET
+	CFI_DEF_CFA rsp,8+\offset
 	.else
-	CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
+	CFI_DEF_CFA_OFFSET 8+\offset
 	.endif
-	.if \adj == 0
-	CFI_REL_OFFSET	r15,R15
-	CFI_REL_OFFSET	r14,R14
-	CFI_REL_OFFSET	r13,R13
-	CFI_REL_OFFSET	r12,R12
-	CFI_REL_OFFSET	rbp,RBP
-	CFI_REL_OFFSET	rbx,RBX
+	.endm
+
+/*
+ * initial frame state for syscall
+ */
+	.macro BASIC_FRAME start=1 offset=0
+	EMPTY_FRAME \start, SS+8+\offset-RIP
+	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+	CFI_REL_OFFSET rsp, RSP+\offset-RIP
+	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+	CFI_REL_OFFSET rip, RIP+\offset-RIP
+	.endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+	.macro INTR_FRAME start=1 offset=0
+	.if \start == 1
+	BASIC_FRAME 1, \offset+2*8
+	CFI_REL_OFFSET rcx, 0+\offset
+	CFI_REL_OFFSET r11, 8+\offset
+	.else
+	BASIC_FRAME \start, \offset
 	.endif
-	CFI_REL_OFFSET	r11,R11 - \adj*ARGOFFSET
-	CFI_REL_OFFSET	r10,R10 - \adj*ARGOFFSET
-	CFI_REL_OFFSET	r9,R9 - \adj*ARGOFFSET
-	CFI_REL_OFFSET	r8,R8 - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI - \adj*ARGOFFSET
-	CFI_REL_OFFSET	rip,RIP - \adj*ARGOFFSET
-	/*CFI_REL_OFFSET	cs,CS - \adj*ARGOFFSET*/
-	/*CFI_REL_OFFSET	rflags,EFLAGS - \adj*ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP - \adj*ARGOFFSET
-	/*CFI_REL_OFFSET	ss,SS - \adj*ARGOFFSET*/
+	.endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+	.macro XCPT_FRAME start=1 offset=0
+	INTR_FRAME \start, RIP+\offset-ORIG_RAX
+	/*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+	.endm
+
+/*
+ * frame that enables calling into C.
+ */
+	.macro PARTIAL_FRAME start=1 offset=0
+	XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET
+	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+	.endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+	.macro DEFAULT_FRAME start=1 offset=0
+	PARTIAL_FRAME \start, R11+\offset-R15
+	CFI_REL_OFFSET rbx, RBX+\offset
+	CFI_REL_OFFSET rbp, RBP+\offset
+	CFI_REL_OFFSET r12, R12+\offset
+	CFI_REL_OFFSET r13, R13+\offset
+	CFI_REL_OFFSET r14, R14+\offset
+	CFI_REL_OFFSET r15, R15+\offset
 	.endm
 
         /*
@@ -264,70 +347,149 @@ NMI_MASK = 0x80000000
 	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
 	.endm
 
+#ifndef CONFIG_XEN
+/* save partial stack frame */
+ENTRY(save_args)
+	XCPT_FRAME
+	cld
+	movq_cfi rdi, RDI+16-ARGOFFSET
+	movq_cfi rsi, RSI+16-ARGOFFSET
+	movq_cfi rdx, RDX+16-ARGOFFSET
+	movq_cfi rcx, RCX+16-ARGOFFSET
+	movq_cfi rax, RAX+16-ARGOFFSET
+	movq_cfi  r8,  R8+16-ARGOFFSET
+	movq_cfi  r9,  R9+16-ARGOFFSET
+	movq_cfi r10, R10+16-ARGOFFSET
+	movq_cfi r11, R11+16-ARGOFFSET
+
+	leaq -ARGOFFSET+16(%rsp),%rdi	/* arg1 for handler */
+	movq_cfi rbp, 8		/* push %rbp */
+	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
+	testl $3, CS(%rdi)
+	je 1f
+	SWAPGS
+	/*
+	 * irqcount is used to check if a CPU is already on an interrupt stack
+	 * or not. While this is essentially redundant with preempt_count it is
+	 * a little cheaper to use a separate counter in the PDA (short of
+	 * moving irq_enter into assembly, which would be too much work)
+	 */
+1:	incl %gs:pda_irqcount
+	jne 2f
+	popq_cfi %rax			/* move return address... */
+	mov %gs:pda_irqstackptr,%rsp
+	EMPTY_FRAME 0
+	pushq_cfi %rbp			/* backlink for unwinder */
+	pushq_cfi %rax			/* ... to the new stack */
+	/*
+	 * We entered an interrupt context - irqs are off:
+	 */
+2:	TRACE_IRQS_OFF
+	ret
+	CFI_ENDPROC
+END(save_args)
+#endif
+
+ENTRY(save_rest)
+	PARTIAL_FRAME 1 REST_SKIP+8
+	movq 5*8+16(%rsp), %r11	/* save return address */
+	movq_cfi rbx, RBX+16
+	movq_cfi rbp, RBP+16
+	movq_cfi r12, R12+16
+	movq_cfi r13, R13+16
+	movq_cfi r14, R14+16
+	movq_cfi r15, R15+16
+	movq %r11, 8(%rsp)	/* return address */
+	FIXUP_TOP_OF_STACK %r11, 16
+	ret
+	CFI_ENDPROC
+END(save_rest)
+
+#ifndef CONFIG_XEN
+/* save complete stack frame */
+ENTRY(save_paranoid)
+	XCPT_FRAME 1 RDI+8
+	cld
+	movq_cfi rdi, RDI+8
+	movq_cfi rsi, RSI+8
+	movq_cfi rdx, RDX+8
+	movq_cfi rcx, RCX+8
+	movq_cfi rax, RAX+8
+	movq_cfi r8, R8+8
+	movq_cfi r9, R9+8
+	movq_cfi r10, R10+8
+	movq_cfi r11, R11+8
+	movq_cfi rbx, RBX+8
+	movq_cfi rbp, RBP+8
+	movq_cfi r12, R12+8
+	movq_cfi r13, R13+8
+	movq_cfi r14, R14+8
+	movq_cfi r15, R15+8
+	movl $1,%ebx
+	movl $MSR_GS_BASE,%ecx
+	rdmsr
+	testl %edx,%edx
+	js 1f	/* negative -> in kernel */
+	SWAPGS
+	xorl %ebx,%ebx
+1:	ret
+	CFI_ENDPROC
+END(save_paranoid)
+#endif
+
 /*
- * A newly forked process directly context switches into this.
- */ 	
-/* rdi:	prev */	
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
 ENTRY(ret_from_fork)
-	CFI_DEFAULT_STACK
+	DEFAULT_FRAME
+
 	push kernel_eflags(%rip)
 	CFI_ADJUST_CFA_OFFSET 8
-	popf				# reset kernel eflags
+	popf					# reset kernel eflags
 	CFI_ADJUST_CFA_OFFSET -8
-	call schedule_tail
+
+	call schedule_tail			# rdi: 'prev' task parameter
+
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
-	jnz rff_trace
-rff_action:	
+
+	CFI_REMEMBER_STATE
 	RESTORE_REST
-	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
+
+	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
 	je   int_ret_from_sys_call
-	testl $_TIF_IA32,TI_flags(%rcx)
+
+	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
 	jnz  int_ret_from_sys_call
-	RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
-	jmp ret_from_sys_call
-rff_trace:
-	movq %rsp,%rdi
-	call syscall_trace_leave
-	GET_THREAD_INFO(%rcx)	
-	jmp rff_action
+
+	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+	jmp ret_from_sys_call			# go to the SYSRET fastpath
+
+	CFI_RESTORE_STATE
 	CFI_ENDPROC
 END(ret_from_fork)
 
 /*
- * initial frame state for interrupts and exceptions
- */
-	.macro _frame ref
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA rsp,SS+8-\ref
-	/*CFI_REL_OFFSET ss,SS-\ref*/
-	CFI_REL_OFFSET rsp,RSP-\ref
-	/*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
-	/*CFI_REL_OFFSET cs,CS-\ref*/
-	CFI_REL_OFFSET rip,RIP-\ref
-	.endm
-
-/*
  * System call entry. Upto 6 arguments in registers are supported.
  *
  * SYSCALL does not save anything on the stack and does not change the
  * stack pointer.
  */
-		
+
 /*
- * Register setup:	
+ * Register setup:
  * rax  system call number
  * rdi  arg0
- * rcx  return address for syscall/sysret, C arg3 
+ * rcx  return address for syscall/sysret, C arg3
  * rsi  arg1
- * rdx  arg2	
+ * rdx  arg2
  * r10  arg3 	(--> moved to rcx for C)
  * r8   arg4
  * r9   arg5
  * r11  eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched. 		
- * 
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
  * Interrupts are enabled on entry.
  * Only called from user space.
  *
@@ -337,10 +499,10 @@ END(ret_from_fork)
  * When user can change the frames always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
- */ 			 		
+ */
 
 ENTRY(system_call)
-	_frame (RIP-0x10)
+	INTR_FRAME start=2 offset=2*8
 	SAVE_ARGS -8,0
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	GET_THREAD_INFO(%rcx)
@@ -354,19 +516,19 @@ system_call_fastpath:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack. 
- */		
+ * Has incomplete stack frame and undefined top of stack.
+ */
 ret_from_sys_call:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	flagmask */
-sysret_check:		
+sysret_check:
 	LOCKDEP_SYS_EXIT
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
-	jnz  sysret_careful 
+	jnz  sysret_careful
 	CFI_REMEMBER_STATE
 	/*
 	 * sysretq will re-enable interrupts:
@@ -378,7 +540,7 @@ sysret_check:		
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
-	/* edx:	work, edi: workmask */	
+	/* edx:	work, edi: workmask */
 sysret_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
@@ -391,7 +553,7 @@ sysret_careful:
 	CFI_ADJUST_CFA_OFFSET -8
 	jmp sysret_check
 
-	/* Handle a signal */ 
+	/* Handle a signal */
 sysret_signal:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -400,17 +562,20 @@ sysret_signal:
 	jc sysret_audit
 #endif
 	/* edx:	work flags (arg3) */
-	leaq do_notify_resume(%rip),%rax
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
-	call ptregscall_common
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call do_notify_resume
+	RESTORE_TOP_OF_STACK %r11
+	RESTORE_REST
 	movl $_TIF_WORK_MASK,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
-	
+
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
 	jmp ret_from_sys_call
@@ -449,7 +614,7 @@ sysret_audit:
 #endif	/* CONFIG_AUDITSYSCALL */
 
 	/* Do syscall tracing */
-tracesys:			 
+tracesys:
 #ifdef CONFIG_AUDITSYSCALL
 	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jz auditsys
@@ -472,8 +637,8 @@ tracesys:			 
 	call *sys_call_table(,%rax,8)
 	movq %rax,RAX-ARGOFFSET(%rsp)
 	/* Use IRET because user could have changed frame */
-		
-/* 
+
+/*
  * Syscall return path ending with IRET.
  * Has correct top of stack, but partial stack frame.
  */
@@ -521,18 +686,18 @@ int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
-	/* Check for syscall exit trace */	
+	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
-	leaq 8(%rsp),%rdi	# &ptregs -> arg1	
+	leaq 8(%rsp),%rdi	# &ptregs -> arg1
 	call syscall_trace_leave
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
 	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
-	
+
 int_signal:
 	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
@@ -547,22 +712,24 @@ int_restore_rest:
 	jmp int_with_check
 	CFI_ENDPROC
 END(system_call)
-		
-/* 
+
+/*
  * Certain special system calls that need to save a complete full stack frame.
- */ 								
-	
+ */
 	.macro PTREGSCALL label,func,arg
-	.globl \label
-\label:
-	leaq	\func(%rip),%rax
-	leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
-	jmp	ptregscall_common
+ENTRY(\label)
+	PARTIAL_FRAME 1 8		/* offset 8: return address */
+	subq $REST_SKIP, %rsp
+	CFI_ADJUST_CFA_OFFSET REST_SKIP
+	call save_rest
+	DEFAULT_FRAME 0 8		/* offset 8: return address */
+	leaq 8(%rsp), \arg	/* pt_regs pointer */
+	call \func
+	jmp ptregscall_common
+	CFI_ENDPROC
 END(\label)
 	.endm
 
-	CFI_STARTPROC
-
 	PTREGSCALL stub_clone, sys_clone, %r8
 	PTREGSCALL stub_fork, sys_fork, %rdi
 	PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -570,25 +737,18 @@ END(\label)
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 
 ENTRY(ptregscall_common)
-	popq %r11
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_REGISTER rip, r11
-	SAVE_REST
-	movq %r11, %r15
-	CFI_REGISTER rip, r15
-	FIXUP_TOP_OF_STACK %r11
-	call *%rax
-	RESTORE_TOP_OF_STACK %r11
-	movq %r15, %r11
-	CFI_REGISTER rip, r11
-	RESTORE_REST
-	pushq %r11
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rip, 0
-	ret
+	DEFAULT_FRAME 1 8	/* offset 8: return address */
+	RESTORE_TOP_OF_STACK %r11, 8
+	movq_cfi_restore R15+8, r15
+	movq_cfi_restore R14+8, r14
+	movq_cfi_restore R13+8, r13
+	movq_cfi_restore R12+8, r12
+	movq_cfi_restore RBP+8, rbp
+	movq_cfi_restore RBX+8, rbx
+	ret $REST_SKIP		/* pop extended registers */
 	CFI_ENDPROC
 END(ptregscall_common)
-	
+
 ENTRY(stub_execve)
 	CFI_STARTPROC
 	popq %r11
@@ -604,11 +764,11 @@ ENTRY(stub_execve)
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
-	
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
- */                
+ */
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
@@ -623,24 +783,12 @@ ENTRY(stub_rt_sigreturn)
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
 
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame (RIP-0x10); \
-	CFI_REL_OFFSET rcx,0; \
-	CFI_REL_OFFSET r11,8
-
-/* initial frame state for exceptions with error code (and interrupts with
-   vector already pushed) */
-#define XCPT_FRAME _frame (RIP-0x18); \
-	CFI_REL_OFFSET rcx,0; \
-	CFI_REL_OFFSET r11,8
-
-/* 
+/*
  * Interrupt exit.
- *
  */ 
 
 retint_with_reschedule:
-	CFI_DEFAULT_STACK adj=1
+	PARTIAL_FRAME
 	movl $_TIF_WORK_MASK,%edi
 retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
@@ -669,20 +817,20 @@ retint_careful:
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
 	call  schedule
-	popq %rdi		
+	popq %rdi
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp retint_check
-	
+
 retint_signal:
 	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_restore_args
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
-	movq $-1,ORIG_RAX(%rsp) 			
+	movq $-1,ORIG_RAX(%rsp)
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
@@ -704,324 +852,132 @@ ENTRY(retint_kernel)
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp retint_kernel       /* check again */
-#endif	
+#endif
 
 	CFI_ENDPROC
 END(retint_check)
-	
+
 #ifndef CONFIG_XEN
 /*
  * APIC interrupts.
- */		
-	.macro apicinterrupt num,func
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
 	INTR_FRAME
 	pushq $~(\num)
 	CFI_ADJUST_CFA_OFFSET 8
-	interrupt \func
+	interrupt \do_sym
 	jmp error_entry
 	CFI_ENDPROC
-	.endm
+END(\sym)
+.endm
 
-ENTRY(thermal_interrupt)
-	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
-
-ENTRY(threshold_interrupt)
-	apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
-
-#ifdef CONFIG_SMP	
-ENTRY(reschedule_interrupt)
-	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
-	.macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
-	apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt	
-END(invalidate_interrupt\num)
-	.endm
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+#endif
+
+apicinterrupt UV_BAU_MESSAGE \
+	uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt LOCAL_TIMER_VECTOR \
+	apic_timer_interrupt smp_apic_timer_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+	invalidate_interrupt0 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+	invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+	invalidate_interrupt2 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+	invalidate_interrupt3 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+	invalidate_interrupt4 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+	invalidate_interrupt5 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+	invalidate_interrupt6 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+	invalidate_interrupt7 smp_invalidate_interrupt
+#endif
 
-	INVALIDATE_ENTRY 0
-	INVALIDATE_ENTRY 1
-	INVALIDATE_ENTRY 2
-	INVALIDATE_ENTRY 3
-	INVALIDATE_ENTRY 4
-	INVALIDATE_ENTRY 5
-	INVALIDATE_ENTRY 6
-	INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
-	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(call_function_single_interrupt)
-	apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
-END(call_function_single_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
-	apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
+apicinterrupt THRESHOLD_APIC_VECTOR \
+	threshold_interrupt mce_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+	thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+	call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+	call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+	reschedule_interrupt smp_reschedule_interrupt
 #endif
 
-ENTRY(apic_timer_interrupt)
-	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
-
-ENTRY(uv_bau_message_intr1)
-	apicinterrupt 220,uv_bau_message_interrupt
-END(uv_bau_message_intr1)
-
-ENTRY(error_interrupt)
-	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
-
-ENTRY(spurious_interrupt)
-	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
+apicinterrupt ERROR_APIC_VECTOR \
+	error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+	spurious_interrupt smp_spurious_interrupt
 #endif /* !CONFIG_XEN */
-				
+
 /*
  * Exception entry points.
- */ 		
-	.macro zeroentry sym
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
 	INTR_FRAME
         movq (%rsp),%rcx
 	CFI_RESTORE rcx
         movq 8(%rsp),%r11
 	CFI_RESTORE r11
-        addq $0x10,%rsp /* skip rcx and r11 */
-	CFI_ADJUST_CFA_OFFSET -0x10
-	pushq $0	/* push error code/oldrax */ 
-	CFI_ADJUST_CFA_OFFSET 8
-	pushq %rax	/* push real oldrax to the rdi slot */ 
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rax,0
-	leaq  \sym(%rip),%rax
-	jmp error_entry
+	movq $-1,8(%rsp)	/* ORIG_RAX: no syscall to restart */
+	subq $(15-1)*8,%rsp
+	CFI_ADJUST_CFA_OFFSET (15-1)*8
+	call error_entry
+	DEFAULT_FRAME 0
+	movq %rsp,%rdi		/* pt_regs pointer */
+	xorl %esi,%esi		/* no error code */
+	call \do_sym
+	jmp error_exit		/* %ebx: no swapgs flag */
 	CFI_ENDPROC
-	.endm	
+END(\sym)
+.endm
+
+.macro paranoidzeroentry sym do_sym
+	zeroentry \sym \do_sym
+.endm
+
+.macro paranoidzeroentry_ist sym do_sym ist
+	zeroentry \sym \do_sym
+.endm
 
-	.macro errorentry sym
+.macro errorentry sym do_sym
+ENTRY(\sym)
 	XCPT_FRAME
         movq (%rsp),%rcx
 	CFI_RESTORE rcx
         movq 8(%rsp),%r11
 	CFI_RESTORE r11
-        addq $0x10,%rsp /* rsp points to the error code */
-	CFI_ADJUST_CFA_OFFSET -0x10
-	pushq %rax
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rax,0
-	leaq  \sym(%rip),%rax
-	jmp error_entry
+	subq $(15-2)*8,%rsp
+	CFI_ADJUST_CFA_OFFSET (15-2)*8
+	call error_entry
+	DEFAULT_FRAME 0
+	movq %rsp,%rdi			/* pt_regs pointer */
+	movq ORIG_RAX(%rsp),%rsi	/* get error code */
+	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	call \do_sym
+	jmp error_exit			/* %ebx: no swapgs flag */
 	CFI_ENDPROC
-	.endm
+END(\sym)
+.endm
 
-#if 0 /* not XEN */
 	/* error code is on the stack already */
-	/* handle NMI like exceptions that can happen everywhere */
-	.macro paranoidentry sym, ist=0, irqtrace=1
-        movq (%rsp),%rcx
-        movq 8(%rsp),%r11
-        addq $0x10,%rsp /* skip rcx and r11 */        
-	SAVE_ALL
-	cld
-#if 0 /* not XEN */
-	movl $1,%ebx
-	movl  $MSR_GS_BASE,%ecx
-	rdmsr
-	testl %edx,%edx
-	js    1f
-	SWAPGS
-	xorl  %ebx,%ebx
-1:
-#endif
-	.if \ist
-	movq	%gs:pda_data_offset, %rbp
-	.endif
-	.if \irqtrace
-	TRACE_IRQS_OFF
-	.endif
-	movq %rsp,%rdi
-	movq ORIG_RAX(%rsp),%rsi
-	movq $-1,ORIG_RAX(%rsp)
-	.if \ist
-	subq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
-	.endif
-	call \sym
-	.if \ist
-	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
-	.endif
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	.if \irqtrace
-	TRACE_IRQS_OFF
-	.endif
-	.endm
-
-	/*
- 	 * "Paranoid" exit path from exception stack.
-  	 * Paranoid because this is used by NMIs and cannot take
-	 * any kernel state for granted.
-	 * We don't do kernel preemption checks here, because only
-	 * NMI should be common and it does not enable IRQs and
-	 * cannot get reschedule ticks.
-	 *
-	 * "trace" is 0 for the NMI handler only, because irq-tracing
-	 * is fundamentally NMI-unsafe. (we cannot change the soft and
-	 * hard flags at once, atomically)
-	 */
-	.macro paranoidexit trace=1
-	/* ebx:	no swapgs flag */
-paranoid_exit\trace:
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore\trace
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace\trace
-paranoid_swapgs\trace:
-	.if \trace
-	TRACE_IRQS_IRETQ 0
-	.endif
-	SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
-	RESTORE_ALL 8
-	jmp irq_return
-paranoid_userspace\trace:
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs\trace
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz paranoid_schedule\trace
-	movl %ebx,%edx			/* arg3: thread flags */
-	.if \trace
-	TRACE_IRQS_ON
-	.endif
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	.if \trace
-	TRACE_IRQS_OFF
-	.endif
-	jmp paranoid_userspace\trace
-paranoid_schedule\trace:
-	.if \trace
-	TRACE_IRQS_ON
-	.endif
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	call schedule
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	.if \trace
-	TRACE_IRQS_OFF
-	.endif
-	jmp paranoid_userspace\trace
-	CFI_ENDPROC
-	.endm
-#endif
+.macro paranoiderrorentry sym do_sym
+	errorentry \sym \do_sym
+.endm
 
 /*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.	
- */ 		  				
-KPROBE_ENTRY(error_entry)
-	_frame RDI
-	CFI_REL_OFFSET rax,0
-	/* rdi slot contains rax, oldrax contains error code */
-	cld	
-	subq  $14*8,%rsp
-	CFI_ADJUST_CFA_OFFSET	(14*8)
-	movq %rsi,13*8(%rsp)
-	CFI_REL_OFFSET	rsi,RSI
-	movq 14*8(%rsp),%rsi	/* load rax from rdi slot */
-	CFI_REGISTER	rax,rsi
-	movq %rdx,12*8(%rsp)
-	CFI_REL_OFFSET	rdx,RDX
-	movq %rcx,11*8(%rsp)
-	CFI_REL_OFFSET	rcx,RCX
-	movq %rsi,10*8(%rsp)	/* store rax */ 
-	CFI_REL_OFFSET	rax,RAX
-	movq %r8, 9*8(%rsp)
-	CFI_REL_OFFSET	r8,R8
-	movq %r9, 8*8(%rsp)
-	CFI_REL_OFFSET	r9,R9
-	movq %r10,7*8(%rsp)
-	CFI_REL_OFFSET	r10,R10
-	movq %r11,6*8(%rsp)
-	CFI_REL_OFFSET	r11,R11
-	movq %rbx,5*8(%rsp) 
-	CFI_REL_OFFSET	rbx,RBX
-	movq %rbp,4*8(%rsp) 
-	CFI_REL_OFFSET	rbp,RBP
-	movq %r12,3*8(%rsp) 
-	CFI_REL_OFFSET	r12,R12
-	movq %r13,2*8(%rsp) 
-	CFI_REL_OFFSET	r13,R13
-	movq %r14,1*8(%rsp) 
-	CFI_REL_OFFSET	r14,R14
-	movq %r15,(%rsp) 
-	CFI_REL_OFFSET	r15,R15
-#if 0        
-	cmpl $__KERNEL_CS,CS(%rsp)
-	CFI_REMEMBER_STATE
-	je  error_kernelspace
-#endif        
-error_call_handler:
-	movq %rdi, RDI(%rsp)            
-	CFI_REL_OFFSET	rdi,RDI
-	movq %rsp,%rdi
-	movq ORIG_RAX(%rsp),%rsi	# get error code 
-	movq $-1,ORIG_RAX(%rsp)
-	call *%rax
-error_exit:
-	RESTORE_REST
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)	
-	testb $3,CS-ARGOFFSET(%rsp)
-	jz retint_kernel
-	LOCKDEP_SYS_EXIT_IRQ
-	movl  TI_flags(%rcx),%edx
-	movl  $_TIF_WORK_MASK,%edi	
-	andl  %edi,%edx
-	jnz   retint_careful
-	jmp   retint_restore_args
-
-#if 0
-         /*
-         * We need to re-write the logic here because we don't do iretq to 
-         * to return to user mode. It's still possible that we get trap/fault
-         * in the kernel (when accessing buffers pointed to by system calls, 
-         * for example).
-         *
-         */           
-	CFI_RESTORE_STATE
-error_kernelspace:
-	incl %ebx
-       /* There are two places in the kernel that can potentially fault with
-          usergs. Handle them here. The exception handlers after
-	   iret run with kernel gs again, so don't set the user space flag.
-	   B stepping K8s sometimes report an truncated RIP for IRET 
-	   exceptions returning to compat mode. Check for these here too. */
-	leaq irq_return(%rip),%rcx
-	cmpq %rcx,RIP(%rsp)
-	je   error_swapgs
-	movl %ecx,%ecx	/* zero extend */
-	cmpq %rcx,RIP(%rsp)
-	je   error_swapgs
-	cmpq $gs_change,RIP(%rsp)
-        je   error_swapgs
-	jmp  error_sti
-#endif
-	CFI_ENDPROC
-KPROBE_END(error_entry)
-	
-ENTRY(hypervisor_callback)
-	zeroentry do_hypervisor_callback
-END(hypervisor_callback)
-        
-/*
  * Copied from arch/xen/i386/kernel/entry.S
  */               
 # A note on the "critical region" in our callback handler.
@@ -1041,7 +997,7 @@ ENTRY(do_hypervisor_callback)   # do_hyp
 # see the correct pointer to the pt_regs
 	movq %rdi, %rsp            # we don't return, adjust the stack frame
 	CFI_ENDPROC
-	CFI_DEFAULT_STACK
+	DEFAULT_FRAME
 11:	incl %gs:pda_irqcount
 	movq %rsp,%rbp
 	CFI_DEF_CFA_REGISTER rbp
@@ -1057,7 +1013,7 @@ END(do_hypervisor_callback)
 
         ALIGN
 restore_all_enable_events:  
-	CFI_DEFAULT_STACK adj=1
+	PARTIAL_FRAME
 	TRACE_IRQS_ON
 	__ENABLE_INTERRUPTS
 
@@ -1093,9 +1049,7 @@ ecrit:  /**** END OF CRITICAL REGION ***
 # We distinguish between categories by comparing each saved segment register
 # with its current contents: any discrepancy means we in category 1.
 ENTRY(failsafe_callback)
-	_frame (RIP-0x30)
-	CFI_REL_OFFSET rcx, 0
-	CFI_REL_OFFSET r11, 8
+	INTR_FRAME offset=4*8
 	movw %ds,%cx
 	cmpw %cx,0x10(%rsp)
 	CFI_REMEMBER_STATE
@@ -1131,20 +1085,19 @@ ENTRY(failsafe_callback)
 	SAVE_ALL
 	jmp error_exit
 	CFI_ENDPROC
-#if 0	      
-        .section __ex_table,"a"
-        .align 8
-        .quad gs_change,bad_gs
-        .previous
-        .section .fixup,"ax"
-	/* running with kernelgs */
-bad_gs: 
-/*	swapgs		*/	/* switch back to user gs */
-	xorl %eax,%eax
-        movl %eax,%gs
-        jmp  2b
-        .previous       
-#endif
+
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+zeroentry hypervisor_callback do_hypervisor_callback
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
 	
 /*
  * Create a kernel thread.
@@ -1168,7 +1121,7 @@ ENTRY(kernel_thread)
 
 	xorl %r8d,%r8d
 	xorl %r9d,%r9d
-	
+
 	# clone now
 	call do_fork
 	movq %rax,RAX(%rsp)
@@ -1179,15 +1132,15 @@ ENTRY(kernel_thread)
 	 * so internally to the x86_64 port you can rely on kernel_thread()
 	 * not to reschedule the child before returning, this avoids the need
 	 * of hacks for example to fork off the per-CPU idle tasks.
-         * [Hopefully no generic code relies on the reschedule -AK]	
+	 * [Hopefully no generic code relies on the reschedule -AK]
 	 */
 	RESTORE_ALL
 	UNFAKE_STACK_FRAME
 	ret
 	CFI_ENDPROC
-ENDPROC(kernel_thread)
-	
-child_rip:
+END(kernel_thread)
+
+ENTRY(child_rip)
 	pushq $0		# fake return address
 	CFI_STARTPROC
 	/*
@@ -1200,8 +1153,9 @@ child_rip:
 	# exit
 	mov %eax, %edi
 	call do_exit
+	ud2			# padding for call trace
 	CFI_ENDPROC
-ENDPROC(child_rip)
+END(child_rip)
 
 /*
  * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1221,10 +1175,10 @@ ENDPROC(child_rip)
 ENTRY(kernel_execve)
 	CFI_STARTPROC
 	FAKE_STACK_FRAME $0
-	SAVE_ALL	
+	SAVE_ALL
 	movq %rsp,%rcx
 	call sys_execve
-	movq %rax, RAX(%rsp)	
+	movq %rax, RAX(%rsp)
 	RESTORE_REST
 	testq %rax,%rax
 	jne 1f
@@ -1233,132 +1187,7 @@ ENTRY(kernel_execve)
 	UNFAKE_STACK_FRAME
 	ret
 	CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-KPROBE_ENTRY(page_fault)
-	errorentry do_page_fault
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
-	zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
-	zeroentry do_simd_coprocessor_error	
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
-	zeroentry do_device_not_available
-END(device_not_available)
-
-	/* runs on exception stack */
-KPROBE_ENTRY(debug)
-/* 	INTR_FRAME
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq $0
-	CFI_ADJUST_CFA_OFFSET 8	*/
-	zeroentry do_debug
-/*	paranoidexit
-	CFI_ENDPROC */
-KPROBE_END(debug)
-
-KPROBE_ENTRY(nmi)
-	zeroentry do_nmi_callback
-KPROBE_END(nmi)
-do_nmi_callback:
-	CFI_STARTPROC
-	addq $8, %rsp
-	CFI_ENDPROC
-	CFI_DEFAULT_STACK
-	call do_nmi
-	orl  $NMI_MASK,EFLAGS(%rsp)
-	RESTORE_REST
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	jmp  retint_restore_args
-	CFI_ENDPROC
-END(do_nmi_callback)
-
-KPROBE_ENTRY(int3)
-/* 	INTR_FRAME
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
- 	pushq $0
- 	CFI_ADJUST_CFA_OFFSET 8 */
- 	zeroentry do_int3
-/* 	jmp paranoid_exit1
- 	CFI_ENDPROC */
-KPROBE_END(int3)
-
-ENTRY(overflow)
-	zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
-	zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
-	zeroentry do_invalid_op	
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
-	zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
-#if 0
-	/* runs on exception stack */
-ENTRY(double_fault)
-	XCPT_FRAME
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	paranoidentry do_double_fault
-	jmp paranoid_exit1
-	CFI_ENDPROC
-END(double_fault)
-#endif
-
-ENTRY(invalid_TSS)
-	errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
-	errorentry do_segment_not_present
-END(segment_not_present)
-
-	/* runs on exception stack */
-ENTRY(stack_segment)
-/*	XCPT_FRAME
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	paranoidentry do_stack_segment */
-	errorentry do_stack_segment
-/*	jmp paranoid_exit1
-	CFI_ENDPROC */
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
-	errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
-	errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
-	zeroentry do_divide_error
-END(divide_error)
-
-#ifndef CONFIG_XEN
-ENTRY(spurious_interrupt_bug)
-	zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-#endif
-
-#ifdef CONFIG_X86_MCE
-	/* runs on exception stack */
-KPROBE_ENTRY(machine_check)
-	zeroentry do_machine_check
-END(machine_check)
-#endif
+END(kernel_execve)
 
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
@@ -1378,24 +1207,191 @@ ENTRY(call_softirq)
 	decl %gs:pda_irqcount
 	ret
 	CFI_ENDPROC
-ENDPROC(call_softirq)
+END(call_softirq)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+zeroentry nmi do_nmi_callback
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+#ifndef CONFIG_XEN
+	/*
+	 * "Paranoid" exit path from exception stack.
+	 * Paranoid because this is used by NMIs and cannot take
+	 * any kernel state for granted.
+	 * We don't do kernel preemption checks here, because only
+	 * NMI should be common and it does not enable IRQs and
+	 * cannot get reschedule ticks.
+	 *
+	 * "trace" is 0 for the NMI handler only, because irq-tracing
+	 * is fundamentally NMI-unsafe. (we cannot change the soft and
+	 * hard flags at once, atomically)
+	 */
+
+	/* ebx:	no swapgs flag */
+ENTRY(paranoid_exit)
+	INTR_FRAME
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace
+paranoid_swapgs:
+	TRACE_IRQS_IRETQ 0
+	SWAPGS_UNSAFE_STACK
+paranoid_restore:
+	RESTORE_ALL 8
+	jmp irq_return
+paranoid_userspace:
+	GET_THREAD_INFO(%rcx)
+	movl TI_flags(%rcx),%ebx
+	andl $_TIF_WORK_MASK,%ebx
+	jz paranoid_swapgs
+	movq %rsp,%rdi			/* &pt_regs */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack for scheduling */
+	testl $_TIF_NEED_RESCHED,%ebx
+	jnz paranoid_schedule
+	movl %ebx,%edx			/* arg3: thread flags */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	xorl %esi,%esi 			/* arg2: oldset */
+	movq %rsp,%rdi 			/* arg1: &pt_regs */
+	call do_notify_resume
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp paranoid_userspace
+paranoid_schedule:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	call schedule
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	jmp paranoid_userspace
+	CFI_ENDPROC
+END(paranoid_exit)
+#endif
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+	XCPT_FRAME 2
+	CFI_ADJUST_CFA_OFFSET 15*8
+	/* oldrax contains error code */
+	cld
+	movq_cfi rdi, RDI+8
+	movq_cfi rsi, RSI+8
+	movq_cfi rdx, RDX+8
+	movq_cfi rcx, RCX+8
+	movq_cfi rax, RAX+8
+	movq_cfi  r8,  R8+8
+	movq_cfi  r9,  R9+8
+	movq_cfi r10, R10+8
+	movq_cfi r11, R11+8
+	movq_cfi rbx, RBX+8
+	movq_cfi rbp, RBP+8
+	movq_cfi r12, R12+8
+	movq_cfi r13, R13+8
+	movq_cfi r14, R14+8
+	movq_cfi r15, R15+8
+#ifndef CONFIG_XEN
+	xorl %ebx,%ebx
+	testl $3,CS+8(%rsp)
+	je error_kernelspace
+error_swapgs:
+	SWAPGS
+error_sti:
+#endif
+	TRACE_IRQS_OFF
+	ret
+	CFI_ENDPROC
+
+#ifndef CONFIG_XEN
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+	incl %ebx
+	leaq irq_return(%rip),%rcx
+	cmpq %rcx,RIP+8(%rsp)
+	je error_swapgs
+	movl %ecx,%ecx	/* zero extend */
+	cmpq %rcx,RIP+8(%rsp)
+	je error_swapgs
+	cmpq $gs_change,RIP+8(%rsp)
+	je error_swapgs
+	jmp error_sti
+#endif
+END(error_entry)
+
+
+ENTRY(error_exit)
+	DEFAULT_FRAME
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	testb $3,CS-ARGOFFSET(%rsp)
+	jz retint_kernel
+	LOCKDEP_SYS_EXIT_IRQ
+	movl TI_flags(%rcx),%edx
+	movl $_TIF_WORK_MASK,%edi
+	andl %edi,%edx
+	jnz retint_careful
+	jmp retint_restore_args
+	CFI_ENDPROC
+END(error_exit)
+
+
+do_nmi_callback:
+	CFI_STARTPROC
+	addq $8, %rsp
+	CFI_ENDPROC
+	DEFAULT_FRAME
+	call do_nmi
+	orl  $NMI_MASK,EFLAGS(%rsp)
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	jmp  retint_restore_args
+	CFI_ENDPROC
+END(do_nmi_callback)
+
 
 #ifndef CONFIG_IA32_EMULATION
-KPROBE_ENTRY(ignore_sysret)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-RIP+16
-/*	CFI_REL_OFFSET	ss,SS-RIP+16 */
-	CFI_REL_OFFSET	rsp,RSP-RIP+16
-/*	CFI_REL_OFFSET	rflags,EFLAGS-RIP+16 */
-/*	CFI_REL_OFFSET	cs,CS-RIP+16 */
-	CFI_REL_OFFSET	rip,RIP-RIP+16
+ENTRY(ignore_sysret)
+	INTR_FRAME
 	popq %rcx
 	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rcx
 	popq %r11
 	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE r11
 	mov $-ENOSYS,%eax
 	HYPERVISOR_IRET 0
 	CFI_ENDPROC
-ENDPROC(ignore_sysret)
+END(ignore_sysret)
 #endif
+
+/*
+ * End of kprobes section
+ */
+	.popsection
--- head-2010-04-29.orig/arch/x86/kernel/head-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/head-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -36,7 +36,6 @@ void __init reserve_ebda_region(void)
 
 	/* start of EBDA area */
 	ebda_addr = get_bios_ebda();
-	printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
 
 	/* Fixup: bios puts an EBDA in the top 64K segment */
 	/* of conventional memory, but does not adjust lowmem. */
--- head-2010-04-29.orig/arch/x86/kernel/head32-xen.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/head32-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -12,9 +12,12 @@
 #include <asm/sections.h>
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
 
 void __init i386_start_kernel(void)
 {
+	reserve_trampoline_memory();
+
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
 
 #ifndef CONFIG_XEN
--- head-2010-04-29.orig/arch/x86/kernel/head64-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/head64-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -31,9 +31,10 @@
 #include <asm/kdebug.h>
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
 
 /* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
 
 #ifdef CONFIG_SMP
 /*
@@ -163,6 +164,8 @@ void __init x86_64_start_reservations(ch
 {
 	copy_bootdata(__va(real_mode_data));
 
+	reserve_trampoline_memory();
+
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
 
 	reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
--- head-2010-04-29.orig/arch/x86/kernel/apic/io_apic-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/apic/io_apic-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -122,102 +122,276 @@ static int __init parse_noapic(char *str
 }
 early_param("noapic", parse_noapic);
 
+#ifndef CONFIG_XEN
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-#ifndef CONFIG_XEN
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
-	cpumask_t domain;
-	cpumask_t old_domain;
+	cpumask_var_t domain;
+	cpumask_var_t old_domain;
 	unsigned move_cleanup_count;
-#endif
 	u8 vector;
-#ifndef CONFIG_XEN
 	u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+	u8 move_desc_pending : 1;
 #endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0 },
-	[1]  = { .irq =  1 },
-	[2]  = { .irq =  2 },
-	[3]  = { .irq =  3 },
-	[4]  = { .irq =  4 },
-	[5]  = { .irq =  5 },
-	[6]  = { .irq =  6 },
-	[7]  = { .irq =  7 },
-	[8]  = { .irq =  8 },
-	[9]  = { .irq =  9 },
-	[10] = { .irq = 10 },
-	[11] = { .irq = 11 },
-	[12] = { .irq = 12 },
-	[13] = { .irq = 13 },
-	[14] = { .irq = 14 },
-	[15] = { .irq = 15 },
+#endif
+	[0]  = { .vector = IRQ0_VECTOR,  },
+	[1]  = { .vector = IRQ1_VECTOR,  },
+	[2]  = { .vector = IRQ2_VECTOR,  },
+	[3]  = { .vector = IRQ3_VECTOR,  },
+	[4]  = { .vector = IRQ4_VECTOR,  },
+	[5]  = { .vector = IRQ5_VECTOR,  },
+	[6]  = { .vector = IRQ6_VECTOR,  },
+	[7]  = { .vector = IRQ7_VECTOR,  },
+	[8]  = { .vector = IRQ8_VECTOR,  },
+	[9]  = { .vector = IRQ9_VECTOR,  },
+	[10] = { .vector = IRQ10_VECTOR, },
+	[11] = { .vector = IRQ11_VECTOR, },
+	[12] = { .vector = IRQ12_VECTOR, },
+	[13] = { .vector = IRQ13_VECTOR, },
+	[14] = { .vector = IRQ14_VECTOR, },
+	[15] = { .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+int __init arch_early_irq_init(void)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
 
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+		alloc_bootmem_cpumask_var(&cfg[i].domain);
+		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+		if (i < NR_IRQS_LEGACY)
+			cpumask_setall(cfg[i].domain);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	return irq_cfg(irq);
+	struct irq_cfg *cfg;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	if (cfg) {
+		if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+			kfree(cfg);
+			cfg = NULL;
+		} else if (!alloc_cpumask_var_node(&cfg->old_domain,
+							  GFP_ATOMIC, node)) {
+			free_cpumask_var(cfg->domain);
+			kfree(cfg);
+			cfg = NULL;
+		} else {
+			cpumask_clear(cfg->domain);
+			cpumask_clear(cfg->old_domain);
+		}
+	}
+
+	return cfg;
 }
 
-#ifdef CONFIG_XEN
-#define irq_2_pin_init()
-#define add_pin_to_irq(irq, apic, pin)
-#else
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+	cfg = desc->chip_data;
+	if (!cfg) {
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+		if (!desc->chip_data) {
+			printk(KERN_ERR "can not alloc irq_cfg\n");
+			BUG_ON(1);
+		}
+	}
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	return 0;
+}
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
 
-static void __init irq_2_pin_init(void)
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
+
+	entry->apic	= old_entry->apic;
+	entry->pin	= old_entry->pin;
+	head		= entry;
+	tail		= entry;
+	old_entry	= old_entry->next;
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic	= old_entry->apic;
+		entry->pin	= old_entry->pin;
+		tail->next	= entry;
+		tail		= entry;
+		old_entry	= old_entry->next;
+	}
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
+
+	entry = old_cfg->irq_2_pin;
 
-	irq_2_pin_ptr = &pin[0];
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = get_one_free_irq_cfg(cpu);
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpumask_intersects(&desc->affinity, mask))
+			cfg->move_desc_pending = 1;
+	}
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
 }
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -230,7 +404,7 @@ static __attribute_const__ struct io_api
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
 		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
-#endif
+#endif /* CONFIG_XEN */
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
@@ -285,11 +459,10 @@ static inline void io_apic_modify(unsign
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -375,13 +548,32 @@ static void ioapic_mask_entry(int apic, 
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	cpumask_var_t cleanup_mask;
+
+	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+		unsigned int i;
+		cfg->move_cleanup_count = 0;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			cfg->move_cleanup_count++;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+	} else {
+		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		free_cpumask_var(cleanup_mask);
+	}
+	cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -411,36 +603,61 @@ static void __target_IO_APIC_irq(unsigne
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg;
+	unsigned int irq;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return BAD_APICID;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
+		return BAD_APICID;
+
+	cpumask_and(&desc->affinity, cfg->domain, mask);
+	set_extra_move_desc(desc, mask);
+	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
-	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
-		return;
+	irq = desc->irq;
+	cfg = desc->chip_data;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
-		return;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	dest = set_desc_affinity(desc, mask);
+	if (dest != BAD_APICID) {
+		/* Only the high 8 bits are valid. */
+		dest = SET_APIC_LOGICAL_ID(dest);
+		__target_IO_APIC_irq(irq, dest, cfg);
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+	struct irq_desc *desc;
 
 	desc = irq_to_desc(irq);
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
-	desc->affinity = mask;
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -449,16 +666,18 @@ static void set_ioapic_affinity_irq(unsi
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+					apic, pin);
+			return;
+		}
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -473,7 +692,7 @@ static void add_pin_to_irq(unsigned int 
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -482,11 +701,10 @@ static void add_pin_to_irq(unsigned int 
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -503,18 +721,16 @@ static void __init replace_pin_at_irq(un
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -527,13 +743,13 @@ static inline void io_apic_modify_irq(un
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
 {
 	/*
 	 * Synchronize the IO-APIC and the CPU by doing
@@ -544,47 +760,64 @@ void io_apic_sync(struct irq_pin_list *e
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irq_desc(desc);
+}
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -624,6 +857,8 @@ void send_IPI_self(int vector)
 	apic_write(APIC_ICR, cfg);
 }
 #endif /* !CONFIG_SMP && CONFIG_X86_32*/
+#else
+#define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
 #endif /* CONFIG_XEN */
 
 #ifdef CONFIG_X86_32
@@ -864,7 +1099,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1089,52 +1324,118 @@ void unlock_vector_lock(void)
 {
 	spin_unlock(&vector_lock);
 }
-#endif
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
-	struct physdev_irq irq_op;
-	struct irq_cfg *cfg;
-
-	if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
-		return -EINVAL;
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	unsigned int old_vector;
+	int cpu, err;
+	cpumask_var_t tmp_mask;
+
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
+
+	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	old_vector = cfg->vector;
+	if (old_vector) {
+		cpumask_and(tmp_mask, mask, cpu_online_mask);
+		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+		if (!cpumask_empty(tmp_mask)) {
+			free_cpumask_var(tmp_mask);
+			return 0;
+		}
+	}
 
-	cfg = irq_cfg(irq);
+	/* Only try and allocate irqs on cpus that are present */
+	err = -ENOSPC;
+	for_each_cpu_and(cpu, mask, cpu_online_mask) {
+		int new_cpu;
+		int vector, offset;
+
+		vector_allocation_domain(cpu, tmp_mask);
+
+		vector = current_vector;
+		offset = current_offset;
+next:
+		vector += 8;
+		if (vector >= first_system_vector) {
+			/* If out of vectors on large boxen, must share them. */
+			offset = (offset + 1) % 8;
+			vector = FIRST_DEVICE_VECTOR + offset;
+		}
+		if (unlikely(current_vector == vector))
+			continue;
 
-	if (cfg->vector)
-		return 0;
+		if (test_bit(vector, used_vectors))
+			goto next;
 
-	irq_op.irq = irq;
-	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
-		return -ENOSPC;
+#ifdef CONFIG_KDB
+		if (vector == KDBENTER_VECTOR)
+			goto next;
+#endif	/* CONFIG_KDB */
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+				goto next;
+		/* Found one! */
+		current_vector = vector;
+		current_offset = offset;
+		if (old_vector) {
+			cfg->move_in_progress = 1;
+			cpumask_copy(cfg->old_domain, cfg->domain);
+		}
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+			per_cpu(vector_irq, new_cpu)[vector] = irq;
+		cfg->vector = vector;
+		cpumask_copy(cfg->domain, tmp_mask);
+		err = 0;
+		break;
+	}
+	free_cpumask_var(tmp_mask);
+	return err;
+}
 
-	cfg->vector = irq_op.vector;
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+	int err;
+	unsigned long flags;
 
-	return 0;
+	spin_lock_irqsave(&vector_lock, flags);
+	err = __assign_irq_vector(irq, cfg, mask);
+	spin_unlock_irqrestore(&vector_lock, flags);
+	return err;
 }
 
-#ifndef CONFIG_XEN
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
-	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
-	cpus_and(mask, cfg->domain, cpu_online_map);
-	for_each_cpu_mask_nr(cpu, mask)
+	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = -1;
 
 	cfg->vector = 0;
-	cpus_clear(cfg->domain);
+	cpumask_clear(cfg->domain);
 
 	if (likely(!cfg->move_in_progress))
 		return;
-	cpus_and(mask, cfg->old_domain, cpu_online_map);
-	for_each_cpu_mask_nr(cpu, mask) {
+	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 								vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1152,10 +1453,12 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
-		if (!cpu_isset(cpu, cfg->domain))
+	for_each_irq_desc(irq, desc) {
+		cfg = desc->chip_data;
+		if (!cpumask_test_cpu(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1167,7 +1470,7 @@ void __setup_vector_irq(int cpu)
 			continue;
 
 		cfg = irq_cfg(irq);
-		if (!cpu_isset(cpu, cfg->domain))
+		if (!cpumask_test_cpu(cpu, cfg->domain))
 			per_cpu(vector_irq, cpu)[vector] = -1;
 	}
 }
@@ -1205,11 +1508,8 @@ static inline int IO_APIC_irq_trigger(in
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1240,8 +1540,8 @@ static void ioapic_register_intr(int irq
 					      handle_edge_irq, "edge");
 }
 #else /* !CONFIG_XEN */
-#define __clear_irq_vector(irq) ((void)(irq))
-#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq)
+#define __clear_irq_vector(irq, cfg) ((void)0)
+#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
 #endif
 
 static int setup_ioapic_entry(int apic, int irq,
@@ -1305,24 +1605,25 @@ static int setup_ioapic_entry(int apic, 
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
 	struct IO_APIC_route_entry entry;
-	cpumask_t mask;
+	unsigned int dest;
 
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = irq_cfg(irq);
+	cfg = desc->chip_data;
 
-	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, TARGET_CPUS))
 		return;
 
 #ifndef CONFIG_XEN
-	cpus_and(mask, cfg->domain, mask);
+	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+#else
+	dest = cpu_mask_to_apicid(TARGET_CPUS);
 #endif
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
@@ -1333,16 +1634,15 @@ static void setup_IO_APIC_irq(int apic, 
 
 
 	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-			       cpu_mask_to_apicid(mask), trigger, polarity,
-			       cfg->vector)) {
+			       dest, trigger, polarity, cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	ioapic_register_intr(irq, desc, trigger);
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1352,6 +1652,9 @@ static void __init setup_IO_APIC_irqs(vo
 {
 	int apic, pin, idx, irq;
 	int notcon = 0;
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1386,9 +1689,15 @@ static void __init setup_IO_APIC_irqs(vo
 			if (multi_timer_check(apic, irq))
 				continue;
 #endif
-			add_pin_to_irq(irq, apic, pin);
+			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			if (!desc) {
+				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+				continue;
+			}
+			cfg = desc->chip_data;
+			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-			setup_IO_APIC_irq(apic, pin, irq,
+			setup_IO_APIC_irq(apic, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -1448,6 +1757,7 @@ __apicdebuginit(void) print_IO_APIC(void
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1847,11 @@ __apicdebuginit(void) print_IO_APIC(void
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2030,14 +2343,16 @@ static unsigned int startup_ioapic_irq(u
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2051,7 +2366,7 @@ static int ioapic_retrigger_irq(unsigned
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+	send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -2100,35 +2415,35 @@ static DECLARE_DELAYED_WORK(ir_migration
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
-	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
+	unsigned int irq;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
+	set_extra_move_desc(desc, mask);
+
+	dest = cpu_mask_to_apicid_and(cfg->domain, mask);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2140,24 +2455,20 @@ static void migrate_ioapic_irq(int irq, 
 	 */
 	modify_irte(irq, &irte);
 
-	if (cfg->move_in_progress) {
-		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		cfg->move_in_progress = 0;
-	}
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
 
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irq_desc(desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2169,14 +2480,15 @@ static int migrate_irq_remapped_level(in
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
-	cpus_clear(desc->pending_mask);
+	cpumask_clear(&desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irq_desc(desc);
+
 	return ret;
 }
 
@@ -2197,7 +2509,7 @@ static void ir_irq_migration(struct work
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, desc->pending_mask);
+			desc->chip->set_affinity(irq, &desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -2206,28 +2518,33 @@ static void ir_irq_migration(struct work
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+					    const struct cpumask *mask)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
-		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		cpumask_copy(&desc->pending_mask, mask);
+		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+				       const struct cpumask *mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
+
 	ack_APIC_irq();
-#ifdef CONFIG_X86_64
 	exit_idle();
-#endif
 	irq_enter();
 
 	me = smp_processor_id();
@@ -2237,6 +2554,9 @@ asmlinkage void smp_irq_move_cleanup_int
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
 
+		if (irq == -1)
+			continue;
+
 		desc = irq_to_desc(irq);
 		if (!desc)
 			continue;
@@ -2246,7 +2566,7 @@ asmlinkage void smp_irq_move_cleanup_int
 		if (!cfg->move_cleanup_count)
 			goto unlock;
 
-		if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;
 
 		__get_cpu_var(vector_irq)[vector] = -1;
@@ -2258,28 +2578,45 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		if (likely(!cfg->move_desc_pending))
+			return;
+
+		/* domain has not changed, but affinity did */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_pending = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
-	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
-		cpumask_t cleanup_mask;
 
-		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		cfg->move_in_progress = 0;
+	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+		send_cleanup_vector(cfg);
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
 static void ack_x2apic_level(unsigned int irq)
 {
@@ -2290,11 +2627,14 @@ static void ack_x2apic_edge(unsigned int
 {
 	ack_x2APIC_irq();
 }
+
 #endif
 
 static void ack_apic_edge(unsigned int irq)
 {
-	irq_complete_move(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	irq_complete_move(&desc);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
@@ -2303,18 +2643,21 @@ atomic_t irq_mis_count;
 
 static void ack_apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(&desc);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irq_desc(desc);
 	}
 #endif
 
@@ -2338,7 +2681,8 @@ static void ack_apic_level(unsigned int 
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = desc->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2377,17 +2721,18 @@ static void ack_apic_level(unsigned int 
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
+		cfg = desc->chip_data;
+		if (!io_apic_level_ack_pending(cfg))
 			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		unmask_IO_APIC_irq_desc(desc);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
@@ -2439,24 +2784,23 @@ static inline void init_IO_APIC_traps(vo
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
 #ifdef CONFIG_XEN
 		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
 			continue;
 #endif
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
 	}
 }
@@ -2482,7 +2826,7 @@ static void unmask_lapic_irq(unsigned in
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
 	ack_APIC_irq();
 }
@@ -2494,11 +2838,8 @@ static struct irq_chip lapic_chip __read
 	.ack		= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
@@ -2602,7 +2943,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2617,7 +2960,7 @@ static inline void __init check_timer(vo
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2668,10 +3011,10 @@ static inline void __init check_timer(vo
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2697,9 +3040,9 @@ static inline void __init check_timer(vo
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2731,7 +3074,7 @@ static inline void __init check_timer(vo
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0);
+	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2930,22 +3273,26 @@ unsigned int create_irq_nr(unsigned int 
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
-
-	irq_want = nr_irqs - 1;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu = boot_cpu_id;
+	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new > 0; new--) {
+	for (new = irq_want; new < NR_IRQS; new++) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		if (!desc_new) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", new);
+			continue;
+		}
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2953,15 +3300,21 @@ unsigned int create_irq_nr(unsigned int 
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
 
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
 int create_irq(void)
 {
+	unsigned int irq_want;
 	int irq;
 
-	irq = create_irq_nr(nr_irqs - 1);
+	irq_want = nr_irqs_gsi;
+	irq = create_irq_nr(irq_want);
 
 	if (irq == 0)
 		irq = -1;
@@ -2972,14 +3325,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 #endif /* CONFIG_XEN */
@@ -2993,16 +3354,13 @@ static int msi_compose_msg(struct pci_de
 	struct irq_cfg *cfg;
 	int err;
 	unsigned dest;
-	cpumask_t tmp;
 
-	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	cfg = irq_cfg(irq);
+	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, tmp);
-	dest = cpu_mask_to_apicid(tmp);
+	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
 #ifdef CONFIG_INTR_REMAP
 	if (irq_remapped(irq)) {
@@ -3056,64 +3414,48 @@ static int msi_compose_msg(struct pci_de
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
-	cpumask_t tmp;
-	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	dest = set_desc_affinity(desc, mask);
+	if (dest == BAD_APICID)
 		return;
 
-	if (assign_irq_vector(irq, mask))
-		return;
-
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
+	cfg = desc->chip_data;
 
-	read_msi_msg(irq, &msg);
+	read_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	write_msi_msg_desc(desc, &msg);
 }
-
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
-	struct irq_cfg *cfg;
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned int dest;
-	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	struct irq_desc *desc;
-
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
-		return;
 
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	dest = set_desc_affinity(desc, mask);
+	if (dest == BAD_APICID)
 		return;
 
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
-
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
 
@@ -3127,16 +3469,10 @@ static void ir_set_msi_irq_affinity(unsi
 	 * at the new destination. So, time to cleanup the previous
 	 * vector allocation.
 	 */
-	if (cfg->move_in_progress) {
-		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		cfg->move_in_progress = 0;
-	}
-
-	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
 }
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3195,7 +3531,7 @@ static int msi_alloc_irte(struct pci_dev
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	int ret;
 	struct msi_msg msg;
@@ -3204,7 +3540,7 @@ static int setup_msi_irq(struct pci_dev 
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, desc);
+	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3224,26 +3560,13 @@ static int setup_msi_irq(struct pci_dev 
 	return 0;
 }
 
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
-	unsigned int irq;
-
-	irq = dev->bus->number;
-	irq <<= 8;
-	irq |= dev->devfn;
-	irq <<= 12;
-
-	return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	unsigned int irq;
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+	irq_want = nr_irqs_gsi;
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
 		return -1;
@@ -3257,7 +3580,7 @@ int arch_setup_msi_irq(struct pci_dev *d
 		goto error;
 no_ir:
 #endif
-	ret = setup_msi_irq(dev, desc, irq);
+	ret = setup_msi_irq(dev, msidesc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -3275,7 +3598,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 {
 	unsigned int irq;
 	int ret, sub_handle;
-	struct msi_desc *desc;
+	struct msi_desc *msidesc;
 	unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3283,10 +3606,11 @@ int arch_setup_msi_irqs(struct pci_dev *
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
-	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want--);
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = create_irq_nr(irq_want);
+		irq_want++;
 		if (irq == 0)
 			return -1;
 #ifdef CONFIG_INTR_REMAP
@@ -3318,7 +3642,7 @@ int arch_setup_msi_irqs(struct pci_dev *
 		}
 no_ir:
 #endif
-		ret = setup_msi_irq(dev, desc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3337,24 +3661,18 @@ void arch_teardown_msi_irq(unsigned int 
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
-	cpumask_t tmp;
-	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	dest = set_desc_affinity(desc, mask);
+	if (dest == BAD_APICID)
 		return;
 
-	if (assign_irq_vector(irq, mask))
-		return;
-
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
+	cfg = desc->chip_data;
 
 	dmar_msi_read(irq, &msg);
 
@@ -3364,9 +3682,8 @@ static void dmar_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
-	desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3398,24 +3715,18 @@ int arch_setup_dmar_msi(unsigned int irq
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	struct msi_msg msg;
 	unsigned int dest;
-	cpumask_t tmp;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	dest = set_desc_affinity(desc, mask);
+	if (dest == BAD_APICID)
 		return;
 
-	if (assign_irq_vector(irq, mask))
-		return;
-
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
+	cfg = desc->chip_data;
 
 	hpet_msi_read(irq, &msg);
 
@@ -3425,9 +3736,8 @@ static void hpet_msi_set_affinity(unsign
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
-	desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3480,28 +3790,21 @@ static void target_ht_irq(unsigned int i
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	unsigned int dest;
-	cpumask_t tmp;
-	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	dest = set_desc_affinity(desc, mask);
+	if (dest == BAD_APICID)
 		return;
 
-	if (assign_irq_vector(irq, mask))
-		return;
-
-	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
-	dest = cpu_mask_to_apicid(tmp);
+	cfg = desc->chip_data;
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
-	desc->affinity = mask;
 }
+
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3519,17 +3822,14 @@ int arch_setup_ht_irq(unsigned int irq, 
 {
 	struct irq_cfg *cfg;
 	int err;
-	cpumask_t tmp;
 
-	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	cfg = irq_cfg(irq);
+	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
-		cpus_and(tmp, cfg->domain, tmp);
-		dest = cpu_mask_to_apicid(tmp);
+		dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
 		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
@@ -3565,7 +3865,7 @@ int arch_setup_ht_irq(unsigned int irq, 
 int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 		       unsigned long mmr_offset)
 {
-	const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+	const struct cpumask *eligible_cpu = cpumask_of(cpu);
 	struct irq_cfg *cfg;
 	int mmr_pnode;
 	unsigned long mmr_value;
@@ -3573,7 +3873,9 @@ int arch_enable_uv_irq(char *irq_name, u
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3582,8 +3884,6 @@ int arch_enable_uv_irq(char *irq_name, u
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3594,7 +3894,7 @@ int arch_enable_uv_irq(char *irq_name, u
 	entry->polarity = 0;
 	entry->trigger = 0;
 	entry->mask = 0;
-	entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+	entry->dest = cpu_mask_to_apicid(eligible_cpu);
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3635,10 +3935,29 @@ int __init io_apic_get_redir_entries (in
 	return reg_01.bits.entries;
 }
 
-int __init probe_nr_irqs(void)
+#ifndef CONFIG_XEN
+void __init probe_nr_irqs_gsi(void)
 {
-	return NR_IRQS;
+	int nr = 0;
+
+	nr = acpi_probe_gsi();
+	if (nr > nr_irqs_gsi) {
+		nr_irqs_gsi = nr;
+	} else {
+		/* for acpi=off or acpi is not compiled in */
+		int idx;
+
+		nr = 0;
+		for (idx = 0; idx < nr_ioapics; idx++)
+			nr += io_apic_get_redir_entries(idx) + 1;
+
+		if (nr > nr_irqs_gsi)
+			nr_irqs_gsi = nr;
+	}
+
+	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
+#endif
 
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
@@ -3738,6 +4057,10 @@ int __init io_apic_get_version(int ioapi
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
+
 #ifdef CONFIG_XEN
 	if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
@@ -3752,13 +4075,21 @@ int io_apic_set_pci_routing (int ioapic,
 		return -EINVAL;
 	}
 
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+		return 0;
+	}
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
 	return 0;
 }
@@ -3797,7 +4128,7 @@ void __init setup_ioapic_dest(void)
 	int pin, ioapic, irq, irq_entry;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	cpumask_t mask;
+	const struct cpumask *mask;
 
 	if (skip_ioapic_setup == 1)
 		return;
@@ -3813,9 +4144,10 @@ void __init setup_ioapic_dest(void)
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			cfg = irq_cfg(irq);
+			desc = irq_to_desc(irq);
+			cfg = desc->chip_data;
 			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq,
+				setup_IO_APIC_irq(ioapic, pin, irq, desc,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
 				continue;
@@ -3825,19 +4157,18 @@ void __init setup_ioapic_dest(void)
 			/*
 			 * Honour affinities which have been set in early boot
 			 */
-			desc = irq_to_desc(irq);
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = desc->affinity;
+				mask = &desc->affinity;
 			else
 				mask = TARGET_CPUS;
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, mask);
+				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, mask);
+				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
 	}
@@ -3886,7 +4217,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
--- head-2010-04-29.orig/arch/x86/kernel/ioport-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/ioport-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -36,7 +36,7 @@ static void set_bitmap(unsigned long *bi
  */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
-	struct thread_struct * t = &current->thread;
+	struct thread_struct *t = &current->thread;
 	struct physdev_set_iobitmap set_iobitmap;
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
--- head-2010-04-29.orig/arch/x86/kernel/apic/ipi-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/apic/ipi-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -150,31 +150,28 @@ static inline void __send_IPI_dest_field
 /*
  * This is only used on smaller machines.
  */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
 {
 #ifndef CONFIG_XEN
-	unsigned long mask = cpus_addr(cpumask)[0];
+	unsigned long mask = cpumask_bits(cpumask)[0];
 #else
-	cpumask_t mask;
 	unsigned int cpu;
 #endif
 	unsigned long flags;
 
 	local_irq_save(flags);
 #ifndef CONFIG_XEN
-	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
 	__send_IPI_dest_field(mask, vector);
 #else
-	cpus_andnot(mask, cpumask, cpu_online_map);
-	WARN_ON(!cpus_empty(mask));
-	for_each_online_cpu(cpu)
-		if (cpu_isset(cpu, cpumask))
-			__send_IPI_one(cpu, vector);
+	WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+		__send_IPI_one(cpu, vector);
 #endif
 	local_irq_restore(flags);
 }
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
 {
 #ifndef CONFIG_XEN
 	unsigned long flags;
@@ -187,18 +184,37 @@ void send_IPI_mask_sequence(cpumask_t ma
 	 */
 
 	local_irq_save(flags);
-	for_each_possible_cpu(query_cpu) {
-		if (cpu_isset(query_cpu, mask)) {
-			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
-					      vector);
-		}
-	}
+	for_each_cpu(query_cpu, mask)
+		__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
 	local_irq_restore(flags);
 #else
 	send_IPI_mask_bitmask(mask, vector);
 #endif
 }
 
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+	unsigned long flags;
+	unsigned int query_cpu;
+	unsigned int this_cpu = smp_processor_id();
+
+	/* See Hack comment above */
+
+	local_irq_save(flags);
+#ifndef CONFIG_XEN
+	for_each_cpu(query_cpu, mask)
+		if (query_cpu != this_cpu)
+			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+					      vector);
+#else
+	WARN_ON(!cpumask_subset(mask, cpu_online_mask));
+	for_each_cpu_and(query_cpu, mask, cpu_online_mask)
+		if (query_cpu != this_cpu)
+			__send_IPI_one(query_cpu, vector);
+#endif
+	local_irq_restore(flags);
+}
+
 #ifndef CONFIG_XEN
 /* must come after the send_IPI functions above for inlining */
 static int convert_apicid_to_cpu(int apic_id)
--- head-2010-04-29.orig/arch/x86/kernel/irq-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/irq-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -5,10 +5,11 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/seq_file.h>
+#include <linux/smp.h>
 
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/smp.h>
+#include <asm/irq.h>
 
 atomic_t irq_err_count;
 
@@ -43,57 +44,57 @@ void ack_bad_irq(unsigned int irq)
 /*
  * /proc/interrupts printing:
  */
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
 {
 	int j;
 
-	seq_printf(p, "NMI: ");
+	seq_printf(p, "%*s: ", prec, "NMI");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
 	seq_printf(p, "  Non-maskable interrupts\n");
 #ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "LOC: ");
+	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
 #endif
 #ifdef CONFIG_SMP
-	seq_printf(p, "RES: ");
+	seq_printf(p, "%*s: ", prec, "RES");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
 	seq_printf(p, "  Rescheduling interrupts\n");
-	seq_printf(p, "CAL: ");
+	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_printf(p, "  Function call interrupts\n");
 #ifndef CONFIG_XEN
-	seq_printf(p, "TLB: ");
+	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
 	seq_printf(p, "  TLB shootdowns\n");
 #endif
 #endif
 #ifdef CONFIG_X86_MCE
-	seq_printf(p, "TRM: ");
+	seq_printf(p, "%*s: ", prec, "TRM");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
 	seq_printf(p, "  Thermal event interrupts\n");
 # ifdef CONFIG_X86_64
-	seq_printf(p, "THR: ");
+	seq_printf(p, "%*s: ", prec, "THR");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
 #ifdef CONFIG_X86_LOCAL_APIC
-	seq_printf(p, "SPU: ");
+	seq_printf(p, "%*s: ", prec, "SPU");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
 #endif
-	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
-	seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
 	return 0;
 }
@@ -101,25 +102,31 @@ static int show_other_interrupts(struct 
 int show_interrupts(struct seq_file *p, void *v)
 {
 	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j;
+	int i = *(loff_t *) v, j, prec;
 	struct irqaction *action;
 	struct irq_desc *desc;
 
 	if (i > nr_irqs)
 		return 0;
 
+	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+		j *= 10;
+
 	if (i == nr_irqs)
-		return show_other_interrupts(p);
+		return show_other_interrupts(p, prec);
 
 	/* print header */
 	if (i == 0) {
-		seq_printf(p, "           ");
+		seq_printf(p, "%*s", prec + 8, "");
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
 	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
@@ -131,7 +138,7 @@ int show_interrupts(struct seq_file *p, 
 	if (!action && !any_count)
 		goto out;
 
-	seq_printf(p, "%3d: ", i);
+	seq_printf(p, "%*d: ", prec, i);
 #ifndef CONFIG_SMP
 	seq_printf(p, "%10u ", kstat_irqs(i));
 #else
--- head-2010-04-29.orig/arch/x86/kernel/ldt-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/ldt-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -12,8 +12,8 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
--- head-2010-04-29.orig/arch/x86/kernel/machine_kexec_32.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/machine_kexec_32.c	2010-03-24 15:17:58.000000000 +0100
@@ -123,13 +123,7 @@ void machine_kexec_setup_load_arg(xen_ke
 	memcpy(control_page, relocate_kernel, PAGE_SIZE);
 
 	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
-	xki->page_list[PA_PGD] = __ma(kexec_pgd);
-#ifdef CONFIG_X86_PAE
-	xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
-	xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
-#endif
-	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
-	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+	xki->page_list[PA_PGD] = __ma(image->arch.pgd);
 
 	if (image->type == KEXEC_TYPE_DEFAULT)
 		xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
--- head-2010-04-29.orig/arch/x86/kernel/mpparse-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/mpparse-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -2,7 +2,7 @@
  *	Intel Multiprocessor Specification 1.1 and 1.4
  *	compliant MP-table parsing routines.
  *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
  *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
  *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
  */
@@ -16,18 +16,18 @@
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
+#include <linux/smp.h>
 
-#include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
 #include <asm/proto.h>
-#include <asm/acpi.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
 #include <asm/setup.h>
+#include <asm/smp.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -54,13 +54,13 @@ static int __init mpf_checksum(unsigned 
 	return sum & 0xFF;
 }
 
-static void __init MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_cpu *m)
 {
 #ifndef CONFIG_XEN
 	int apicid;
 	char *bootup_cpu = "";
 
-	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+	if (!(m->cpuflag & CPU_ENABLED)) {
 		disabled_cpus++;
 		return;
 	}
@@ -68,57 +68,57 @@ static void __init MP_processor_info(str
 	if (x86_quirks->mpc_apic_id)
 		apicid = x86_quirks->mpc_apic_id(m);
 	else
-		apicid = m->mpc_apicid;
+		apicid = m->apicid;
 
-	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+	if (m->cpuflag & CPU_BOOTPROCESSOR) {
 		bootup_cpu = " (Bootup-CPU)";
-		boot_cpu_physical_apicid = m->mpc_apicid;
+		boot_cpu_physical_apicid = m->apicid;
 	}
 
-	printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
-	generic_processor_info(apicid, m->mpc_apicver);
+	printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+	generic_processor_info(apicid, m->apicver);
 #else /* CONFIG_XEN */
 	num_processors++;
 #endif
 }
 
 #ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_config_bus *m)
+static void __init MP_bus_info(struct mpc_bus *m)
 {
 	char str[7];
-	memcpy(str, m->mpc_bustype, 6);
+	memcpy(str, m->bustype, 6);
 	str[6] = 0;
 
 	if (x86_quirks->mpc_oem_bus_info)
 		x86_quirks->mpc_oem_bus_info(m, str);
 	else
-		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
+		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
 
 #if MAX_MP_BUSSES < 256
-	if (m->mpc_busid >= MAX_MP_BUSSES) {
+	if (m->busid >= MAX_MP_BUSSES) {
 		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
 		       " is too large, max. supported is %d\n",
-		       m->mpc_busid, str, MAX_MP_BUSSES - 1);
+		       m->busid, str, MAX_MP_BUSSES - 1);
 		return;
 	}
 #endif
 
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-		 set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+		set_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
 		if (x86_quirks->mpc_oem_pci_bus)
 			x86_quirks->mpc_oem_pci_bus(m);
 
-		clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+		clear_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+		mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+		mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
 	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
-		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+		mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
 #endif
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -142,32 +142,31 @@ static int bad_ioapic(unsigned long addr
 	return 0;
 }
 
-static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
-	if (!(m->mpc_flags & MPC_APIC_USABLE))
+	if (!(m->flags & MPC_APIC_USABLE))
 		return;
 
 	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
-	       m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+	       m->apicid, m->apicver, m->apicaddr);
 
-	if (bad_ioapic(m->mpc_apicaddr))
+	if (bad_ioapic(m->apicaddr))
 		return;
 
-	mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
-	mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
-	mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
-	mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
-	mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
+	mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
+	mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
+	mp_ioapics[nr_ioapics].mp_type = m->type;
+	mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
+	mp_ioapics[nr_ioapics].mp_flags = m->flags;
 	nr_ioapics++;
 }
 
-static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_intsrc *m)
 {
 	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
-		m->mpc_irqtype, m->mpc_irqflag & 3,
-		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+		m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+		m->srcbusirq, m->dstapic, m->dstirq);
 }
 
 static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
@@ -179,52 +178,52 @@ static void __init print_mp_irq_info(str
 		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
 }
 
-static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+static void __init assign_to_mp_irq(struct mpc_intsrc *m,
 				    struct mp_config_intsrc *mp_irq)
 {
-	mp_irq->mp_dstapic = m->mpc_dstapic;
-	mp_irq->mp_type = m->mpc_type;
-	mp_irq->mp_irqtype = m->mpc_irqtype;
-	mp_irq->mp_irqflag = m->mpc_irqflag;
-	mp_irq->mp_srcbus = m->mpc_srcbus;
-	mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
-	mp_irq->mp_dstirq = m->mpc_dstirq;
+	mp_irq->mp_dstapic = m->dstapic;
+	mp_irq->mp_type = m->type;
+	mp_irq->mp_irqtype = m->irqtype;
+	mp_irq->mp_irqflag = m->irqflag;
+	mp_irq->mp_srcbus = m->srcbus;
+	mp_irq->mp_srcbusirq = m->srcbusirq;
+	mp_irq->mp_dstirq = m->dstirq;
 }
 
 static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
-					struct mpc_config_intsrc *m)
+					struct mpc_intsrc *m)
 {
-	m->mpc_dstapic = mp_irq->mp_dstapic;
-	m->mpc_type = mp_irq->mp_type;
-	m->mpc_irqtype = mp_irq->mp_irqtype;
-	m->mpc_irqflag = mp_irq->mp_irqflag;
-	m->mpc_srcbus = mp_irq->mp_srcbus;
-	m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
-	m->mpc_dstirq = mp_irq->mp_dstirq;
+	m->dstapic = mp_irq->mp_dstapic;
+	m->type = mp_irq->mp_type;
+	m->irqtype = mp_irq->mp_irqtype;
+	m->irqflag = mp_irq->mp_irqflag;
+	m->srcbus = mp_irq->mp_srcbus;
+	m->srcbusirq = mp_irq->mp_srcbusirq;
+	m->dstirq = mp_irq->mp_dstirq;
 }
 
 static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
-					struct mpc_config_intsrc *m)
+					struct mpc_intsrc *m)
 {
-	if (mp_irq->mp_dstapic != m->mpc_dstapic)
+	if (mp_irq->mp_dstapic != m->dstapic)
 		return 1;
-	if (mp_irq->mp_type != m->mpc_type)
+	if (mp_irq->mp_type != m->type)
 		return 2;
-	if (mp_irq->mp_irqtype != m->mpc_irqtype)
+	if (mp_irq->mp_irqtype != m->irqtype)
 		return 3;
-	if (mp_irq->mp_irqflag != m->mpc_irqflag)
+	if (mp_irq->mp_irqflag != m->irqflag)
 		return 4;
-	if (mp_irq->mp_srcbus != m->mpc_srcbus)
+	if (mp_irq->mp_srcbus != m->srcbus)
 		return 5;
-	if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+	if (mp_irq->mp_srcbusirq != m->srcbusirq)
 		return 6;
-	if (mp_irq->mp_dstirq != m->mpc_dstirq)
+	if (mp_irq->mp_dstirq != m->dstirq)
 		return 7;
 
 	return 0;
 }
 
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void __init MP_intsrc_info(struct mpc_intsrc *m)
 {
 	int i;
 
@@ -242,57 +241,55 @@ static void __init MP_intsrc_info(struct
 
 #endif
 
-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
 	apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-		m->mpc_irqtype, m->mpc_irqflag & 3,
-		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
-		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+		m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+		m->srcbusirq, m->destapic, m->destapiclint);
 }
 
 /*
  * Read/parse the MPC
  */
 
-static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
-				char *str)
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
 {
 
-	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+	if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
 		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
-		       mpc->mpc_signature[0], mpc->mpc_signature[1],
-		       mpc->mpc_signature[2], mpc->mpc_signature[3]);
+		       mpc->signature[0], mpc->signature[1],
+		       mpc->signature[2], mpc->signature[3]);
 		return 0;
 	}
-	if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+	if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
 		printk(KERN_ERR "MPTABLE: checksum error!\n");
 		return 0;
 	}
-	if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+	if (mpc->spec != 0x01 && mpc->spec != 0x04) {
 		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-		       mpc->mpc_spec);
+		       mpc->spec);
 		return 0;
 	}
-	if (!mpc->mpc_lapic) {
+	if (!mpc->lapic) {
 		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
 		return 0;
 	}
-	memcpy(oem, mpc->mpc_oem, 8);
+	memcpy(oem, mpc->oem, 8);
 	oem[8] = 0;
 	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
 
-	memcpy(str, mpc->mpc_productid, 12);
+	memcpy(str, mpc->productid, 12);
 	str[12] = 0;
 
 	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
 
-	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
 
 	return 1;
 }
 
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
 	char oem[10];
@@ -317,14 +314,14 @@ static int __init smp_read_mpc(struct mp
 #endif
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
-		mp_lapic_addr = mpc->mpc_lapic;
+		mp_lapic_addr = mpc->lapic;
 
 	if (early)
 		return 1;
 
-	if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
-		struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
-		x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+	if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
+		struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
+		x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
 	}
 
 	/*
@@ -333,12 +330,11 @@ static int __init smp_read_mpc(struct mp
 	if (x86_quirks->mpc_record)
 		*x86_quirks->mpc_record = 0;
 
-	while (count < mpc->mpc_length) {
+	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
 			{
-				struct mpc_config_processor *m =
-				    (struct mpc_config_processor *)mpt;
+				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
 				/* ACPI may have already provided this data */
 				if (!acpi_lapic)
 					MP_processor_info(m);
@@ -348,8 +344,7 @@ static int __init smp_read_mpc(struct mp
 			}
 		case MP_BUS:
 			{
-				struct mpc_config_bus *m =
-				    (struct mpc_config_bus *)mpt;
+				struct mpc_bus *m = (struct mpc_bus *)mpt;
 #ifdef CONFIG_X86_IO_APIC
 				MP_bus_info(m);
 #endif
@@ -360,30 +355,28 @@ static int __init smp_read_mpc(struct mp
 		case MP_IOAPIC:
 			{
 #ifdef CONFIG_X86_IO_APIC
-				struct mpc_config_ioapic *m =
-				    (struct mpc_config_ioapic *)mpt;
+				struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
 				MP_ioapic_info(m);
 #endif
-				mpt += sizeof(struct mpc_config_ioapic);
-				count += sizeof(struct mpc_config_ioapic);
+				mpt += sizeof(struct mpc_ioapic);
+				count += sizeof(struct mpc_ioapic);
 				break;
 			}
 		case MP_INTSRC:
 			{
 #ifdef CONFIG_X86_IO_APIC
-				struct mpc_config_intsrc *m =
-				    (struct mpc_config_intsrc *)mpt;
+				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 
 				MP_intsrc_info(m);
 #endif
-				mpt += sizeof(struct mpc_config_intsrc);
-				count += sizeof(struct mpc_config_intsrc);
+				mpt += sizeof(struct mpc_intsrc);
+				count += sizeof(struct mpc_intsrc);
 				break;
 			}
 		case MP_LINTSRC:
 			{
-				struct mpc_config_lintsrc *m =
-				    (struct mpc_config_lintsrc *)mpt;
+				struct mpc_lintsrc *m =
+				    (struct mpc_lintsrc *)mpt;
 				MP_lintsrc_info(m);
 				mpt += sizeof(*m);
 				count += sizeof(*m);
@@ -394,8 +387,8 @@ static int __init smp_read_mpc(struct mp
 			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
 			printk(KERN_ERR "type %x\n", *mpt);
 			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->mpc_length, 1);
-			count = mpc->mpc_length;
+					1, mpc, mpc->length, 1);
+			count = mpc->length;
 			break;
 		}
 		if (x86_quirks->mpc_record)
@@ -426,16 +419,16 @@ static int __init ELCR_trigger(unsigned 
 
 static void __init construct_default_ioirq_mptable(int mpc_default_type)
 {
-	struct mpc_config_intsrc intsrc;
+	struct mpc_intsrc intsrc;
 	int i;
 	int ELCR_fallback = 0;
 
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;	/* conforming */
-	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
+	intsrc.type = MP_INTSRC;
+	intsrc.irqflag = 0;	/* conforming */
+	intsrc.srcbus = 0;
+	intsrc.dstapic = mp_ioapics[0].mp_apicid;
 
-	intsrc.mpc_irqtype = mp_INT;
+	intsrc.irqtype = mp_INT;
 
 	/*
 	 *  If true, we have an ISA/PCI system with no IRQ entries
@@ -478,30 +471,30 @@ static void __init construct_default_ioi
 			 *  irqflag field (level sensitive, active high polarity).
 			 */
 			if (ELCR_trigger(i))
-				intsrc.mpc_irqflag = 13;
+				intsrc.irqflag = 13;
 			else
-				intsrc.mpc_irqflag = 0;
+				intsrc.irqflag = 0;
 		}
 
-		intsrc.mpc_srcbusirq = i;
-		intsrc.mpc_dstirq = i ? i : 2;	/* IRQ0 to INTIN2 */
+		intsrc.srcbusirq = i;
+		intsrc.dstirq = i ? i : 2;	/* IRQ0 to INTIN2 */
 		MP_intsrc_info(&intsrc);
 	}
 
-	intsrc.mpc_irqtype = mp_ExtINT;
-	intsrc.mpc_srcbusirq = 0;
-	intsrc.mpc_dstirq = 0;	/* 8259A to INTIN0 */
+	intsrc.irqtype = mp_ExtINT;
+	intsrc.srcbusirq = 0;
+	intsrc.dstirq = 0;	/* 8259A to INTIN0 */
 	MP_intsrc_info(&intsrc);
 }
 
 
 static void __init construct_ioapic_table(int mpc_default_type)
 {
-	struct mpc_config_ioapic ioapic;
-	struct mpc_config_bus bus;
+	struct mpc_ioapic ioapic;
+	struct mpc_bus bus;
 
-	bus.mpc_type = MP_BUS;
-	bus.mpc_busid = 0;
+	bus.type = MP_BUS;
+	bus.busid = 0;
 	switch (mpc_default_type) {
 	default:
 		printk(KERN_ERR "???\nUnknown standard configuration %d\n",
@@ -509,29 +502,29 @@ static void __init construct_ioapic_tabl
 		/* fall through */
 	case 1:
 	case 5:
-		memcpy(bus.mpc_bustype, "ISA   ", 6);
+		memcpy(bus.bustype, "ISA   ", 6);
 		break;
 	case 2:
 	case 6:
 	case 3:
-		memcpy(bus.mpc_bustype, "EISA  ", 6);
+		memcpy(bus.bustype, "EISA  ", 6);
 		break;
 	case 4:
 	case 7:
-		memcpy(bus.mpc_bustype, "MCA   ", 6);
+		memcpy(bus.bustype, "MCA   ", 6);
 	}
 	MP_bus_info(&bus);
 	if (mpc_default_type > 4) {
-		bus.mpc_busid = 1;
-		memcpy(bus.mpc_bustype, "PCI   ", 6);
+		bus.busid = 1;
+		memcpy(bus.bustype, "PCI   ", 6);
 		MP_bus_info(&bus);
 	}
 
-	ioapic.mpc_type = MP_IOAPIC;
-	ioapic.mpc_apicid = 2;
-	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	ioapic.mpc_flags = MPC_APIC_USABLE;
-	ioapic.mpc_apicaddr = 0xFEC00000;
+	ioapic.type = MP_IOAPIC;
+	ioapic.apicid = 2;
+	ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.flags = MPC_APIC_USABLE;
+	ioapic.apicaddr = 0xFEC00000;
 	MP_ioapic_info(&ioapic);
 
 	/*
@@ -545,8 +538,8 @@ static inline void __init construct_ioap
 
 static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 {
-	struct mpc_config_processor processor;
-	struct mpc_config_lintsrc lintsrc;
+	struct mpc_cpu processor;
+	struct mpc_lintsrc lintsrc;
 	int linttypes[2] = { mp_ExtINT, mp_NMI };
 	int i;
 
@@ -558,30 +551,30 @@ static inline void __init construct_defa
 	/*
 	 * 2 CPUs, numbered 0 & 1.
 	 */
-	processor.mpc_type = MP_PROCESSOR;
+	processor.type = MP_PROCESSOR;
 	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.cpuflag = CPU_ENABLED;
+	processor.cpufeature = (boot_cpu_data.x86 << 8) |
 	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
+	processor.featureflag = boot_cpu_data.x86_capability[0];
+	processor.reserved[0] = 0;
+	processor.reserved[1] = 0;
 	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
+		processor.apicid = i;
 		MP_processor_info(&processor);
 	}
 
 	construct_ioapic_table(mpc_default_type);
 
-	lintsrc.mpc_type = MP_LINTSRC;
-	lintsrc.mpc_irqflag = 0;	/* conforming */
-	lintsrc.mpc_srcbusid = 0;
-	lintsrc.mpc_srcbusirq = 0;
-	lintsrc.mpc_destapic = MP_APIC_ALL;
+	lintsrc.type = MP_LINTSRC;
+	lintsrc.irqflag = 0;		/* conforming */
+	lintsrc.srcbusid = 0;
+	lintsrc.srcbusirq = 0;
+	lintsrc.destapic = MP_APIC_ALL;
 	for (i = 0; i < 2; i++) {
-		lintsrc.mpc_irqtype = linttypes[i];
-		lintsrc.mpc_destapiclint = i;
+		lintsrc.irqtype = linttypes[i];
+		lintsrc.destapiclint = i;
 		MP_lintsrc_info(&lintsrc);
 	}
 }
@@ -595,26 +588,23 @@ static void __init __get_smp_config(unsi
 {
 	struct intel_mp_floating *mpf = mpf_found;
 
-	if (x86_quirks->mach_get_smp_config) {
-		if (x86_quirks->mach_get_smp_config(early))
-			return;
-	}
+	if (!mpf)
+		return;
+
 	if (acpi_lapic && early)
 		return;
+
 	/*
-	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
-	 * processors, where MPS only supports physical.
+	 * MPS doesn't support hyperthreading, aka only have
+	 * thread 0 apic id in MPS table
 	 */
-	if (acpi_lapic && acpi_ioapic) {
-		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
-		       "information\n");
+	if (acpi_lapic && acpi_ioapic)
 		return;
-	} else if (acpi_lapic)
-		printk(KERN_INFO "Using ACPI for processor (LAPIC) "
-		       "configuration information\n");
 
-	if (!mpf)
-		return;
+	if (x86_quirks->mach_get_smp_config) {
+		if (x86_quirks->mach_get_smp_config(early))
+			return;
+	}
 
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
@@ -669,15 +659,15 @@ static void __init __get_smp_config(unsi
 		 * ISA defaults and hope it will work.
 		 */
 		if (!mp_irq_entries) {
-			struct mpc_config_bus bus;
+			struct mpc_bus bus;
 
 			printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
 			       "using default mptable. "
 			       "(tell your hw vendor)\n");
 
-			bus.mpc_type = MP_BUS;
-			bus.mpc_busid = 0;
-			memcpy(bus.mpc_bustype, "ISA   ", 6);
+			bus.type = MP_BUS;
+			bus.busid = 0;
+			memcpy(bus.bustype, "ISA   ", 6);
 			MP_bus_info(&bus);
 
 			construct_default_ioirq_mptable(0);
@@ -823,14 +813,14 @@ void __init find_smp_config(void)
 #ifdef CONFIG_X86_IO_APIC
 static u8 __initdata irq_used[MAX_IRQ_SOURCES];
 
-static int  __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
 {
 	int i;
 
-	if (m->mpc_irqtype != mp_INT)
+	if (m->irqtype != mp_INT)
 		return 0;
 
-	if (m->mpc_irqflag != 0x0f)
+	if (m->irqflag != 0x0f)
 		return 0;
 
 	/* not legacy */
@@ -842,9 +832,9 @@ static int  __init get_MP_intsrc_index(s
 		if (mp_irqs[i].mp_irqflag != 0x0f)
 			continue;
 
-		if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+		if (mp_irqs[i].mp_srcbus != m->srcbus)
 			continue;
-		if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+		if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
 			continue;
 		if (irq_used[i]) {
 			/* already claimed */
@@ -860,10 +850,10 @@ static int  __init get_MP_intsrc_index(s
 
 #define SPARE_SLOT_NUM 20
 
-static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
 #endif
 
-static int  __init replace_intsrc_all(struct mp_config_table *mpc,
+static int  __init replace_intsrc_all(struct mpc_table *mpc,
 					unsigned long mpc_new_phys,
 					unsigned long mpc_new_length)
 {
@@ -875,36 +865,33 @@ static int  __init replace_intsrc_all(st
 	int count = sizeof(*mpc);
 	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
-	printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
-	while (count < mpc->mpc_length) {
+	printk(KERN_INFO "mpc_length %x\n", mpc->length);
+	while (count < mpc->length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
 			{
-				struct mpc_config_processor *m =
-				    (struct mpc_config_processor *)mpt;
+				struct mpc_cpu *m = (struct mpc_cpu *)mpt;
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
 			}
 		case MP_BUS:
 			{
-				struct mpc_config_bus *m =
-				    (struct mpc_config_bus *)mpt;
+				struct mpc_bus *m = (struct mpc_bus *)mpt;
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
 			}
 		case MP_IOAPIC:
 			{
-				mpt += sizeof(struct mpc_config_ioapic);
-				count += sizeof(struct mpc_config_ioapic);
+				mpt += sizeof(struct mpc_ioapic);
+				count += sizeof(struct mpc_ioapic);
 				break;
 			}
 		case MP_INTSRC:
 			{
 #ifdef CONFIG_X86_IO_APIC
-				struct mpc_config_intsrc *m =
-				    (struct mpc_config_intsrc *)mpt;
+				struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 
 				apic_printk(APIC_VERBOSE, "OLD ");
 				print_MP_intsrc_info(m);
@@ -925,14 +912,14 @@ static int  __init replace_intsrc_all(st
 					nr_m_spare++;
 				}
 #endif
-				mpt += sizeof(struct mpc_config_intsrc);
-				count += sizeof(struct mpc_config_intsrc);
+				mpt += sizeof(struct mpc_intsrc);
+				count += sizeof(struct mpc_intsrc);
 				break;
 			}
 		case MP_LINTSRC:
 			{
-				struct mpc_config_lintsrc *m =
-				    (struct mpc_config_lintsrc *)mpt;
+				struct mpc_lintsrc *m =
+				    (struct mpc_lintsrc *)mpt;
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -942,7 +929,7 @@ static int  __init replace_intsrc_all(st
 			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
 			printk(KERN_ERR "type %x\n", *mpt);
 			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-					1, mpc, mpc->mpc_length, 1);
+					1, mpc, mpc->length, 1);
 			goto out;
 		}
 	}
@@ -964,9 +951,8 @@ static int  __init replace_intsrc_all(st
 			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
 			m_spare[nr_m_spare] = NULL;
 		} else {
-			struct mpc_config_intsrc *m =
-			    (struct mpc_config_intsrc *)mpt;
-			count += sizeof(struct mpc_config_intsrc);
+			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+			count += sizeof(struct mpc_intsrc);
 			if (!mpc_new_phys) {
 				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
 			} else {
@@ -978,17 +964,16 @@ static int  __init replace_intsrc_all(st
 				}
 			}
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
-			mpc->mpc_length = count;
-			mpt += sizeof(struct mpc_config_intsrc);
+			mpc->length = count;
+			mpt += sizeof(struct mpc_intsrc);
 		}
 		print_mp_irq_info(&mp_irqs[i]);
 	}
 #endif
 out:
 	/* update checksum */
-	mpc->mpc_checksum = 0;
-	mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
-					   mpc->mpc_length);
+	mpc->checksum = 0;
+	mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
 
 	return 0;
 }
@@ -1034,8 +1019,7 @@ static int __init update_mp_table(void)
 	char str[16];
 	char oem[10];
 	struct intel_mp_floating *mpf;
-	struct mp_config_table *mpc;
-	struct mp_config_table *mpc_new;
+	struct mpc_table *mpc, *mpc_new;
 
 	if (!enable_update_mptable)
 		return 0;
@@ -1061,7 +1045,7 @@ static int __init update_mp_table(void)
 	printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
 	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
 
-	if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+	if (mpc_new_phys && mpc->length > mpc_new_length) {
 		mpc_new_phys = 0;
 		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
 			 mpc_new_length);
@@ -1070,10 +1054,10 @@ static int __init update_mp_table(void)
 	if (!mpc_new_phys) {
 		unsigned char old, new;
 		/* check if we can change the postion */
-		mpc->mpc_checksum = 0;
-		old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
-		mpc->mpc_checksum = 0xff;
-		new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		mpc->checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->length);
+		mpc->checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->length);
 		if (old == new) {
 			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
 			return 0;
@@ -1085,7 +1069,7 @@ static int __init update_mp_table(void)
 		mpc_new_bus = phys_to_machine(mpc_new_phys);
 		mpf->mpf_physptr = mpc_new_bus;
 		mpc_new = phys_to_virt(mpc_new_phys);
-		memcpy(mpc_new, mpc, mpc->mpc_length);
+		memcpy(mpc_new, mpc, mpc->length);
 		mpc = mpc_new;
 		/* check if we can modify that */
 		if (mpc_new_bus - mpf->mpf_physptr) {
--- head-2010-04-29.orig/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/pci-dma-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -6,6 +6,7 @@
 #include <asm/proto.h>
 #include <asm/dma.h>
 #include <asm/iommu.h>
+#include <asm/gart.h>
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly = 0;
 
-/* This tells the BIO block layer to assume merging. Default to off
-   because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
 dma_addr_t bad_dma_address __read_mostly = 0;
 EXPORT_SYMBOL(bad_dma_address);
 
@@ -42,7 +38,7 @@ EXPORT_SYMBOL(bad_dma_address);
    be probably a smaller DMA mask, but this is bug-to-bug compatible
    to older i386. */
 struct device x86_dma_fallback_dev = {
-	.bus_id = "fallback device",
+	.init_name = "fallback device",
 	.coherent_dma_mask = DMA_32BIT_MASK,
 	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
 };
@@ -105,8 +101,6 @@ static void __init dma32_free_bootmem(vo
 	dma32_bootmem_ptr = NULL;
 	dma32_bootmem_size = 0;
 }
-#else
-#define dma32_free_bootmem() ((void)0)
 #endif
 
 static struct dma_mapping_ops swiotlb_dma_ops = {
@@ -128,8 +122,11 @@ static struct dma_mapping_ops swiotlb_dm
 
 void __init pci_iommu_alloc(void)
 {
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
+#endif
+
 	/*
 	 * The order of these functions is important for
 	 * fall-back/fail-over reasons
@@ -149,16 +146,6 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-#ifndef CONFIG_XEN
-unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
-{
-	unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-
-	return size >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(iommu_nr_pages);
-#endif
-
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 				 dma_addr_t *dma_addr, gfp_t flag)
 {
@@ -246,7 +233,6 @@ static __init int iommu_setup(char *p)
 		}
 
 		if (!strncmp(p, "biomerge", 8)) {
-			iommu_bio_merge = 4096;
 			iommu_merge = 1;
 			force_iommu = 1;
 		}
@@ -385,8 +371,8 @@ fs_initcall(pci_iommu_init);
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
 	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected."
-				 "Disabling DAC.\n");
+		printk(KERN_INFO
+			"PCI: VIA PCI bridge detected. Disabling DAC.\n");
 		forbid_dac = 1;
 	}
 }
--- head-2010-04-29.orig/arch/x86/kernel/process-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/process-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -1,13 +1,17 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <asm/idle.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
+#include <asm/apic.h>
+#include <xen/evtchn.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -99,6 +103,9 @@ static inline int hlt_use_halt(void)
  */
 void xen_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 1);
 	current_thread_info()->status &= ~TS_POLLING;
 	/*
 	 * TS_POLLING-cleared state must be visible before we
@@ -111,11 +118,27 @@ void xen_idle(void)
 	else
 		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
+	trace_power_end(&it);
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
 #endif
 
+void stop_this_cpu(void *dummy)
+{
+	local_irq_disable();
+	/*
+	 * Remove this CPU:
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	disable_all_local_evtchn();
+
+	for (;;) {
+		if (hlt_works(smp_processor_id()))
+			halt();
+	}
+}
+
 static void do_nothing(void *unused)
 {
 }
@@ -149,24 +172,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
+		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
+		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -179,9 +215,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 #ifndef CONFIG_XEN
@@ -267,7 +307,7 @@ static void c1e_idle(void)
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
 			c1e_detected = 1;
-			if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
 			printk(KERN_INFO "System has AMD C1E enabled\n");
 			set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
--- head-2010-04-29.orig/arch/x86/kernel/process_32-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/process_32-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -38,11 +38,13 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
 
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
-#include <asm/io.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -59,10 +61,9 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
-#include <asm/kdebug.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/smp.h>
+#include <asm/ds.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
@@ -108,9 +109,6 @@ void cpu_idle(void)
 			check_pgt_cache();
 			rmb();
 
-			if (rcu_pending(cpu))
-				rcu_check_callbacks(cpu, 0);
-
 			if (cpu_is_offline(cpu))
 				play_dead();
 
@@ -208,7 +206,7 @@ extern void kernel_thread_helper(void);
 /*
  * Create a kernel thread
  */
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
 	struct pt_regs regs;
 
@@ -247,14 +245,8 @@ void exit_thread(void)
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(current->thread.ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(current->thread.ds_ctx);
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -267,7 +259,7 @@ void flush_thread(void)
 	tsk->thread.debugreg3 = 0;
 	tsk->thread.debugreg6 = 0;
 	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 	/*
 	 * Forget coprocessor state..
@@ -294,9 +286,9 @@ void prepare_to_copy(struct task_struct 
 
 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
-	struct task_struct * p, struct pt_regs * regs)
+	struct task_struct *p, struct pt_regs *regs)
 {
-	struct pt_regs * childregs;
+	struct pt_regs *childregs;
 	struct task_struct *tsk;
 	int err;
 
@@ -340,13 +332,19 @@ int copy_thread(int nr, unsigned long cl
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+
+	ds_copy_thread(p, current);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	return err;
 }
 
 void
 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
-	__asm__("movl %0, %%gs" :: "r"(0));
+	__asm__("movl %0, %%gs" : : "r"(0));
 	regs->fs		= 0;
 	set_fs(USER_DS);
 	regs->ds		= __USER_DS;
@@ -420,47 +418,18 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	unsigned long ds_prev = 0;
-	unsigned long ds_next = 0;
-
-	if (prev->ds_ctx)
-		ds_prev = (unsigned long)prev->ds_ctx->ds;
-	if (next->ds_ctx)
-		ds_next = (unsigned long)next->ds_ctx->ds;
-
-	if (ds_next != ds_prev) {
-		/* we clear debugctl to make sure DS
-		 * is not in use when we change it */
-		debugctl = 0;
-		update_debugctlmsr(0);
-		wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
-	}
-	return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -481,14 +450,6 @@ __switch_to_xtra(struct task_struct *pre
 		else
 			hard_enable_TSC();
 	}
-
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
 }
 
 /*
@@ -518,7 +479,8 @@ __switch_to_xtra(struct task_struct *pre
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
@@ -698,7 +660,7 @@ asmlinkage int sys_vfork(struct pt_regs 
 asmlinkage int sys_execve(struct pt_regs regs)
 {
 	int error;
-	char * filename;
+	char *filename;
 
 	filename = getname((char __user *) regs.bx);
 	error = PTR_ERR(filename);
--- head-2010-04-29.orig/arch/x86/kernel/process_64-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/process_64-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -42,6 +42,8 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/dmi.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -59,6 +61,7 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
+#include <asm/ds.h>
 
 #include <xen/cpu_hotplug.h>
 
@@ -158,14 +161,18 @@ void __show_regs(struct pt_regs *regs, i
 	unsigned long d0, d1, d2, d3, d6, d7;
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
+	const char *board;
 
 	printk("\n");
 	print_modules();
-	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
+	board = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!board)
+		board = "";
+	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+		init_utsname()->version, board);
 	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 	printk_address(regs->ip, 1);
 	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
@@ -256,14 +263,8 @@ void exit_thread(void)
 #endif
 		t->io_bitmap_max = 0;
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(t->ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(t->ds_ctx);
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void xen_load_gs_index(unsigned gs)
@@ -399,6 +400,11 @@ int copy_thread(int nr, unsigned long cl
 	}
         p->thread.iopl = current->thread.iopl;
 
+	ds_copy_thread(p, me);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
@@ -495,35 +501,14 @@ static inline void __switch_to_xtra(stru
 				    struct task_struct *next_p)
 {
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
-	debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
-	{
-		unsigned long ds_prev = 0, ds_next = 0;
-
-		if (prev->ds_ctx)
-			ds_prev = (unsigned long)prev->ds_ctx->ds;
-		if (next->ds_ctx)
-			ds_next = (unsigned long)next->ds_ctx->ds;
-
-		if (ds_next != ds_prev) {
-			/*
-			 * We clear debugctl to make sure DS
-			 * is not in use when we change it:
-			 */
-			debugctl = 0;
-			update_debugctlmsr(0);
-			wrmsrl(MSR_IA32_DS_AREA, ds_next);
-		}
-	}
-#endif /* CONFIG_X86_DS */
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -544,14 +529,6 @@ static inline void __switch_to_xtra(stru
 		else
 			hard_enable_TSC();
 	}
-
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
 }
 
 /*
@@ -562,8 +539,9 @@ static inline void __switch_to_xtra(stru
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
--- head-2010-04-29.orig/arch/x86/kernel/quirks-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/quirks-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -169,6 +169,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
 			 ich_force_enable_hpet);
 
--- head-2010-04-29.orig/arch/x86/kernel/setup-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/setup-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -93,11 +93,13 @@
 #include <asm/desc.h>
 #include <asm/dma.h>
 #include <asm/iommu.h>
+#include <asm/gart.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 
 #include <mach_apic.h>
 #include <asm/paravirt.h>
+#include <asm/hypervisor.h>
 
 #include <asm/percpu.h>
 #include <asm/topology.h>
@@ -508,6 +510,7 @@ static void __init reserve_early_setup_d
  * @size: Size of the crashkernel memory to reserve.
  * Returns the base address on success, and -1ULL on failure.
  */
+static
 unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
 {
 	const unsigned long long alignment = 16<<20; 	/* 16M */
@@ -650,165 +653,32 @@ static int __init setup_elfcorehdr(char 
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
-/*
- * Some BIOSes seem to corrupt the low 64k of memory during events
- * like suspend/resume and unplugging an HDMI cable.  Reserve all
- * remaining free memory in that area and fill it with a distinct
- * pattern.
- */
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-#define MAX_SCAN_AREAS	8
-
-static int __read_mostly memory_corruption_check = -1;
-
-static unsigned __read_mostly corruption_check_size = 64*1024;
-static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
-static int num_scan_areas;
-
-
-static int set_corruption_check(char *arg)
-{
-	char *end;
-
-	memory_corruption_check = simple_strtol(arg, &end, 10);
-
-	return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check", set_corruption_check);
-
-static int set_corruption_check_period(char *arg)
-{
-	char *end;
-
-	corruption_check_period = simple_strtoul(arg, &end, 10);
-
-	return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_period", set_corruption_check_period);
-
-static int set_corruption_check_size(char *arg)
+#ifndef CONFIG_XEN
+static int __init default_update_genapic(void)
 {
-	char *end;
-	unsigned size;
-
-	size = memparse(arg, &end);
-
-	if (*end == '\0')
-		corruption_check_size = size;
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+	genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
+#endif
 
-	return (size == corruption_check_size) ? 0 : -EINVAL;
+	return 0;
 }
-early_param("memory_corruption_check_size", set_corruption_check_size);
-
-
-static void __init setup_bios_corruption_check(void)
-{
-	u64 addr = PAGE_SIZE;	/* assume first page is reserved anyway */
-
-	if (memory_corruption_check == -1) {
-		memory_corruption_check =
-#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
-			1
 #else
-			0
+#define default_update_genapic NULL
 #endif
-			;
-	}
-
-	if (corruption_check_size == 0)
-		memory_corruption_check = 0;
-
-	if (!memory_corruption_check)
-		return;
-
-	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
-
-	while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
-		u64 size;
-		addr = find_e820_area_size(addr, &size, PAGE_SIZE);
-
-		if (addr == 0)
-			break;
-
-		if ((addr + size) > corruption_check_size)
-			size = corruption_check_size - addr;
 
-		if (size == 0)
-			break;
-
-		e820_update_range(addr, size, E820_RAM, E820_RESERVED);
-		scan_areas[num_scan_areas].addr = addr;
-		scan_areas[num_scan_areas].size = size;
-		num_scan_areas++;
-
-		/* Assume we've already mapped this early memory */
-		memset(__va(addr), 0, size);
-
-		addr += size;
-	}
-
-	printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
-	       num_scan_areas);
-	update_e820();
-}
-
-static struct timer_list periodic_check_timer;
-
-void check_for_bios_corruption(void)
-{
-	int i;
-	int corruption = 0;
-
-	if (!memory_corruption_check)
-		return;
-
-	for(i = 0; i < num_scan_areas; i++) {
-		unsigned long *addr = __va(scan_areas[i].addr);
-		unsigned long size = scan_areas[i].size;
-
-		for(; size; addr++, size -= sizeof(unsigned long)) {
-			if (!*addr)
-				continue;
-			printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
-			       addr, __pa(addr), *addr);
-			corruption = 1;
-			*addr = 0;
-		}
-	}
-
-	WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
-}
-
-static void periodic_check_for_corruption(unsigned long data)
-{
-	check_for_bios_corruption();
-	mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
-}
-
-void start_periodic_check_for_corruption(void)
-{
-	if (!memory_corruption_check || corruption_check_period == 0)
-		return;
-
-	printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
-	       corruption_check_period);
+static struct x86_quirks default_x86_quirks __initdata = {
+	.update_genapic         = default_update_genapic,
+};
 
-	init_timer(&periodic_check_timer);
-	periodic_check_timer.function = &periodic_check_for_corruption;
-	periodic_check_for_corruption(0);
-}
-#endif
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
 
+#ifdef CONFIG_X86_RESERVE_LOW_64K
 static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
 {
 	printk(KERN_NOTICE
-		"%s detected: BIOS may corrupt low RAM, working it around.\n",
+		"%s detected: BIOS may corrupt low RAM, working around it.\n",
 		d->ident);
 
 	e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
@@ -816,6 +686,7 @@ static int __init dmi_low_memory_corrupt
 
 	return 0;
 }
+#endif
 
 /* List of systems that have known low memory corruption BIOS problems */
 static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -1023,15 +894,25 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	if (efi_enabled)
+		efi_init();
+
 	if (is_initial_xendomain()) {
 		dmi_scan_machine();
 
 		dmi_check_system(bad_bios_dmi_table);
+	}
+
+	/*
+	 * VMware detection requires dmi to be available, so this
+	 * needs to be done after dmi_scan_machine, for the BP.
+	 */
+	init_hypervisor(&boot_cpu_data);
 
 #ifdef CONFIG_X86_32
+	if (is_initial_xendomain())
 		probe_roms();
 #endif
-	}
 
 #ifndef CONFIG_XEN
 	/* after parse_early_param, so could debug it */
@@ -1039,8 +920,6 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
-	if (efi_enabled)
-		efi_init();
 
 #ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
@@ -1295,7 +1174,7 @@ void __init setup_arch(char **cmdline_p)
 	ioapic_init_mappings();
 
 	/* need to wait for io_apic is mapped */
-	nr_irqs = probe_nr_irqs();
+	probe_nr_irqs_gsi();
 
 	kvm_guest_init();
 
--- head-2010-04-29.orig/arch/x86/kernel/smp-xen.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/smp-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -1,7 +1,7 @@
 /*
  *	Intel SMP support routines.
  *
- *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
  *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
  *      (c) 2002,2003 Andi Kleen, SuSE Labs.
  *
@@ -118,30 +118,17 @@ void xen_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
-	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+	send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
 void xen_send_call_func_single_ipi(int cpu)
 {
-	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
+	send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
 }
 
-void xen_send_call_func_ipi(cpumask_t mask)
+void xen_send_call_func_ipi(const struct cpumask *mask)
 {
-	send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-}
-
-static void stop_this_cpu(void *dummy)
-{
-	local_irq_disable();
-	/*
-	 * Remove this CPU:
-	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
-	disable_all_local_evtchn();
-	if (hlt_works(smp_processor_id()))
-		for (;;) halt();
-	for (;;);
+	send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
 }
 
 /*
@@ -165,11 +152,7 @@ void xen_smp_send_stop(void)
  */
 irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
 {
-#ifdef CONFIG_X86_32
-	__get_cpu_var(irq_stat).irq_resched_count++;
-#else
-	add_pda(irq_resched_count, 1);
-#endif
+	inc_irq_stat(irq_resched_count);
 	return IRQ_HANDLED;
 }
 
@@ -177,11 +160,7 @@ irqreturn_t smp_call_function_interrupt(
 {
 	irq_enter();
 	generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
-	__get_cpu_var(irq_stat).irq_call_count++;
-#else
-	add_pda(irq_call_count, 1);
-#endif
+	inc_irq_stat(irq_call_count);
 	irq_exit();
 
 	return IRQ_HANDLED;
@@ -191,11 +170,7 @@ irqreturn_t smp_call_function_single_int
 {
 	irq_enter();
 	generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
-	__get_cpu_var(irq_stat).irq_call_count++;
-#else
-	add_pda(irq_call_count, 1);
-#endif
+	inc_irq_stat(irq_call_count);
 	irq_exit();
 
 	return IRQ_HANDLED;
--- head-2010-04-29.orig/arch/x86/kernel/time-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/time-xen.c	2010-05-11 17:14:09.000000000 +0200
@@ -454,11 +454,7 @@ irqreturn_t timer_interrupt(int irq, voi
 	struct vcpu_runstate_info runstate;
 
 	/* Keep nmi watchdog up to date */
-#ifdef __i386__
-	x86_add_percpu(irq_stat.irq0_irqs, 1);
-#else
-	add_pda(irq0_irqs, 1);
-#endif
+	inc_irq_stat(irq0_irqs);
 
 	/*
 	 * Here we are in the timer irq handler. We just have irqs locally
@@ -518,7 +514,6 @@ irqreturn_t timer_interrupt(int irq, voi
 
 	/*
 	 * Account stolen ticks.
-	 * HACK: Passing NULL to account_steal_time()
 	 * ensures that the ticks are accounted as stolen.
 	 */
 	stolen = runstate.time[RUNSTATE_runnable]
@@ -531,12 +526,11 @@ irqreturn_t timer_interrupt(int irq, voi
 		do_div(stolen, NS_PER_TICK);
 		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
 		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
-		account_steal_time(NULL, (cputime_t)stolen);
+		account_steal_ticks(stolen);
 	}
 
 	/*
 	 * Account blocked ticks.
-	 * HACK: Passing idle_task to account_steal_time()
 	 * ensures that the ticks are accounted as idle/wait.
 	 */
 	blocked = runstate.time[RUNSTATE_blocked]
@@ -548,18 +542,23 @@ irqreturn_t timer_interrupt(int irq, voi
 		do_div(blocked, NS_PER_TICK);
 		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
 		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
-		account_steal_time(idle_task(cpu), (cputime_t)blocked);
+		account_idle_ticks(blocked);
 	}
 
 	/* Account user/system ticks. */
 	if (delta_cpu > 0) {
+		cputime_t ct;
+
 		do_div(delta_cpu, NS_PER_TICK);
 		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+		ct = jiffies_to_cputime(delta_cpu);
 		if (user_mode_vm(get_irq_regs()))
-			account_user_time(current, (cputime_t)delta_cpu);
-		else
+			account_user_time(current, ct, cputime_to_scaled(ct));
+		else if (current != idle_task(cpu))
 			account_system_time(current, HARDIRQ_OFFSET,
-					    (cputime_t)delta_cpu);
+					    ct, cputime_to_scaled(ct));
+		else
+			account_idle_ticks(delta_cpu);
 	}
 
 	/* Offlined for more than a few seconds? Avoid lockup warnings. */
@@ -788,7 +786,7 @@ static void stop_hz_timer(void)
 	unsigned long j;
 	int rc;
 
-	cpu_set(cpu, nohz_cpu_mask);
+	cpumask_set_cpu(cpu, nohz_cpu_mask);
 
 	/* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
 	/* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
@@ -804,7 +802,7 @@ static void stop_hz_timer(void)
 	    local_softirq_pending() ||
 	    (j = get_next_timer_interrupt(jiffies),
 	     time_before_eq(j, jiffies))) {
-		cpu_clear(cpu, nohz_cpu_mask);
+		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 		j = jiffies + 1;
 	}
 
@@ -835,7 +833,7 @@ static void start_hz_timer(void)
 	}
 #endif
 	BUG_ON(rc);
-	cpu_clear(cpu, nohz_cpu_mask);
+	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 }
 
 void xen_safe_halt(void)
--- head-2010-04-29.orig/arch/x86/kernel/traps-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/traps-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -20,7 +20,6 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/string.h>
-#include <linux/unwind.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/kexec.h>
@@ -51,7 +50,6 @@
 #include <asm/debugreg.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
-#include <asm/unwind.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
@@ -65,18 +63,10 @@
 #else
 #include <asm/processor-flags.h>
 #include <asm/arch_hooks.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
 #include <asm/traps.h>
 
 #include "cpu/mcheck/mce.h"
 
-#ifndef CONFIG_XEN
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
-#endif
-
 asmlinkage int system_call(void);
 
 /* Do we ignore FPU interrupts ? */
@@ -93,6 +83,11 @@ gate_desc idt_table[256]
 #endif
 #endif
 
+#ifndef CONFIG_XEN
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+#endif
+
 static int ignore_nmis;
 
 static inline void conditional_sti(struct pt_regs *regs)
@@ -108,6 +103,12 @@ static inline void preempt_conditional_s
 		local_irq_enable();
 }
 
+static inline void conditional_cli(struct pt_regs *regs)
+{
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_disable();
+}
+
 static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
@@ -298,8 +299,10 @@ dotraplinkage void do_double_fault(struc
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 8;
 
-	/* This is always a kernel trap and never fixable (and thus must
-	   never return). */
+	/*
+	 * This is always a kernel trap and never fixable (and thus must
+	 * never return).
+	 */
 	for (;;)
 		die(str, regs, error_code);
 }
@@ -476,11 +479,7 @@ do_nmi(struct pt_regs *regs, long error_
 {
 	nmi_enter();
 
-#ifdef CONFIG_X86_32
-	{ int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
-	add_pda(__nmi_count, 1);
-#endif
+	inc_irq_stat(__nmi_count);
 
 	if (!ignore_nmis)
 		default_do_nmi(regs);
@@ -519,9 +518,11 @@ dotraplinkage void __kprobes do_int3(str
 }
 
 #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
 	struct pt_regs *regs = eregs;
@@ -531,8 +532,10 @@ asmlinkage __kprobes struct pt_regs *syn
 	/* Exception from user space */
 	else if (user_mode(eregs))
 		regs = task_pt_regs(current);
-	/* Exception from kernel and interrupts are enabled. Move to
-	   kernel process stack. */
+	/*
+	 * Exception from kernel and interrupts are enabled. Move to
+	 * kernel process stack.
+	 */
 	else if (eregs->flags & X86_EFLAGS_IF)
 		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
 	if (eregs != regs)
@@ -624,8 +627,10 @@ clear_dr7:
 
 #ifdef CONFIG_X86_32
 debug_vm86:
+	/* reenable preemption: handle_vm86_trap() might sleep */
+	dec_preempt_count();
 	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-	preempt_conditional_cli(regs);
+	conditional_cli(regs);
 	return;
 #endif
 
@@ -659,7 +664,7 @@ void math_error(void __user *ip)
 {
 	struct task_struct *task;
 	siginfo_t info;
-	unsigned short cwd, swd;
+	unsigned short cwd, swd, err;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
@@ -670,7 +675,6 @@ void math_error(void __user *ip)
 	task->thread.error_code = 0;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
 	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -684,34 +688,30 @@ void math_error(void __user *ip)
 	 */
 	cwd = get_fpu_cwd(task);
 	swd = get_fpu_swd(task);
-	switch (swd & ~cwd & 0x3f) {
-	case 0x000: /* No unmasked exception */
-#ifdef CONFIG_X86_32
-		return;
-#endif
-	default: /* Multiple exceptions */
-		break;
-	case 0x001: /* Invalid Op */
+
+	err = swd & ~cwd;
+
+	if (err & 0x001) {	/* Invalid op */
 		/*
 		 * swd & 0x240 == 0x040: Stack Underflow
 		 * swd & 0x240 == 0x240: Stack Overflow
 		 * User must clear the SF bit (0x40) if set
 		 */
 		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
+	} else if (err & 0x004) { /* Divide by Zero */
 		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
+	} else if (err & 0x008) { /* Overflow */
 		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
+	} else if (err & 0x012) { /* Denormal, Underflow */
+		info.si_code = FPE_FLTUND;
+	} else if (err & 0x020) { /* Precision */
 		info.si_code = FPE_FLTRES;
-		break;
+	} else {
+		/*
+		 * If we're using IRQ 13, or supposedly even some trap 16
+		 * implementations, it's possible we get a spurious trap...
+		 */
+		return;		/* Spurious trap, no error */
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
@@ -901,7 +901,7 @@ asmlinkage void math_state_restore(void)
 EXPORT_SYMBOL_GPL(math_state_restore);
 
 #ifndef CONFIG_MATH_EMULATION
-asmlinkage void math_emulate(long arg)
+void math_emulate(struct math_emu_info *info)
 {
 	printk(KERN_EMERG
 		"math-emulation not enabled and no coprocessor found.\n");
@@ -911,16 +911,19 @@ asmlinkage void math_emulate(long arg)
 }
 #endif /* CONFIG_MATH_EMULATION */
 
-dotraplinkage void __kprobes
-do_device_not_available(struct pt_regs *regs, long error)
+dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
 {
 #if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
 	if (read_cr0() & X86_CR0_EM) {
-		conditional_sti(regs);
-		math_emulate(0);
+		struct math_emu_info info = { };
+
+		conditional_sti(&regs);
+
+		info.regs = &regs;
+		math_emulate(&info);
 	} else {
 		math_state_restore(); /* interrupts still off */
-		conditional_sti(regs);
+		conditional_sti(&regs);
 	}
 #else
 	math_state_restore();
--- head-2010-04-29.orig/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/kernel/vsyscall_64-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -17,6 +17,9 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeo
 			gettimeofday(tv,NULL);
 			return;
 		}
+
+		/*
+		 * Surround the RDTSC by barriers, to make sure it's not
+		 * speculated to outside the seqlock critical section and
+		 * does not cause time warps:
+		 */
+		rdtsc_barrier();
 		now = vread();
+		rdtsc_barrier();
+
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;
--- head-2010-04-29.orig/arch/x86/mm/fault-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/fault-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -53,7 +53,7 @@
 
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
 	if (unlikely(is_kmmio_active()))
 		if (kmmio_handler(regs, addr) == 1)
 			return -1;
@@ -406,7 +406,7 @@ static void show_fault_oops(struct pt_re
 		if (pte && pte_present(*pte) && !pte_exec(*pte))
 			printk(KERN_CRIT "kernel tried to execute "
 				"NX-protected page - exploit attempt? "
-				"(uid: %d)\n", current->uid);
+				"(uid: %d)\n", current_uid());
 	}
 #endif
 
@@ -426,6 +426,7 @@ static noinline void pgtable_bad(unsigne
 				 unsigned long error_code)
 {
 	unsigned long flags = oops_begin();
+	int sig = SIGKILL;
 	struct task_struct *tsk;
 
 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -436,8 +437,8 @@ static noinline void pgtable_bad(unsigne
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
 	if (__die("Bad pagetable", regs, error_code))
-		regs = NULL;
-	oops_end(flags, regs, SIGKILL);
+		sig = 0;
+	oops_end(flags, regs, sig);
 }
 #endif
 
@@ -546,10 +547,7 @@ static int vmalloc_fault(unsigned long a
 	   happen within a race in page table update. In the later
 	   case just flush. */
 
-	/* On Xen the line below does not always work. Needs investigating! */
-	/*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
-	pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
-	pgd += pgd_index(address);
+	pgd = pgd_offset(current->active_mm, address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -606,6 +604,7 @@ void __kprobes do_page_fault(struct pt_r
 	int fault;
 #ifdef CONFIG_X86_64
 	unsigned long flags;
+	int sig;
 #endif
 
 	/* Set the "privileged fault" bit to something sane. */
@@ -623,8 +622,6 @@ void __kprobes do_page_fault(struct pt_r
 
 	si_code = SEGV_MAPERR;
 
-	if (notify_page_fault(regs))
-		return;
 	if (unlikely(kmmio_fault(regs, address)))
 		return;
 
@@ -663,6 +660,9 @@ void __kprobes do_page_fault(struct pt_r
 		if (spurious_fault(address, error_code))
 			return;
 
+		/* kprobes don't want to hook the spurious faults. */
+		if (notify_page_fault(regs))
+			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
 		 * fault we could otherwise deadlock.
@@ -670,6 +670,9 @@ void __kprobes do_page_fault(struct pt_r
 		goto bad_area_nosemaphore;
 	}
 
+	/* kprobes don't want to hook the spurious faults. */
+	if (notify_page_fault(regs))
+		return;
 
 	/*
 	 * It's safe to allow irq's after cr2 has been saved and the
@@ -696,7 +699,6 @@ void __kprobes do_page_fault(struct pt_r
 	if (unlikely(in_atomic() || !mm))
 		goto bad_area_nosemaphore;
 
-again:
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -880,32 +882,22 @@ no_context:
 	bust_spinlocks(0);
 	do_exit(SIGKILL);
 #else
+	sig = SIGKILL;
 	if (__die("Oops", regs, error_code))
-		regs = NULL;
+		sig = 0;
 	/* Executive summary in case the body of the oops scrolled away */
 	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags, regs, SIGKILL);
+	oops_end(flags, regs, sig);
 #endif
 
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
 out_of_memory:
+	/*
+	 * We ran out of memory, call the OOM killer, and return the userspace
+	 * (which will retry the fault, or kill us if we got oom-killed).
+	 */
 	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		/*
-		 * Re-lookup the vma - in theory the vma tree might
-		 * have changed:
-		 */
-		goto again;
-	}
-
-	printk("VM: killing process %s\n", tsk->comm);
-	if (error_code & PF_USER)
-		do_group_exit(SIGKILL);
-	goto no_context;
+	pagefault_out_of_memory();
+	return;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
--- head-2010-04-29.orig/arch/x86/mm/hypervisor.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/hypervisor.c	2010-03-24 15:17:58.000000000 +0100
@@ -79,12 +79,12 @@ static void multicall_failed(const multi
 	BUG();
 }
 
-int xen_multicall_flush(bool ret_last) {
+static int _xen_multicall_flush(bool ret_last) {
 	struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
 	multicall_entry_t *mc = lazy->mc;
 	unsigned int count = lazy->nr_mc;
 
-	if (!count || !use_lazy_mmu_mode())
+	if (!count)
 		return 0;
 
 	lazy->nr_mc = 0;
@@ -112,6 +112,11 @@ int xen_multicall_flush(bool ret_last) {
 
 	return 0;
 }
+
+void xen_multicall_flush(bool force) {
+	if (force || use_lazy_mmu_mode())
+		_xen_multicall_flush(false);
+}
 EXPORT_SYMBOL(xen_multicall_flush);
 
 int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
@@ -130,7 +135,7 @@ int xen_multi_update_va_mapping(unsigned
 #endif
 
 	if (unlikely(lazy->nr_mc == NR_MC))
-		xen_multicall_flush(false);
+		_xen_multicall_flush(false);
 
 	mc = lazy->mc + lazy->nr_mc++;
 	mc->op = __HYPERVISOR_update_va_mapping;
@@ -169,7 +174,7 @@ int xen_multi_mmu_update(mmu_update_t *s
 	merge = lazy->nr_mc && !commit
 		&& mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
 	if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
-		xen_multicall_flush(false);
+		_xen_multicall_flush(false);
 		mc = lazy->mc;
 		commit = count > NR_MMU || success_count;
 	}
@@ -207,7 +212,7 @@ int xen_multi_mmu_update(mmu_update_t *s
 			break;
 		}
 
-	return commit ? xen_multicall_flush(true) : 0;
+	return commit ? _xen_multicall_flush(true) : 0;
 }
 
 int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
@@ -291,7 +296,7 @@ int xen_multi_mmuext_op(struct mmuext_op
 	merge = lazy->nr_mc && !commit
 		&& mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
 	if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
-		xen_multicall_flush(false);
+		_xen_multicall_flush(false);
 		mc = lazy->mc;
 		commit = count > NR_MMUEXT || success_count;
 	}
@@ -338,7 +343,7 @@ int xen_multi_mmuext_op(struct mmuext_op
 			break;
 		}
 
-	return commit ? xen_multicall_flush(true) : 0;
+	return commit ? _xen_multicall_flush(true) : 0;
 }
 
 void xen_l1_entry_update(pte_t *ptr, pte_t val)
--- head-2010-04-29.orig/arch/x86/mm/init_32-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/init_32-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -71,7 +71,7 @@ static unsigned long __initdata table_to
 
 static int __initdata after_init_bootmem;
 
-static __init void *alloc_low_page(unsigned long *phys)
+static __init void *alloc_low_page(void)
 {
 	unsigned long pfn = table_end++;
 	void *adr;
@@ -81,7 +81,6 @@ static __init void *alloc_low_page(unsig
 
 	adr = __va(pfn * PAGE_SIZE);
 	memset(adr, 0, PAGE_SIZE);
-	*phys  = pfn * PAGE_SIZE;
 	return adr;
 }
 
@@ -96,17 +95,18 @@ static pmd_t * __init one_md_table_init(
 	pmd_t *pmd_table;
 
 #ifdef CONFIG_X86_PAE
-	unsigned long phys;
 	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
 		if (after_init_bootmem)
 			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		else
-			pmd_table = (pmd_t *)alloc_low_page(&phys);
+			pmd_table = (pmd_t *)alloc_low_page();
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
 		BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+		return pmd_table;
 	}
 #endif
 	pud = pud_offset(pgd, 0);
@@ -135,10 +135,8 @@ static pte_t * __init one_page_table_ini
 			if (!page_table)
 				page_table =
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-		} else {
-			unsigned long phys;
-			page_table = (pte_t *)alloc_low_page(&phys);
-		}
+		} else
+			page_table = (pte_t *)alloc_low_page();
 
 		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		make_lowmem_page_readonly(page_table,
@@ -150,6 +148,51 @@ static pte_t * __init one_page_table_ini
 	return pte_offset_kernel(pmd, 0);
 }
 
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+					   unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * Something (early fixmap) may already have put a pte
+	 * page here, which causes the page table allocation
+	 * to become nonlinear. Attempt to fix it, and if it
+	 * is still nonlinear then we have to bug.
+	 */
+	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+		pte_t *newpte;
+		int i;
+
+		BUG_ON(after_init_bootmem);
+		newpte = alloc_low_page();
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			set_pte(newpte + i, pte[i]);
+
+		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+		make_lowmem_page_readonly(newpte,
+					  XENFEAT_writable_page_tables);
+		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+		BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+		__flush_tlb_all();
+
+		paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+		make_lowmem_page_writable(pte,
+					  XENFEAT_writable_page_tables);
+		pte = newpte;
+	}
+	BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+	       && vaddr > fix_to_virt(FIX_KMAP_END)
+	       && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+	return pte;
+}
+
 /*
  * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
@@ -166,6 +209,7 @@ page_table_range_init(unsigned long star
 	unsigned long vaddr;
 	pgd_t *pgd;
 	pmd_t *pmd;
+	pte_t *pte = NULL;
 
 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
@@ -177,8 +221,10 @@ page_table_range_init(unsigned long star
 		pmd = pmd + pmd_index(vaddr);
 		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
 							pmd++, pmd_idx++) {
-			if (vaddr < hypervisor_virt_start)
-				one_page_table_init(pmd);
+			if (vaddr >= hypervisor_virt_start)
+				break;
+			pte = page_table_kmap_check(one_page_table_init(pmd),
+			                            pmd, vaddr, pte);
 
 			vaddr += PMD_SIZE;
 		}
@@ -361,6 +407,8 @@ int devmem_is_allowed(unsigned long page
 {
 	if (pagenr <= 256)
 		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
 	if (mfn_to_local_pfn(pagenr) >= max_pfn)
 		return 1;
 	return 0;
@@ -476,8 +524,12 @@ static void __init set_highmem_pages_ini
 #endif /* !CONFIG_NUMA */
 
 #else
-# define permanent_kmaps_init(pgd_base)		do { } while (0)
-# define set_highmem_pages_init()	do { } while (0)
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+static inline void set_highmem_pages_init(void)
+{
+}
 #endif /* CONFIG_HIGHMEM */
 
 pgd_t *swapper_pg_dir;
@@ -509,7 +561,6 @@ static void __init early_ioremap_page_ta
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
-	early_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
@@ -856,10 +907,7 @@ static void __init find_early_table_spac
 	tables += PAGE_ALIGN(ptes * sizeof(pte_t));
 
 	/* for fixmap */
-	tables += PAGE_SIZE
-	          * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
-	              - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
-	             >> PMD_SHIFT);
+	tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
 
 	table_start = extend_init_mapping(tables);
 
@@ -1023,8 +1071,6 @@ void __init mem_init(void)
 
 	pci_iommu_alloc();
 
-	start_periodic_check_for_corruption();
-
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
@@ -1099,11 +1145,25 @@ void __init mem_init(void)
 		(unsigned long)&_text, (unsigned long)&_etext,
 		((unsigned long)&_etext - (unsigned long)&_text) >> 10);
 
+	/*
+	 * Check boundaries twice: Some fundamental inconsistencies can
+	 * be detected at build time already.
+	 */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+	BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUILD_BUG_ON(VMALLOC_END			> PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
 #ifdef CONFIG_HIGHMEM
 	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
 	BUG_ON(VMALLOC_END				> PKMAP_BASE);
 #endif
-	BUG_ON(VMALLOC_START				> VMALLOC_END);
+	BUG_ON(VMALLOC_START				>= VMALLOC_END);
 	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
 
 	if (boot_cpu_data.wp_works_ok < 0)
@@ -1123,7 +1183,7 @@ int arch_add_memory(int nid, u64 start, 
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(zone, start_pfn, nr_pages);
+	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
 #endif
 
--- head-2010-04-29.orig/arch/x86/mm/init_64-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/init_64-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -841,7 +841,7 @@ static void __init init_gbpages(void)
 #endif
 }
 
-static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long page_size_mask)
 {
@@ -966,6 +966,8 @@ unsigned long __init_refok init_memory_m
 	pos = start_pfn << PAGE_SHIFT;
 	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
 			<< (PMD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
 	if (start_pfn < end_pfn) {
 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 		pos = end_pfn << PAGE_SHIFT;
@@ -1146,7 +1148,7 @@ int arch_add_memory(int nid, u64 start, 
 	if (last_mapped_pfn > max_pfn_mapped)
 		max_pfn_mapped = last_mapped_pfn;
 
-	ret = __add_pages(zone, start_pfn, nr_pages);
+	ret = __add_pages(nid, zone, start_pfn, nr_pages);
 	WARN_ON_ONCE(ret);
 
 	return ret;
@@ -1177,6 +1179,8 @@ int devmem_is_allowed(unsigned long page
 {
 	if (pagenr <= 256)
 		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
 	if (mfn_to_local_pfn(pagenr) >= max_pfn)
 		return 1;
 	return 0;
@@ -1192,8 +1196,6 @@ void __init mem_init(void)
 	unsigned long absent_pages;
 	unsigned long pfn;
 
-	start_periodic_check_for_corruption();
-
 	pci_iommu_alloc();
 
 	/* clear_bss() already clear the empty_zero_page */
--- head-2010-04-29.orig/arch/x86/mm/iomap_32-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/iomap_32-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -17,9 +17,21 @@
  */
 
 #include <asm/iomap.h>
+#include <asm/pat.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 
+int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#ifndef CONFIG_X86_PAE
+	/* There is no way to map greater than 1 << 32 address without PAE */
+	if (base + size > 0x100000000ULL)
+		return 0;
+#endif
+	return 1;
+}
+EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
 /* Map 'mfn' using fixed map 'type' and protections 'prot'
  */
 void *
@@ -30,6 +42,15 @@ iomap_atomic_prot_pfn(unsigned long mfn,
 
 	pagefault_disable();
 
+	/*
+	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 */
+	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+		prot = PAGE_KERNEL_UC_MINUS;
+
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	pgprot_val(prot) |= _PAGE_IOMAP;
--- head-2010-04-29.orig/arch/x86/mm/ioremap-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/ioremap-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -293,25 +293,6 @@ int page_is_ram(unsigned long pagenr)
 	return 0;
 }
 
-int pagerange_is_ram(unsigned long start, unsigned long end)
-{
-	int ram_page = 0, not_rampage = 0;
-	unsigned long page_nr;
-
-	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
-	     ++page_nr) {
-		if (page_is_ram(mfn_to_local_pfn(page_nr)))
-			ram_page = 1;
-		else
-			not_rampage = 1;
-
-		if (ram_page == not_rampage)
-			return -1;
-	}
-
-	return ram_page;
-}
-
 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
@@ -402,7 +383,8 @@ static void __iomem *__ioremap_caller(re
 	 * Check if the request spans more than any BAR in the iomem resource
 	 * tree.
 	 */
-	WARN_ON(iomem_map_sanity_check(phys_addr, size));
+	WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
+		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
 
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
@@ -746,38 +728,10 @@ void __init early_ioremap_init(void)
 	}
 }
 
-#ifdef CONFIG_X86_32
-void __init early_ioremap_clear(void)
-{
-	pmd_t *pmd;
-
-	if (early_ioremap_debug)
-		printk(KERN_INFO "early_ioremap_clear()\n");
-
-	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
-	pmd_clear(pmd);
-	make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
-	/* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
-	__flush_tlb_all();
-}
-
 void __init early_ioremap_reset(void)
 {
-	enum fixed_addresses idx;
-	unsigned long addr, phys;
-	pte_t *pte;
-
 	after_paging_init = 1;
-	for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
-		addr = fix_to_virt(idx);
-		pte = early_ioremap_pte(addr);
-		if (pte_present(*pte)) {
-			phys = __pte_val(*pte) & PAGE_MASK;
-			set_fixmap(idx, phys);
-		}
-	}
 }
-#endif /* CONFIG_X86_32 */
 
 static void __init __early_set_fixmap(enum fixed_addresses idx,
 				   unsigned long phys, pgprot_t flags)
--- head-2010-04-29.orig/arch/x86/mm/pageattr-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/pageattr-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -524,22 +524,28 @@ static int split_large_page(pte_t *kpte,
 		set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
 
 	/*
-	 * Install the new, split up pagetable. Important details here:
+	 * Install the new, split up pagetable.
 	 *
-	 * On Intel the NX bit of all levels must be cleared to make a
-	 * page executable. See section 4.13.2 of Intel 64 and IA-32
-	 * Architectures Software Developer's Manual).
-	 *
-	 * Mark the entry present. The current mapping might be
-	 * set to not present, which we preserved above.
+	 * We use the standard kernel pagetable protections for the new
+	 * pagetable protections, the actual ptes set above control the
+	 * primary protection behavior:
 	 */
 	if (!xen_feature(XENFEAT_writable_page_tables) &&
 	    HYPERVISOR_update_va_mapping((unsigned long)pbase,
 					 mk_pte(base, PAGE_KERNEL_RO), 0))
 		BUG();
-	ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
-	pgprot_val(ref_prot) |= _PAGE_PRESENT;
-	__set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
+	__set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+	/*
+	 * Intel Atom errata AAH41 workaround.
+	 *
+	 * The real fix should be in hw or in a microcode update, but
+	 * we also probabilistically try to reduce the window of having
+	 * a large TLB mixed with 4K TLBs while instruction fetches are
+	 * going on.
+	 */
+	__flush_tlb_all();
+
 	base = NULL;
 
 out_unlock:
@@ -554,6 +560,36 @@ out_unlock:
 	return 0;
 }
 
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+			       int primary)
+{
+	/*
+	 * Ignore all non primary paths.
+	 */
+	if (!primary)
+		return 0;
+
+	/*
+	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
+	 * to have holes.
+	 * Also set numpages to '1' indicating that we processed cpa req for
+	 * one virtual address page and its pfn. TBD: numpages can be set based
+	 * on the initial value and the level returned by lookup_address().
+	 */
+	if (within(vaddr, PAGE_OFFSET,
+		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+		cpa->numpages = 1;
+		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+		return 0;
+	} else {
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
+			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+			*cpa->vaddr);
+
+		return -EFAULT;
+	}
+}
+
 static int __change_page_attr(struct cpa_data *cpa, int primary)
 {
 	unsigned long address;
@@ -565,21 +601,14 @@ static int __change_page_attr(struct cpa
 		address = cpa->vaddr[cpa->curpage];
 	else
 		address = *cpa->vaddr;
-
 repeat:
 	kpte = lookup_address(address, &level);
 	if (!kpte)
-		return 0;
+		return __cpa_process_fault(cpa, address, primary);
 
 	old_pte = *kpte;
-	if (!__pte_val(old_pte)) {
-		if (!primary)
-			return 0;
-		WARN(1, KERN_WARNING "CPA: called for zero pte. "
-		       "vaddr = %lx cpa->vaddr = %lx\n", address,
-		       *cpa->vaddr);
-		return -EINVAL;
-	}
+	if (!__pte_val(old_pte))
+		return __cpa_process_fault(cpa, address, primary);
 
 	if (level == PG_LEVEL_4K) {
 		pte_t new_pte;
@@ -678,12 +707,7 @@ static int cpa_process_alias(struct cpa_
 		vaddr = *cpa->vaddr;
 
 	if (!(within(vaddr, PAGE_OFFSET,
-		    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
-#ifdef CONFIG_X86_64
-		|| within(vaddr, PAGE_OFFSET + (1UL<<32),
-		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
-#endif
-	)) {
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 
 		alias_cpa = *cpa;
 		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -814,6 +838,15 @@ static int change_page_attr_set_clr(unsi
 
 	vm_unmap_aliases();
 
+	/*
+	 * If we're called with lazy mmu updates enabled, the
+	 * in-memory pte state may be stale.  Flush pending updates to
+	 * bring them up to date.
+	 *
+	arch_flush_lazy_mmu_mode();*/
+	if (arch_use_lazy_mmu_mode())
+		xen_multicall_flush(true);
+
 	cpa.vaddr = addr;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
@@ -856,6 +889,14 @@ static int change_page_attr_set_clr(unsi
 	} else
 		cpa_flush_all(cache);
 
+	/*
+	 * If we've been called with lazy mmu updates enabled, then
+	 * make sure that everything gets flushed out before we
+	 * return.
+	 *
+	arch_flush_lazy_mmu_mode();*/
+	WARN_ON_ONCE(arch_use_lazy_mmu_mode() && !irq_count());
+
 out:
 	return ret;
 }
--- head-2010-04-29.orig/arch/x86/mm/pat-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/mm/pat-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -11,6 +11,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -167,11 +168,12 @@ struct memtype {
 static LIST_HEAD(memtype_list);
 static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
 
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
 static inline u8 _mtrr_type_lookup(u64 start, u64 end)
 {
 	if (is_initial_xendomain())
 		return mtrr_type_lookup(start, end);
-	return pagerange_is_ram(start, end) > 0
+	return pat_pagerange_is_ram(start, end) > 0
 	       ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
 }
 #define mtrr_type_lookup _mtrr_type_lookup
@@ -232,6 +234,33 @@ chk_conflict(struct memtype *new, struct
 static struct memtype *cached_entry;
 static u64 cached_start;
 
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+	int ram_page = 0, not_rampage = 0;
+	unsigned long page_nr;
+
+	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+	     ++page_nr) {
+		/*
+		 * For legacy reasons, physical address range in the legacy ISA
+		 * region is tracked as non-RAM. This will allow users of
+		 * /dev/mem to map portions of legacy ISA region, even when
+		 * some of those portions are listed(or not even listed) with
+		 * different e820 types(RAM/reserved/..)
+		 */
+		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
+		    page_is_ram(mfn_to_local_pfn(page_nr)))
+			ram_page = 1;
+		else
+			not_rampage = 1;
+
+		if (ram_page == not_rampage)
+			return -1;
+	}
+
+	return ram_page;
+}
+
 /*
  * For RAM pages, mark the pages as non WB memory type using
  * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
@@ -360,9 +389,13 @@ int reserve_memtype(u64 start, u64 end, 
 					      req_type & _PAGE_CACHE_MASK);
 	}
 
-	is_range_ram = pagerange_is_ram(start, end);
+	if (new_type)
+		*new_type = actual_type;
+
+	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1)
-		return reserve_ram_pages_type(start, end, req_type, new_type);
+		return reserve_ram_pages_type(start, end, req_type,
+					      new_type);
 	else if (is_range_ram < 0)
 		return -EINVAL;
 
@@ -374,9 +407,6 @@ int reserve_memtype(u64 start, u64 end, 
 	new->end	= end;
 	new->type	= actual_type;
 
-	if (new_type)
-		*new_type = actual_type;
-
 	spin_lock(&memtype_lock);
 
 	if (cached_entry && start >= cached_start)
@@ -464,7 +494,7 @@ int free_memtype(u64 start, u64 end)
 	if (is_ISA_range(start, end - 1))
 		return 0;
 
-	is_range_ram = pagerange_is_ram(start, end);
+	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1)
 		return free_ram_pages_type(start, end);
 	else if (is_range_ram < 0)
@@ -623,6 +653,254 @@ void unmap_devmem(unsigned long mfn, uns
 	free_memtype(addr, addr + size);
 }
 
+#ifndef CONFIG_XEN
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+				int strict_prot)
+{
+	int is_ram = 0;
+	int id_sz, ret;
+	unsigned long flags;
+	unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+	/*
+	 * reserve_pfn_range() doesn't support RAM pages.
+	 */
+	if (is_ram != 0)
+		return -EINVAL;
+
+	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+	if (ret)
+		return ret;
+
+	if (flags != want_flags) {
+		if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
+			free_memtype(paddr, paddr + size);
+			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+				" for %Lx-%Lx, got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_flags),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size),
+				cattr_name(flags));
+			return -EINVAL;
+		}
+		/*
+		 * We allow returning different type than the one requested in
+		 * non strict case.
+		 */
+		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+				      (~_PAGE_CACHE_MASK)) |
+				     flags);
+	}
+
+	/* Need to keep identity mapping in sync */
+	if (paddr >= __pa(high_memory))
+		return 0;
+
+	id_sz = (__pa(high_memory) < paddr + size) ?
+				__pa(high_memory) - paddr :
+				size;
+
+	if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+		free_memtype(paddr, paddr + size);
+		printk(KERN_ERR
+			"%s:%d reserve_pfn_range ioremap_change_attr failed %s "
+			"for %Lx-%Lx\n",
+			current->comm, current->pid,
+			cattr_name(flags),
+			(unsigned long long)paddr,
+			(unsigned long long)(paddr + size));
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+	int is_ram;
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+	if (is_ram == 0)
+		free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ * Otherwise, we reserve the entire vma range, my ging through the PTEs page
+ * by page to get physical address and protection.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+	int retval = 0;
+	unsigned long i, j;
+	resource_size_t paddr;
+	unsigned long prot;
+	unsigned long vma_start = vma->vm_start;
+	unsigned long vma_end = vma->vm_end;
+	unsigned long vma_size = vma_end - vma_start;
+	pgprot_t pgprot;
+
+	if (!pat_enabled)
+		return 0;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/*
+		 * reserve the whole chunk covered by vma. We need the
+		 * starting address and protection from pte.
+		 */
+		if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+		pgprot = __pgprot(prot);
+		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+	}
+
+	/* reserve entire vma page by page, using pfn and prot from pte */
+	for (i = 0; i < vma_size; i += PAGE_SIZE) {
+		if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+			continue;
+
+		pgprot = __pgprot(prot);
+		retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
+		if (retval)
+			goto cleanup_ret;
+	}
+	return 0;
+
+cleanup_ret:
+	/* Reserve error: Cleanup partial reservation and return error */
+	for (j = 0; j < i; j += PAGE_SIZE) {
+		if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
+			continue;
+
+		free_pfn_range(paddr, PAGE_SIZE);
+	}
+
+	return retval;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ * Otherwise, we look t the pfn and size and reserve only the specified range
+ * page by page.
+ *
+ * Note that this function can be called with caller trying to map only a
+ * subrange/page inside the vma.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
+			unsigned long pfn, unsigned long size)
+{
+	int retval = 0;
+	unsigned long i, j;
+	resource_size_t base_paddr;
+	resource_size_t paddr;
+	unsigned long vma_start = vma->vm_start;
+	unsigned long vma_end = vma->vm_end;
+	unsigned long vma_size = vma_end - vma_start;
+
+	if (!pat_enabled)
+		return 0;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/* reserve the whole chunk starting from vm_pgoff */
+		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+		return reserve_pfn_range(paddr, vma_size, prot, 0);
+	}
+
+	/* reserve page by page using pfn and size */
+	base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		paddr = base_paddr + i;
+		retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
+		if (retval)
+			goto cleanup_ret;
+	}
+	return 0;
+
+cleanup_ret:
+	/* Reserve error: Cleanup partial reservation and return error */
+	for (j = 0; j < i; j += PAGE_SIZE) {
+		paddr = base_paddr + j;
+		free_pfn_range(paddr, PAGE_SIZE);
+	}
+
+	return retval;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+			unsigned long size)
+{
+	unsigned long i;
+	resource_size_t paddr;
+	unsigned long prot;
+	unsigned long vma_start = vma->vm_start;
+	unsigned long vma_end = vma->vm_end;
+	unsigned long vma_size = vma_end - vma_start;
+
+	if (!pat_enabled)
+		return;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/* free the whole chunk starting from vm_pgoff */
+		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+		free_pfn_range(paddr, vma_size);
+		return;
+	}
+
+	if (size != 0 && size != vma_size) {
+		/* free page by page, using pfn and size */
+		paddr = (resource_size_t)pfn << PAGE_SHIFT;
+		for (i = 0; i < size; i += PAGE_SIZE) {
+			paddr = paddr + i;
+			free_pfn_range(paddr, PAGE_SIZE);
+		}
+	} else {
+		/* free entire vma, page by page, using the pfn from pte */
+		for (i = 0; i < vma_size; i += PAGE_SIZE) {
+			if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+				continue;
+
+			free_pfn_range(paddr, PAGE_SIZE);
+		}
+	}
+}
+#endif /* CONFIG_XEN */
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	if (pat_enabled)
+		return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+	else
+		return pgprot_noncached(prot);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 /* get Nth element of the linked list */
--- head-2010-04-29.orig/arch/x86/pci/irq-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/arch/x86/pci/irq-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -16,8 +16,7 @@
 #include <asm/io_apic.h>
 #include <linux/irq.h>
 #include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #define PIRQ_SIGNATURE	(('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
 #define PIRQ_VERSION 0x0100
@@ -540,7 +539,7 @@ static int pirq_bios_set(struct pci_dev 
 {
 	struct pci_dev *bridge;
 	int pin = pci_get_interrupt_pin(dev, &bridge);
-	return pcibios_set_irq_routing(bridge, pin, irq);
+	return pcibios_set_irq_routing(bridge, pin - 1, irq);
 }
 
 #endif
@@ -579,6 +578,7 @@ static __init int intel_router_probe(str
 	case PCI_DEVICE_ID_INTEL_ICH7_1:
 	case PCI_DEVICE_ID_INTEL_ICH7_30:
 	case PCI_DEVICE_ID_INTEL_ICH7_31:
+	case PCI_DEVICE_ID_INTEL_TGP_LPC:
 	case PCI_DEVICE_ID_INTEL_ESB2_0:
 	case PCI_DEVICE_ID_INTEL_ICH8_0:
 	case PCI_DEVICE_ID_INTEL_ICH8_1:
@@ -894,7 +894,6 @@ static int pcibios_lookup_irq(struct pci
 		dev_dbg(&dev->dev, "no interrupt pin\n");
 		return 0;
 	}
-	pin = pin - 1;
 
 	/* Find IRQ routing entry */
 
@@ -904,17 +903,17 @@ static int pcibios_lookup_irq(struct pci
 	info = pirq_get_info(dev);
 	if (!info) {
 		dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
-			'A' + pin);
+			'A' + pin - 1);
 		return 0;
 	}
-	pirq = info->irq[pin].link;
-	mask = info->irq[pin].bitmap;
+	pirq = info->irq[pin - 1].link;
+	mask = info->irq[pin - 1].bitmap;
 	if (!pirq) {
-		dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
+		dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
 		return 0;
 	}
 	dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
-		'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
+		'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
 	mask &= pcibios_irq_mask;
 
 	/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -956,7 +955,7 @@ static int pcibios_lookup_irq(struct pci
 				newirq = i;
 		}
 	}
-	dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
+	dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
 
 	/* Check if it is hardcoded */
 	if ((pirq & 0xf0) == 0xf0) {
@@ -984,18 +983,18 @@ static int pcibios_lookup_irq(struct pci
 			return 0;
 		}
 	}
-	dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
+	dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
 
 	/* Update IRQ for all devices with the same pirq value */
 	while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
 		pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
 		if (!pin)
 			continue;
-		pin--;
+
 		info = pirq_get_info(dev2);
 		if (!info)
 			continue;
-		if (info->irq[pin].link == pirq) {
+		if (info->irq[pin - 1].link == pirq) {
 			/*
 			 * We refuse to override the dev->irq
 			 * information. Give a warning!
@@ -1049,6 +1048,9 @@ static void __init pcibios_fixup_irqs(vo
 	dev = NULL;
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+		if (!pin)
+			continue;
+
 #ifdef CONFIG_X86_IO_APIC
 		/*
 		 * Recalculate IRQ numbers if we use the I/O APIC.
@@ -1056,15 +1058,11 @@ static void __init pcibios_fixup_irqs(vo
 		if (io_apic_assign_pci_irqs) {
 			int irq;
 
-			if (!pin)
-				continue;
-
 			/*
 			 * interrupt pins are numbered starting from 1
 			 */
-			pin--;
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-				PCI_SLOT(dev->devfn), pin);
+				PCI_SLOT(dev->devfn), pin - 1);
 			/*
 			 * Busses behind bridges are typically not listed in the
 			 * MP-table.  In this case we have to look up the IRQ
@@ -1077,22 +1075,22 @@ static void __init pcibios_fixup_irqs(vo
 				struct pci_dev *bridge = dev->bus->self;
 				int bus;
 
-				pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+				pin = pci_swizzle_interrupt_pin(dev, pin);
 				bus = bridge->bus->number;
 				irq = IO_APIC_get_PCI_irq_vector(bus,
-						PCI_SLOT(bridge->devfn), pin);
+						PCI_SLOT(bridge->devfn), pin - 1);
 				if (irq >= 0)
 					dev_warn(&dev->dev,
 						"using bridge %s INT %c to "
 							"get IRQ %d\n",
 						 pci_name(bridge),
-						 'A' + pin, irq);
+						 'A' + pin - 1, irq);
 			}
 			if (irq >= 0) {
 				dev_info(&dev->dev,
 					"PCI->APIC IRQ transform: INT %c "
 						"-> IRQ %d\n",
-					'A' + pin, irq);
+					'A' + pin - 1, irq);
 				dev->irq = irq;
 			}
 		}
@@ -1100,7 +1098,7 @@ static void __init pcibios_fixup_irqs(vo
 		/*
 		 * Still no IRQ? Try to lookup one...
 		 */
-		if (pin && !dev->irq)
+		if (!dev->irq)
 			pcibios_lookup_irq(dev, 0);
 	}
 }
@@ -1227,12 +1225,10 @@ static int pirq_enable_irq(struct pci_de
 	if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
 		char *msg = "";
 
-		pin--; /* interrupt pins are numbered starting from 1 */
-
 		if (io_apic_assign_pci_irqs) {
 			int irq;
 
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
 			/*
 			 * Busses behind bridges are typically not listed in the MP-table.
 			 * In this case we have to look up the IRQ based on the parent bus,
@@ -1243,20 +1239,20 @@ static int pirq_enable_irq(struct pci_de
 			while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
 				struct pci_dev *bridge = dev->bus->self;
 
-				pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+				pin = pci_swizzle_interrupt_pin(dev, pin);
 				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
-						PCI_SLOT(bridge->devfn), pin);
+						PCI_SLOT(bridge->devfn), pin - 1);
 				if (irq >= 0)
 					dev_warn(&dev->dev, "using bridge %s "
 						 "INT %c to get IRQ %d\n",
-						 pci_name(bridge), 'A' + pin,
+						 pci_name(bridge), 'A' + pin - 1,
 						 irq);
 				dev = bridge;
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
-					 "INT %c -> IRQ %d\n", 'A' + pin, irq);
+					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
 				dev->irq = irq;
 				return 0;
 			} else
@@ -1275,7 +1271,7 @@ static int pirq_enable_irq(struct pci_de
 			return 0;
 
 		dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
-			 'A' + pin, msg);
+			 'A' + pin - 1, msg);
 	}
 	return 0;
 }
--- head-2010-04-29.orig/arch/x86/pci/pcifront.c	2009-03-18 10:39:31.000000000 +0100
+++ head-2010-04-29/arch/x86/pci/pcifront.c	2010-03-24 15:17:58.000000000 +0100
@@ -8,8 +8,8 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <asm/acpi.h>
+#include <asm/pci_x86.h>
 #include <xen/evtchn.h>
-#include "pci.h"
 
 static int pcifront_enable_irq(struct pci_dev *dev)
 {
--- head-2010-04-29.orig/arch/x86/vdso/vdso32-setup-xen.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/arch/x86/vdso/vdso32-setup-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -349,7 +349,7 @@ int __init sysenter_setup(void)
 }
 
 /* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
--- head-2010-04-29.orig/drivers/acpi/Kconfig	2010-03-24 14:36:44.000000000 +0100
+++ head-2010-04-29/drivers/acpi/Kconfig	2010-03-24 15:17:58.000000000 +0100
@@ -9,7 +9,7 @@ menuconfig ACPI
 	depends on PCI
 	depends on PM
 	select PNP
-	select CPU_IDLE
+	select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL
 	default y
 	help
 	  Advanced Configuration and Power Interface (ACPI) support for 
--- head-2010-04-29.orig/drivers/acpi/processor_extcntl.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/acpi/processor_extcntl.c	2010-03-24 15:17:58.000000000 +0100
@@ -230,3 +230,117 @@ err_out:
 	kfree(perf);
 	return ret;
 }
+
+/*
+ * Objects and functions removed in native 2.6.29, and thus moved here.
+ */
+#ifdef CONFIG_SMP
+static void smp_callback(void *v)
+{
+	/* we already woke the CPU up, nothing more to do */
+}
+
+/*
+ * This function gets called when a part of the kernel has a new latency
+ * requirement.  This means we need to get all processors out of their C-state,
+ * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
+ * wakes them all right up.
+ */
+static int acpi_processor_latency_notify(struct notifier_block *b,
+					 unsigned long l, void *v)
+{
+	smp_call_function(smp_callback, NULL, 1);
+	return NOTIFY_OK;
+}
+
+struct notifier_block acpi_processor_latency_notifier = {
+	.notifier_call = acpi_processor_latency_notify,
+};
+#endif
+
+/*
+ * bm_history -- bit-mask with a bit per jiffy of bus-master activity
+ * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
+ * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
+ * 100 HZ: 0x0000000F: 4 jiffies = 40ms
+ * reduce history for more aggressive entry into C3
+ */
+static unsigned int bm_history __read_mostly =
+    (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
+module_param(bm_history, uint, 0644);
+
+int acpi_processor_set_power_policy(struct acpi_processor *pr)
+{
+	unsigned int i;
+	unsigned int state_is_set = 0;
+	struct acpi_processor_cx *lower = NULL;
+	struct acpi_processor_cx *higher = NULL;
+	struct acpi_processor_cx *cx;
+
+
+	if (!pr)
+		return -EINVAL;
+
+	/*
+	 * This function sets the default Cx state policy (OS idle handler).
+	 * Our scheme is to promote quickly to C2 but more conservatively
+	 * to C3.  We're favoring C2  for its characteristics of low latency
+	 * (quick response), good power savings, and ability to allow bus
+	 * mastering activity.  Note that the Cx state policy is completely
+	 * customizable and can be altered dynamically.
+	 */
+
+	/* startup state */
+	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
+		cx = &pr->power.states[i];
+		if (!cx->valid)
+			continue;
+
+		if (!state_is_set)
+			pr->power.state = cx;
+		state_is_set++;
+		break;
+	}
+
+	if (!state_is_set)
+		return -ENODEV;
+
+	/* demotion */
+	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
+		cx = &pr->power.states[i];
+		if (!cx->valid)
+			continue;
+
+		if (lower) {
+			cx->demotion.state = lower;
+			cx->demotion.threshold.ticks = cx->latency_ticks;
+			cx->demotion.threshold.count = 1;
+			if (cx->type == ACPI_STATE_C3)
+				cx->demotion.threshold.bm = bm_history;
+		}
+
+		lower = cx;
+	}
+
+	/* promotion */
+	for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
+		cx = &pr->power.states[i];
+		if (!cx->valid)
+			continue;
+
+		if (higher) {
+			cx->promotion.state = higher;
+			cx->promotion.threshold.ticks = cx->latency_ticks;
+			if (cx->type >= ACPI_STATE_C2)
+				cx->promotion.threshold.count = 4;
+			else
+				cx->promotion.threshold.count = 10;
+			if (higher->type == ACPI_STATE_C3)
+				cx->promotion.threshold.bm = bm_history;
+		}
+
+		higher = cx;
+	}
+
+	return 0;
+}
--- head-2010-04-29.orig/drivers/acpi/processor_idle.c	2010-04-15 09:55:39.000000000 +0200
+++ head-2010-04-29/drivers/acpi/processor_idle.c	2010-04-15 10:06:51.000000000 +0200
@@ -123,6 +123,7 @@ static struct dmi_system_id __cpuinitdat
 };
 
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 /*
  * Callers should disable interrupts before the call and enable
  * interrupts after return.
@@ -141,6 +142,7 @@ static void acpi_safe_halt(void)
 	}
 	current_thread_info()->status |= TS_POLLING;
 }
+#endif
 
 #ifdef ARCH_APICTIMER_STOPS_ON_C3
 
@@ -211,7 +213,7 @@ static void lapic_timer_state_broadcast(
 static void lapic_timer_check_state(int state, struct acpi_processor *pr,
 				   struct acpi_processor_cx *cstate) { }
 static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { }
-static void lapic_timer_state_broadcast(struct acpi_processor *pr,
+static inline void lapic_timer_state_broadcast(struct acpi_processor *pr,
 				       struct acpi_processor_cx *cx,
 				       int broadcast)
 {
@@ -259,7 +261,8 @@ int acpi_processor_resume(struct acpi_de
 	return 0;
 }
 
-#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
+#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86) \
+    && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
 static void tsc_check_state(int state)
 {
 	switch (boot_cpu_data.x86_vendor) {
@@ -600,7 +603,11 @@ static void acpi_processor_power_verify_
 	 */
 	cx->valid = 1;
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	cx->latency_ticks = cx->latency;
+#else
+	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+#endif
 	/*
 	 * On older chipsets, BM_RLD needs to be set
 	 * in order for Bus Master activity to wake the
@@ -633,7 +640,11 @@ static int acpi_processor_power_verify(s
 			if (!cx->address)
 				break;
 			cx->valid = 1; 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 			cx->latency_ticks = cx->latency; /* Normalize latency */
+#else
+			cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+#endif
 			break;
 
 		case ACPI_STATE_C3:
@@ -676,6 +687,20 @@ static int acpi_processor_get_power_info
 
 	pr->power.count = acpi_processor_power_verify(pr);
 
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+	/*
+	 * Set Default Policy
+	 * ------------------
+	 * Now that we know which states are supported, set the default
+	 * policy.  Note that this policy can be changed dynamically
+	 * (e.g. encourage deeper sleeps to conserve battery life when
+	 * not on AC).
+	 */
+	result = acpi_processor_set_power_policy(pr);
+	if (result)
+		return result;
+#endif
+
 	/*
 	 * if one state of type C2 or C3 is available, mark this
 	 * CPU as being "idle manageable"
@@ -773,6 +798,7 @@ static const struct file_operations acpi
 };
 #endif
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 /**
  * acpi_idle_bm_check - checks if bus master activity was detected
  */
@@ -1142,6 +1168,13 @@ static int acpi_processor_setup_cpuidle(
 	return 0;
 }
 
+#else /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+static inline int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
+{
+	return 0;
+}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+
 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
 {
 	int ret = 0;
@@ -1208,6 +1241,10 @@ int __cpuinit acpi_processor_power_init(
 			       "ACPI: processor limited to max C-state %d\n",
 			       max_cstate);
 		first_run++;
+#if defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) && defined(CONFIG_SMP)
+		pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY,
+				    &acpi_processor_latency_notifier);
+#endif
 	}
 
 	if (!pr)
@@ -1267,5 +1304,12 @@ int acpi_processor_power_exit(struct acp
 				  acpi_device_dir(device));
 #endif
 
+#if defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) && defined(CONFIG_SMP)
+	/* Unregister the idle handler when processor #0 is removed. */
+	if (pr->id == 0)
+		pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY,
+				       &acpi_processor_latency_notifier);
+#endif
+
 	return 0;
 }
--- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_drv.c	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/drivers/gpu/drm/i915/i915_drv.c	2010-04-29 09:52:19.000000000 +0200
@@ -533,7 +533,7 @@ static struct drm_driver driver = {
 		 .open = drm_open,
 		 .release = drm_release,
 		 .unlocked_ioctl = drm_ioctl,
-		 .mmap = drm_gem_mmap,
+		 .mmap = i915_gem_mmap,
 		 .poll = drm_poll,
 		 .fasync = drm_fasync,
 		 .read = drm_read,
--- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_drv.h	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/drivers/gpu/drm/i915/i915_drv.h	2010-04-29 09:52:20.000000000 +0200
@@ -926,6 +926,11 @@ int i915_gem_idle(struct drm_device *dev
 uint32_t i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
 			  uint32_t flush_domains);
 int i915_do_wait_request(struct drm_device *dev, uint32_t seqno, int interruptible);
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma);
+#else
+#define i915_gem_mmap drm_gem_mmap
+#endif
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 int i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj,
 				      int write);
--- head-2010-04-29.orig/drivers/gpu/drm/i915/i915_gem.c	2010-04-29 09:29:49.000000000 +0200
+++ head-2010-04-29/drivers/gpu/drm/i915/i915_gem.c	2010-04-15 10:06:44.000000000 +0200
@@ -1146,6 +1146,17 @@ i915_gem_mmap_ioctl(struct drm_device *d
 	return 0;
 }
 
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int ret = drm_gem_mmap(filp, vma);
+
+	pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+
+	return ret;
+}
+#endif
+
 /**
  * i915_gem_fault - fault a page into the GTT
  * vma: VMA in question
--- head-2010-04-29.orig/drivers/oprofile/buffer_sync.c	2010-04-15 09:51:29.000000000 +0200
+++ head-2010-04-29/drivers/oprofile/buffer_sync.c	2010-04-15 10:06:28.000000000 +0200
@@ -537,7 +537,6 @@ void sync_buffer(int cpu)
 	int cpu_mode = CPU_MODE_KERNEL;
 	sync_buffer_state state = sb_buffer_start;
 	unsigned int i;
-	int domain_switch = 0;
 	unsigned long available;
 	unsigned long flags;
 	struct op_entry entry;
@@ -562,15 +561,6 @@ void sync_buffer(int cpu)
 		if (!sample)
 			break;
 
-#ifdef CONFIG_XEN
-		if (domain_switch) {
-			cpu_current_domain[cpu] = sample->eip;
-			add_domain_switch(sample->eip);
-			domain_switch = 0;
-			continue;
-		}
-#endif
-
 		if (is_code(sample->eip)) {
 			flags = sample->event;
 			if (flags & TRACE_BEGIN) {
@@ -596,8 +586,11 @@ void sync_buffer(int cpu)
 				add_user_ctx_switch(new, cookie);
 			}
 #ifdef CONFIG_XEN
-			if (flags & DOMAIN_SWITCH)
-				domain_switch = 1;
+			if ((flags & DOMAIN_SWITCH)
+			    && op_cpu_buffer_get_data(&entry, &val)) {
+				cpu_current_domain[cpu] = val;
+				add_domain_switch(val);
+			}
 #endif
 			if (op_cpu_buffer_get_size(&entry))
 				add_data(&entry, mm);
--- head-2010-04-29.orig/drivers/oprofile/cpu_buffer.c	2010-03-24 15:12:36.000000000 +0100
+++ head-2010-04-29/drivers/oprofile/cpu_buffer.c	2010-03-24 15:17:58.000000000 +0100
@@ -444,34 +444,15 @@ void oprofile_add_pc(unsigned long pc, i
 
 #ifdef CONFIG_XEN
 /*
- * This is basically log_sample(b, ESCAPE_CODE, cpu_mode, CPU_TRACE_BEGIN),
+ * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
  * as was previously accessible through oprofile_add_pc().
  */
 void oprofile_add_mode(int cpu_mode)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
-	struct task_struct *task;
 
-	if (nr_available_slots(cpu_buf) < 3) {
+	if (op_add_code(cpu_buf, 1, cpu_mode, current))
 		cpu_buf->sample_lost_overflow++;
-		return;
-	}
-
-	task = current;
-
-	/* notice a switch from user->kernel or vice versa */
-	if (cpu_buf->last_cpu_mode != cpu_mode) {
-		cpu_buf->last_cpu_mode = cpu_mode;
-		add_code(cpu_buf, cpu_mode);
-	}
-
-	/* notice a task switch */
-	if (cpu_buf->last_task != task) {
-		cpu_buf->last_task = task;
-		add_code(cpu_buf, (unsigned long)task);
-	}
-
-	add_code(cpu_buf, CPU_TRACE_BEGIN);
 }
 #endif
 
@@ -502,17 +483,18 @@ fail:
 #ifdef CONFIG_XEN
 int oprofile_add_domain_switch(int32_t domain_id)
 {
-	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
+	struct op_entry entry;
+	struct op_sample *sample;
 
-	/* should have space for switching into and out of domain
-	   (2 slots each) plus one sample and one cpu mode switch */
-	if (((nr_available_slots(cpu_buf) < 6) &&
-	     (domain_id != COORDINATOR_DOMAIN)) ||
-	    (nr_available_slots(cpu_buf) < 2))
+	sample = op_cpu_buffer_write_reserve(&entry, 1);
+	if (!sample)
 		return 0;
 
-	add_code(cpu_buf, DOMAIN_SWITCH);
-	add_sample(cpu_buf, domain_id, 0);
+	sample->eip = ESCAPE_CODE;
+	sample->event = DOMAIN_SWITCH;
+
+	op_cpu_buffer_add_data(&entry, domain_id);
+	op_cpu_buffer_write_commit(&entry);
 
 	current_domain = domain_id;
 
--- head-2010-04-29.orig/drivers/pci/msi-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/pci/msi-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -763,30 +763,21 @@ void pci_no_msi(void)
 	pci_msi_enable = 0;
 }
 
+/**
+ * pci_msi_enabled - is MSI enabled?
+ *
+ * Returns true if MSI has not been disabled by the command-line option
+ * pci=nomsi.
+ **/
+int pci_msi_enabled(void)
+{
+	return pci_msi_enable;
+}
+EXPORT_SYMBOL(pci_msi_enabled);
+
 void pci_msi_init_pci_dev(struct pci_dev *dev)
 {
 #ifndef CONFIG_XEN
 	INIT_LIST_HEAD(&dev->msi_list);
 #endif
 }
-
-#ifdef CONFIG_ACPI
-#include <linux/acpi.h>
-#include <linux/pci-acpi.h>
-static void __devinit msi_acpi_init(void)
-{
-	if (acpi_pci_disabled)
-		return;
-	pci_osc_support_set(OSC_MSI_SUPPORT);
-	pcie_osc_support_set(OSC_MSI_SUPPORT);
-}
-#else
-static inline void msi_acpi_init(void) { }
-#endif /* CONFIG_ACPI */
-
-void __devinit msi_init(void)
-{
-	if (!pci_msi_enable)
-		return;
-	msi_acpi_init();
-}
--- head-2010-04-29.orig/drivers/xen/Kconfig	2010-03-24 15:12:36.000000000 +0100
+++ head-2010-04-29/drivers/xen/Kconfig	2010-03-24 15:18:46.000000000 +0100
@@ -388,6 +388,7 @@ config XEN_DEV_EVTCHN
 
 config XENFS
 	tristate "Xen filesystem"
+	depends on PARAVIRT_XEN
 	default y
 	help
 	  The xen filesystem provides a way for domains to share
--- head-2010-04-29.orig/drivers/xen/Makefile	2010-04-19 14:52:08.000000000 +0200
+++ head-2010-04-29/drivers/xen/Makefile	2010-04-19 14:52:22.000000000 +0200
@@ -13,6 +13,7 @@ obj-$(CONFIG_XEN)		+= features.o util.o
 obj-$(CONFIG_HOTPLUG_CPU)	+= $(xen-hotplug-y)
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= $(xen-balloon-y)
+obj-$(CONFIG_XENFS)		+= xenfs/
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
 obj-$(CONFIG_XEN_BLKDEV_TAP)		+= blktap/
 obj-$(CONFIG_XEN_BLKDEV_TAP2)           += blktap2/
--- head-2010-04-29.orig/drivers/xen/balloon/sysfs.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/drivers/xen/balloon/sysfs.c	2010-03-24 15:17:58.000000000 +0100
@@ -67,7 +67,7 @@ static ssize_t store_target_kb(struct sy
 			       struct sysdev_attribute *attr,
 			       const char *buf, size_t count)
 {
-	char memstring[64], *endchar;
+	char *endchar;
 	unsigned long long target_bytes;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -75,11 +75,8 @@ static ssize_t store_target_kb(struct sy
 	
 	if (count <= 1)
 		return -EBADMSG; /* runt */
-	if (count > sizeof(memstring))
-		return -EFBIG;   /* too long */
-	strcpy(memstring, buf);
 	
-	target_bytes = memparse(memstring, &endchar);
+	target_bytes = simple_strtoull(buf, &endchar, 0) << 10;
 	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
 	
 	return count;
@@ -88,8 +85,40 @@ static ssize_t store_target_kb(struct sy
 static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
 		   show_target_kb, store_target_kb);
 
+static ssize_t show_target(struct sys_device *dev,
+			   struct sysdev_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long)balloon_stats.target_pages
+		       << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct sys_device *dev,
+			    struct sysdev_attribute *attr,
+			    const char *buf,
+			    size_t count)
+{
+	char *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 1)
+		return -EBADMSG; /* runt */
+
+	target_bytes = memparse(buf, &endchar);
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+	return count;
+}
+
+static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
+		   show_target, store_target);
+
 static struct sysdev_attribute *balloon_attrs[] = {
 	&attr_target_kb,
+	&attr_target,
 };
 
 static struct attribute *balloon_info_attrs[] = {
--- head-2010-04-29.orig/drivers/xen/blkfront/vbd.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/xen/blkfront/vbd.c	2010-03-24 15:17:58.000000000 +0100
@@ -308,6 +308,10 @@ xlvbd_init_blk_queue(struct gendisk *gd,
 	if (rq == NULL)
 		return -1;
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+#endif
+
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_hardsect_size(rq, sector_size);
 	blk_queue_max_sectors(rq, 512);
--- head-2010-04-29.orig/drivers/xen/core/cpu_hotplug.c	2009-04-07 13:58:48.000000000 +0200
+++ head-2010-04-29/drivers/xen/core/cpu_hotplug.c	2010-03-24 15:17:58.000000000 +0100
@@ -10,10 +10,10 @@
  * Set of CPUs that remote admin software will allow us to bring online.
  * Notified to us via xenbus.
  */
-static cpumask_t xenbus_allowed_cpumask;
+static cpumask_var_t xenbus_allowed_cpumask;
 
 /* Set of CPUs that local admin will allow us to bring online. */
-static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
+static cpumask_var_t local_allowed_cpumask;
 
 static int local_cpu_hotplug_request(void)
 {
@@ -40,10 +40,10 @@ static void vcpu_hotplug(unsigned int cp
 	}
 
 	if (strcmp(state, "online") == 0) {
-		cpu_set(cpu, xenbus_allowed_cpumask);
+		cpumask_set_cpu(cpu, xenbus_allowed_cpumask);
 		(void)cpu_up(cpu);
 	} else if (strcmp(state, "offline") == 0) {
-		cpu_clear(cpu, xenbus_allowed_cpumask);
+		cpumask_clear_cpu(cpu, xenbus_allowed_cpumask);
 		(void)cpu_down(cpu);
 	} else {
 		printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
@@ -75,7 +75,7 @@ static int smpboot_cpu_notify(struct not
 	 * as it's always executed from within a stopmachine kthread.
 	 */
 	if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
-		cpu_clear(cpu, local_allowed_cpumask);
+		cpumask_clear_cpu(cpu, local_allowed_cpumask);
 
 	return NOTIFY_OK;
 }
@@ -156,21 +156,26 @@ int cpu_up_check(unsigned int cpu)
 	int rc = 0;
 
 	if (local_cpu_hotplug_request()) {
-		cpu_set(cpu, local_allowed_cpumask);
-		if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
+		cpumask_set_cpu(cpu, local_allowed_cpumask);
+		if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
 			printk("%s: attempt to bring up CPU %u disallowed by "
 			       "remote admin.\n", __FUNCTION__, cpu);
 			rc = -EBUSY;
 		}
-	} else if (!cpu_isset(cpu, local_allowed_cpumask) ||
-		   !cpu_isset(cpu, xenbus_allowed_cpumask)) {
+	} else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) ||
+		   !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
 		rc = -EBUSY;
 	}
 
 	return rc;
 }
 
-void init_xenbus_allowed_cpumask(void)
+void __init init_xenbus_allowed_cpumask(void)
 {
-	xenbus_allowed_cpumask = cpu_present_map;
+	if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL))
+		BUG();
+	cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask);
+	if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL))
+		BUG();
+	cpumask_setall(local_allowed_cpumask);
 }
--- head-2010-04-29.orig/drivers/xen/core/evtchn.c	2010-04-23 15:18:24.000000000 +0200
+++ head-2010-04-29/drivers/xen/core/evtchn.c	2010-04-23 15:19:25.000000000 +0200
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/bootmem.h>
+#include <linux/ftrace.h>
 #include <linux/version.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
@@ -57,9 +58,6 @@ static DEFINE_SPINLOCK(irq_mapping_updat
 static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
 	[0 ...  NR_EVENT_CHANNELS-1] = -1 };
 
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-static u32 irq_info[NR_IRQS];
-
 /* Binding types. */
 enum {
 	IRQT_UNBOUND,
@@ -75,6 +73,30 @@ enum {
 #define _EVTCHN_BITS 12
 #define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS)
 
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	(IRQT_UNBOUND << (32 - _IRQT_BITS))
+
+static struct irq_cfg _irq_cfg[] = {
+	[0 ...
+#ifdef CONFIG_SPARSE_IRQ
+	       BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY
+#else
+	       NR_IRQS
+#endif
+		       - 1].info = IRQ_UNBOUND
+};
+
+static inline struct irq_cfg *__pure irq_cfg(unsigned int irq)
+{
+#ifdef CONFIG_SPARSE_IRQ
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc ? desc->chip_data : NULL;
+#else
+	return irq < NR_IRQS ? _irq_cfg + irq : NULL;
+#endif
+}
+
 /* Constructor for packed IRQ information. */
 static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
 {
@@ -90,26 +112,30 @@ static inline u32 mk_irq_info(u32 type, 
 	return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn);
 }
 
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
-
 /*
  * Accessors for packed IRQ information.
  */
 
 static inline unsigned int evtchn_from_irq(int irq)
 {
-	return irq_info[irq] & ((1U << _EVTCHN_BITS) - 1);
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	return cfg ? cfg->info & ((1U << _EVTCHN_BITS) - 1) : 0;
 }
 
 static inline unsigned int index_from_irq(int irq)
 {
-	return (irq_info[irq] >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1);
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	return cfg ? (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1)
+		   : 0;
 }
 
 static inline unsigned int type_from_irq(int irq)
 {
-	return irq_info[irq] >> (32 - _IRQT_BITS);
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	return cfg ? cfg->info >> (32 - _IRQT_BITS) : IRQT_UNBOUND;
 }
 
 /* IRQ <-> VIRQ mapping. */
@@ -121,9 +147,6 @@ DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS
 #endif
 DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
 
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
 #ifdef CONFIG_SMP
 
 static u8 cpu_evtchn[NR_EVENT_CHANNELS];
@@ -157,8 +180,12 @@ static void init_evtchn_cpu_bindings(voi
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for (i = 0; i < NR_IRQS; i++)
-		irq_to_desc(i)->affinity = cpumask_of_cpu(0);
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+
+		if (desc)
+			desc->affinity = cpumask_of_cpu(0);
+	}
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
 	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
@@ -232,7 +259,7 @@ static DEFINE_PER_CPU(unsigned int, curr
 static DEFINE_PER_CPU(unsigned int, current_l2i);
 
 /* NB. Interrupts are disabled on entry. */
-asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
+asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
 {
 	unsigned long       l1, l2;
 	unsigned long       masked_l1, masked_l2;
@@ -320,14 +347,25 @@ asmlinkage void evtchn_do_upcall(struct 
 	irq_exit();
 }
 
-static int find_unbound_irq(void)
+static struct irq_chip dynirq_chip;
+
+static int find_unbound_irq(unsigned int cpu)
 {
 	static int warned;
 	int irq;
 
-	for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++)
-		if (irq_bindcount[irq] == 0)
+	for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) {
+		struct irq_desc *desc = irq_to_desc_alloc_cpu(irq, cpu);
+		struct irq_cfg *cfg = desc->chip_data;
+
+		if (!cfg->bindcount) {
+			desc->status |= IRQ_NOPROBE;
+			set_irq_chip_and_handler_name(irq, &dynirq_chip,
+						      handle_level_irq,
+						      "level");
 			return irq;
+		}
+	}
 
 	if (!warned) {
 		warned = 1;
@@ -345,14 +383,15 @@ static int bind_caller_port_to_irq(unsig
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = evtchn_to_irq[caller_port]) == -1) {
-		if ((irq = find_unbound_irq()) < 0)
+		if ((irq = find_unbound_irq(smp_processor_id())) < 0)
 			goto out;
 
 		evtchn_to_irq[caller_port] = irq;
-		irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
+		irq_cfg(irq)->info = mk_irq_info(IRQT_CALLER_PORT,
+						  0, caller_port);
 	}
 
-	irq_bindcount[irq]++;
+	irq_cfg(irq)->bindcount++;
 
  out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -367,7 +406,7 @@ static int bind_local_port_to_irq(unsign
 
 	BUG_ON(evtchn_to_irq[local_port] != -1);
 
-	if ((irq = find_unbound_irq()) < 0) {
+	if ((irq = find_unbound_irq(smp_processor_id())) < 0) {
 		struct evtchn_close close = { .port = local_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			BUG();
@@ -375,8 +414,8 @@ static int bind_local_port_to_irq(unsign
 	}
 
 	evtchn_to_irq[local_port] = irq;
-	irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
-	irq_bindcount[irq]++;
+	irq_cfg(irq)->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
+	irq_cfg(irq)->bindcount++;
 
  out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -420,7 +459,7 @@ static int bind_virq_to_irq(unsigned int
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
-		if ((irq = find_unbound_irq()) < 0)
+		if ((irq = find_unbound_irq(cpu)) < 0)
 			goto out;
 
 		bind_virq.virq = virq;
@@ -431,14 +470,14 @@ static int bind_virq_to_irq(unsigned int
 		evtchn = bind_virq.port;
 
 		evtchn_to_irq[evtchn] = irq;
-		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+		irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
 
 		per_cpu(virq_to_irq, cpu)[virq] = irq;
 
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 
-	irq_bindcount[irq]++;
+	irq_cfg(irq)->bindcount++;
 
  out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -453,7 +492,7 @@ static int bind_ipi_to_irq(unsigned int 
 	spin_lock(&irq_mapping_update_lock);
 
 	if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
-		if ((irq = find_unbound_irq()) < 0)
+		if ((irq = find_unbound_irq(cpu)) < 0)
 			goto out;
 
 		bind_ipi.vcpu = cpu;
@@ -463,14 +502,14 @@ static int bind_ipi_to_irq(unsigned int 
 		evtchn = bind_ipi.port;
 
 		evtchn_to_irq[evtchn] = irq;
-		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+		irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
 
 		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
 
 		bind_evtchn_to_cpu(evtchn, cpu);
 	}
 
-	irq_bindcount[irq]++;
+	irq_cfg(irq)->bindcount++;
 
  out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -485,7 +524,7 @@ static void unbind_from_irq(unsigned int
 
 	spin_lock(&irq_mapping_update_lock);
 
-	if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
+	if (!--irq_cfg(irq)->bindcount && VALID_EVTCHN(evtchn)) {
 		close.port = evtchn;
 		if ((type_from_irq(irq) != IRQT_CALLER_PORT) &&
 		    HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -508,11 +547,15 @@ static void unbind_from_irq(unsigned int
 		bind_evtchn_to_cpu(evtchn, 0);
 
 		evtchn_to_irq[evtchn] = -1;
-		irq_info[irq] = IRQ_UNBOUND;
+		irq_cfg(irq)->info = IRQ_UNBOUND;
 
 		/* Zap stats across IRQ changes of use. */
 		for_each_possible_cpu(cpu)
+#ifdef CONFIG_SPARSE_IRQ
+			irq_to_desc(irq)->kstat_irqs[cpu] = 0;
+#else
 			kstat_cpu(cpu).irqs[irq] = 0;
+#endif
 	}
 
 	spin_unlock(&irq_mapping_update_lock);
@@ -664,10 +707,9 @@ static void rebind_irq_to_cpu(unsigned i
 		rebind_evtchn_to_cpu(evtchn, tcpu);
 }
 
-static void set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
-	unsigned tcpu = first_cpu(dest);
-	rebind_irq_to_cpu(irq, tcpu);
+	rebind_irq_to_cpu(irq, cpumask_first(dest));
 }
 #endif
 
@@ -835,7 +877,7 @@ static void enable_pirq(unsigned int irq
 
 	evtchn_to_irq[evtchn] = irq;
 	bind_evtchn_to_cpu(evtchn, 0);
-	irq_info[irq] = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
+	irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
 
  out:
 	pirq_unmask_and_notify(evtchn, irq);
@@ -857,7 +899,7 @@ static void disable_pirq(unsigned int ir
 
 	bind_evtchn_to_cpu(evtchn, 0);
 	evtchn_to_irq[evtchn] = -1;
-	irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0);
+	irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0);
 }
 
 static unsigned int startup_pirq(unsigned int irq)
@@ -1023,7 +1065,7 @@ static void restore_cpu_virqs(unsigned i
 		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
 			continue;
 
-		BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
+		BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0));
 
 		/* Get a new binding from Xen. */
 		bind_virq.virq = virq;
@@ -1035,7 +1077,7 @@ static void restore_cpu_virqs(unsigned i
 
 		/* Record the new mapping. */
 		evtchn_to_irq[evtchn] = irq;
-		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+		irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
 		bind_evtchn_to_cpu(evtchn, cpu);
 
 		/* Ready for use. */
@@ -1052,7 +1094,7 @@ static void restore_cpu_ipis(unsigned in
 		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
 			continue;
 
-		BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
+		BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_IPI, ipi, 0));
 
 		/* Get a new binding from Xen. */
 		bind_ipi.vcpu = cpu;
@@ -1063,7 +1105,7 @@ static void restore_cpu_ipis(unsigned in
 
 		/* Record the new mapping. */
 		evtchn_to_irq[evtchn] = irq;
-		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+		irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
 		bind_evtchn_to_cpu(evtchn, cpu);
 
 		/* Ready for use. */
@@ -1075,6 +1117,7 @@ static void restore_cpu_ipis(unsigned in
 void irq_resume(void)
 {
 	unsigned int cpu, irq, evtchn;
+	struct irq_cfg *cfg;
 
 	init_evtchn_cpu_bindings();
 
@@ -1091,12 +1134,17 @@ void irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* Check that no PIRQs are still bound. */
-	for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++)
-		BUG_ON(irq_info[irq] != IRQ_UNBOUND);
+	for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) {
+		cfg = irq_cfg(irq);
+		BUG_ON(cfg && cfg->info != IRQ_UNBOUND);
+	}
 
 	/* No IRQ <-> event-channel mappings. */
-	for (irq = 0; irq < NR_IRQS; irq++)
-		irq_info[irq] &= ~((1U << _EVTCHN_BITS) - 1);
+	for (irq = 0; irq < nr_irqs; irq++) {
+		cfg = irq_cfg(irq);
+		if (cfg)
+			cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
+	}
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
 		evtchn_to_irq[evtchn] = -1;
 
@@ -1108,10 +1156,56 @@ void irq_resume(void)
 }
 #endif
 
+int __init arch_early_irq_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
+		irq_to_desc(i)->chip_data = _irq_cfg + i;
+
+	return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	if (!desc->chip_data) {
+		/* By default all event channels notify CPU#0. */
+		desc->affinity = cpumask_of_cpu(0);
+
+		desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC);
+	}
+	if (!desc->chip_data) {
+		printk(KERN_ERR "cannot alloc irq_cfg\n");
+		BUG();
+	}
+
+	return 0;
+}
+#endif
+
 #if defined(CONFIG_X86_IO_APIC)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+	struct physdev_irq irq_op;
+
+	if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
+		return -EINVAL;
+
+	if (cfg->vector)
+		return 0;
+
+	irq_op.irq = irq;
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+		return -ENOSPC;
+
+	cfg->vector = irq_op.vector;
+
+	return 0;
+}
 #define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
 #elif defined(CONFIG_X86)
-#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < 16)
+#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY)
 #else
 #define identity_mapped_irq(irq) (1)
 #endif
@@ -1121,7 +1215,7 @@ void evtchn_register_pirq(int irq)
 	BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS);
 	if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND)
 		return;
-	irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0);
+	irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0);
 	set_irq_chip_and_handler_name(irq, &pirq_chip, handle_level_irq,
 				      "level");
 }
@@ -1134,12 +1228,17 @@ int evtchn_map_pirq(int irq, int xen_pir
 		irq = PIRQ_BASE + NR_PIRQS - 1;
 		spin_lock(&irq_alloc_lock);
 		do {
+			struct irq_desc *desc;
+			struct irq_cfg *cfg;
+
 			if (identity_mapped_irq(irq))
 				continue;
+			desc = irq_to_desc_alloc_cpu(irq, smp_processor_id());
+			cfg = desc->chip_data;
 			if (!index_from_irq(irq)) {
 				BUG_ON(type_from_irq(irq) != IRQT_UNBOUND);
-				irq_info[irq] = mk_irq_info(IRQT_PIRQ,
-							    xen_pirq, 0);
+				cfg->info = mk_irq_info(IRQT_PIRQ,
+							xen_pirq, 0);
 				break;
 			}
 		} while (--irq >= PIRQ_BASE);
@@ -1158,7 +1257,7 @@ int evtchn_map_pirq(int irq, int xen_pir
 		 * then causes a warning in dynamic_irq_cleanup().
 		 */
 		set_irq_chip_and_handler(irq, NULL, NULL);
-		irq_info[irq] = IRQ_UNBOUND;
+		irq_cfg(irq)->info = IRQ_UNBOUND;
 		return 0;
 	} else if (type_from_irq(irq) != IRQT_PIRQ
 		   || index_from_irq(irq) != xen_pirq) {
@@ -1195,23 +1294,17 @@ void __init xen_init_IRQ(void)
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		mask_evtchn(i);
 
-	/* No IRQ -> event-channel mappings. */
-	for (i = 0; i < NR_IRQS; i++)
-		irq_info[i] = IRQ_UNBOUND;
-
-	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+#ifndef CONFIG_SPARSE_IRQ
 	for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) {
-		irq_bindcount[i] = 0;
-
 		irq_to_desc(i)->status |= IRQ_NOPROBE;
 		set_irq_chip_and_handler_name(i, &dynirq_chip,
 					      handle_level_irq, "level");
 	}
 
-	/* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
 	for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) {
-		irq_bindcount[i] = 1;
-
+#else
+	for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
+#endif
 		if (!identity_mapped_irq(i))
 			continue;
 
--- head-2010-04-29.orig/drivers/xen/core/machine_reboot.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/machine_reboot.c	2010-03-24 15:17:58.000000000 +0100
@@ -19,6 +19,9 @@
 #include <xen/interface/vcpu.h>
 
 #if defined(__i386__) || defined(__x86_64__)
+#include <asm/pci_x86.h>
+/* TBD: Dom0 should propagate the determined value to Xen. */
+bool port_cf9_safe = false;
 
 /*
  * Power off function, if any
@@ -84,7 +87,7 @@ static void post_suspend(int suspend_can
 			pfn_to_mfn(xen_start_info->console.domU.mfn);
 	} else {
 #ifdef CONFIG_SMP
-		cpu_initialized_map = cpu_online_map;
+		cpumask_copy(vcpu_initialized_mask, cpu_online_mask);
 #endif
 		for_each_possible_cpu(i)
 			setup_runstate_area(i);
@@ -222,6 +225,12 @@ int __xen_suspend(int fast_suspend, void
 	if (num_possible_cpus() == 1)
 		fast_suspend = 0;
 
+	if (fast_suspend) {
+		err = stop_machine_create();
+		if (err)
+			return err;
+	}
+
 	suspend.fast_suspend = fast_suspend;
 	suspend.resume_notifier = resume_notifier;
 
@@ -248,6 +257,8 @@ int __xen_suspend(int fast_suspend, void
 
 	if (!fast_suspend)
 		smp_resume();
+	else
+		stop_machine_destroy();
 
 	return 0;
 }
--- head-2010-04-29.orig/drivers/xen/core/smpboot.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/smpboot.c	2010-03-24 15:17:58.000000000 +0100
@@ -36,11 +36,7 @@ extern void smp_trap_init(trap_info_t *)
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-cpumask_t cpu_initialized_map;
+cpumask_var_t vcpu_initialized_mask;
 
 DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -76,10 +72,14 @@ void __init prefill_possible_map(void)
 #endif
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
 		if (rc >= 0) {
-			cpu_set(i, cpu_possible_map);
+			set_cpu_possible(i, true);
 			nr_cpu_ids = i + 1;
 		}
 	}
+	total_cpus = num_possible_cpus();
+	for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i)
+		if (i != smp_processor_id())
+			++total_cpus;
 }
 
 static inline void
@@ -203,7 +203,7 @@ static void __cpuinit cpu_initialize_con
 
 	struct task_struct *idle = idle_task(cpu);
 
-	if (cpu_test_and_set(cpu, cpu_initialized_map))
+	if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask))
 		return;
 
 	spin_lock(&ctxt_lock);
@@ -284,13 +284,15 @@ void __init smp_prepare_cpus(unsigned in
 	if (xen_smp_intr_init(0))
 		BUG();
 
-	cpu_initialized_map = cpumask_of_cpu(0);
+	if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL))
+		BUG();
+	cpumask_copy(vcpu_initialized_mask, cpumask_of(0));
 
 	/* Restrict the possible_map according to max_cpus. */
 	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-		for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+		for (cpu = nr_cpu_ids-1; !cpumask_test_cpu(cpu, cpu_possible_mask); cpu--)
 			continue;
-		cpu_clear(cpu, cpu_possible_map);
+		set_cpu_possible(cpu, false);
 	}
 
 	for_each_possible_cpu (cpu) {
@@ -328,10 +330,8 @@ void __init smp_prepare_cpus(unsigned in
 
 #ifdef CONFIG_HOTPLUG_CPU
 		if (is_initial_xendomain())
-			cpu_set(cpu, cpu_present_map);
-#else
-		cpu_set(cpu, cpu_present_map);
 #endif
+			set_cpu_present(cpu, true);
 	}
 
 	init_xenbus_allowed_cpumask();
@@ -364,14 +364,17 @@ void __init smp_prepare_boot_cpu(void)
  */
 static int __init initialize_cpu_present_map(void)
 {
-	cpu_present_map = cpu_possible_map;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		set_cpu_present(cpu, true);
+
 	return 0;
 }
 core_initcall(initialize_cpu_present_map);
 
 int __cpuexit __cpu_disable(void)
 {
-	cpumask_t map = cpu_online_map;
 	unsigned int cpu = smp_processor_id();
 
 	if (cpu == 0)
@@ -379,9 +382,8 @@ int __cpuexit __cpu_disable(void)
 
 	remove_siblinginfo(cpu);
 
-	cpu_clear(cpu, map);
-	fixup_irqs(map);
-	cpu_clear(cpu, cpu_online_map);
+	set_cpu_online(cpu, false);
+	fixup_irqs();
 
 	return 0;
 }
@@ -424,7 +426,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		return rc;
 	}
 
-	cpu_set(cpu, cpu_online_map);
+	set_cpu_online(cpu, true);
 
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	BUG_ON(rc);
@@ -436,7 +438,7 @@ void __ref play_dead(void)
 {
 	idle_task_exit();
 	local_irq_disable();
-	cpu_clear(smp_processor_id(), cpu_initialized);
+	cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask);
 	preempt_enable_no_resched();
 	VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
 #ifdef CONFIG_HOTPLUG_CPU
--- head-2010-04-29.orig/drivers/xen/core/spinlock.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/xen/core/spinlock.c	2010-03-24 15:17:58.000000000 +0100
@@ -173,7 +173,8 @@ bool xen_spin_wait(raw_spinlock_t *lock,
 		current_vcpu_info()->evtchn_upcall_mask = upcall_mask;
 
 		rc = !xen_test_irq_pending(irq);
-		kstat_this_cpu.irqs[irq] += !rc;
+		if (!rc)
+			kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
 	} while (spinning.prev || rc);
 
 	/*
--- head-2010-04-29.orig/drivers/xen/netback/interface.c	2010-03-24 15:09:08.000000000 +0100
+++ head-2010-04-29/drivers/xen/netback/interface.c	2010-03-24 15:17:58.000000000 +0100
@@ -176,6 +176,14 @@ static struct ethtool_ops network_ethtoo
 	.get_strings = netbk_get_strings,
 };
 
+static const struct net_device_ops netif_be_netdev_ops = {
+	.ndo_open               = net_open,
+	.ndo_stop               = net_close,
+	.ndo_start_xmit         = netif_be_start_xmit,
+	.ndo_change_mtu	        = netbk_change_mtu,
+	.ndo_get_stats          = netif_be_get_stats,
+};
+
 netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
 {
 	int err = 0;
@@ -210,11 +218,7 @@ netif_t *netif_alloc(struct device *pare
 
 	init_timer(&netif->tx_queue_timeout);
 
-	dev->hard_start_xmit = netif_be_start_xmit;
-	dev->get_stats       = netif_be_get_stats;
-	dev->open            = net_open;
-	dev->stop            = net_close;
-	dev->change_mtu	     = netbk_change_mtu;
+	dev->netdev_ops	     = &netif_be_netdev_ops;
 	dev->features        = NETIF_F_IP_CSUM;
 
 	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
--- head-2010-04-29.orig/drivers/xen/netback/loopback.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/netback/loopback.c	2010-03-24 15:17:58.000000000 +0100
@@ -201,19 +201,21 @@ static void loopback_set_multicast_list(
 {
 }
 
+static const struct net_device_ops loopback_netdev_ops = {
+	.ndo_open               = loopback_open,
+	.ndo_stop               = loopback_close,
+	.ndo_start_xmit         = loopback_start_xmit,
+	.ndo_set_multicast_list = loopback_set_multicast_list,
+	.ndo_change_mtu	        = NULL, /* allow arbitrary mtu */
+	.ndo_get_stats          = loopback_get_stats,
+};
+
 static void loopback_construct(struct net_device *dev, struct net_device *lo)
 {
 	struct net_private *np = netdev_priv(dev);
 
 	np->loopback_dev     = lo;
-
-	dev->open            = loopback_open;
-	dev->stop            = loopback_close;
-	dev->hard_start_xmit = loopback_start_xmit;
-	dev->get_stats       = loopback_get_stats;
-	dev->set_multicast_list = loopback_set_multicast_list;
-	dev->change_mtu	     = NULL; /* allow arbitrary mtu */
-
+	dev->netdev_ops      = &loopback_netdev_ops;
 	dev->tx_queue_len    = 0;
 
 	dev->features        = (NETIF_F_HIGHDMA |
--- head-2010-04-29.orig/drivers/xen/netback/netback.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/netback/netback.c	2010-03-24 15:17:58.000000000 +0100
@@ -354,7 +354,7 @@ static void xen_network_done_notify(void
 	static struct net_device *eth0_dev = NULL;
 	if (unlikely(eth0_dev == NULL))
 		eth0_dev = __dev_get_by_name(&init_net, "eth0");
-	netif_rx_schedule(eth0_dev, ???);
+	netif_rx_schedule(???);
 }
 /* 
  * Add following to poll() function in NAPI driver (Tigon3 is example):
--- head-2010-04-29.orig/drivers/xen/netfront/netfront.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/xen/netfront/netfront.c	2010-03-24 15:17:58.000000000 +0100
@@ -635,7 +635,7 @@ static int network_open(struct net_devic
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
 			netfront_accelerator_call_stop_napi_irq(np, dev);
 
-			netif_rx_schedule(dev, &np->napi);
+			netif_rx_schedule(&np->napi);
 		}
 	}
 	spin_unlock_bh(&np->rx_lock);
@@ -707,7 +707,7 @@ static void rx_refill_timeout(unsigned l
 
 	netfront_accelerator_call_stop_napi_irq(np, dev);
 
-	netif_rx_schedule(dev, &np->napi);
+	netif_rx_schedule(&np->napi);
 }
 
 static void network_alloc_rx_buffers(struct net_device *dev)
@@ -1064,8 +1064,7 @@ static irqreturn_t netif_int(int irq, vo
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
 			netfront_accelerator_call_stop_napi_irq(np, dev);
 
-			netif_rx_schedule(dev, &np->napi);
-			dev->last_rx = jiffies;
+			netif_rx_schedule(&np->napi);
 		}
 	}
 
@@ -1481,7 +1480,6 @@ err:	
 
 		/* Pass it up. */
 		netif_receive_skb(skb);
-		dev->last_rx = jiffies;
 	}
 
 	/* If we get a callback with very few responses, reduce fill target. */
@@ -1523,7 +1521,7 @@ err:	
 		}
 
 		if (!more_to_do && !accel_more_to_do)
-			__netif_rx_complete(dev, napi);
+			__netif_rx_complete(napi);
 
 		local_irq_restore(flags);
 	}
@@ -2024,6 +2022,18 @@ static void network_set_multicast_list(s
 {
 }
 
+static const struct net_device_ops xennet_netdev_ops = {
+	.ndo_uninit             = netif_uninit,
+	.ndo_open               = network_open,
+	.ndo_stop               = network_close,
+	.ndo_start_xmit         = network_start_xmit,
+	.ndo_set_multicast_list = network_set_multicast_list,
+	.ndo_set_mac_address    = xennet_set_mac_address,
+	.ndo_validate_addr      = eth_validate_addr,
+	.ndo_change_mtu	        = xennet_change_mtu,
+	.ndo_get_stats          = network_get_stats,
+};
+
 static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
 {
 	int i, err = 0;
@@ -2080,15 +2090,8 @@ static struct net_device * __devinit cre
 		goto exit_free_tx;
 	}
 
-	netdev->open            = network_open;
-	netdev->hard_start_xmit = network_start_xmit;
-	netdev->stop            = network_close;
-	netdev->get_stats       = network_get_stats;
+	netdev->netdev_ops	= &xennet_netdev_ops;
 	netif_napi_add(netdev, &np->napi, netif_poll, 64);
-	netdev->set_multicast_list = network_set_multicast_list;
-	netdev->uninit          = netif_uninit;
-	netdev->set_mac_address	= xennet_set_mac_address;
-	netdev->change_mtu	= xennet_change_mtu;
 	netdev->features        = NETIF_F_IP_CSUM;
 
 	SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
@@ -2119,7 +2122,7 @@ inetdev_notify(struct notifier_block *th
 	struct net_device *dev = ifa->ifa_dev->dev;
 
 	/* UP event and is it one of our devices? */
-	if (event == NETDEV_UP && dev->open == network_open)
+	if (event == NETDEV_UP && dev->netdev_ops->ndo_open == network_open)
 		send_fake_arp(dev);
 
 	return NOTIFY_DONE;
--- head-2010-04-29.orig/drivers/xen/sfc_netfront/accel_msg.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/sfc_netfront/accel_msg.c	2010-03-24 15:17:58.000000000 +0100
@@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro
 		netfront_accel_disable_net_interrupts(vnic);
 		vnic->irq_enabled = 0;
 		NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
-		netif_rx_schedule(vnic->net_dev, &np->napi);
+		netif_rx_schedule(&np->napi);
 	} else {
 		/*
 		 * Nothing yet, make sure we get interrupts through
@@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i
 				vnic->stats.event_count_since_irq;
 		vnic->stats.event_count_since_irq = 0;
 #endif
-		netif_rx_schedule(net_dev, &np->napi);
+		netif_rx_schedule(&np->napi);
 	}
 	else {
 		spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
--- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_client.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenbus/xenbus_client.c	2010-03-24 15:17:58.000000000 +0100
@@ -170,7 +170,6 @@ EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
 /**
  * xenbus_switch_state
  * @dev: xenbus device
- * @xbt: transaction handle
  * @state: new state
  *
  * Advertise in the store a change of the given driver to the given new_state.
@@ -304,7 +303,7 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error);
  * @fmt: error message format
  *
  * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
- * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
  * closedown of this driver and its peer.
  */
 void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
--- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:12:46.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe.c	2010-03-24 15:17:58.000000000 +0100
@@ -42,6 +42,7 @@
 #include <linux/ctype.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/mutex.h>
 #include <linux/io.h>
@@ -73,6 +74,10 @@
 #endif
 
 int xen_store_evtchn;
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+EXPORT_SYMBOL(xen_store_evtchn);
+#endif
+
 struct xenstore_domain_interface *xen_store_interface;
 static unsigned long xen_store_mfn;
 
@@ -197,6 +202,12 @@ static int xenbus_uevent_frontend(struct
 }
 #endif
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+static struct device_attribute xenbus_dev_attrs[] = {
+	__ATTR_NULL
+};
+#endif
+
 /* Bus type for frontend drivers. */
 static struct xen_bus_type xenbus_frontend = {
 	.root = "device",
@@ -205,13 +216,16 @@ static struct xen_bus_type xenbus_fronte
 	.probe = xenbus_probe_frontend,
 	.error = -ENODEV,
 	.bus = {
-		.name     = "xen",
-		.match    = xenbus_match,
+		.name      = "xen",
+		.match     = xenbus_match,
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
-		.probe    = xenbus_dev_probe,
-		.remove   = xenbus_dev_remove,
-		.shutdown = xenbus_dev_shutdown,
-		.uevent   = xenbus_uevent_frontend,
+		.probe     = xenbus_dev_probe,
+		.remove    = xenbus_dev_remove,
+		.shutdown  = xenbus_dev_shutdown,
+		.uevent    = xenbus_uevent_frontend,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+		.dev_attrs = xenbus_dev_attrs,
 #endif
 	},
 #if defined(CONFIG_XEN) || defined(MODULE)
@@ -584,7 +598,17 @@ int xenbus_probe_node(struct xen_bus_typ
 	xendev->dev.bus = &bus->bus;
 	xendev->dev.release = xenbus_dev_release;
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+	{
+		char devname[XEN_BUS_ID_SIZE];
+
+		err = bus->get_bus_id(devname, xendev->nodename);
+		if (!err)
+			dev_set_name(&xendev->dev, devname);
+	}
+#else
 	err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+#endif
 	if (err)
 		goto fail;
 
@@ -770,7 +794,7 @@ static int suspend_dev(struct device *de
 		err = drv->suspend(xdev);
 	if (err)
 		printk(KERN_WARNING
-		       "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
+		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
 	return 0;
 }
 
@@ -791,7 +815,7 @@ static int suspend_cancel_dev(struct dev
 	if (err)
 		printk(KERN_WARNING
 		       "xenbus: suspend_cancel %s failed: %i\n",
-		       dev->bus_id, err);
+		       dev_name(dev), err);
 	return 0;
 }
 
@@ -813,7 +837,7 @@ static int resume_dev(struct device *dev
 	if (err) {
 		printk(KERN_WARNING
 		       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
-		       dev->bus_id, err);
+		       dev_name(dev), err);
 		return err;
 	}
 
@@ -824,7 +848,7 @@ static int resume_dev(struct device *dev
 		if (err) {
 			printk(KERN_WARNING
 			       "xenbus: resume %s failed: %i\n",
-			       dev->bus_id, err);
+			       dev_name(dev), err);
 			return err;
 		}
 	}
@@ -833,7 +857,7 @@ static int resume_dev(struct device *dev
 	if (err) {
 		printk(KERN_WARNING
 		       "xenbus_probe: resume (watch_otherend) %s failed: "
-		       "%d.\n", dev->bus_id, err);
+		       "%d.\n", dev_name(dev), err);
 		return err;
 	}
 
@@ -1145,6 +1169,14 @@ static int __devinit xenbus_probe_init(v
 	if (!is_initial_xendomain())
 		xenbus_probe(NULL);
 
+#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE)
+	/*
+	 * Create xenfs mountpoint in /proc for compatibility with
+	 * utilities that expect to find "xenbus" under "/proc/xen".
+	 */
+	proc_mkdir("xen", NULL);
+#endif
+
 	return 0;
 
  err:
--- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe.h	2010-03-24 15:17:58.000000000 +0100
@@ -45,6 +45,10 @@
 #define is_initial_xendomain() xen_initial_domain()
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+#define dev_name(dev) ((dev)->bus_id)
+#endif
+
 #if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
 extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
 extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
--- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_probe_backend.c	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenbus/xenbus_probe_backend.c	2010-03-24 15:17:58.000000000 +0100
@@ -36,6 +36,7 @@
 		 __FUNCTION__, __LINE__, ##args)
 
 #include <linux/kernel.h>
+#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
@@ -108,6 +109,10 @@ static int backend_bus_id(char bus_id[BU
 	return 0;
 }
 
+static struct device_attribute xenbus_backend_attrs[] = {
+	__ATTR_NULL
+};
+
 static struct xen_bus_type xenbus_backend = {
 	.root = "backend",
 	.levels = 3, 		/* backend/type/<frontend>/<id> */
@@ -115,12 +120,13 @@ static struct xen_bus_type xenbus_backen
 	.probe = xenbus_probe_backend,
 	.error = -ENODEV,
 	.bus = {
-		.name     = "xen-backend",
-		.match    = xenbus_match,
-		.probe    = xenbus_dev_probe,
-		.remove   = xenbus_dev_remove,
-//		.shutdown = xenbus_dev_shutdown,
-		.uevent   = xenbus_uevent_backend,
+		.name      = "xen-backend",
+		.match     = xenbus_match,
+		.probe     = xenbus_dev_probe,
+		.remove    = xenbus_dev_remove,
+//		.shutdown  = xenbus_dev_shutdown,
+		.uevent    = xenbus_uevent_backend,
+		.dev_attrs = xenbus_backend_attrs,
 	},
 	.dev = {
 		.bus_id = "xen-backend",
--- head-2010-04-29.orig/drivers/xen/xenbus/xenbus_xs.c	2010-03-24 15:09:22.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenbus/xenbus_xs.c	2010-03-24 15:17:58.000000000 +0100
@@ -227,6 +227,9 @@ void *xenbus_dev_request_and_reply(struc
 
 	return ret;
 }
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+#endif
 
 /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
 static void *xs_talkv(struct xenbus_transaction t,
--- head-2010-04-29.orig/drivers/xen/xenoprof/xenoprofile.c	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-04-29/drivers/xen/xenoprof/xenoprofile.c	2010-03-24 15:17:58.000000000 +0100
@@ -50,7 +50,7 @@ static int xenoprof_enabled = 0;
 static int xenoprof_is_primary = 0;
 static int active_defined;
 
-extern unsigned long backtrace_depth;
+extern unsigned long oprofile_backtrace_depth;
 
 /* Number of buffers in shared area (one per VCPU) */
 static int nbuf;
@@ -339,11 +339,11 @@ static int xenoprof_setup(void)
 			active_defined = 1;
 		}
 
-		if (backtrace_depth > 0) {
+		if (oprofile_backtrace_depth > 0) {
 			ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, 
-						     &backtrace_depth);
+						     &oprofile_backtrace_depth);
 			if (ret)
-				backtrace_depth = 0;
+				oprofile_backtrace_depth = 0;
 		}
 
 		ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
--- head-2010-04-29.orig/include/acpi/processor.h	2010-03-24 14:53:41.000000000 +0100
+++ head-2010-04-29/include/acpi/processor.h	2010-03-24 15:17:58.000000000 +0100
@@ -451,6 +451,13 @@ extern int processor_extcntl_prepare(str
 extern int acpi_processor_get_performance_info(struct acpi_processor *pr);
 extern int acpi_processor_get_psd(struct acpi_processor *pr);
 void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **);
+
+/*
+ * Declarations for objects and functions removed in native 2.6.29, and
+ * thus moved to drivers/acpi/processor_extcntl.c.
+ */
+extern struct notifier_block acpi_processor_latency_notifier;
+int acpi_processor_set_power_policy(struct acpi_processor *);
 #else
 static inline int processor_cntl_external(void) {return 0;}
 static inline int processor_pm_external(void) {return 0;}
--- head-2010-04-29.orig/include/xen/cpu_hotplug.h	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/include/xen/cpu_hotplug.h	2010-03-24 15:17:58.000000000 +0100
@@ -5,7 +5,7 @@
 #include <linux/cpumask.h>
 
 #if defined(CONFIG_X86) && defined(CONFIG_SMP)
-extern cpumask_t cpu_initialized_map;
+extern cpumask_var_t vcpu_initialized_mask;
 #endif
 
 #if defined(CONFIG_HOTPLUG_CPU)
--- head-2010-04-29.orig/include/xen/evtchn.h	2010-03-24 15:10:37.000000000 +0100
+++ head-2010-04-29/include/xen/evtchn.h	2010-03-24 15:17:58.000000000 +0100
@@ -48,6 +48,18 @@
  * LOW-LEVEL DEFINITIONS
  */
 
+struct irq_cfg {
+	u32 info;
+	union {
+		int bindcount; /* for dynamic IRQs */
+#ifdef CONFIG_X86_IO_APIC
+		u8 vector; /* for physical IRQs */
+#endif
+	};
+};
+
+int assign_irq_vector(int irq, struct irq_cfg *, const struct cpumask *);
+
 /*
  * Dynamically bind an event source to an IRQ-like callback handler.
  * On some platforms this may not be implemented via the Linux IRQ subsystem.
--- head-2010-04-29.orig/include/xen/xenbus.h	2010-03-24 15:10:29.000000000 +0100
+++ head-2010-04-29/include/xen/xenbus.h	2010-03-24 15:17:58.000000000 +0100
@@ -322,7 +322,9 @@ void xenbus_dev_error(struct xenbus_devi
 void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
 		      ...);
 
+#if defined(CONFIG_XEN) || defined(MODULE)
 int xenbus_dev_init(void);
+#endif
 
 const char *xenbus_strstate(enum xenbus_state state);
 int xenbus_dev_is_online(struct xenbus_device *dev);
--- head-2010-04-29.orig/lib/swiotlb-xen.c	2010-03-24 15:14:47.000000000 +0100
+++ head-2010-04-29/lib/swiotlb-xen.c	2010-03-24 15:17:58.000000000 +0100
@@ -8,6 +8,7 @@
  * Copyright (C) 2000, 2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
+ * 08/12/11 beckyb	Add highmem support
  */
 
 #include <linux/cache.h>
@@ -16,6 +17,8 @@
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
@@ -30,27 +33,11 @@
 #include <xen/interface/memory.h>
 #include <asm/gnttab_dma.h>
 
-int swiotlb;
-
 #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
 
-/*
- * Maximum allowable number of contiguous slabs to map,
- * must be a power of 2.  What is the appropriate value ?
- * The complexity of {map,unmap}_single is linearly dependent on this value.
- */
-#define IO_TLB_SEGSIZE	128
-
-/*
- * log of the size of each IO TLB slab.  The number of slabs is command line
- * controllable.
- */
-#define IO_TLB_SHIFT 11
-
+int swiotlb;
 int swiotlb_force;
 
-static unsigned long iotlb_nslabs;
-
 /*
  * Used to do a quick range check in swiotlb_unmap_single and
  * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
@@ -59,6 +46,12 @@ static unsigned long iotlb_nslabs;
 static char *io_tlb_start, *io_tlb_end;
 
 /*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
  * When the IOMMU overflows we return a fallback buffer. This sets the size.
  */
 static unsigned long io_tlb_overflow = 32*1024;
@@ -76,10 +69,7 @@ static unsigned int io_tlb_index;
  * We need to save away the original address corresponding to a mapped entry
  * for the sync operations.
  */
-static struct phys_addr {
-	struct page *page;
-	unsigned int offset;
-} *io_tlb_orig_addr;
+static phys_addr_t *io_tlb_orig_addr;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -101,9 +91,9 @@ setup_io_tlb_npages(char *str)
 {
 	/* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
 	if (isdigit(*str)) {
-		iotlb_nslabs = simple_strtoul(str, &str, 0) <<
+		io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
 			(20 - IO_TLB_SHIFT);
-		iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 	}
 	if (*str == ',')
 		++str;
@@ -121,35 +111,17 @@ setup_io_tlb_npages(char *str)
 __setup("swiotlb=", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the PCI DMA API.
- */
-void __init
-swiotlb_init_with_default_size(size_t default_size)
+void *__init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
 {
-	unsigned long i, bytes;
+	void *start = alloc_bootmem_pages(size);
+	unsigned int i;
 	int rc;
 
-	if (!iotlb_nslabs) {
-		iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
-		iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
-	}
-
-	bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
-
-	/*
-	 * Get IO TLB memory from the low pages
-	 */
-	io_tlb_start = alloc_bootmem_pages(bytes);
-	if (!io_tlb_start)
-		panic("Cannot allocate SWIOTLB buffer!\n");
-
 	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
-	for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
+	for (i = 0; i < nslabs; i += IO_TLB_SEGSIZE) {
 		do {
 			rc = xen_create_contiguous_region(
-				(unsigned long)io_tlb_start + (i << IO_TLB_SHIFT),
+				(unsigned long)start + (i << IO_TLB_SHIFT),
 				get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
 				dma_bits);
 		} while (rc && dma_bits++ < max_dma_bits);
@@ -158,12 +130,12 @@ swiotlb_init_with_default_size(size_t de
 				panic("No suitable physical memory available for SWIOTLB buffer!\n"
 				      "Use dom0_mem Xen boot parameter to reserve\n"
 				      "some DMA memory (e.g., dom0_mem=-128M).\n");
-			iotlb_nslabs = i;
+			io_tlb_nslabs = i;
 			i <<= IO_TLB_SHIFT;
-			free_bootmem(__pa(io_tlb_start + i), bytes - i);
-			bytes = i;
+			free_bootmem(__pa(start + i), size - i);
+			size = i;
 			for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
-				unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1));
+				unsigned int bits = fls64(virt_to_bus(start + i - 1));
 
 				if (bits > dma_bits)
 					dma_bits = bits;
@@ -171,18 +143,88 @@ swiotlb_init_with_default_size(size_t de
 			break;
 		}
 	}
+
+	return start;
+}
+
+#ifndef CONFIG_XEN
+void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+#endif
+
+dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
+{
+	return phys_to_machine(paddr);
+}
+
+phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+	return machine_to_phys(baddr);
+}
+
+static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+				      volatile void *address)
+{
+	return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
+}
+
+static void *swiotlb_bus_to_virt(dma_addr_t address)
+{
+	return phys_to_virt(swiotlb_bus_to_phys(address));
+}
+
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+	return 0;
+}
+
+static void swiotlb_print_info(unsigned long bytes)
+{
+	printk(KERN_INFO "Software IO TLB enabled: \n"
+	       " Aperture:     %lu megabytes\n"
+	       " Address size: %u bits\n"
+	       " Kernel range: %p - %p\n",
+	       bytes >> 20, dma_bits,
+	       io_tlb_start, io_tlb_end);
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the PCI DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size)
+{
+	unsigned long i, bytes;
+	int rc;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs);
+	if (!io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer!\n");
+	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 	io_tlb_end = io_tlb_start + bytes;
 
 	/*
 	 * Allocate and initialize the free list array.  This array is used
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
 	 */
-	io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
-	for (i = 0; i < iotlb_nslabs; i++)
+	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
-	io_tlb_orig_addr = alloc_bootmem(
-		iotlb_nslabs * sizeof(*io_tlb_orig_addr));
+	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
 
 	/*
 	 * Get the overflow emergency buffer
@@ -200,13 +242,7 @@ swiotlb_init_with_default_size(size_t de
 	if (rc)
 		panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
 
-	printk(KERN_INFO "Software IO TLB enabled: \n"
-	       " Aperture:     %lu megabytes\n"
-	       " Kernel range: %p - %p\n"
-	       " Address size: %u bits\n",
-	       bytes >> 20,
-	       io_tlb_start, io_tlb_end,
-	       dma_bits);
+	swiotlb_print_info(bytes);
 }
 
 void __init
@@ -233,6 +269,11 @@ swiotlb_init(void)
 		printk(KERN_INFO "Software IO TLB disabled\n");
 }
 
+static inline int range_needs_mapping(phys_addr_t pa, size_t size)
+{
+	return range_straddles_page_boundary(pa, size);
+}
+
 static int is_swiotlb_buffer(dma_addr_t addr)
 {
 	unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr));
@@ -246,46 +287,50 @@ static int is_swiotlb_buffer(dma_addr_t 
 }
 
 /*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ *
  * We use __copy_to_user_inatomic to transfer to the host buffer because the
  * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
  * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
  * unnecessary copy from the aperture to the host buffer, and a page fault.
  */
-static void
-__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
+static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+			   enum dma_data_direction dir)
 {
-	if (PageHighMem(buffer.page)) {
-		size_t len, bytes;
-		char *dev, *host, *kmp;
-		len = size;
-		while (len != 0) {
-			unsigned long flags;
-
-			if (((bytes = len) + buffer.offset) > PAGE_SIZE)
-				bytes = PAGE_SIZE - buffer.offset;
-			local_irq_save(flags); /* protects KM_BOUNCE_READ */
-			kmp  = kmap_atomic(buffer.page, KM_BOUNCE_READ);
-			dev  = dma_addr + size - len;
-			host = kmp + buffer.offset;
-			if (dir == DMA_FROM_DEVICE) {
-				if (__copy_to_user_inatomic(host, dev, bytes))
-					/* inaccessible */;
-			} else
-				memcpy(dev, host, bytes);
-			kunmap_atomic(kmp, KM_BOUNCE_READ);
+	unsigned long pfn = PFN_DOWN(phys);
+
+	if (PageHighMem(pfn_to_page(pfn))) {
+		/* The buffer does not have a mapping.  Map it in and copy */
+		unsigned int offset = phys & ~PAGE_MASK;
+		char *buffer;
+		unsigned int sz = 0;
+		unsigned long flags;
+
+		while (size) {
+			sz = min((size_t)(PAGE_SIZE - offset), size);
+
+			local_irq_save(flags);
+			buffer = kmap_atomic(pfn_to_page(pfn),
+					     KM_BOUNCE_READ);
+			if (dir == DMA_TO_DEVICE)
+				memcpy(dma_addr, buffer + offset, sz);
+			else if (__copy_to_user_inatomic(buffer + offset,
+							 dma_addr, sz))
+				/* inaccessible */;
+			kunmap_atomic(buffer, KM_BOUNCE_READ);
 			local_irq_restore(flags);
-			len -= bytes;
-			buffer.page++;
-			buffer.offset = 0;
+
+			size -= sz;
+			pfn++;
+			dma_addr += sz;
+			offset = 0;
 		}
 	} else {
-		char *host = (char *)phys_to_virt(
-			page_to_pseudophys(buffer.page)) + buffer.offset;
-		if (dir == DMA_FROM_DEVICE) {
-			if (__copy_to_user_inatomic(host, dma_addr, size))
-				/* inaccessible */;
-		} else if (dir == DMA_TO_DEVICE)
-			memcpy(dma_addr, host, size);
+		if (dir == DMA_TO_DEVICE)
+			memcpy(dma_addr, phys_to_virt(phys), size);
+		else if (__copy_to_user_inatomic(phys_to_virt(phys),
+						 dma_addr, size))
+			/* inaccessible */;
 	}
 }
 
@@ -293,12 +338,11 @@ __sync_single(struct phys_addr buffer, c
  * Allocates bounce buffer and returns its kernel virtual address.
  */
 static void *
-map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
+map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
 {
 	unsigned long flags;
 	char *dma_addr;
 	unsigned int nslots, stride, index, wrap;
-	struct phys_addr slot_buf;
 	int i;
 	unsigned long mask;
 	unsigned long offset_slots;
@@ -306,6 +350,10 @@ map_single(struct device *hwdev, struct 
 
 	mask = dma_get_seg_boundary(hwdev);
 	offset_slots = -IO_TLB_SEGSIZE;
+
+	/*
+ 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
+ 	 */
 	max_slots = mask + 1
 		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -328,7 +376,7 @@ map_single(struct device *hwdev, struct 
 	 */
 	spin_lock_irqsave(&io_tlb_lock, flags);
 	index = ALIGN(io_tlb_index, stride);
-	if (index >= iotlb_nslabs)
+	if (index >= io_tlb_nslabs)
 		index = 0;
 	wrap = index;
 
@@ -336,7 +384,7 @@ map_single(struct device *hwdev, struct 
 		while (iommu_is_span_boundary(index, nslots, offset_slots,
 					      max_slots)) {
 			index += stride;
-			if (index >= iotlb_nslabs)
+			if (index >= io_tlb_nslabs)
 				index = 0;
 			if (index == wrap)
 				goto not_found;
@@ -360,13 +408,13 @@ map_single(struct device *hwdev, struct 
 			 * Update the indices to avoid searching in the next
 			 * round.
 			 */
-			io_tlb_index = ((index + nslots) < iotlb_nslabs
+			io_tlb_index = ((index + nslots) < io_tlb_nslabs
 					? (index + nslots) : 0);
 
 			goto found;
 		}
 		index += stride;
-		if (index >= iotlb_nslabs)
+		if (index >= io_tlb_nslabs)
 			index = 0;
 	} while (index != wrap);
 
@@ -381,29 +429,14 @@ found:
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	slot_buf = buffer;
-	for (i = 0; i < nslots; i++) {
-		slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
-		slot_buf.offset &= PAGE_SIZE - 1;
-		io_tlb_orig_addr[index+i] = slot_buf;
-		slot_buf.offset += 1 << IO_TLB_SHIFT;
-	}
-	if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
-		__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+	for (i = 0; i < nslots; i++)
+		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
 
 	return dma_addr;
 }
 
-static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
-{
-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	struct phys_addr buffer = io_tlb_orig_addr[index];
-	buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
-	buffer.page += buffer.offset >> PAGE_SHIFT;
-	buffer.offset &= PAGE_SIZE - 1;
-	return buffer;
-}
-
 /*
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
@@ -413,13 +446,13 @@ unmap_single(struct device *hwdev, char 
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+	phys_addr_t phys = io_tlb_orig_addr[index];
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
-		__sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -453,9 +486,13 @@ unmap_single(struct device *hwdev, char 
 static void
 sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
-	struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
+
+	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+
 	BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
-	__sync_single(buffer, dma_addr, size, dir);
+	swiotlb_bounce(phys, dma_addr, size, dir);
 }
 
 static void
@@ -469,7 +506,7 @@ swiotlb_full(struct device *dev, size_t 
 	 * the damage, or panic when the transfer is too big.
 	 */
 	printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
-	       "device %s\n", size, dev ? dev->bus_id : "?");
+	       "device %s\n", size, dev ? dev_name(dev) : "?");
 
 	if (size > io_tlb_overflow && do_panic) {
 		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -494,7 +531,6 @@ _swiotlb_map_single(struct device *hwdev
 	dma_addr_t dev_addr = gnttab_dma_map_page(page) +
 			      offset_in_page(paddr);
 	void *map;
-	struct phys_addr buffer;
 
 	BUG_ON(dir == DMA_NONE);
 
@@ -503,23 +539,21 @@ _swiotlb_map_single(struct device *hwdev
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!range_straddles_page_boundary(paddr, size) &&
-	    !address_needs_mapping(hwdev, dev_addr, size))
+	if (!address_needs_mapping(hwdev, dev_addr, size) &&
+	    !range_needs_mapping(paddr, size))
 		return dev_addr;
 
 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
 	gnttab_dma_unmap_page(dev_addr);
-	buffer.page   = page;
-	buffer.offset = offset_in_page(paddr);
-	map = map_single(hwdev, buffer, size, dir);
+	map = map_single(hwdev, paddr, size, dir);
 	if (!map) {
 		swiotlb_full(hwdev, size, dir, 1);
 		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = virt_to_bus(map);
+	dev_addr = swiotlb_virt_to_bus(hwdev, map);
 	return dev_addr;
 }
 
@@ -536,6 +570,7 @@ swiotlb_map_single(struct device *hwdev,
 {
 	return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_map_single);
 
 dma_addr_t
 swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
@@ -555,7 +590,7 @@ void
 swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
 			   size_t size, int dir, struct dma_attrs *attrs)
 {
- 	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dev_addr))
@@ -571,6 +606,8 @@ swiotlb_unmap_single(struct device *hwde
 {
 	return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_unmap_single);
+
 /*
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
@@ -585,48 +622,50 @@ void
 swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 			    size_t size, int dir)
 {
- 	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dev_addr))
 		sync_single(hwdev, dma_addr, size, dir);
 }
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
 
 void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
 			       size_t size, int dir)
 {
- 	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dev_addr))
 		sync_single(hwdev, dma_addr, size, dir);
 }
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
 
 void
 swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 				  unsigned long offset, size_t size, int dir)
 {
- 	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dev_addr))
 		sync_single(hwdev, dma_addr + offset, size, dir);
 }
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
 
 void
 swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
 				     unsigned long offset, size_t size, int dir)
 {
- 	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dev_addr))
 		sync_single(hwdev, dma_addr + offset, size, dir);
 }
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 
-void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
-			    struct dma_attrs *);
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
  * This is the scatter-gather version of the above swiotlb_map_single
@@ -648,23 +687,23 @@ swiotlb_map_sg_attrs(struct device *hwde
 		     int dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	struct phys_addr buffer;
-	dma_addr_t dev_addr;
-	char *map;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		dev_addr = gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+		dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg))
+				      + sg->offset;
+		phys_addr_t paddr = page_to_pseudophys(sg_page(sg))
+				   + sg->offset;
 
-		if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg))
-						  + sg->offset, sg->length)
+		if (range_needs_mapping(paddr, sg->length)
 		    || address_needs_mapping(hwdev, dev_addr, sg->length)) {
+			void *map;
+
 			gnttab_dma_unmap_page(dev_addr);
-			buffer.page   = sg_page(sg);
-			buffer.offset = sg->offset;
-			map = map_single(hwdev, buffer, sg->length, dir);
+			map = map_single(hwdev, paddr,
+					 sg->length, dir);
 			if (!map) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
@@ -674,7 +713,7 @@ swiotlb_map_sg_attrs(struct device *hwde
 				sgl[0].dma_length = 0;
 				return 0;
 			}
-			sg->dma_address = virt_to_bus(map);
+			sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
@@ -689,6 +728,7 @@ swiotlb_map_sg(struct device *hwdev, str
 {
 	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_map_sg);
 
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
@@ -705,7 +745,7 @@ swiotlb_unmap_sg_attrs(struct device *hw
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != sg_phys(sg))
-			unmap_single(hwdev, bus_to_virt(sg->dma_address),
+			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else
 			gnttab_dma_unmap_page(sg->dma_address);
@@ -719,6 +759,7 @@ swiotlb_unmap_sg(struct device *hwdev, s
 {
 	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_unmap_sg);
 
 /*
  * Make physical memory consistent for a set of streaming mode DMA translations
@@ -738,10 +779,11 @@ swiotlb_sync_sg_for_cpu(struct device *h
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != sg_phys(sg))
-			sync_single(hwdev, bus_to_virt(sg->dma_address),
+			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir);
 	}
 }
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
 
 void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl,
@@ -754,16 +796,18 @@ swiotlb_sync_sg_for_device(struct device
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != sg_phys(sg))
-			sync_single(hwdev, bus_to_virt(sg->dma_address),
+			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir);
 	}
 }
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
 }
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
 /*
  * Return whether the given PCI device DMA address mask can be supported
@@ -776,14 +820,4 @@ swiotlb_dma_supported (struct device *hw
 {
 	return (mask >= ((1UL << dma_bits) - 1));
 }
-
-EXPORT_SYMBOL(swiotlb_map_single);
-EXPORT_SYMBOL(swiotlb_unmap_single);
-EXPORT_SYMBOL(swiotlb_map_sg);
-EXPORT_SYMBOL(swiotlb_unmap_sg);
-EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_single_for_device);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
-EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 EXPORT_SYMBOL(swiotlb_dma_supported);