You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
qubes-linux-kernel/patches.xen/xen3-patch-2.6.29

11363 lines
320 KiB

From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Linux: 2.6.29
Patch-mainline: 2.6.29
This patch contains the differences between 2.6.28 and 2.6.29.
Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.29" by xen-port-patches.py
--- head-2011-03-17.orig/arch/x86/Kconfig 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/Kconfig 2011-02-01 14:42:26.000000000 +0100
@@ -311,7 +311,6 @@ config X86_XEN
select X86_PAE
select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
- select SWIOTLB
help
Choose this option if you plan to run this kernel on top of the
Xen Hypervisor.
@@ -349,7 +348,6 @@ config X86_64_XEN
bool "Enable Xen compatible kernel"
depends on X86_64
select XEN
- select SWIOTLB
help
This option will compile a kernel compatible with Xen hypervisor
@@ -747,7 +745,7 @@ config AMD_IOMMU_STATS
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
- def_bool y if X86_64
+ def_bool y if X86_64 || XEN
---help---
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
@@ -862,7 +860,7 @@ config X86_XEN_GENAPIC
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
- depends on X86_IO_APIC
+ depends on X86_IO_APIC && !XEN
---help---
This option enables a workaround that fixes a source of
spurious interrupts. This is recommended when threaded
--- head-2011-03-17.orig/arch/x86/Makefile 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/Makefile 2011-02-01 14:42:26.000000000 +0100
@@ -158,8 +158,8 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdi
PHONY += bzImage vmlinuz $(BOOT_TARGETS)
ifdef CONFIG_XEN
-KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
- -I$(srctree)/arch/x86/include/mach-xen $(KBUILD_CPPFLAGS)
+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+ -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
ifdef CONFIG_X86_64
LDFLAGS_vmlinux := -e startup_64
--- head-2011-03-17.orig/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:42:26.000000000 +0100
@@ -363,9 +363,9 @@ ENTRY(ia32_syscall)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
-ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
- ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ ja ia32_badsys
+ia32_do_call:
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
@@ -380,7 +380,9 @@ ia32_tracesys:
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
- jmp ia32_do_syscall
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
+ jmp ia32_do_call
END(ia32_syscall)
ia32_badsys:
--- head-2011-03-17.orig/arch/x86/include/asm/hw_irq.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/hw_irq.h 2011-02-01 14:42:26.000000000 +0100
@@ -145,7 +145,9 @@ extern irqreturn_t smp_call_function_sin
#endif
#endif
+#ifndef CONFIG_XEN
extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+#endif
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
--- head-2011-03-17.orig/arch/x86/include/asm/hypervisor.h 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/hypervisor.h 2011-02-01 14:42:26.000000000 +0100
@@ -60,3 +60,7 @@ static inline bool hypervisor_x2apic_ava
}
#endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include_next <asm/hypervisor.h>
+#endif
--- head-2011-03-17.orig/arch/x86/include/asm/kexec.h 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/kexec.h 2011-02-01 14:42:26.000000000 +0100
@@ -12,13 +12,10 @@
/*
* The hypervisor interface implicitly requires that all entries (except
* for possibly the final one) are arranged in matching PA_/VA_ pairs.
+# define VA_PGD 3
*/
-# define PA_PMD_0 8
-# define VA_PMD_0 9
-# define PA_PMD_1 10
-# define VA_PMD_1 11
-# define PA_SWAP_PAGE 12
-# define PAGES_NR 13
+# define PA_SWAP_PAGE 4
+# define PAGES_NR 5
# endif /* CONFIG_XEN */
#else
# define PA_CONTROL_PAGE 0
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:42:26.000000000 +0100
@@ -342,16 +342,14 @@ static inline void set_intr_gate(unsigne
_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}
-#define SYS_VECTOR_FREE 0
-#define SYS_VECTOR_ALLOCED 1
-
extern int first_system_vector;
-extern char system_vectors[];
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
static inline void alloc_system_vector(int vector)
{
- if (system_vectors[vector] == SYS_VECTOR_FREE) {
- system_vectors[vector] = SYS_VECTOR_ALLOCED;
+ if (!test_bit(vector, used_vectors)) {
+ set_bit(vector, used_vectors);
if (first_system_vector > vector)
first_system_vector = vector;
} else
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:42:26.000000000 +0100
@@ -16,7 +16,6 @@
#include <asm/apicdef.h>
#include <asm/page.h>
#include <asm/vsyscall.h>
-#include <asm/efi.h>
#include <asm/acpi.h>
/*
@@ -52,11 +51,6 @@ enum fixed_addresses {
FIX_ISAMAP_END,
FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
#endif
-#ifdef CONFIG_EFI
- FIX_EFI_IO_MAP_LAST_PAGE,
- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
- + MAX_EFI_IO_PAGES - 1,
-#endif
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#else
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:42:26.000000000 +0100
@@ -79,6 +79,7 @@ static inline void clear_user_highpage(s
clear_highpage(page);
}
#define __HAVE_ARCH_CLEAR_HIGHPAGE
+#define clear_user_highpage clear_user_highpage
#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
void copy_highpage(struct page *to, struct page *from);
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:42:26.000000000 +0100
@@ -69,6 +69,8 @@ extern start_info_t *xen_start_info;
#define is_initial_xendomain() 0
#endif
+#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN))
+
struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
/* arch/xen/kernel/evtchn.c */
@@ -139,7 +141,7 @@ void scrub_pages(void *, unsigned int);
DECLARE_PER_CPU(bool, xen_lazy_mmu);
-int xen_multicall_flush(bool);
+void xen_multicall_flush(bool);
int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
unsigned long flags);
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:42:26.000000000 +0100
@@ -4,6 +4,7 @@
#define ARCH_HAS_IOREMAP_WC
#include <linux/compiler.h>
+#include <asm-generic/int-ll64.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned
#define mmiowb() barrier()
#ifdef CONFIG_X86_64
+
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_read(__readq, "q", unsigned long, "=r", )
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-build_mmio_write(__writeq, "q", unsigned long, "r", )
-#define readq_relaxed(a) __readq(a)
-#define __raw_readq __readq
-#define __raw_writeq writeq
-
-/* Let people know we have them */
-#define readq readq
-#define writeq writeq
+#else
+
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+ const volatile u32 __iomem *p = addr;
+ u32 low, high;
+
+ low = readl(p);
+ high = readl(p + 1);
+
+ return low + ((u64)high << 32);
+}
+
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+ writel(val, addr);
+ writel(val >> 32, addr+4);
+}
+
#endif
-extern int iommu_bio_merge;
+#define readq_relaxed(a) readq(a)
+
+#define __raw_readq(a) readq(a)
+#define __raw_writeq(val, addr) writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq readq
+#define writeq writeq
#define native_io_delay xen_io_delay
@@ -120,7 +139,6 @@ extern void __iomem *ioremap_wc(unsigned
* A boot-time mapping is currently limited to at most 16 pages.
*/
extern void early_ioremap_init(void);
-extern void early_ioremap_clear(void);
extern void early_ioremap_reset(void);
extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:31:50.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:32:00.000000000 +0100
@@ -24,6 +24,8 @@
#define LAST_VM86_IRQ 15
#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+#define NR_IRQS_LEGACY 16
+
/*
* The flat IRQ space is divided into two regions:
* 1. A one-to-one mapping of real physical IRQs. This space is only used
@@ -36,8 +38,10 @@
#define PIRQ_BASE 0
#if defined(NR_CPUS) && defined(MAX_IO_APICS)
-# if NR_CPUS < MAX_IO_APICS
+# if !defined(CONFIG_SPARSE_IRQ) && NR_CPUS < MAX_IO_APICS
# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
+# elif defined(CONFIG_SPARSE_IRQ) && 8 * NR_CPUS > 32 * MAX_IO_APICS
+# define NR_PIRQS (NR_VECTORS + 8 * NR_CPUS)
# else
# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
# endif
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:42:26.000000000 +0100
@@ -3,10 +3,9 @@
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#if 0 /* XEN: no lazy tlb */
- unsigned cpu = smp_processor_id();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
@@ -38,9 +37,9 @@ static inline void switch_mm(struct mm_s
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
-#if 0 /* XEN: no lazy tlb */
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
- per_cpu(cpu_tlbstate, cpu).active_mm = next;
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ x86_write_percpu(cpu_tlbstate.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);
@@ -62,10 +61,10 @@ static inline void switch_mm(struct mm_s
BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
}
-#if 0 /* XEN: no lazy tlb */
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
else {
- per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
- BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:42:26.000000000 +0100
@@ -22,6 +22,8 @@ struct pci_sysdata {
};
extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -88,6 +90,8 @@ static inline void pci_dma_burst_advice(
static inline void early_quirks(void) { }
#endif
+extern void pci_iommu_alloc(void);
+
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_32
@@ -104,9 +108,9 @@ static inline void early_quirks(void) {
#ifdef CONFIG_NUMA
/* Returns the node based on pci bus */
-static inline int __pcibus_to_node(struct pci_bus *bus)
+static inline int __pcibus_to_node(const struct pci_bus *bus)
{
- struct pci_sysdata *sd = bus->sysdata;
+ const struct pci_sysdata *sd = bus->sysdata;
return sd->node;
}
@@ -115,6 +119,12 @@ static inline cpumask_t __pcibus_to_cpum
{
return node_to_cpumask(__pcibus_to_node(bus));
}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+ return cpumask_of_node(__pcibus_to_node(bus));
+}
#endif
#endif /* _ASM_X86_PCI_H */
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-07 15:41:11.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-01 14:42:26.000000000 +0100
@@ -22,12 +22,10 @@
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
-
-/* set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
-
-/* if the user mapped it with PROT_NONE; pte_present gives true */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -176,8 +174,19 @@ extern unsigned int __kernel_page_user;
#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
#endif
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
+ : (prot))
+
#ifndef __ASSEMBLY__
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -309,41 +318,42 @@ static inline pte_t pte_mkspecial(pte_t
extern pteval_t __supported_pte_mask;
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+/*
+ * Mask out unsupported bits in a present pgprot. Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
- pgprotval_t prot = pgprot_val(pgprot);
+ pgprotval_t protval = pgprot_val(pgprot);
+
+ if (protval & _PAGE_PRESENT)
+ protval &= __supported_pte_mask;
- if (prot & _PAGE_PRESENT)
- prot &= __supported_pte_mask;
- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
+ return protval;
}
-static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- pgprotval_t prot = pgprot_val(pgprot);
+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
+}
- if (prot & _PAGE_PRESENT)
- prot &= __supported_pte_mask;
- return __pte_ma((page_nr << PAGE_SHIFT) | prot);
+static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
+{
+ return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- pgprotval_t prot = pgprot_val(pgprot);
-
- if (prot & _PAGE_PRESENT)
- prot &= __supported_pte_mask;
- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot));
}
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pgprotval_t prot = pgprot_val(newprot);
pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
- if (prot & _PAGE_PRESENT)
- prot &= __supported_pte_mask;
- val |= prot & ~_PAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
return __pte(val);
}
@@ -359,11 +369,33 @@ static inline pgprot_t pgprot_modify(pgp
#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
-#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
- ? pgprot_val(p) & __supported_pte_mask \
- : pgprot_val(p))
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(unsigned long flags,
+ unsigned long new_flags)
+{
+ /*
+ * Certain new memtypes are not allowed with certain
+ * requested memtype:
+ * - request is uncached, return cannot be write-back
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((flags == _PAGE_CACHE_UC_MINUS &&
+ new_flags == _PAGE_CACHE_WB) ||
+ (flags == _PAGE_CACHE_WC &&
+ new_flags == _PAGE_CACHE_WB)) {
+ return 0;
+ }
+
+ return 1;
+}
#ifndef __ASSEMBLY__
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:42:26.000000000 +0100
@@ -151,6 +151,7 @@ static inline int pte_none(pte_t pte)
#define PTE_FILE_MAX_BITS 32
/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
#define __swp_offset(x) ((x).val >> 5)
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:42:26.000000000 +0100
@@ -107,15 +107,6 @@ extern unsigned long pg0[];
#endif
/*
- * Macro to mark a page protection value as "uncacheable".
- * On processors which do not support it, this is a no-op.
- */
-#define pgprot_noncached(prot) \
- ((boot_cpu_data.x86 > 3) \
- ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
- : (prot))
-
-/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:42:26.000000000 +0100
@@ -149,8 +149,8 @@ static inline void xen_pgd_clear(pgd_t *
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
-#define MAXMEM _AC(0x000004ffffffffff, UL)
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
@@ -183,12 +183,6 @@ static inline int pmd_bad(pmd_t pmd)
#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
/*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot) \
- (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
-
-/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
@@ -270,6 +264,8 @@ static inline int pud_large(pud_t pte)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
#endif
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
& ((1U << SWP_TYPE_BITS) - 1))
#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:14.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:45:38.000000000 +0100
@@ -121,6 +121,7 @@ struct cpuinfo_x86 {
/* Index into per_cpu list: */
u16 cpu_index;
#endif
+ unsigned int x86_hyper_vendor;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
@@ -134,6 +135,10 @@ struct cpuinfo_x86 {
#define X86_VENDOR_UNKNOWN 0xff
+#define X86_HYPER_VENDOR_NONE 0
+#define X86_HYPER_VENDOR_VMWARE 1
+#define X86_HYPER_VENDOR_XEN 'X'
+
/*
* capabilities of CPUs
*/
@@ -364,7 +369,7 @@ struct i387_soft_struct {
u8 no_update;
u8 rm;
u8 alimit;
- struct info *info;
+ struct math_emu_info *info;
u32 entry_eip;
};
@@ -705,6 +710,19 @@ extern void switch_to_new_gdt(void);
extern void cpu_init(void);
extern void init_gdt(int cpu);
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
+
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:42:26.000000000 +0100
@@ -18,9 +18,26 @@
#include <asm/pda.h>
#include <asm/thread_info.h>
+#ifdef CONFIG_X86_64
+
+#define cpu_callin_mask cpu_possible_mask
+#define cpu_callout_mask cpu_possible_mask
+extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_sibling_setup_mask;
+
+#else /* CONFIG_X86_32 */
+
+#define cpu_callin_map cpu_possible_map
#define cpu_callout_map cpu_possible_map
extern cpumask_t cpu_initialized;
-#define cpu_callin_map cpu_possible_map
+extern cpumask_t cpu_sibling_setup_map;
+
+#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
+#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
+#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
+#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
+
+#endif /* CONFIG_X86_32 */
extern void (*mtrr_hook)(void);
extern void zap_low_mappings(void);
@@ -28,7 +45,6 @@ extern void zap_low_mappings(void);
extern int __cpuinit get_local_pda(int cpu);
extern unsigned int num_processors;
-extern cpumask_t cpu_initialized;
#ifndef CONFIG_XEN
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -39,6 +55,16 @@ DECLARE_PER_CPU(u16, cpu_llc_id);
DECLARE_PER_CPU(int, cpu_number);
#endif
+static inline const struct cpumask *cpu_sibling_mask(int cpu)
+{
+ return cpumask_of(cpu);
+}
+
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+ return cpumask_of(cpu);
+}
+
#ifndef CONFIG_XEN
DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
@@ -67,7 +93,7 @@ struct smp_ops {
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
- void (*send_call_func_ipi)(cpumask_t mask);
+ void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
};
@@ -128,7 +154,7 @@ static inline void arch_send_call_functi
static inline void arch_send_call_function_ipi(cpumask_t mask)
{
- smp_ops.send_call_func_ipi(mask);
+ smp_ops.send_call_func_ipi(&mask);
}
void cpu_disable_common(void);
@@ -147,13 +173,13 @@ extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
-void xen_send_call_func_ipi(cpumask_t mask);
+void xen_send_call_func_ipi(const struct cpumask *mask);
void xen_send_call_func_single_ipi(int cpu);
#define smp_send_stop xen_smp_send_stop
#define smp_send_reschedule xen_smp_send_reschedule
#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
-#define arch_send_call_function_ipi xen_send_call_func_ipi
+#define arch_send_call_function_ipi(m) xen_send_call_func_ipi(&(m))
void play_dead(void);
@@ -167,7 +193,7 @@ void smp_store_cpu_info(int id);
/* We don't mark CPUs online until __cpu_up(), so we need another measure */
static inline int num_booting_cpus(void)
{
- return cpus_weight(cpu_callout_map);
+ return cpumask_weight(cpu_callout_mask);
}
#else
static inline void prefill_possible_map(void)
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:42:26.000000000 +0100
@@ -337,6 +337,7 @@ static inline int __raw_spin_is_contende
{
return __raw_spin(is_contended)(lock);
}
+#define __raw_spin_is_contended __raw_spin_is_contended
static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
{
--- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/system.h 2011-03-03 16:01:23.000000000 +0100
+++ head-2011-03-17/arch/x86/include/mach-xen/asm/system.h 2011-03-03 16:05:49.000000000 +0100
@@ -18,12 +18,12 @@
# define AT_VECTOR_SIZE_ARCH 1
#endif
-#ifdef CONFIG_X86_32
-
struct task_struct; /* one of the stranger aspects of C forward declarations */
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
+#ifdef CONFIG_X86_32
+
/*
* Saving eflags is important. It switches not only IOPL between tasks,
* it also protects other tasks from NT leaking through sysenter etc.
@@ -300,6 +300,8 @@ extern void free_init_pages(char *what,
void xen_idle(void);
+void stop_this_cpu(void *dummy);
+
/*
* Force strict CPU ordering.
* And yes, this is required on UP too when we're talking
--- head-2011-03-17.orig/arch/x86/include/asm/thread_info.h 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-17/arch/x86/include/asm/thread_info.h 2011-02-01 14:42:26.000000000 +0100
@@ -154,7 +154,7 @@ struct thread_info {
#else
#define _TIF_WORK_CTXSW (_TIF_NOTSC \
- /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/)
+ /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR */)
#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
--- head-2011-03-17.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -163,6 +163,8 @@ static int __init acpi_sleep_setup(char
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
+ if (strncmp(str, "s4_nonvs", 8) == 0)
+ acpi_s4_no_nvs();
#endif
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
--- head-2011-03-17.orig/arch/x86/kernel/apic/apic-xen.c 2011-02-24 15:49:32.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/apic-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -32,7 +32,7 @@ static int __init apic_set_verbosity(cha
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
--- head-2011-03-17.orig/arch/x86/kernel/cpu/Makefile 2011-02-03 14:29:46.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/Makefile 2011-02-01 14:42:26.000000000 +0100
@@ -34,7 +34,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
-disabled-obj-$(CONFIG_XEN) := perfctr-watchdog.o
+disabled-obj-$(CONFIG_XEN) := hypervisor.o perfctr-watchdog.o vmware.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
--- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:40:32.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:41:35.000000000 +0100
@@ -38,17 +38,45 @@
#include <asm/proto.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
#ifdef CONFIG_XEN
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_LOCAL_APIC)
#define phys_pkg_id(a,b) a
#endif
-#include <asm/hypervisor.h>
#include <xen/interface/callback.h>
#endif
#include "cpu.h"
+#ifdef CONFIG_X86_64
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+#ifndef CONFIG_XEN
+cpumask_var_t cpu_callin_mask;
+cpumask_var_t cpu_callout_mask;
+#endif
+cpumask_var_t cpu_initialized_mask;
+
+#ifndef CONFIG_XEN
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+#endif
+
+#else /* CONFIG_X86_32 */
+
+#ifndef CONFIG_XEN
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+#endif
+cpumask_t cpu_initialized;
+#ifndef CONFIG_XEN
+cpumask_t cpu_sibling_setup_map;
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+
static struct cpu_dev *this_cpu __cpuinitdata;
#ifdef CONFIG_X86_64
@@ -380,7 +408,7 @@ void __cpuinit detect_ht(struct cpuinfo_
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
} else if (smp_num_siblings > 1) {
- if (smp_num_siblings > NR_CPUS) {
+ if (smp_num_siblings > nr_cpu_ids) {
printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
smp_num_siblings);
smp_num_siblings = 1;
@@ -735,6 +763,7 @@ static void __cpuinit identify_cpu(struc
detect_ht(c);
#endif
+ init_hypervisor(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -886,8 +915,6 @@ static __init int setup_disablecpuid(cha
}
__setup("clearcpuid=", setup_disablecpuid);
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
#ifdef CONFIG_X86_64
struct x8664_pda **_cpu_pda __read_mostly;
EXPORT_SYMBOL(_cpu_pda);
@@ -896,7 +923,7 @@ EXPORT_SYMBOL(_cpu_pda);
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
#endif
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
static void __ref switch_pt(int cpu)
{
@@ -956,8 +983,8 @@ void __cpuinit pda_init(int cpu)
}
#ifndef CONFIG_X86_NO_TSS
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
+static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+ DEBUG_STKSZ] __page_aligned_bss;
#endif
extern asmlinkage void ignore_sysret(void);
@@ -1045,7 +1072,7 @@ void __cpuinit cpu_init(void)
me = current;
- if (cpu_test_and_set(cpu, cpu_initialized))
+ if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
panic("CPU#%d already initialized!\n", cpu);
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1170,7 +1197,7 @@ void __cpuinit cpu_init(void)
#endif
struct thread_struct *thread = &curr->thread;
- if (cpu_test_and_set(cpu, cpu_initialized)) {
+ if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
for (;;) local_irq_enable();
}
--- head-2011-03-17.orig/arch/x86/kernel/cpu/intel.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/intel.c 2011-02-01 14:42:26.000000000 +0100
@@ -36,10 +36,15 @@ static void __cpuinit early_init_intel(s
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
+#ifndef CONFIG_XEN
misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
+#else
+ pr_warning("CPUID levels are restricted -"
+ " update hypervisor\n");
+#endif
}
}
--- head-2011-03-17.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/cpu/mtrr/main-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = {
struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
unsigned int num_var_ranges;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static u64 tom2;
--- head-2011-03-17.orig/arch/x86/kernel/e820-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/e820-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -719,6 +719,27 @@ void __init e820_mark_nosave_regions(uns
}
}
#endif
+
+#ifdef CONFIG_HIBERNATION
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type == E820_NVS)
+ hibernate_nvs_register(ei->addr, ei->size);
+ }
+
+ return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
#endif
/*
@@ -734,22 +755,6 @@ struct early_res {
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
#ifndef CONFIG_XEN
{ 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
-#endif
#endif
{}
};
--- head-2011-03-17.orig/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -904,49 +904,6 @@ static struct console early_dbgp_console
};
#endif
-/* Console interface to a host file on AMD's SimNow! */
-
-static int simnow_fd;
-
-enum {
- MAGIC1 = 0xBACCD00A,
- MAGIC2 = 0xCA110000,
- XOPEN = 5,
- XWRITE = 4,
-};
-
-static noinline long simnow(long cmd, long a, long b, long c)
-{
- long ret;
-
- asm volatile("cpuid" :
- "=a" (ret) :
- "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
- return ret;
-}
-
-static void __init simnow_init(char *str)
-{
- char *fn = "klog";
-
- if (*str == '=')
- fn = ++str;
- /* error ignored */
- simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
-}
-
-static void simnow_write(struct console *con, const char *s, unsigned n)
-{
- simnow(XWRITE, simnow_fd, (unsigned long)s, n);
-}
-
-static struct console simnow_console = {
- .name = "simnow",
- .write = simnow_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
static int __initdata early_console_initialized;
@@ -958,7 +915,7 @@ asmlinkage void early_printk(const char
va_list ap;
va_start(ap, fmt);
- n = vscnprintf(buf, 512, fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
early_console->write(early_console, buf, n);
va_end(ap);
}
@@ -991,10 +948,6 @@ static int __init setup_early_printk(cha
current_ypos = boot_params.screen_info.orig_y;
#endif
early_console = &early_vga_console;
- } else if (!strncmp(buf, "simnow", 6)) {
- simnow_init(buf + 6);
- early_console = &simnow_console;
- keep_early = 1;
#ifdef CONFIG_EARLY_PRINTK_DBGP
} else if (!strncmp(buf, "dbgp", 4)) {
if (early_dbgp_init(buf+4) < 0)
--- head-2011-03-17.orig/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:42:26.000000000 +0100
@@ -690,28 +690,37 @@ END(syscall_badsys)
27:;
/*
- * Build the entry stubs and pointer table with
- * some assembler magic.
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
*/
-.section .rodata,"a"
+.section .init.rodata,"a"
ENTRY(interrupt)
.text
-
+ .p2align 5
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
RING0_INT_FRAME
-vector=0
-.rept NR_VECTORS
- ALIGN
- .if vector
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ .balign 32
+ .rept 7
+ .if vector < NR_VECTORS
+ .if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl $~(vector)
+ .endif
+1: pushl $(~vector+0x80) /* Note: always in signed byte range */
CFI_ADJUST_CFA_OFFSET 4
- jmp common_interrupt
- .previous
+ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ jmp 2f
+ .endif
+ .previous
.long 1b
- .text
+ .text
vector=vector+1
+ .endif
+ .endr
+2: jmp common_interrupt
.endr
END(irq_entries_start)
@@ -723,8 +732,9 @@ END(interrupt)
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
*/
- ALIGN
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
movl %esp,%eax
@@ -751,68 +761,7 @@ ENDPROC(name)
#else
#define UNWIND_ESPFIX_STACK
-#endif
-
-KPROBE_ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %fs's slot on the stack */
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(page_fault)
-#ifdef CONFIG_XEN
# A note on the "critical region" in our callback handler.
# We want to avoid stacking callback handlers due to events occurring
# during handling of the last event. To do this, we keep events disabled
@@ -981,158 +930,6 @@ ENTRY(device_not_available)
CFI_ENDPROC
END(device_not_available)
-#ifndef CONFIG_XEN
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl SYSENTER_stack_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
- CFI_REL_OFFSET eip, 0
-#endif /* CONFIG_XEN */
-
-KPROBE_ENTRY(debug)
- RING0_INT_FRAME
-#ifndef CONFIG_XEN
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
-#endif /* !CONFIG_XEN */
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(debug)
-
-#ifndef CONFIG_XEN
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_nocheck_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-#else
-KPROBE_ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- orl $NMI_MASK, PT_EFLAGS(%esp)
- jmp restore_all
- CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
@@ -1148,19 +945,6 @@ ENTRY(native_irq_enable_sysexit)
END(native_irq_enable_sysexit)
#endif
-KPROBE_ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(int3)
-
ENTRY(overflow)
RING0_INT_FRAME
pushl $0
@@ -1225,14 +1009,6 @@ ENTRY(stack_segment)
CFI_ENDPROC
END(stack_segment)
-KPROBE_ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-KPROBE_END(general_protection)
-
ENTRY(alignment_check)
RING0_EC_FRAME
pushl $do_alignment_check
@@ -1292,6 +1068,7 @@ ENTRY(kernel_thread_helper)
push %eax
CFI_ADJUST_CFA_OFFSET 4
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
@@ -1303,6 +1080,9 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
pushl %eax
pushl %ecx
pushl %edx
@@ -1317,6 +1097,11 @@ ftrace_call:
popl %edx
popl %ecx
popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -1326,8 +1111,18 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
.globl ftrace_stub
ftrace_stub:
ret
@@ -1346,12 +1141,43 @@ trace:
popl %edx
popl %ecx
popl %eax
-
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %edx
+ lea 0x4(%ebp), %eax
+ subl $MCOUNT_INSN_SIZE, %edx
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl $0
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ call ftrace_return_to_handler
+ movl %eax, 0xc(%esp)
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+#endif
+
#include <asm/alternative-asm.h>
# pv syscall call handler stub
@@ -1485,3 +1311,238 @@ mask=0
#undef sys_fork
#undef sys_clone
#undef sys_vfork
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ ALIGN
+error_code:
+ /* the function address is in %fs's slot on the stack */
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ cld
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_REGISTER es, ecx*/
+ movl PT_FS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(page_fault)
+
+#ifndef CONFIG_XEN
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label) \
+ cmpw $__KERNEL_CS,4(%esp); \
+ jne ok; \
+label: \
+ movl TSS_sysenter_sp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
+ pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $__KERNEL_CS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
+#endif /* CONFIG_XEN */
+
+ENTRY(debug)
+ RING0_INT_FRAME
+#ifndef CONFIG_XEN
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+#endif /* !CONFIG_XEN */
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+#ifndef CONFIG_XEN
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
+
+nmi_stack_fixup:
+ RING0_INT_FRAME
+ FIX_STACK(12,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK(24,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_espfix_stack:
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
+ pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ addw $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ .endr
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+ jmp irq_return
+#else
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ orl $NMI_MASK, PT_EFLAGS(%esp)
+ jmp restore_all
+#endif
+ CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+ RING0_EC_FRAME
+ pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
--- head-2011-03-17.orig/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:42:26.000000000 +0100
@@ -14,15 +14,15 @@
*
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
* only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
+ * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
@@ -65,7 +65,6 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
-
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
@@ -73,16 +72,10 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
- /* taken from glibc */
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
+ MCOUNT_SAVE_FRAME
movq 0x38(%rsp), %rdi
movq 8(%rbp), %rsi
@@ -92,14 +85,13 @@ ENTRY(ftrace_caller)
ftrace_call:
call ftrace_stub
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
+ MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -108,15 +100,63 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
.globl ftrace_stub
ftrace_stub:
retq
trace:
- /* taken from glibc */
- subq $0x38, %rsp
+ MCOUNT_SAVE_FRAME
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ MCOUNT_SAVE_FRAME
+
+ leaq 8(%rbp), %rdi
+ movq 0x38(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+ subq $80, %rsp
+
movq %rax, (%rsp)
movq %rcx, 8(%rsp)
movq %rdx, 16(%rsp)
@@ -124,13 +164,14 @@ trace:
movq %rdi, 32(%rsp)
movq %r8, 40(%rsp)
movq %r9, 48(%rsp)
+ movq %r10, 56(%rsp)
+ movq %r11, 64(%rsp)
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
+ call ftrace_return_to_handler
+ movq %rax, 72(%rsp)
+ movq 64(%rsp), %r11
+ movq 56(%rsp), %r10
movq 48(%rsp), %r9
movq 40(%rsp), %r8
movq 32(%rsp), %rdi
@@ -138,16 +179,14 @@ trace:
movq 16(%rsp), %rdx
movq 8(%rsp), %rcx
movq (%rsp), %rax
- addq $0x38, %rsp
+ addq $72, %rsp
+ retq
+#endif
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
-#endif
+#endif
.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
@@ -162,20 +201,20 @@ END(mcount)
NMI_MASK = 0x80000000
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
* RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
* manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp
- movq $__USER_CS,CS(%rsp)
- movq $-1,RCX(%rsp)
+ */
+
+ /* %rsp:at FRAMEEND */
+ .macro FIXUP_TOP_OF_STACK tmp offset=0
+ movq $__USER_CS,CS+\offset(%rsp)
+ movq $-1,RCX+\offset(%rsp)
.endm
- .macro RESTORE_TOP_OF_STACK tmp,offset=0
+ .macro RESTORE_TOP_OF_STACK tmp offset=0
.endm
.macro FAKE_STACK_FRAME child_rip
@@ -187,7 +226,7 @@ NMI_MASK = 0x80000000
pushq %rax /* rsp */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rsp,0
- pushq $(1<<9) /* eflags - interrupts on */
+ pushq $X86_EFLAGS_IF /* eflags - interrupts on */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET rflags,0*/
pushq $__KERNEL_CS /* cs */
@@ -205,36 +244,80 @@ NMI_MASK = 0x80000000
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
- .macro CFI_DEFAULT_STACK start=1,adj=0
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro EMPTY_FRAME start=1 offset=0
.if \start
- CFI_STARTPROC simple
+ CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET
+ CFI_DEF_CFA rsp,8+\offset
.else
- CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET
+ CFI_DEF_CFA_OFFSET 8+\offset
.endif
- .if \adj == 0
- CFI_REL_OFFSET r15,R15
- CFI_REL_OFFSET r14,R14
- CFI_REL_OFFSET r13,R13
- CFI_REL_OFFSET r12,R12
- CFI_REL_OFFSET rbp,RBP
- CFI_REL_OFFSET rbx,RBX
+ .endm
+
+/*
+ * initial frame state for syscall
+ */
+ .macro BASIC_FRAME start=1 offset=0
+ EMPTY_FRAME \start, SS+8+\offset-RIP
+ /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+ CFI_REL_OFFSET rsp, RSP+\offset-RIP
+ /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+ /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+ CFI_REL_OFFSET rip, RIP+\offset-RIP
+ .endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro INTR_FRAME start=1 offset=0
+ .if \start == 1
+ BASIC_FRAME 1, \offset+2*8
+ CFI_REL_OFFSET rcx, 0+\offset
+ CFI_REL_OFFSET r11, 8+\offset
+ .else
+ BASIC_FRAME \start, \offset
.endif
- CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET
- CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET
- CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET
- CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET
- CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET
- CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET
- CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET
- CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET
- CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET
- CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET
- /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/
- /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/
- CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET
- /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/
+ .endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+ .macro XCPT_FRAME start=1 offset=0
+ INTR_FRAME \start, RIP+\offset-ORIG_RAX
+ /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ .endm
+
+/*
+ * frame that enables calling into C.
+ */
+ .macro PARTIAL_FRAME start=1 offset=0
+ XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+ CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+ CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+ CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ .endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+ .macro DEFAULT_FRAME start=1 offset=0
+ PARTIAL_FRAME \start, R11+\offset-R15
+ CFI_REL_OFFSET rbx, RBX+\offset
+ CFI_REL_OFFSET rbp, RBP+\offset
+ CFI_REL_OFFSET r12, R12+\offset
+ CFI_REL_OFFSET r13, R13+\offset
+ CFI_REL_OFFSET r14, R14+\offset
+ CFI_REL_OFFSET r15, R15+\offset
.endm
/*
@@ -264,70 +347,149 @@ NMI_MASK = 0x80000000
jmp hypercall_page + (__HYPERVISOR_iret * 32)
.endm
+#ifndef CONFIG_XEN
+/* save partial stack frame */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ movq_cfi rdi, RDI+16-ARGOFFSET
+ movq_cfi rsi, RSI+16-ARGOFFSET
+ movq_cfi rdx, RDX+16-ARGOFFSET
+ movq_cfi rcx, RCX+16-ARGOFFSET
+ movq_cfi rax, RAX+16-ARGOFFSET
+ movq_cfi r8, R8+16-ARGOFFSET
+ movq_cfi r9, R9+16-ARGOFFSET
+ movq_cfi r10, R10+16-ARGOFFSET
+ movq_cfi r11, R11+16-ARGOFFSET
+
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq_cfi rbp, 8 /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irqcount is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl %gs:pda_irqcount
+ jne 2f
+ popq_cfi %rax /* move return address... */
+ mov %gs:pda_irqstackptr,%rsp
+ EMPTY_FRAME 0
+ pushq_cfi %rbp /* backlink for unwinder */
+ pushq_cfi %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+#endif
+
+ENTRY(save_rest)
+ PARTIAL_FRAME 1 REST_SKIP+8
+ movq 5*8+16(%rsp), %r11 /* save return address */
+ movq_cfi rbx, RBX+16
+ movq_cfi rbp, RBP+16
+ movq_cfi r12, R12+16
+ movq_cfi r13, R13+16
+ movq_cfi r14, R14+16
+ movq_cfi r15, R15+16
+ movq %r11, 8(%rsp) /* return address */
+ FIXUP_TOP_OF_STACK %r11, 16
+ ret
+ CFI_ENDPROC
+END(save_rest)
+
+#ifndef CONFIG_XEN
+/* save complete stack frame */
+ENTRY(save_paranoid)
+ XCPT_FRAME 1 RDI+8
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(save_paranoid)
+#endif
+
/*
- * A newly forked process directly context switches into this.
- */
-/* rdi: prev */
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
ENTRY(ret_from_fork)
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
+
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
+ popf # reset kernel eflags
CFI_ADJUST_CFA_OFFSET -8
- call schedule_tail
+
+ call schedule_tail # rdi: 'prev' task parameter
+
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- jnz rff_trace
-rff_action:
+
+ CFI_REMEMBER_STATE
RESTORE_REST
- testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
+
+ testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
- testl $_TIF_IA32,TI_flags(%rcx)
+
+ testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
- jmp ret_from_sys_call
-rff_trace:
- movq %rsp,%rdi
- call syscall_trace_leave
- GET_THREAD_INFO(%rcx)
- jmp rff_action
+
+ RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+ jmp ret_from_sys_call # go to the SYSRET fastpath
+
+ CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
/*
- * initial frame state for interrupts and exceptions
- */
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
-
-/*
* System call entry. Upto 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.
*/
-
+
/*
- * Register setup:
+ * Register setup:
* rax system call number
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
+ * rcx return address for syscall/sysret, C arg3
* rsi arg1
- * rdx arg2
+ * rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
* Interrupts are enabled on entry.
* Only called from user space.
*
@@ -337,10 +499,10 @@ END(ret_from_fork)
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
- */
+ */
ENTRY(system_call)
- _frame (RIP-0x10)
+ INTR_FRAME start=2 offset=2*8
SAVE_ARGS -8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%rcx)
@@ -354,19 +516,19 @@ system_call_fastpath:
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
+ * Has incomplete stack frame and undefined top of stack.
+ */
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: flagmask */
-sysret_check:
+sysret_check:
LOCKDEP_SYS_EXIT
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl TI_flags(%rcx),%edx
andl %edi,%edx
- jnz sysret_careful
+ jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -378,7 +540,7 @@ sysret_check:
CFI_RESTORE_STATE
/* Handle reschedules */
- /* edx: work, edi: workmask */
+ /* edx: work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
@@ -391,7 +553,7 @@ sysret_careful:
CFI_ADJUST_CFA_OFFSET -8
jmp sysret_check
- /* Handle a signal */
+ /* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -400,17 +562,20 @@ sysret_signal:
jc sysret_audit
#endif
/* edx: work flags (arg3) */
- leaq do_notify_resume(%rip),%rax
leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
xorl %esi,%esi # oldset -> arg2
- call ptregscall_common
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call do_notify_resume
+ RESTORE_TOP_OF_STACK %r11
+ RESTORE_REST
movl $_TIF_WORK_MASK,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
-
+
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call
@@ -449,7 +614,7 @@ sysret_audit:
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */
-tracesys:
+tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jz auditsys
@@ -472,8 +637,8 @@ tracesys:
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */
-
-/*
+
+/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
@@ -521,18 +686,18 @@ int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- /* Check for syscall exit trace */
+ /* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
-
+
int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
@@ -547,22 +712,24 @@ int_restore_rest:
jmp int_with_check
CFI_ENDPROC
END(system_call)
-
-/*
+
+/*
* Certain special system calls that need to save a complete full stack frame.
- */
-
+ */
.macro PTREGSCALL label,func,arg
- .globl \label
-\label:
- leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ptregscall_common
+ENTRY(\label)
+ PARTIAL_FRAME 1 8 /* offset 8: return address */
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ call save_rest
+ DEFAULT_FRAME 0 8 /* offset 8: return address */
+ leaq 8(%rsp), \arg /* pt_regs pointer */
+ call \func
+ jmp ptregscall_common
+ CFI_ENDPROC
END(\label)
.endm
- CFI_STARTPROC
-
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -570,25 +737,18 @@ END(\label)
PTREGSCALL stub_iopl, sys_iopl, %rsi
ENTRY(ptregscall_common)
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
- FIXUP_TOP_OF_STACK %r11
- call *%rax
- RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
+ DEFAULT_FRAME 1 8 /* offset 8: return address */
+ RESTORE_TOP_OF_STACK %r11, 8
+ movq_cfi_restore R15+8, r15
+ movq_cfi_restore R14+8, r14
+ movq_cfi_restore R13+8, r13
+ movq_cfi_restore R12+8, r12
+ movq_cfi_restore RBP+8, rbp
+ movq_cfi_restore RBX+8, rbx
+ ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(ptregscall_common)
-
+
ENTRY(stub_execve)
CFI_STARTPROC
popq %r11
@@ -604,11 +764,11 @@ ENTRY(stub_execve)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
- */
+ */
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
@@ -623,24 +783,12 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC
END(stub_rt_sigreturn)
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame (RIP-0x10); \
- CFI_REL_OFFSET rcx,0; \
- CFI_REL_OFFSET r11,8
-
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame (RIP-0x18); \
- CFI_REL_OFFSET rcx,0; \
- CFI_REL_OFFSET r11,8
-
-/*
+/*
* Interrupt exit.
- *
*/
retint_with_reschedule:
- CFI_DEFAULT_STACK adj=1
+ PARTIAL_FRAME
movl $_TIF_WORK_MASK,%edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
@@ -669,20 +817,20 @@ retint_careful:
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
- popq %rdi
+ popq %rdi
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check
-
+
retint_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_restore_args
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
+ movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
@@ -704,324 +852,132 @@ ENTRY(retint_kernel)
jnc retint_restore_args
call preempt_schedule_irq
jmp retint_kernel /* check again */
-#endif
+#endif
CFI_ENDPROC
END(retint_check)
-
+
#ifndef CONFIG_XEN
/*
* APIC interrupts.
- */
- .macro apicinterrupt num,func
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
INTR_FRAME
pushq $~(\num)
CFI_ADJUST_CFA_OFFSET 8
- interrupt \func
+ interrupt \do_sym
jmp error_entry
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
-ENTRY(thermal_interrupt)
- apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
-
-ENTRY(threshold_interrupt)
- apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
-
-#ifdef CONFIG_SMP
-ENTRY(reschedule_interrupt)
- apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
- .macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
- apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
-END(invalidate_interrupt\num)
- .endm
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+#endif
+
+apicinterrupt UV_BAU_MESSAGE \
+ uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt LOCAL_TIMER_VECTOR \
+ apic_timer_interrupt smp_apic_timer_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+ invalidate_interrupt0 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+ invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+ invalidate_interrupt2 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+ invalidate_interrupt3 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+ invalidate_interrupt4 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+ invalidate_interrupt5 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+ invalidate_interrupt6 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+ invalidate_interrupt7 smp_invalidate_interrupt
+#endif
- INVALIDATE_ENTRY 0
- INVALIDATE_ENTRY 1
- INVALIDATE_ENTRY 2
- INVALIDATE_ENTRY 3
- INVALIDATE_ENTRY 4
- INVALIDATE_ENTRY 5
- INVALIDATE_ENTRY 6
- INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
- apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(call_function_single_interrupt)
- apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
-END(call_function_single_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
- apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
+apicinterrupt THRESHOLD_APIC_VECTOR \
+ threshold_interrupt mce_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+ thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
#endif
-ENTRY(apic_timer_interrupt)
- apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
-
-ENTRY(uv_bau_message_intr1)
- apicinterrupt 220,uv_bau_message_interrupt
-END(uv_bau_message_intr1)
-
-ENTRY(error_interrupt)
- apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
-
-ENTRY(spurious_interrupt)
- apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
+apicinterrupt ERROR_APIC_VECTOR \
+ error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+ spurious_interrupt smp_spurious_interrupt
#endif /* !CONFIG_XEN */
-
+
/*
* Exception entry points.
- */
- .macro zeroentry sym
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
INTR_FRAME
movq (%rsp),%rcx
CFI_RESTORE rcx
movq 8(%rsp),%r11
CFI_RESTORE r11
- addq $0x10,%rsp /* skip rcx and r11 */
- CFI_ADJUST_CFA_OFFSET -0x10
- pushq $0 /* push error code/oldrax */
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rax /* push real oldrax to the rdi slot */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */
+ subq $(15-1)*8,%rsp
+ CFI_ADJUST_CFA_OFFSET (15-1)*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
+
+.macro paranoidzeroentry sym do_sym
+ zeroentry \sym \do_sym
+.endm
+
+.macro paranoidzeroentry_ist sym do_sym ist
+ zeroentry \sym \do_sym
+.endm
- .macro errorentry sym
+.macro errorentry sym do_sym
+ENTRY(\sym)
XCPT_FRAME
movq (%rsp),%rcx
CFI_RESTORE rcx
movq 8(%rsp),%r11
CFI_RESTORE r11
- addq $0x10,%rsp /* rsp points to the error code */
- CFI_ADJUST_CFA_OFFSET -0x10
- pushq %rax
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ subq $(15-2)*8,%rsp
+ CFI_ADJUST_CFA_OFFSET (15-2)*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
-#if 0 /* not XEN */
/* error code is on the stack already */
- /* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0, irqtrace=1
- movq (%rsp),%rcx
- movq 8(%rsp),%r11
- addq $0x10,%rsp /* skip rcx and r11 */
- SAVE_ALL
- cld
-#if 0 /* not XEN */
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f
- SWAPGS
- xorl %ebx,%ebx
-1:
-#endif
- .if \ist
- movq %gs:pda_data_offset, %rbp
- .endif
- .if \irqtrace
- TRACE_IRQS_OFF
- .endif
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi
- movq $-1,ORIG_RAX(%rsp)
- .if \ist
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- call \sym
- .if \ist
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \irqtrace
- TRACE_IRQS_OFF
- .endif
- .endm
-
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
- .macro paranoidexit trace=1
- /* ebx: no swapgs flag */
-paranoid_exit\trace:
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore\trace
- testl $3,CS(%rsp)
- jnz paranoid_userspace\trace
-paranoid_swapgs\trace:
- .if \trace
- TRACE_IRQS_IRETQ 0
- .endif
- SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace\trace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs\trace
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule\trace
- movl %ebx,%edx /* arg3: thread flags */
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
-paranoid_schedule\trace:
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
- CFI_ENDPROC
- .endm
-#endif
+.macro paranoiderrorentry sym do_sym
+ errorentry \sym \do_sym
+.endm
/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.
- */
-KPROBE_ENTRY(error_entry)
- _frame RDI
- CFI_REL_OFFSET rax,0
- /* rdi slot contains rax, oldrax contains error code */
- cld
- subq $14*8,%rsp
- CFI_ADJUST_CFA_OFFSET (14*8)
- movq %rsi,13*8(%rsp)
- CFI_REL_OFFSET rsi,RSI
- movq 14*8(%rsp),%rsi /* load rax from rdi slot */
- CFI_REGISTER rax,rsi
- movq %rdx,12*8(%rsp)
- CFI_REL_OFFSET rdx,RDX
- movq %rcx,11*8(%rsp)
- CFI_REL_OFFSET rcx,RCX
- movq %rsi,10*8(%rsp) /* store rax */
- CFI_REL_OFFSET rax,RAX
- movq %r8, 9*8(%rsp)
- CFI_REL_OFFSET r8,R8
- movq %r9, 8*8(%rsp)
- CFI_REL_OFFSET r9,R9
- movq %r10,7*8(%rsp)
- CFI_REL_OFFSET r10,R10
- movq %r11,6*8(%rsp)
- CFI_REL_OFFSET r11,R11
- movq %rbx,5*8(%rsp)
- CFI_REL_OFFSET rbx,RBX
- movq %rbp,4*8(%rsp)
- CFI_REL_OFFSET rbp,RBP
- movq %r12,3*8(%rsp)
- CFI_REL_OFFSET r12,R12
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13,R13
- movq %r14,1*8(%rsp)
- CFI_REL_OFFSET r14,R14
- movq %r15,(%rsp)
- CFI_REL_OFFSET r15,R15
-#if 0
- cmpl $__KERNEL_CS,CS(%rsp)
- CFI_REMEMBER_STATE
- je error_kernelspace
-#endif
-error_call_handler:
- movq %rdi, RDI(%rsp)
- CFI_REL_OFFSET rdi,RDI
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi # get error code
- movq $-1,ORIG_RAX(%rsp)
- call *%rax
-error_exit:
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testb $3,CS-ARGOFFSET(%rsp)
- jz retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_restore_args
-
-#if 0
- /*
- * We need to re-write the logic here because we don't do iretq to
- * to return to user mode. It's still possible that we get trap/fault
- * in the kernel (when accessing buffers pointed to by system calls,
- * for example).
- *
- */
- CFI_RESTORE_STATE
-error_kernelspace:
- incl %ebx
- /* There are two places in the kernel that can potentially fault with
- usergs. Handle them here. The exception handlers after
- iret run with kernel gs again, so don't set the user space flag.
- B stepping K8s sometimes report an truncated RIP for IRET
- exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP(%rsp)
- je error_swapgs
- jmp error_sti
-#endif
- CFI_ENDPROC
-KPROBE_END(error_entry)
-
-ENTRY(hypervisor_callback)
- zeroentry do_hypervisor_callback
-END(hypervisor_callback)
-
-/*
* Copied from arch/xen/i386/kernel/entry.S
*/
# A note on the "critical region" in our callback handler.
@@ -1041,7 +997,7 @@ ENTRY(do_hypervisor_callback) # do_hyp
# see the correct pointer to the pt_regs
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1057,7 +1013,7 @@ END(do_hypervisor_callback)
ALIGN
restore_all_enable_events:
- CFI_DEFAULT_STACK adj=1
+ PARTIAL_FRAME
TRACE_IRQS_ON
__ENABLE_INTERRUPTS
@@ -1093,9 +1049,7 @@ ecrit: /**** END OF CRITICAL REGION ***
# We distinguish between categories by comparing each saved segment register
# with its current contents: any discrepancy means we in category 1.
ENTRY(failsafe_callback)
- _frame (RIP-0x30)
- CFI_REL_OFFSET rcx, 0
- CFI_REL_OFFSET r11, 8
+ INTR_FRAME offset=4*8
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
@@ -1131,20 +1085,19 @@ ENTRY(failsafe_callback)
SAVE_ALL
jmp error_exit
CFI_ENDPROC
-#if 0
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
- .section .fixup,"ax"
- /* running with kernelgs */
-bad_gs:
-/* swapgs */ /* switch back to user gs */
- xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-#endif
+
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+zeroentry hypervisor_callback do_hypervisor_callback
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
/*
* Create a kernel thread.
@@ -1168,7 +1121,7 @@ ENTRY(kernel_thread)
xorl %r8d,%r8d
xorl %r9d,%r9d
-
+
# clone now
call do_fork
movq %rax,RAX(%rsp)
@@ -1179,15 +1132,15 @@ ENTRY(kernel_thread)
* so internally to the x86_64 port you can rely on kernel_thread()
* not to reschedule the child before returning, this avoids the need
* of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
+ * [Hopefully no generic code relies on the reschedule -AK]
*/
RESTORE_ALL
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_thread)
-
-child_rip:
+END(kernel_thread)
+
+ENTRY(child_rip)
pushq $0 # fake return address
CFI_STARTPROC
/*
@@ -1200,8 +1153,9 @@ child_rip:
# exit
mov %eax, %edi
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
-ENDPROC(child_rip)
+END(child_rip)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1221,10 +1175,10 @@ ENDPROC(child_rip)
ENTRY(kernel_execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
- SAVE_ALL
+ SAVE_ALL
movq %rsp,%rcx
call sys_execve
- movq %rax, RAX(%rsp)
+ movq %rax, RAX(%rsp)
RESTORE_REST
testq %rax,%rax
jne 1f
@@ -1233,132 +1187,7 @@ ENTRY(kernel_execve)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-KPROBE_ENTRY(page_fault)
- errorentry do_page_fault
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
- zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- zeroentry do_simd_coprocessor_error
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- zeroentry do_device_not_available
-END(device_not_available)
-
- /* runs on exception stack */
-KPROBE_ENTRY(debug)
-/* INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8 */
- zeroentry do_debug
-/* paranoidexit
- CFI_ENDPROC */
-KPROBE_END(debug)
-
-KPROBE_ENTRY(nmi)
- zeroentry do_nmi_callback
-KPROBE_END(nmi)
-do_nmi_callback:
- CFI_STARTPROC
- addq $8, %rsp
- CFI_ENDPROC
- CFI_DEFAULT_STACK
- call do_nmi
- orl $NMI_MASK,EFLAGS(%rsp)
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- jmp retint_restore_args
- CFI_ENDPROC
-END(do_nmi_callback)
-
-KPROBE_ENTRY(int3)
-/* INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8 */
- zeroentry do_int3
-/* jmp paranoid_exit1
- CFI_ENDPROC */
-KPROBE_END(int3)
-
-ENTRY(overflow)
- zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
- zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
- zeroentry do_invalid_op
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
-#if 0
- /* runs on exception stack */
-ENTRY(double_fault)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_double_fault
- jmp paranoid_exit1
- CFI_ENDPROC
-END(double_fault)
-#endif
-
-ENTRY(invalid_TSS)
- errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- errorentry do_segment_not_present
-END(segment_not_present)
-
- /* runs on exception stack */
-ENTRY(stack_segment)
-/* XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_stack_segment */
- errorentry do_stack_segment
-/* jmp paranoid_exit1
- CFI_ENDPROC */
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
- errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
- errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
- zeroentry do_divide_error
-END(divide_error)
-
-#ifndef CONFIG_XEN
-ENTRY(spurious_interrupt_bug)
- zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-#endif
-
-#ifdef CONFIG_X86_MCE
- /* runs on exception stack */
-KPROBE_ENTRY(machine_check)
- zeroentry do_machine_check
-END(machine_check)
-#endif
+END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
@@ -1378,24 +1207,191 @@ ENTRY(call_softirq)
decl %gs:pda_irqcount
ret
CFI_ENDPROC
-ENDPROC(call_softirq)
+END(call_softirq)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+zeroentry nmi do_nmi_callback
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+#ifndef CONFIG_XEN
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+
+ /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+ INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+paranoid_swapgs:
+ TRACE_IRQS_IRETQ 0
+ SWAPGS_UNSAFE_STACK
+paranoid_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+paranoid_schedule:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+ CFI_ENDPROC
+END(paranoid_exit)
+#endif
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+ XCPT_FRAME 2
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+#ifndef CONFIG_XEN
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+#endif
+ TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+
+#ifndef CONFIG_XEN
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+#endif
+END(error_entry)
+
+
+ENTRY(error_exit)
+ DEFAULT_FRAME
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testb $3,CS-ARGOFFSET(%rsp)
+ jz retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_restore_args
+ CFI_ENDPROC
+END(error_exit)
+
+
+do_nmi_callback:
+ CFI_STARTPROC
+ addq $8, %rsp
+ CFI_ENDPROC
+ DEFAULT_FRAME
+ call do_nmi
+ orl $NMI_MASK,EFLAGS(%rsp)
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ jmp retint_restore_args
+ CFI_ENDPROC
+END(do_nmi_callback)
+
#ifndef CONFIG_IA32_EMULATION
-KPROBE_ENTRY(ignore_sysret)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-RIP+16
-/* CFI_REL_OFFSET ss,SS-RIP+16 */
- CFI_REL_OFFSET rsp,RSP-RIP+16
-/* CFI_REL_OFFSET rflags,EFLAGS-RIP+16 */
-/* CFI_REL_OFFSET cs,CS-RIP+16 */
- CFI_REL_OFFSET rip,RIP-RIP+16
+ENTRY(ignore_sysret)
+ INTR_FRAME
popq %rcx
CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
popq %r11
CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE r11
mov $-ENOSYS,%eax
HYPERVISOR_IRET 0
CFI_ENDPROC
-ENDPROC(ignore_sysret)
+END(ignore_sysret)
#endif
+
+/*
+ * End of kprobes section
+ */
+ .popsection
--- head-2011-03-17.orig/arch/x86/kernel/head-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -36,7 +36,6 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
- printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
--- head-2011-03-17.orig/arch/x86/kernel/head32-xen.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head32-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -12,9 +12,12 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
void __init i386_start_kernel(void)
{
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
#ifndef CONFIG_XEN
--- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -31,9 +31,10 @@
#include <asm/kdebug.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
#ifdef CONFIG_SMP
/*
@@ -163,6 +164,8 @@ void __init x86_64_start_reservations(ch
{
copy_bootdata(__va(real_mode_data));
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
--- head-2011-03-17.orig/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/io_apic-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -112,102 +112,276 @@ static int __init parse_noapic(char *str
}
early_param("noapic", parse_noapic);
+#ifndef CONFIG_XEN
struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+ struct irq_pin_list *pin;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+
+ return pin;
+}
+
struct irq_cfg {
-#ifndef CONFIG_XEN
- unsigned int irq;
struct irq_pin_list *irq_2_pin;
- cpumask_t domain;
- cpumask_t old_domain;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
unsigned move_cleanup_count;
-#endif
u8 vector;
-#ifndef CONFIG_XEN
u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ u8 move_desc_pending : 1;
#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
static struct irq_cfg irq_cfgx[NR_IRQS] = {
- [0] = { .irq = 0 },
- [1] = { .irq = 1 },
- [2] = { .irq = 2 },
- [3] = { .irq = 3 },
- [4] = { .irq = 4 },
- [5] = { .irq = 5 },
- [6] = { .irq = 6 },
- [7] = { .irq = 7 },
- [8] = { .irq = 8 },
- [9] = { .irq = 9 },
- [10] = { .irq = 10 },
- [11] = { .irq = 11 },
- [12] = { .irq = 12 },
- [13] = { .irq = 13 },
- [14] = { .irq = 14 },
- [15] = { .irq = 15 },
+#endif
+ [0] = { .vector = IRQ0_VECTOR, },
+ [1] = { .vector = IRQ1_VECTOR, },
+ [2] = { .vector = IRQ2_VECTOR, },
+ [3] = { .vector = IRQ3_VECTOR, },
+ [4] = { .vector = IRQ4_VECTOR, },
+ [5] = { .vector = IRQ5_VECTOR, },
+ [6] = { .vector = IRQ6_VECTOR, },
+ [7] = { .vector = IRQ7_VECTOR, },
+ [8] = { .vector = IRQ8_VECTOR, },
+ [9] = { .vector = IRQ9_VECTOR, },
+ [10] = { .vector = IRQ10_VECTOR, },
+ [11] = { .vector = IRQ11_VECTOR, },
+ [12] = { .vector = IRQ12_VECTOR, },
+ [13] = { .vector = IRQ13_VECTOR, },
+ [14] = { .vector = IRQ14_VECTOR, },
+ [15] = { .vector = IRQ15_VECTOR, },
};
-#define for_each_irq_cfg(irq, cfg) \
- for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+int __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ int count;
+ int i;
+
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
+ for (i = 0; i < count; i++) {
+ desc = irq_to_desc(i);
+ desc->chip_data = &cfg[i];
+ alloc_bootmem_cpumask_var(&cfg[i].domain);
+ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ if (i < NR_IRQS_LEGACY)
+ cpumask_setall(cfg[i].domain);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg *irq_cfg(unsigned int irq)
{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ struct irq_cfg *cfg = NULL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ if (desc)
+ cfg = desc->chip_data;
+
+ return cfg;
}
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
{
- return irq_cfg(irq);
+ struct irq_cfg *cfg;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ if (cfg) {
+ if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+ kfree(cfg);
+ cfg = NULL;
+ } else if (!alloc_cpumask_var_node(&cfg->old_domain,
+ GFP_ATOMIC, node)) {
+ free_cpumask_var(cfg->domain);
+ kfree(cfg);
+ cfg = NULL;
+ } else {
+ cpumask_clear(cfg->domain);
+ cpumask_clear(cfg->old_domain);
+ }
+ }
+
+ return cfg;
}
-#ifdef CONFIG_XEN
-#define irq_2_pin_init()
-#define add_pin_to_irq(irq, apic, pin)
-#else
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+ cfg = desc->chip_data;
+ if (!cfg) {
+ desc->chip_data = get_one_free_irq_cfg(cpu);
+ if (!desc->chip_data) {
+ printk(KERN_ERR "can not alloc irq_cfg\n");
+ BUG_ON(1);
+ }
+ }
-struct irq_pin_list {
- int apic, pin;
- struct irq_pin_list *next;
-};
+ return 0;
+}
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static void __init irq_2_pin_init(void)
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
{
- struct irq_pin_list *pin = irq_2_pin_head;
- int i;
+ struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+ cfg->irq_2_pin = NULL;
+ old_entry = old_cfg->irq_2_pin;
+ if (!old_entry)
+ return;
+
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry)
+ return;
+
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ head = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ while (old_entry) {
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ entry = head;
+ while (entry) {
+ head = entry->next;
+ kfree(entry);
+ entry = head;
+ }
+ /* still use the old one */
+ return;
+ }
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ tail->next = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ }
- for (i = 1; i < PIN_MAP_SIZE; i++)
- pin[i-1].next = &pin[i];
+ tail->next = NULL;
+ cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry, *next;
+
+ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+ return;
+
+ entry = old_cfg->irq_2_pin;
- irq_2_pin_ptr = &pin[0];
+ while (entry) {
+ next = entry->next;
+ kfree(entry);
+ entry = next;
+ }
+ old_cfg->irq_2_pin = NULL;
}
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+ struct irq_desc *desc, int cpu)
{
- struct irq_pin_list *pin = irq_2_pin_ptr;
+ struct irq_cfg *cfg;
+ struct irq_cfg *old_cfg;
- if (!pin)
- panic("can not get more irq_2_pin\n");
+ cfg = get_one_free_irq_cfg(cpu);
- irq_2_pin_ptr = pin->next;
- pin->next = NULL;
- return pin;
+ if (!cfg)
+ return;
+
+ desc->chip_data = cfg;
+
+ old_cfg = old_desc->chip_data;
+
+ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+ init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+ kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+ struct irq_cfg *old_cfg, *cfg;
+
+ old_cfg = old_desc->chip_data;
+ cfg = desc->chip_data;
+
+ if (old_cfg == cfg)
+ return;
+
+ if (old_cfg) {
+ free_irq_2_pin(old_cfg, cfg);
+ free_irq_cfg(old_cfg);
+ old_desc->chip_data = NULL;
+ }
+}
+
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg = desc->chip_data;
+
+ if (!cfg->move_in_progress) {
+ /* it means that domain is not changed */
+ if (!cpumask_intersects(&desc->affinity, mask))
+ cfg->move_desc_pending = 1;
+ }
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
}
+#endif
struct io_apic {
unsigned int index;
@@ -220,7 +394,7 @@ static __attribute_const__ struct io_api
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
-#endif
+#endif /* !CONFIG_XEN */
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
@@ -275,11 +449,10 @@ static inline void io_apic_modify(unsign
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
unsigned long flags;
- struct irq_cfg *cfg = irq_cfg(irq);
spin_lock_irqsave(&ioapic_lock, flags);
entry = cfg->irq_2_pin;
@@ -365,13 +538,32 @@ static void ioapic_mask_entry(int apic,
}
#ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
- cfg = irq_cfg(irq);
entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
@@ -401,36 +593,61 @@ static void __target_IO_APIC_irq(unsigne
}
}
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_and(&desc->affinity, cfg->domain, mask);
+ set_extra_move_desc(desc, mask);
+ return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
+ irq = desc->irq;
+ cfg = desc->chip_data;
- cfg = irq_cfg(irq);
- if (assign_irq_vector(irq, mask))
- return;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
- /*
- * Only the high 8 bits are valid.
- */
- dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
desc = irq_to_desc(irq);
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
- desc->affinity = mask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ set_ioapic_affinity_irq_desc(desc, mask);
}
#endif /* CONFIG_SMP */
@@ -439,16 +656,18 @@ static void set_ioapic_affinity_irq(unsi
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
{
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- /* first time to refer irq_cfg, so with new */
- cfg = irq_cfg_alloc(irq);
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin();
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+ apic, pin);
+ return;
+ }
cfg->irq_2_pin = entry;
entry->apic = apic;
entry->pin = pin;
@@ -463,7 +682,7 @@ static void add_pin_to_irq(unsigned int
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin();
+ entry->next = get_one_free_irq_2_pin(cpu);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -472,11 +691,10 @@ static void add_pin_to_irq(unsigned int
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_cfg *cfg = irq_cfg(irq);
struct irq_pin_list *entry = cfg->irq_2_pin;
int replaced = 0;
@@ -493,18 +711,16 @@ static void __init replace_pin_at_irq(un
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq(irq, newapic, newpin);
+ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
}
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
int pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- cfg = irq_cfg(irq);
for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
unsigned int reg;
pin = entry->pin;
@@ -517,13 +733,13 @@ static inline void io_apic_modify_irq(un
}
}
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
#ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
{
/*
* Synchronize the IO-APIC and the CPU by doing
@@ -534,47 +750,64 @@ void io_apic_sync(struct irq_pin_list *e
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
#else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
}
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
IO_APIC_REDIR_MASKED, NULL);
}
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
#endif /* CONFIG_X86_32 */
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
+ BUG_ON(!cfg);
+
spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
+ __mask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unmask_IO_APIC_irq_desc(desc);
+}
+
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
@@ -614,6 +847,8 @@ void send_IPI_self(int vector)
apic_write(APIC_ICR, cfg);
}
#endif /* !CONFIG_SMP && CONFIG_X86_32*/
+#else
+#define add_pin_to_irq_cpu(cfg, cpu, apic, pin)
#endif /* !CONFIG_XEN */
#ifdef CONFIG_X86_32
@@ -854,7 +1089,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -1079,52 +1314,118 @@ void unlock_vector_lock(void)
{
spin_unlock(&vector_lock);
}
-#endif
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
- struct physdev_irq irq_op;
- struct irq_cfg *cfg;
-
- if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
- return -EINVAL;
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ unsigned int old_vector;
+ int cpu, err;
+ cpumask_var_t tmp_mask;
+
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ old_vector = cfg->vector;
+ if (old_vector) {
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask)) {
+ free_cpumask_var(tmp_mask);
+ return 0;
+ }
+ }
- cfg = irq_cfg(irq);
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ int new_cpu;
+ int vector, offset;
+
+ vector_allocation_domain(cpu, tmp_mask);
+
+ vector = current_vector;
+ offset = current_offset;
+next:
+ vector += 8;
+ if (vector >= first_system_vector) {
+ /* If out of vectors on large boxen, must share them. */
+ offset = (offset + 1) % 8;
+ vector = FIRST_DEVICE_VECTOR + offset;
+ }
+ if (unlikely(current_vector == vector))
+ continue;
- if (cfg->vector)
- return 0;
+ if (test_bit(vector, used_vectors))
+ goto next;
- irq_op.irq = irq;
- if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
- return -ENOSPC;
+#ifdef CONFIG_KDB
+ if (vector == KDBENTER_VECTOR)
+ goto next;
+#endif /* CONFIG_KDB */
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ goto next;
+ /* Found one! */
+ current_vector = vector;
+ current_offset = offset;
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector] = irq;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
+ }
+ free_cpumask_var(tmp_mask);
+ return err;
+}
- cfg->vector = irq_op.vector;
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
- return 0;
+ spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector(irq, cfg, mask);
+ spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
}
-#ifndef CONFIG_XEN
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg;
- cpumask_t mask;
int cpu, vector;
- cfg = irq_cfg(irq);
BUG_ON(!cfg->vector);
vector = cfg->vector;
- cpus_and(mask, cfg->domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask)
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
- cpus_clear(cfg->domain);
+ cpumask_clear(cfg->domain);
if (likely(!cfg->move_in_progress))
return;
- cpus_and(mask, cfg->old_domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask) {
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1142,10 +1443,12 @@ void __setup_vector_irq(int cpu)
/* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
/* Mark the inuse vectors */
- for_each_irq_cfg(irq, cfg) {
- if (!cpu_isset(cpu, cfg->domain))
+ for_each_irq_desc(irq, desc) {
+ cfg = desc->chip_data;
+ if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1157,7 +1460,7 @@ void __setup_vector_irq(int cpu)
continue;
cfg = irq_cfg(irq);
- if (!cpu_isset(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -1195,11 +1498,8 @@ static inline int IO_APIC_irq_trigger(in
}
#endif
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
@@ -1230,8 +1530,8 @@ static void ioapic_register_intr(int irq
handle_edge_irq, "edge");
}
#else /* !CONFIG_XEN */
-#define __clear_irq_vector(irq) ((void)(irq))
-#define ioapic_register_intr(irq, trigger) evtchn_register_pirq(irq)
+#define __clear_irq_vector(irq, cfg) ((void)0)
+#define ioapic_register_intr(irq, desc, trigger) evtchn_register_pirq(irq)
#endif
static int setup_ioapic_entry(int apic, int irq,
@@ -1295,24 +1595,25 @@ static int setup_ioapic_entry(int apic,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
- cpumask_t mask;
+ unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
- cfg = irq_cfg(irq);
+ cfg = desc->chip_data;
- mask = TARGET_CPUS;
- if (assign_irq_vector(irq, mask))
+ if (assign_irq_vector(irq, cfg, TARGET_CPUS))
return;
#ifndef CONFIG_XEN
- cpus_and(mask, cfg->domain, mask);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+#else
+ dest = cpu_mask_to_apicid(TARGET_CPUS);
#endif
apic_printk(APIC_VERBOSE,KERN_DEBUG
@@ -1323,16 +1624,15 @@ static void setup_IO_APIC_irq(int apic,
if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- cpu_mask_to_apicid(mask), trigger, polarity,
- cfg->vector)) {
+ dest, trigger, polarity, cfg->vector)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic].mp_apicid, pin);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
return;
}
- ioapic_register_intr(irq, trigger);
- if (irq < 16)
+ ioapic_register_intr(irq, desc, trigger);
+ if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
ioapic_write_entry(apic, pin, entry);
@@ -1342,6 +1642,9 @@ static void __init setup_IO_APIC_irqs(vo
{
int apic, pin, idx, irq;
int notcon = 0;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1376,9 +1679,15 @@ static void __init setup_IO_APIC_irqs(vo
if (multi_timer_check(apic, irq))
continue;
#endif
- add_pin_to_irq(irq, apic, pin);
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
+ }
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, apic, pin);
- setup_IO_APIC_irq(apic, pin, irq,
+ setup_IO_APIC_irq(apic, pin, irq, desc,
irq_trigger(idx), irq_polarity(idx));
}
}
@@ -1438,6 +1747,7 @@ __apicdebuginit(void) print_IO_APIC(void
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
unsigned int irq;
if (apic_verbosity == APIC_QUIET)
@@ -1527,8 +1837,11 @@ __apicdebuginit(void) print_IO_APIC(void
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_cfg(irq, cfg) {
- struct irq_pin_list *entry = cfg->irq_2_pin;
+ for_each_irq_desc(irq, desc) {
+ struct irq_pin_list *entry;
+
+ cfg = desc->chip_data;
+ entry = cfg->irq_2_pin;
if (!entry)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2018,14 +2331,16 @@ static unsigned int startup_ioapic_irq(u
{
int was_pending = 0;
unsigned long flags;
+ struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
}
- __unmask_IO_APIC_irq(irq);
+ cfg = irq_cfg(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
@@ -2039,7 +2354,7 @@ static int ioapic_retrigger_irq(unsigned
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2088,35 +2403,35 @@ static DECLARE_DELAYED_WORK(ir_migration
* as simple as edge triggered migration and we can do the irq migration
* with a simple atomic update to IO-APIC RTE.
*/
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ if (!cpumask_intersects(mask, cpu_online_mask))
return;
+ irq = desc->irq;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ set_extra_move_desc(desc, mask);
+
+ dest = cpu_mask_to_apicid_and(cfg->domain, mask);
- desc = irq_to_desc(irq);
modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
+ __target_IO_APIC_irq(irq, dest, cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -2128,24 +2443,20 @@ static void migrate_ioapic_irq(int irq,
*/
modify_irte(irq, &irte);
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
- desc->affinity = mask;
+ cpumask_copy(&desc->affinity, mask);
}
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
{
int ret = -1;
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
- if (io_apic_level_ack_pending(irq)) {
+ if (io_apic_level_ack_pending(cfg)) {
/*
* Interrupt in progress. Migrating irq now will change the
* vector information in the IO-APIC RTE and that will confuse
@@ -2157,14 +2468,15 @@ static int migrate_irq_remapped_level(in
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq(irq, desc->pending_mask);
+ migrate_ioapic_irq_desc(desc, &desc->pending_mask);
ret = 0;
desc->status &= ~IRQ_MOVE_PENDING;
- cpus_clear(desc->pending_mask);
+ cpumask_clear(&desc->pending_mask);
unmask:
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
+
return ret;
}
@@ -2185,7 +2497,7 @@ static void ir_irq_migration(struct work
continue;
}
- desc->chip->set_affinity(irq, desc->pending_mask);
+ desc->chip->set_affinity(irq, &desc->pending_mask);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -2194,28 +2506,33 @@ static void ir_irq_migration(struct work
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
if (desc->status & IRQ_LEVEL) {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
- migrate_irq_remapped_level(irq);
+ cpumask_copy(&desc->pending_mask, mask);
+ migrate_irq_remapped_level_desc(desc);
return;
}
- migrate_ioapic_irq(irq, mask);
+ migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+ const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
+
ack_APIC_irq();
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
me = smp_processor_id();
@@ -2225,6 +2542,9 @@ asmlinkage void smp_irq_move_cleanup_int
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
+ if (irq == -1)
+ continue;
+
desc = irq_to_desc(irq);
if (!desc)
continue;
@@ -2234,7 +2554,7 @@ asmlinkage void smp_irq_move_cleanup_int
if (!cfg->move_cleanup_count)
goto unlock;
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
__get_cpu_var(vector_irq)[vector] = -1;
@@ -2246,28 +2566,45 @@ unlock:
irq_exit();
}
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_desc *desc = *descp;
+ struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress))
+ if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ if (likely(!cfg->move_desc_pending))
+ return;
+
+ /* domain has not changed, but affinity did */
+ me = smp_processor_id();
+ if (cpu_isset(me, desc->affinity)) {
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+ cfg->move_desc_pending = 0;
+ }
+#endif
return;
+ }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- cpumask_t cleanup_mask;
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+#endif
+ send_cleanup_vector(cfg);
}
}
#else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
+
#ifdef CONFIG_INTR_REMAP
static void ack_x2apic_level(unsigned int irq)
{
@@ -2278,11 +2615,14 @@ static void ack_x2apic_edge(unsigned int
{
ack_x2APIC_irq();
}
+
#endif
static void ack_apic_edge(unsigned int irq)
{
- irq_complete_move(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ irq_complete_move(&desc);
move_native_irq(irq);
ack_APIC_irq();
}
@@ -2291,18 +2631,21 @@ atomic_t irq_mis_count;
static void ack_apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
#ifdef CONFIG_X86_32
unsigned long v;
int i;
#endif
+ struct irq_cfg *cfg;
int do_unmask_irq = 0;
- irq_complete_move(irq);
+ irq_complete_move(&desc);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
}
#endif
@@ -2326,7 +2669,8 @@ static void ack_apic_level(unsigned int
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
*/
- i = irq_cfg(irq)->vector;
+ cfg = desc->chip_data;
+ i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
#endif
@@ -2365,17 +2709,18 @@ static void ack_apic_level(unsigned int
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(irq))
+ cfg = desc->chip_data;
+ if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
}
#ifdef CONFIG_X86_32
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
+ __mask_and_edge_IO_APIC_irq(cfg);
+ __unmask_and_level_IO_APIC_irq(cfg);
spin_unlock(&ioapic_lock);
}
#endif
@@ -2427,24 +2772,23 @@ static inline void init_IO_APIC_traps(vo
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_cfg(irq, cfg) {
+ for_each_irq_desc(irq, desc) {
#ifdef CONFIG_XEN
if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS)
continue;
#endif
- if (IO_APIC_IRQ(irq) && !cfg->vector) {
+ cfg = desc->chip_data;
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < 16)
+ if (irq < NR_IRQS_LEGACY)
make_8259A_irq(irq);
- else {
- desc = irq_to_desc(irq);
+ else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
- }
}
}
}
@@ -2470,7 +2814,7 @@ static void unmask_lapic_irq(unsigned in
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
@@ -2482,11 +2826,8 @@ static struct irq_chip lapic_chip __read
.ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
@@ -2590,7 +2931,9 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg(0);
+ struct irq_desc *desc = irq_to_desc(0);
+ struct irq_cfg *cfg = desc->chip_data;
+ int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
unsigned int ver;
@@ -2605,7 +2948,7 @@ static inline void __init check_timer(vo
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, TARGET_CPUS);
+ assign_irq_vector(0, cfg, TARGET_CPUS);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2656,10 +2999,10 @@ static inline void __init check_timer(vo
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
+ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
- unmask_IO_APIC_irq(0);
+ unmask_IO_APIC_irq_desc(desc);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
@@ -2685,9 +3028,9 @@ static inline void __init check_timer(vo
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq(0);
+ unmask_IO_APIC_irq_desc(desc);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2719,7 +3062,7 @@ static inline void __init check_timer(vo
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0);
+ lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2918,22 +3261,26 @@ unsigned int create_irq_nr(unsigned int
unsigned int irq;
unsigned int new;
unsigned long flags;
- struct irq_cfg *cfg_new;
-
- irq_want = nr_irqs - 1;
+ struct irq_cfg *cfg_new = NULL;
+ int cpu = boot_cpu_id;
+ struct irq_desc *desc_new = NULL;
irq = 0;
spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new > 0; new--) {
+ for (new = irq_want; new < NR_IRQS; new++) {
if (platform_legacy_irq(new))
continue;
- cfg_new = irq_cfg(new);
- if (cfg_new && cfg_new->vector != 0)
+
+ desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ if (!desc_new) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", new);
+ continue;
+ }
+ cfg_new = desc_new->chip_data;
+
+ if (cfg_new->vector != 0)
continue;
- /* check if need to create one */
- if (!cfg_new)
- cfg_new = irq_cfg_alloc(new);
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
irq = new;
break;
}
@@ -2941,15 +3288,21 @@ unsigned int create_irq_nr(unsigned int
if (irq > 0) {
dynamic_irq_init(irq);
+ /* restore it, in case dynamic_irq_init clear it */
+ if (desc_new)
+ desc_new->chip_data = cfg_new;
}
return irq;
}
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
int create_irq(void)
{
+ unsigned int irq_want;
int irq;
- irq = create_irq_nr(nr_irqs - 1);
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
if (irq == 0)
irq = -1;
@@ -2960,14 +3313,22 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ /* store it, in case dynamic_irq_cleanup clear it */
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
+ /* connect back irq_cfg */
+ if (desc)
+ desc->chip_data = cfg;
#ifdef CONFIG_INTR_REMAP
free_irte(irq);
#endif
spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
}
#endif /* !CONFIG_XEN */
@@ -2981,16 +3342,13 @@ static int msi_compose_msg(struct pci_de
struct irq_cfg *cfg;
int err;
unsigned dest;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (err)
return err;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
@@ -3044,64 +3402,48 @@ static int msi_compose_msg(struct pci_de
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
- read_msi_msg(irq, &msg);
+ read_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ write_msi_msg_desc(desc, &msg);
}
-
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
- struct irq_cfg *cfg;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
unsigned int dest;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
- struct irq_desc *desc;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
-
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3115,16 +3457,10 @@ static void ir_set_msi_irq_affinity(unsi
* at the new destination. So, time to cleanup the previous
* vector allocation.
*/
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
-
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
}
+
#endif
#endif /* CONFIG_SMP */
@@ -3183,7 +3519,7 @@ static int msi_alloc_irte(struct pci_dev
}
#endif
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
int ret;
struct msi_msg msg;
@@ -3192,7 +3528,7 @@ static int setup_msi_irq(struct pci_dev
if (ret < 0)
return ret;
- set_irq_msi(irq, desc);
+ set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
#ifdef CONFIG_INTR_REMAP
@@ -3212,26 +3548,13 @@ static int setup_msi_irq(struct pci_dev
return 0;
}
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
- unsigned int irq;
-
- irq = dev->bus->number;
- irq <<= 8;
- irq |= dev->devfn;
- irq <<= 12;
-
- return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
{
unsigned int irq;
int ret;
unsigned int irq_want;
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+ irq_want = nr_irqs_gsi;
irq = create_irq_nr(irq_want);
if (irq == 0)
return -1;
@@ -3245,7 +3568,7 @@ int arch_setup_msi_irq(struct pci_dev *d
goto error;
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0) {
destroy_irq(irq);
return ret;
@@ -3263,7 +3586,7 @@ int arch_setup_msi_irqs(struct pci_dev *
{
unsigned int irq;
int ret, sub_handle;
- struct msi_desc *desc;
+ struct msi_desc *msidesc;
unsigned int irq_want;
#ifdef CONFIG_INTR_REMAP
@@ -3271,10 +3594,11 @@ int arch_setup_msi_irqs(struct pci_dev *
int index = 0;
#endif
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
+ irq_want = nr_irqs_gsi;
sub_handle = 0;
- list_for_each_entry(desc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want--);
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want);
+ irq_want++;
if (irq == 0)
return -1;
#ifdef CONFIG_INTR_REMAP
@@ -3306,7 +3630,7 @@ int arch_setup_msi_irqs(struct pci_dev *
}
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3325,24 +3649,18 @@ void arch_teardown_msi_irq(unsigned int
#ifdef CONFIG_DMAR
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
dmar_msi_read(irq, &msg);
@@ -3352,9 +3670,8 @@ static void dmar_msi_set_affinity(unsign
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip dmar_msi_type = {
@@ -3386,24 +3703,18 @@ int arch_setup_dmar_msi(unsigned int irq
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
- struct irq_desc *desc;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
hpet_msi_read(irq, &msg);
@@ -3413,9 +3724,8 @@ static void hpet_msi_set_affinity(unsign
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip hpet_msi_type = {
@@ -3468,28 +3778,21 @@ static void target_ht_irq(unsigned int i
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif
static struct irq_chip ht_irq_chip = {
@@ -3507,17 +3810,14 @@ int arch_setup_ht_irq(unsigned int irq,
{
struct irq_cfg *cfg;
int err;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3553,7 +3853,7 @@ int arch_setup_ht_irq(unsigned int irq,
int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset)
{
- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
struct irq_cfg *cfg;
int mmr_pnode;
unsigned long mmr_value;
@@ -3561,7 +3861,9 @@ int arch_enable_uv_irq(char *irq_name, u
unsigned long flags;
int err;
- err = assign_irq_vector(irq, *eligible_cpu);
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
@@ -3570,8 +3872,6 @@ int arch_enable_uv_irq(char *irq_name, u
irq_name);
spin_unlock_irqrestore(&vector_lock, flags);
- cfg = irq_cfg(irq);
-
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3582,7 +3882,7 @@ int arch_enable_uv_irq(char *irq_name, u
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ entry->dest = cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3623,10 +3923,29 @@ int __init io_apic_get_redir_entries (in
return reg_01.bits.entries;
}
-int __init probe_nr_irqs(void)
+#ifndef CONFIG_XEN
+void __init probe_nr_irqs_gsi(void)
{
- return NR_IRQS;
+ int nr = 0;
+
+ nr = acpi_probe_gsi();
+ if (nr > nr_irqs_gsi) {
+ nr_irqs_gsi = nr;
+ } else {
+ /* for acpi=off or acpi is not compiled in */
+ int idx;
+
+ nr = 0;
+ for (idx = 0; idx < nr_ioapics; idx++)
+ nr += io_apic_get_redir_entries(idx) + 1;
+
+ if (nr > nr_irqs_gsi)
+ nr_irqs_gsi = nr;
+ }
+
+ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+#endif
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
@@ -3726,6 +4045,10 @@ int __init io_apic_get_version(int ioapi
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
+
#ifdef CONFIG_XEN
if (irq < PIRQ_BASE || irq >= PIRQ_BASE + NR_PIRQS) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
@@ -3740,13 +4063,21 @@ int io_apic_set_pci_routing (int ioapic,
return -EINVAL;
}
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+ }
- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
return 0;
}
@@ -3785,7 +4116,7 @@ void __init setup_ioapic_dest(void)
int pin, ioapic, irq, irq_entry;
struct irq_desc *desc;
struct irq_cfg *cfg;
- cpumask_t mask;
+ const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
@@ -3801,9 +4132,10 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- cfg = irq_cfg(irq);
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
if (!cfg->vector) {
- setup_IO_APIC_irq(ioapic, pin, irq,
+ setup_IO_APIC_irq(ioapic, pin, irq, desc,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
continue;
@@ -3813,19 +4145,18 @@ void __init setup_ioapic_dest(void)
/*
* Honour affinities which have been set in early boot
*/
- desc = irq_to_desc(irq);
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
+ mask = &desc->affinity;
else
mask = TARGET_CPUS;
#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq(irq, mask);
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
else
#endif
- set_ioapic_affinity_irq(irq, mask);
+ set_ioapic_affinity_irq_desc(desc, mask);
}
}
@@ -3874,7 +4205,6 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
- irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
--- head-2011-03-17.orig/arch/x86/kernel/ioport-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/ioport-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -36,7 +36,7 @@ static void set_bitmap(unsigned long *bi
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
- struct thread_struct * t = &current->thread;
+ struct thread_struct *t = &current->thread;
struct physdev_set_iobitmap set_iobitmap;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
--- head-2011-03-17.orig/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:51.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/apic/ipi-xen.c 2011-02-21 13:56:59.000000000 +0100
@@ -40,21 +40,29 @@ void send_IPI_self(int vector)
__send_IPI_shortcut(APIC_DEST_SELF, vector);
}
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
{
- cpumask_t mask;
unsigned int cpu;
- cpus_andnot(mask, cpumask, cpu_online_map);
- WARN_ON(!cpus_empty(mask));
- for_each_online_cpu(cpu)
- if (cpu_isset(cpu, cpumask))
- __send_IPI_one(cpu, vector);
+ WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+ for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+ __send_IPI_one(cpu, vector);
}
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
{
send_IPI_mask_bitmask(mask, vector);
}
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ WARN_ON(!cpumask_subset(mask, cpu_online_mask));
+ for_each_cpu_and(query_cpu, mask, cpu_online_mask)
+ if (query_cpu != this_cpu)
+ __send_IPI_one(query_cpu, vector);
+}
+
#endif
--- head-2011-03-17.orig/arch/x86/kernel/irq-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/irq-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -5,10 +5,11 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/seq_file.h>
+#include <linux/smp.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
-#include <asm/smp.h>
+#include <asm/irq.h>
atomic_t irq_err_count;
@@ -43,62 +44,62 @@ void ack_bad_irq(unsigned int irq)
/*
* /proc/interrupts printing:
*/
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
{
int j;
- seq_printf(p, "NMI: ");
+ seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
+ seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
#endif
#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
+ seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
+ seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_printf(p, " Function call interrupts\n");
#ifndef CONFIG_XEN
- seq_printf(p, "TLB: ");
+ seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#else
- seq_printf(p, "LCK: ");
+ seq_printf(p, "%*s: ", prec, "LCK");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count);
seq_printf(p, " Spinlock wakeups\n");
#endif
#endif
#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
+ seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
# ifdef CONFIG_X86_64
- seq_printf(p, "THR: ");
+ seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
+ seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
return 0;
}
@@ -106,25 +107,31 @@ static int show_other_interrupts(struct
int show_interrupts(struct seq_file *p, void *v)
{
unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j;
+ int i = *(loff_t *) v, j, prec;
struct irqaction *action;
struct irq_desc *desc;
if (i > nr_irqs)
return 0;
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
if (i == nr_irqs)
- return show_other_interrupts(p);
+ return show_other_interrupts(p, prec);
/* print header */
if (i == 0) {
- seq_printf(p, " ");
+ seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
}
desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
spin_lock_irqsave(&desc->lock, flags);
#ifndef CONFIG_SMP
any_count = kstat_irqs(i);
@@ -136,7 +143,7 @@ int show_interrupts(struct seq_file *p,
if (!action && !any_count)
goto out;
- seq_printf(p, "%3d: ", i);
+ seq_printf(p, "%*d: ", prec, i);
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
--- head-2011-03-17.orig/arch/x86/kernel/ldt-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/ldt-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -12,8 +12,8 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
--- head-2011-03-17.orig/arch/x86/kernel/machine_kexec_32.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/machine_kexec_32.c 2011-02-01 14:42:26.000000000 +0100
@@ -46,6 +46,17 @@ static int machine_kexec_alloc_page_tabl
{
image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
#ifdef CONFIG_X86_PAE
+#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
+ if (image->arch.pgd) {
+ struct page *pg = virt_to_page(image->arch.pgd);
+
+ if (xen_limit_pages_to_max_mfn(pg, 0, BITS_PER_LONG) < 0) {
+ image->arch.pgd = NULL;
+ __free_page(pg);
+ return -ENOMEM;
+ }
+ }
+#endif
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
#endif
@@ -123,13 +134,7 @@ void machine_kexec_setup_load_arg(xen_ke
memcpy(control_page, relocate_kernel, PAGE_SIZE);
xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
- xki->page_list[PA_PGD] = __ma(kexec_pgd);
-#ifdef CONFIG_X86_PAE
- xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
- xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
-#endif
- xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
- xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+ xki->page_list[PA_PGD] = __ma(image->arch.pgd);
if (image->type == KEXEC_TYPE_DEFAULT)
xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
--- head-2011-03-17.orig/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -2,7 +2,7 @@
* Intel Multiprocessor Specification 1.1 and 1.4
* compliant MP-table parsing routines.
*
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
* (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
* (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*/
@@ -16,18 +16,18 @@
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/module.h>
+#include <linux/smp.h>
-#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
-#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
#include <asm/trampoline.h>
#include <asm/setup.h>
+#include <asm/smp.h>
#include <mach_apic.h>
#ifdef CONFIG_X86_32
@@ -54,13 +54,13 @@ static int __init mpf_checksum(unsigned
return sum & 0xFF;
}
-static void __init MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_cpu *m)
{
#ifndef CONFIG_XEN
int apicid;
char *bootup_cpu = "";
- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ if (!(m->cpuflag & CPU_ENABLED)) {
disabled_cpus++;
return;
}
@@ -68,57 +68,57 @@ static void __init MP_processor_info(str
if (x86_quirks->mpc_apic_id)
apicid = x86_quirks->mpc_apic_id(m);
else
- apicid = m->mpc_apicid;
+ apicid = m->apicid;
- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ if (m->cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
- boot_cpu_physical_apicid = m->mpc_apicid;
+ boot_cpu_physical_apicid = m->apicid;
}
- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
- generic_processor_info(apicid, m->mpc_apicver);
+ printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+ generic_processor_info(apicid, m->apicver);
#else /* CONFIG_XEN */
num_processors++;
#endif
}
#ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_config_bus *m)
+static void __init MP_bus_info(struct mpc_bus *m)
{
char str[7];
- memcpy(str, m->mpc_bustype, 6);
+ memcpy(str, m->bustype, 6);
str[6] = 0;
if (x86_quirks->mpc_oem_bus_info)
x86_quirks->mpc_oem_bus_info(m, str);
else
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
#if MAX_MP_BUSSES < 256
- if (m->mpc_busid >= MAX_MP_BUSSES) {
+ if (m->busid >= MAX_MP_BUSSES) {
printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
" is too large, max. supported is %d\n",
- m->mpc_busid, str, MAX_MP_BUSSES - 1);
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+ set_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
if (x86_quirks->mpc_oem_pci_bus)
x86_quirks->mpc_oem_pci_bus(m);
- clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ clear_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+ mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -142,32 +142,31 @@ static int bad_ioapic(unsigned long addr
return 0;
}
-static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
- if (!(m->mpc_flags & MPC_APIC_USABLE))
+ if (!(m->flags & MPC_APIC_USABLE))
return;
printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+ m->apicid, m->apicver, m->apicaddr);
- if (bad_ioapic(m->mpc_apicaddr))
+ if (bad_ioapic(m->apicaddr))
return;
- mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
- mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
- mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
- mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
- mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
+ mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
+ mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
+ mp_ioapics[nr_ioapics].mp_type = m->type;
+ mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
+ mp_ioapics[nr_ioapics].mp_flags = m->flags;
nr_ioapics++;
}
-static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_intsrc *m)
{
apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
}
static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
@@ -179,52 +178,52 @@ static void __init print_mp_irq_info(str
mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
}
-static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+static void __init assign_to_mp_irq(struct mpc_intsrc *m,
struct mp_config_intsrc *mp_irq)
{
- mp_irq->mp_dstapic = m->mpc_dstapic;
- mp_irq->mp_type = m->mpc_type;
- mp_irq->mp_irqtype = m->mpc_irqtype;
- mp_irq->mp_irqflag = m->mpc_irqflag;
- mp_irq->mp_srcbus = m->mpc_srcbus;
- mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
- mp_irq->mp_dstirq = m->mpc_dstirq;
+ mp_irq->mp_dstapic = m->dstapic;
+ mp_irq->mp_type = m->type;
+ mp_irq->mp_irqtype = m->irqtype;
+ mp_irq->mp_irqflag = m->irqflag;
+ mp_irq->mp_srcbus = m->srcbus;
+ mp_irq->mp_srcbusirq = m->srcbusirq;
+ mp_irq->mp_dstirq = m->dstirq;
}
static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
- struct mpc_config_intsrc *m)
+ struct mpc_intsrc *m)
{
- m->mpc_dstapic = mp_irq->mp_dstapic;
- m->mpc_type = mp_irq->mp_type;
- m->mpc_irqtype = mp_irq->mp_irqtype;
- m->mpc_irqflag = mp_irq->mp_irqflag;
- m->mpc_srcbus = mp_irq->mp_srcbus;
- m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
- m->mpc_dstirq = mp_irq->mp_dstirq;
+ m->dstapic = mp_irq->mp_dstapic;
+ m->type = mp_irq->mp_type;
+ m->irqtype = mp_irq->mp_irqtype;
+ m->irqflag = mp_irq->mp_irqflag;
+ m->srcbus = mp_irq->mp_srcbus;
+ m->srcbusirq = mp_irq->mp_srcbusirq;
+ m->dstirq = mp_irq->mp_dstirq;
}
static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
- struct mpc_config_intsrc *m)
+ struct mpc_intsrc *m)
{
- if (mp_irq->mp_dstapic != m->mpc_dstapic)
+ if (mp_irq->mp_dstapic != m->dstapic)
return 1;
- if (mp_irq->mp_type != m->mpc_type)
+ if (mp_irq->mp_type != m->type)
return 2;
- if (mp_irq->mp_irqtype != m->mpc_irqtype)
+ if (mp_irq->mp_irqtype != m->irqtype)
return 3;
- if (mp_irq->mp_irqflag != m->mpc_irqflag)
+ if (mp_irq->mp_irqflag != m->irqflag)
return 4;
- if (mp_irq->mp_srcbus != m->mpc_srcbus)
+ if (mp_irq->mp_srcbus != m->srcbus)
return 5;
- if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+ if (mp_irq->mp_srcbusirq != m->srcbusirq)
return 6;
- if (mp_irq->mp_dstirq != m->mpc_dstirq)
+ if (mp_irq->mp_dstirq != m->dstirq)
return 7;
return 0;
}
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void __init MP_intsrc_info(struct mpc_intsrc *m)
{
int i;
@@ -242,59 +241,57 @@ static void __init MP_intsrc_info(struct
#endif
-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+ m->srcbusirq, m->destapic, m->destapiclint);
}
/*
* Read/parse the MPC
*/
-static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
- char *str)
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
- if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+ if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
- mpc->mpc_signature[0], mpc->mpc_signature[1],
- mpc->mpc_signature[2], mpc->mpc_signature[3]);
+ mpc->signature[0], mpc->signature[1],
+ mpc->signature[2], mpc->signature[3]);
return 0;
}
- if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+ if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
printk(KERN_ERR "MPTABLE: checksum error!\n");
return 0;
}
- if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+ if (mpc->spec != 0x01 && mpc->spec != 0x04) {
printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->mpc_spec);
+ mpc->spec);
return 0;
}
- if (!mpc->mpc_lapic) {
+ if (!mpc->lapic) {
printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
- memcpy(oem, mpc->mpc_oem, 8);
+ memcpy(oem, mpc->oem, 8);
oem[8] = 0;
printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
- memcpy(str, mpc->mpc_productid, 12);
+ memcpy(str, mpc->productid, 12);
str[12] = 0;
printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
#ifndef CONFIG_XEN
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
#endif
return 1;
}
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
char oem[10];
@@ -320,15 +317,15 @@ static int __init smp_read_mpc(struct mp
#ifndef CONFIG_XEN
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
- mp_lapic_addr = mpc->mpc_lapic;
+ mp_lapic_addr = mpc->lapic;
#endif
if (early)
return 1;
- if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
- struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
- x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+ if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
+ struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
+ x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
}
/*
@@ -337,12 +334,11 @@ static int __init smp_read_mpc(struct mp
if (x86_quirks->mpc_record)
*x86_quirks->mpc_record = 0;
- while (count < mpc->mpc_length) {
+ while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
{
- struct mpc_config_processor *m =
- (struct mpc_config_processor *)mpt;
+ struct mpc_cpu *m = (struct mpc_cpu *)mpt;
/* ACPI may have already provided this data */
if (!acpi_lapic)
MP_processor_info(m);
@@ -352,8 +348,7 @@ static int __init smp_read_mpc(struct mp
}
case MP_BUS:
{
- struct mpc_config_bus *m =
- (struct mpc_config_bus *)mpt;
+ struct mpc_bus *m = (struct mpc_bus *)mpt;
#ifdef CONFIG_X86_IO_APIC
MP_bus_info(m);
#endif
@@ -364,30 +359,28 @@ static int __init smp_read_mpc(struct mp
case MP_IOAPIC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_ioapic *m =
- (struct mpc_config_ioapic *)mpt;
+ struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
MP_ioapic_info(m);
#endif
- mpt += sizeof(struct mpc_config_ioapic);
- count += sizeof(struct mpc_config_ioapic);
+ mpt += sizeof(struct mpc_ioapic);
+ count += sizeof(struct mpc_ioapic);
break;
}
case MP_INTSRC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
MP_intsrc_info(m);
#endif
- mpt += sizeof(struct mpc_config_intsrc);
- count += sizeof(struct mpc_config_intsrc);
+ mpt += sizeof(struct mpc_intsrc);
+ count += sizeof(struct mpc_intsrc);
break;
}
case MP_LINTSRC:
{
- struct mpc_config_lintsrc *m =
- (struct mpc_config_lintsrc *)mpt;
+ struct mpc_lintsrc *m =
+ (struct mpc_lintsrc *)mpt;
MP_lintsrc_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
@@ -398,8 +391,8 @@ static int __init smp_read_mpc(struct mp
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
printk(KERN_ERR "type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->mpc_length, 1);
- count = mpc->mpc_length;
+ 1, mpc, mpc->length, 1);
+ count = mpc->length;
break;
}
if (x86_quirks->mpc_record)
@@ -430,16 +423,16 @@ static int __init ELCR_trigger(unsigned
static void __init construct_default_ioirq_mptable(int mpc_default_type)
{
- struct mpc_config_intsrc intsrc;
+ struct mpc_intsrc intsrc;
int i;
int ELCR_fallback = 0;
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* conforming */
- intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
+ intsrc.type = MP_INTSRC;
+ intsrc.irqflag = 0; /* conforming */
+ intsrc.srcbus = 0;
+ intsrc.dstapic = mp_ioapics[0].mp_apicid;
- intsrc.mpc_irqtype = mp_INT;
+ intsrc.irqtype = mp_INT;
/*
* If true, we have an ISA/PCI system with no IRQ entries
@@ -482,30 +475,30 @@ static void __init construct_default_ioi
* irqflag field (level sensitive, active high polarity).
*/
if (ELCR_trigger(i))
- intsrc.mpc_irqflag = 13;
+ intsrc.irqflag = 13;
else
- intsrc.mpc_irqflag = 0;
+ intsrc.irqflag = 0;
}
- intsrc.mpc_srcbusirq = i;
- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ intsrc.srcbusirq = i;
+ intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
MP_intsrc_info(&intsrc);
}
- intsrc.mpc_irqtype = mp_ExtINT;
- intsrc.mpc_srcbusirq = 0;
- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+ intsrc.irqtype = mp_ExtINT;
+ intsrc.srcbusirq = 0;
+ intsrc.dstirq = 0; /* 8259A to INTIN0 */
MP_intsrc_info(&intsrc);
}
static void __init construct_ioapic_table(int mpc_default_type)
{
- struct mpc_config_ioapic ioapic;
- struct mpc_config_bus bus;
+ struct mpc_ioapic ioapic;
+ struct mpc_bus bus;
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
+ bus.type = MP_BUS;
+ bus.busid = 0;
switch (mpc_default_type) {
default:
printk(KERN_ERR "???\nUnknown standard configuration %d\n",
@@ -513,29 +506,29 @@ static void __init construct_ioapic_tabl
/* fall through */
case 1:
case 5:
- memcpy(bus.mpc_bustype, "ISA ", 6);
+ memcpy(bus.bustype, "ISA ", 6);
break;
case 2:
case 6:
case 3:
- memcpy(bus.mpc_bustype, "EISA ", 6);
+ memcpy(bus.bustype, "EISA ", 6);
break;
case 4:
case 7:
- memcpy(bus.mpc_bustype, "MCA ", 6);
+ memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
- bus.mpc_busid = 1;
- memcpy(bus.mpc_bustype, "PCI ", 6);
+ bus.busid = 1;
+ memcpy(bus.bustype, "PCI ", 6);
MP_bus_info(&bus);
}
- ioapic.mpc_type = MP_IOAPIC;
- ioapic.mpc_apicid = 2;
- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- ioapic.mpc_flags = MPC_APIC_USABLE;
- ioapic.mpc_apicaddr = 0xFEC00000;
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = 0xFEC00000;
MP_ioapic_info(&ioapic);
/*
@@ -549,8 +542,8 @@ static inline void __init construct_ioap
static inline void __init construct_default_ISA_mptable(int mpc_default_type)
{
- struct mpc_config_processor processor;
- struct mpc_config_lintsrc lintsrc;
+ struct mpc_cpu processor;
+ struct mpc_lintsrc lintsrc;
int linttypes[2] = { mp_ExtINT, mp_NMI };
int i;
@@ -564,30 +557,30 @@ static inline void __init construct_defa
/*
* 2 CPUs, numbered 0 & 1.
*/
- processor.mpc_type = MP_PROCESSOR;
+ processor.type = MP_PROCESSOR;
/* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.cpuflag = CPU_ENABLED;
+ processor.cpufeature = (boot_cpu_data.x86 << 8) |
(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
+ processor.featureflag = boot_cpu_data.x86_capability[0];
+ processor.reserved[0] = 0;
+ processor.reserved[1] = 0;
for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
+ processor.apicid = i;
MP_processor_info(&processor);
}
construct_ioapic_table(mpc_default_type);
- lintsrc.mpc_type = MP_LINTSRC;
- lintsrc.mpc_irqflag = 0; /* conforming */
- lintsrc.mpc_srcbusid = 0;
- lintsrc.mpc_srcbusirq = 0;
- lintsrc.mpc_destapic = MP_APIC_ALL;
+ lintsrc.type = MP_LINTSRC;
+ lintsrc.irqflag = 0; /* conforming */
+ lintsrc.srcbusid = 0;
+ lintsrc.srcbusirq = 0;
+ lintsrc.destapic = MP_APIC_ALL;
for (i = 0; i < 2; i++) {
- lintsrc.mpc_irqtype = linttypes[i];
- lintsrc.mpc_destapiclint = i;
+ lintsrc.irqtype = linttypes[i];
+ lintsrc.destapiclint = i;
MP_lintsrc_info(&lintsrc);
}
}
@@ -606,26 +599,23 @@ void __init get_smp_config(void)
{
struct intel_mp_floating *mpf = mpf_found;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
- }
+ if (!mpf)
+ return;
+
if (acpi_lapic && early)
return;
+
/*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
- * processors, where MPS only supports physical.
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
*/
- if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ if (acpi_lapic && acpi_ioapic)
return;
- } else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
- if (!mpf)
- return;
+ if (x86_quirks->mach_get_smp_config) {
+ if (x86_quirks->mach_get_smp_config(early))
+ return;
+ }
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
@@ -682,15 +672,15 @@ void __init get_smp_config(void)
* ISA defaults and hope it will work.
*/
if (!mp_irq_entries) {
- struct mpc_config_bus bus;
+ struct mpc_bus bus;
printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
"using default mptable. "
"(tell your hw vendor)\n");
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
- memcpy(bus.mpc_bustype, "ISA ", 6);
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
MP_bus_info(&bus);
construct_default_ioirq_mptable(0);
@@ -839,14 +829,14 @@ void __init find_smp_config(void)
#ifdef CONFIG_X86_IO_APIC
static u8 __initdata irq_used[MAX_IRQ_SOURCES];
-static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
{
int i;
- if (m->mpc_irqtype != mp_INT)
+ if (m->irqtype != mp_INT)
return 0;
- if (m->mpc_irqflag != 0x0f)
+ if (m->irqflag != 0x0f)
return 0;
/* not legacy */
@@ -858,9 +848,9 @@ static int __init get_MP_intsrc_index(s
if (mp_irqs[i].mp_irqflag != 0x0f)
continue;
- if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+ if (mp_irqs[i].mp_srcbus != m->srcbus)
continue;
- if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+ if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
continue;
if (irq_used[i]) {
/* already claimed */
@@ -876,10 +866,10 @@ static int __init get_MP_intsrc_index(s
#define SPARE_SLOT_NUM 20
-static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
#endif
-static int __init replace_intsrc_all(struct mp_config_table *mpc,
+static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
unsigned long mpc_new_length)
{
@@ -891,36 +881,33 @@ static int __init replace_intsrc_all(st
int count = sizeof(*mpc);
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
- while (count < mpc->mpc_length) {
+ printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
{
- struct mpc_config_processor *m =
- (struct mpc_config_processor *)mpt;
+ struct mpc_cpu *m = (struct mpc_cpu *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_BUS:
{
- struct mpc_config_bus *m =
- (struct mpc_config_bus *)mpt;
+ struct mpc_bus *m = (struct mpc_bus *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_IOAPIC:
{
- mpt += sizeof(struct mpc_config_ioapic);
- count += sizeof(struct mpc_config_ioapic);
+ mpt += sizeof(struct mpc_ioapic);
+ count += sizeof(struct mpc_ioapic);
break;
}
case MP_INTSRC:
{
#ifdef CONFIG_X86_IO_APIC
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
apic_printk(APIC_VERBOSE, "OLD ");
print_MP_intsrc_info(m);
@@ -941,14 +928,14 @@ static int __init replace_intsrc_all(st
nr_m_spare++;
}
#endif
- mpt += sizeof(struct mpc_config_intsrc);
- count += sizeof(struct mpc_config_intsrc);
+ mpt += sizeof(struct mpc_intsrc);
+ count += sizeof(struct mpc_intsrc);
break;
}
case MP_LINTSRC:
{
- struct mpc_config_lintsrc *m =
- (struct mpc_config_lintsrc *)mpt;
+ struct mpc_lintsrc *m =
+ (struct mpc_lintsrc *)mpt;
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -958,7 +945,7 @@ static int __init replace_intsrc_all(st
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
printk(KERN_ERR "type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->mpc_length, 1);
+ 1, mpc, mpc->length, 1);
goto out;
}
}
@@ -980,9 +967,8 @@ static int __init replace_intsrc_all(st
assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
m_spare[nr_m_spare] = NULL;
} else {
- struct mpc_config_intsrc *m =
- (struct mpc_config_intsrc *)mpt;
- count += sizeof(struct mpc_config_intsrc);
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+ count += sizeof(struct mpc_intsrc);
if (!mpc_new_phys) {
printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
} else {
@@ -994,17 +980,16 @@ static int __init replace_intsrc_all(st
}
}
assign_to_mpc_intsrc(&mp_irqs[i], m);
- mpc->mpc_length = count;
- mpt += sizeof(struct mpc_config_intsrc);
+ mpc->length = count;
+ mpt += sizeof(struct mpc_intsrc);
}
print_mp_irq_info(&mp_irqs[i]);
}
#endif
out:
/* update checksum */
- mpc->mpc_checksum = 0;
- mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
- mpc->mpc_length);
+ mpc->checksum = 0;
+ mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
return 0;
}
@@ -1050,8 +1035,7 @@ static int __init update_mp_table(void)
char str[16];
char oem[10];
struct intel_mp_floating *mpf;
- struct mp_config_table *mpc;
- struct mp_config_table *mpc_new;
+ struct mpc_table *mpc, *mpc_new;
if (!enable_update_mptable)
return 0;
@@ -1077,7 +1061,7 @@ static int __init update_mp_table(void)
printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
- if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+ if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
mpc_new_length);
@@ -1086,10 +1070,10 @@ static int __init update_mp_table(void)
if (!mpc_new_phys) {
unsigned char old, new;
/* check if we can change the postion */
- mpc->mpc_checksum = 0;
- old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
- mpc->mpc_checksum = 0xff;
- new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ mpc->checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->length);
+ mpc->checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
return 0;
@@ -1101,7 +1085,7 @@ static int __init update_mp_table(void)
mpc_new_bus = phys_to_machine(mpc_new_phys);
mpf->mpf_physptr = mpc_new_bus;
mpc_new = phys_to_virt(mpc_new_phys);
- memcpy(mpc_new, mpc, mpc->mpc_length);
+ memcpy(mpc_new, mpc, mpc->length);
mpc = mpc_new;
/* check if we can modify that */
if (mpc_new_bus - mpf->mpf_physptr) {
--- head-2011-03-17.orig/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -6,6 +6,7 @@
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -42,7 +38,7 @@ EXPORT_SYMBOL(bad_dma_address);
be probably a smaller DMA mask, but this is bug-to-bug compatible
to older i386. */
struct device x86_dma_fallback_dev = {
- .bus_id = "fallback device",
+ .init_name = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
@@ -105,8 +101,6 @@ static void __init dma32_free_bootmem(vo
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
-#else
-#define dma32_free_bootmem() ((void)0)
#endif
static struct dma_mapping_ops swiotlb_dma_ops = {
@@ -128,8 +122,11 @@ static struct dma_mapping_ops swiotlb_dm
void __init pci_iommu_alloc(void)
{
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
+#endif
+
/*
* The order of these functions is important for
* fall-back/fail-over reasons
@@ -149,16 +146,6 @@ void __init pci_iommu_alloc(void)
}
}
-#ifndef CONFIG_XEN
-unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
-{
- unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-
- return size >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(iommu_nr_pages);
-#endif
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@@ -246,7 +233,6 @@ static __init int iommu_setup(char *p)
}
if (!strncmp(p, "biomerge", 8)) {
- iommu_bio_merge = 4096;
iommu_merge = 1;
force_iommu = 1;
}
@@ -385,8 +371,8 @@ fs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected."
- "Disabling DAC.\n");
+ printk(KERN_INFO
+ "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
--- head-2011-03-17.orig/arch/x86/kernel/process-xen.c 2011-03-03 16:00:33.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process-xen.c 2011-03-03 16:05:57.000000000 +0100
@@ -1,13 +1,17 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <asm/idle.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/ftrace.h>
#include <asm/system.h>
+#include <asm/apic.h>
+#include <xen/evtchn.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -70,6 +74,9 @@ EXPORT_SYMBOL(pm_idle);
*/
void xen_idle(void)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -82,11 +89,27 @@ void xen_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(&it);
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
#endif
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_all_local_evtchn();
+
+ for (;;) {
+ if (hlt_works(smp_processor_id()))
+ halt();
+ }
+}
+
static void do_nothing(void *unused)
{
}
@@ -120,24 +143,37 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
+ trace_power_end(&it);
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
+ struct power_trace it;
if (!need_resched()) {
+ trace_power_start(&it, POWER_CSTATE, 1);
+ if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(&it);
} else
local_irq_enable();
}
@@ -150,9 +186,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
+ trace_power_end(&it);
}
#ifndef CONFIG_XEN
@@ -238,7 +278,7 @@ static void c1e_idle(void)
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
c1e_detected = 1;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
--- head-2011-03-17.orig/arch/x86/kernel/process_32-xen.c 2011-02-02 08:34:28.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process_32-xen.c 2011-02-02 08:36:38.000000000 +0100
@@ -38,11 +38,13 @@
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
@@ -57,10 +59,9 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/kdebug.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/smp.h>
+#include <asm/ds.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
@@ -106,9 +107,6 @@ void cpu_idle(void)
check_pgt_cache();
rmb();
- if (rcu_pending(cpu))
- rcu_check_callbacks(cpu, 0);
-
if (cpu_is_offline(cpu))
play_dead();
@@ -206,7 +204,7 @@ extern void kernel_thread_helper(void);
/*
* Create a kernel thread
*/
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct pt_regs regs;
@@ -245,14 +243,8 @@ void exit_thread(void)
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(current->thread.ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(current->thread.ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void flush_thread(void)
@@ -265,7 +257,7 @@ void flush_thread(void)
tsk->thread.debugreg3 = 0;
tsk->thread.debugreg6 = 0;
tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
clear_tsk_thread_flag(tsk, TIF_DEBUG);
/*
* Forget coprocessor state..
@@ -292,9 +284,9 @@ void prepare_to_copy(struct task_struct
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *tsk;
int err;
@@ -338,13 +330,19 @@ int copy_thread(int nr, unsigned long cl
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+
+ ds_copy_thread(p, current);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
return err;
}
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- __asm__("movl %0, %%gs" :: "r"(0));
+ __asm__("movl %0, %%gs" : : "r"(0));
regs->fs = 0;
set_fs(USER_DS);
regs->ds = __USER_DS;
@@ -418,47 +416,18 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- unsigned long ds_prev = 0;
- unsigned long ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
- }
- return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -479,14 +448,6 @@ __switch_to_xtra(struct task_struct *pre
else
hard_enable_TSC();
}
-
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -516,7 +477,8 @@ __switch_to_xtra(struct task_struct *pre
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
@@ -696,7 +658,7 @@ asmlinkage int sys_vfork(struct pt_regs
asmlinkage int sys_execve(struct pt_regs regs)
{
int error;
- char * filename;
+ char *filename;
filename = getname((char __user *) regs.bx);
error = PTR_ERR(filename);
--- head-2011-03-17.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:34:22.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/process_64-xen.c 2011-02-02 08:36:43.000000000 +0100
@@ -42,6 +42,8 @@
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/dmi.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -57,6 +59,7 @@
#include <asm/ia32.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
+#include <asm/ds.h>
asmlinkage extern void ret_from_fork(void);
@@ -154,14 +157,18 @@ void __show_regs(struct pt_regs *regs, i
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
+ const char *board;
printk("\n");
print_modules();
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
@@ -252,14 +259,8 @@ void exit_thread(void)
#endif
t->io_bitmap_max = 0;
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(t->ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(t->ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void xen_load_gs_index(unsigned gs)
@@ -395,6 +396,11 @@ int copy_thread(int nr, unsigned long cl
}
p->thread.iopl = current->thread.iopl;
+ ds_copy_thread(p, me);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
@@ -491,35 +497,14 @@ static inline void __switch_to_xtra(stru
struct task_struct *next_p)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread,
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
- {
- unsigned long ds_prev = 0, ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /*
- * We clear debugctl to make sure DS
- * is not in use when we change it:
- */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, ds_next);
- }
- }
-#endif /* CONFIG_X86_DS */
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -540,14 +525,6 @@ static inline void __switch_to_xtra(stru
else
hard_enable_TSC();
}
-
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -558,8 +535,9 @@ static inline void __switch_to_xtra(stru
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
*/
-struct task_struct *
+__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
--- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:12.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-03-03 16:22:27.000000000 +0100
@@ -93,11 +93,13 @@
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <mach_apic.h>
#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
#include <asm/percpu.h>
#include <asm/topology.h>
@@ -508,6 +510,7 @@ static void __init reserve_early_setup_d
* @size: Size of the crashkernel memory to reserve.
* Returns the base address on success, and -1ULL on failure.
*/
+static
unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
{
const unsigned long long alignment = 16<<20; /* 16M */
@@ -650,165 +653,32 @@ static int __init setup_elfcorehdr(char
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
-/*
- * Some BIOSes seem to corrupt the low 64k of memory during events
- * like suspend/resume and unplugging an HDMI cable. Reserve all
- * remaining free memory in that area and fill it with a distinct
- * pattern.
- */
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-#define MAX_SCAN_AREAS 8
-
-static int __read_mostly memory_corruption_check = -1;
-
-static unsigned __read_mostly corruption_check_size = 64*1024;
-static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
-static int num_scan_areas;
-
-
-static int set_corruption_check(char *arg)
-{
- char *end;
-
- memory_corruption_check = simple_strtol(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check", set_corruption_check);
-
-static int set_corruption_check_period(char *arg)
-{
- char *end;
-
- corruption_check_period = simple_strtoul(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_period", set_corruption_check_period);
-
-static int set_corruption_check_size(char *arg)
+#ifndef CONFIG_XEN
+static int __init default_update_genapic(void)
{
- char *end;
- unsigned size;
-
- size = memparse(arg, &end);
-
- if (*end == '\0')
- corruption_check_size = size;
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
+#endif
- return (size == corruption_check_size) ? 0 : -EINVAL;
+ return 0;
}
-early_param("memory_corruption_check_size", set_corruption_check_size);
-
-
-static void __init setup_bios_corruption_check(void)
-{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
-
- if (memory_corruption_check == -1) {
- memory_corruption_check =
-#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
- 1
#else
- 0
+#define default_update_genapic NULL
#endif
- ;
- }
-
- if (corruption_check_size == 0)
- memory_corruption_check = 0;
-
- if (!memory_corruption_check)
- return;
-
- corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
-
- while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
-
- if (addr == 0)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
- if (size == 0)
- break;
-
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
-
- /* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
-
- addr += size;
- }
-
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
- num_scan_areas);
- update_e820();
-}
-
-static struct timer_list periodic_check_timer;
-
-void check_for_bios_corruption(void)
-{
- int i;
- int corruption = 0;
-
- if (!memory_corruption_check)
- return;
-
- for(i = 0; i < num_scan_areas; i++) {
- unsigned long *addr = __va(scan_areas[i].addr);
- unsigned long size = scan_areas[i].size;
-
- for(; size; addr++, size -= sizeof(unsigned long)) {
- if (!*addr)
- continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
- corruption = 1;
- *addr = 0;
- }
- }
-
- WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
-}
-
-static void periodic_check_for_corruption(unsigned long data)
-{
- check_for_bios_corruption();
- mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
-}
-
-void start_periodic_check_for_corruption(void)
-{
- if (!memory_corruption_check || corruption_check_period == 0)
- return;
-
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+static struct x86_quirks default_x86_quirks __initdata = {
+ .update_genapic = default_update_genapic,
+};
- init_timer(&periodic_check_timer);
- periodic_check_timer.function = &periodic_check_for_corruption;
- periodic_check_for_corruption(0);
-}
-#endif
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+#ifdef CONFIG_X86_RESERVE_LOW_64K
static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
{
printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working it around.\n",
+ "%s detected: BIOS may corrupt low RAM, working around it.\n",
d->ident);
e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
@@ -816,6 +686,7 @@ static int __init dmi_low_memory_corrupt
return 0;
}
+#endif
/* List of systems that have known low memory corruption BIOS problems */
static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -1025,15 +896,25 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
+ if (efi_enabled)
+ efi_init();
+
if (is_initial_xendomain()) {
dmi_scan_machine();
dmi_check_system(bad_bios_dmi_table);
+ }
+
+ /*
+ * VMware detection requires dmi to be available, so this
+ * needs to be done after dmi_scan_machine, for the BP.
+ */
+ init_hypervisor(&boot_cpu_data);
#ifdef CONFIG_X86_32
+ if (is_initial_xendomain())
probe_roms();
#endif
- }
#ifndef CONFIG_XEN
/* after parse_early_param, so could debug it */
@@ -1041,8 +922,6 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
- if (efi_enabled)
- efi_init();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
@@ -1297,7 +1176,7 @@ void __init setup_arch(char **cmdline_p)
ioapic_init_mappings();
/* need to wait for io_apic is mapped */
- nr_irqs = probe_nr_irqs();
+ probe_nr_irqs_gsi();
kvm_guest_init();
--- head-2011-03-17.orig/arch/x86/kernel/smp-xen.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/smp-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -1,7 +1,7 @@
/*
* Intel SMP support routines.
*
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
@@ -118,30 +118,17 @@ void xen_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void xen_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
}
-void xen_send_call_func_ipi(cpumask_t mask)
+void xen_send_call_func_ipi(const struct cpumask *mask)
{
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-}
-
-static void stop_this_cpu(void *dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_all_local_evtchn();
- if (hlt_works(smp_processor_id()))
- for (;;) halt();
- for (;;);
+ send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
}
/*
@@ -165,22 +152,14 @@ void xen_smp_send_stop(void)
*/
irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
{
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
return IRQ_HANDLED;
}
irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
{
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
return IRQ_HANDLED;
}
@@ -188,11 +167,7 @@ irqreturn_t smp_call_function_interrupt(
irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
{
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
return IRQ_HANDLED;
}
--- head-2011-03-17.orig/arch/x86/kernel/time-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/time-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -455,11 +455,7 @@ irqreturn_t timer_interrupt(int irq, voi
struct vcpu_runstate_info runstate;
/* Keep nmi watchdog up to date */
-#ifdef __i386__
- x86_add_percpu(irq_stat.irq0_irqs, 1);
-#else
- add_pda(irq0_irqs, 1);
-#endif
+ inc_irq_stat(irq0_irqs);
/*
* Here we are in the timer irq handler. We just have irqs locally
@@ -521,7 +517,6 @@ irqreturn_t timer_interrupt(int irq, voi
/*
* Account stolen ticks.
- * HACK: Passing NULL to account_steal_time()
* ensures that the ticks are accounted as stolen.
*/
stolen = runstate.time[RUNSTATE_runnable]
@@ -534,12 +529,11 @@ irqreturn_t timer_interrupt(int irq, voi
do_div(stolen, NS_PER_TICK);
per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
- account_steal_time(NULL, (cputime_t)stolen);
+ account_steal_ticks(stolen);
}
/*
* Account blocked ticks.
- * HACK: Passing idle_task to account_steal_time()
* ensures that the ticks are accounted as idle/wait.
*/
blocked = runstate.time[RUNSTATE_blocked]
@@ -551,18 +545,23 @@ irqreturn_t timer_interrupt(int irq, voi
do_div(blocked, NS_PER_TICK);
per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK;
- account_steal_time(idle_task(cpu), (cputime_t)blocked);
+ account_idle_ticks(blocked);
}
/* Account user/system ticks. */
if (delta_cpu > 0) {
+ cputime_t ct;
+
do_div(delta_cpu, NS_PER_TICK);
per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+ ct = jiffies_to_cputime(delta_cpu);
if (user_mode_vm(get_irq_regs()))
- account_user_time(current, (cputime_t)delta_cpu);
- else
+ account_user_time(current, ct, cputime_to_scaled(ct));
+ else if (current != idle_task(cpu))
account_system_time(current, HARDIRQ_OFFSET,
- (cputime_t)delta_cpu);
+ ct, cputime_to_scaled(ct));
+ else
+ account_idle_ticks(delta_cpu);
}
/* Offlined for more than a few seconds? Avoid lockup warnings. */
@@ -791,7 +790,7 @@ static void stop_hz_timer(void)
unsigned long j;
int rc;
- cpu_set(cpu, nohz_cpu_mask);
+ cpumask_set_cpu(cpu, nohz_cpu_mask);
/* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */
/* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */
@@ -807,7 +806,7 @@ static void stop_hz_timer(void)
local_softirq_pending() ||
(j = get_next_timer_interrupt(jiffies),
time_before_eq(j, jiffies))) {
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
j = jiffies + 1;
}
@@ -838,7 +837,7 @@ static void start_hz_timer(void)
}
#endif
BUG_ON(rc);
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
}
void xen_safe_halt(void)
@@ -848,14 +847,12 @@ void xen_safe_halt(void)
HYPERVISOR_block();
start_hz_timer();
}
-EXPORT_SYMBOL(xen_safe_halt);
void xen_halt(void)
{
if (irqs_disabled())
VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
}
-EXPORT_SYMBOL(xen_halt);
/* No locking required. Interrupts are disabled on all CPUs. */
void time_resume(void)
--- head-2011-03-17.orig/arch/x86/kernel/traps-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/traps-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -20,7 +20,6 @@
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/string.h>
-#include <linux/unwind.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
@@ -51,7 +50,6 @@
#include <asm/debugreg.h>
#include <asm/atomic.h>
#include <asm/system.h>
-#include <asm/unwind.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -65,18 +63,10 @@
#else
#include <asm/processor-flags.h>
#include <asm/arch_hooks.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
#include <asm/traps.h>
#include "cpu/mcheck/mce.h"
-#ifndef CONFIG_XEN
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
-#endif
-
asmlinkage int system_call(void);
/* Do we ignore FPU interrupts ? */
@@ -93,6 +83,11 @@ gate_desc idt_table[256]
#endif
#endif
+#ifndef CONFIG_XEN
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+#endif
+
static int ignore_nmis;
static inline void conditional_sti(struct pt_regs *regs)
@@ -108,6 +103,12 @@ static inline void preempt_conditional_s
local_irq_enable();
}
+static inline void conditional_cli(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
static inline void preempt_conditional_cli(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
@@ -298,8 +299,10 @@ dotraplinkage void do_double_fault(struc
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 8;
- /* This is always a kernel trap and never fixable (and thus must
- never return). */
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
for (;;)
die(str, regs, error_code);
}
@@ -476,11 +479,7 @@ do_nmi(struct pt_regs *regs, long error_
{
nmi_enter();
-#ifdef CONFIG_X86_32
- { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
- add_pda(__nmi_count, 1);
-#endif
+ inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
@@ -519,9 +518,11 @@ dotraplinkage void __kprobes do_int3(str
}
#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
@@ -531,8 +532,10 @@ asmlinkage __kprobes struct pt_regs *syn
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
- /* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ /*
+ * Exception from kernel and interrupts are enabled. Move to
+ * kernel process stack.
+ */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -624,8 +627,10 @@ clear_dr7:
#ifdef CONFIG_X86_32
debug_vm86:
+ /* reenable preemption: handle_vm86_trap() might sleep */
+ dec_preempt_count();
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- preempt_conditional_cli(regs);
+ conditional_cli(regs);
return;
#endif
@@ -659,7 +664,7 @@ void math_error(void __user *ip)
{
struct task_struct *task;
siginfo_t info;
- unsigned short cwd, swd;
+ unsigned short cwd, swd, err;
/*
* Save the info for the exception handler and clear the error.
@@ -670,7 +675,6 @@ void math_error(void __user *ip)
task->thread.error_code = 0;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_code = __SI_FAULT;
info.si_addr = ip;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -684,34 +688,30 @@ void math_error(void __user *ip)
*/
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
-#ifdef CONFIG_X86_32
- return;
-#endif
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
+
+ err = swd & ~cwd;
+
+ if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
+ } else if (err & 0x004) { /* Divide by Zero */
info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
+ } else if (err & 0x008) { /* Overflow */
info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ info.si_code = FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
- break;
+ } else {
+ /*
+ * If we're using IRQ 13, or supposedly even some trap 16
+ * implementations, it's possible we get a spurious trap...
+ */
+ return; /* Spurious trap, no error */
}
force_sig_info(SIGFPE, &info, task);
}
@@ -901,7 +901,7 @@ asmlinkage void math_state_restore(void)
EXPORT_SYMBOL_GPL(math_state_restore);
#ifndef CONFIG_MATH_EMULATION
-asmlinkage void math_emulate(long arg)
+void math_emulate(struct math_emu_info *info)
{
printk(KERN_EMERG
"math-emulation not enabled and no coprocessor found.\n");
@@ -911,16 +911,19 @@ asmlinkage void math_emulate(long arg)
}
#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes
-do_device_not_available(struct pt_regs *regs, long error)
+dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
{
#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
if (read_cr0() & X86_CR0_EM) {
- conditional_sti(regs);
- math_emulate(0);
+ struct math_emu_info info = { };
+
+ conditional_sti(&regs);
+
+ info.regs = &regs;
+ math_emulate(&info);
} else {
math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+ conditional_sti(&regs);
}
#else
math_state_restore();
--- head-2011-03-17.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/vsyscall_64-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -17,6 +17,9 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeo
gettimeofday(tv,NULL);
return;
}
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
now = vread();
+ rdtsc_barrier();
+
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
--- head-2011-03-17.orig/arch/x86/mm/fault-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/fault-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -53,7 +53,7 @@
static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
@@ -406,7 +406,7 @@ static void show_fault_oops(struct pt_re
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(KERN_CRIT "kernel tried to execute "
"NX-protected page - exploit attempt? "
- "(uid: %d)\n", current->uid);
+ "(uid: %d)\n", current_uid());
}
#endif
@@ -426,6 +426,7 @@ static noinline void pgtable_bad(unsigne
unsigned long error_code)
{
unsigned long flags = oops_begin();
+ int sig = SIGKILL;
struct task_struct *tsk;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -436,8 +437,8 @@ static noinline void pgtable_bad(unsigne
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
if (__die("Bad pagetable", regs, error_code))
- regs = NULL;
- oops_end(flags, regs, SIGKILL);
+ sig = 0;
+ oops_end(flags, regs, sig);
}
#endif
@@ -546,10 +547,7 @@ static int vmalloc_fault(unsigned long a
happen within a race in page table update. In the later
case just flush. */
- /* On Xen the line below does not always work. Needs investigating! */
- /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
- pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
- pgd += pgd_index(address);
+ pgd = pgd_offset(current->active_mm, address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -606,6 +604,7 @@ void __kprobes do_page_fault(struct pt_r
int fault;
#ifdef CONFIG_X86_64
unsigned long flags;
+ int sig;
#endif
/* Set the "privileged fault" bit to something sane. */
@@ -623,8 +622,6 @@ void __kprobes do_page_fault(struct pt_r
si_code = SEGV_MAPERR;
- if (notify_page_fault(regs))
- return;
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -663,6 +660,9 @@ void __kprobes do_page_fault(struct pt_r
if (spurious_fault(address, error_code))
return;
+ /* kprobes don't want to hook the spurious faults. */
+ if (notify_page_fault(regs))
+ return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
@@ -670,6 +670,9 @@ void __kprobes do_page_fault(struct pt_r
goto bad_area_nosemaphore;
}
+ /* kprobes don't want to hook the spurious faults. */
+ if (notify_page_fault(regs))
+ return;
/*
* It's safe to allow irq's after cr2 has been saved and the
@@ -696,7 +699,6 @@ void __kprobes do_page_fault(struct pt_r
if (unlikely(in_atomic() || !mm))
goto bad_area_nosemaphore;
-again:
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -880,32 +882,22 @@ no_context:
bust_spinlocks(0);
do_exit(SIGKILL);
#else
+ sig = SIGKILL;
if (__die("Oops", regs, error_code))
- regs = NULL;
+ sig = 0;
/* Executive summary in case the body of the oops scrolled away */
printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags, regs, SIGKILL);
+ oops_end(flags, regs, sig);
#endif
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
out_of_memory:
+ /*
+ * We ran out of memory, call the OOM killer, and return the userspace
+ * (which will retry the fault, or kill us if we got oom-killed).
+ */
up_read(&mm->mmap_sem);
- if (is_global_init(tsk)) {
- yield();
- /*
- * Re-lookup the vma - in theory the vma tree might
- * have changed:
- */
- goto again;
- }
-
- printk("VM: killing process %s\n", tsk->comm);
- if (error_code & PF_USER)
- do_group_exit(SIGKILL);
- goto no_context;
+ pagefault_out_of_memory();
+ return;
do_sigbus:
up_read(&mm->mmap_sem);
--- head-2011-03-17.orig/arch/x86/mm/hypervisor.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/hypervisor.c 2011-02-01 14:42:26.000000000 +0100
@@ -78,12 +78,12 @@ static void multicall_failed(const multi
BUG();
}
-int xen_multicall_flush(bool ret_last) {
+static int _xen_multicall_flush(bool ret_last) {
struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
multicall_entry_t *mc = lazy->mc;
unsigned int count = lazy->nr_mc;
- if (!count || !use_lazy_mmu_mode())
+ if (!count)
return 0;
lazy->nr_mc = 0;
@@ -112,6 +112,11 @@ int xen_multicall_flush(bool ret_last) {
return 0;
}
+void xen_multicall_flush(bool force) {
+ if (force || use_lazy_mmu_mode())
+ _xen_multicall_flush(false);
+}
+
int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
unsigned long uvmf)
{
@@ -128,7 +133,7 @@ int xen_multi_update_va_mapping(unsigned
#endif
if (unlikely(lazy->nr_mc == NR_MC))
- xen_multicall_flush(false);
+ _xen_multicall_flush(false);
mc = lazy->mc + lazy->nr_mc++;
mc->op = __HYPERVISOR_update_va_mapping;
@@ -167,7 +172,7 @@ int xen_multi_mmu_update(mmu_update_t *s
merge = lazy->nr_mc && !commit
&& mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
- xen_multicall_flush(false);
+ _xen_multicall_flush(false);
mc = lazy->mc;
commit = count > NR_MMU || success_count;
}
@@ -205,7 +210,7 @@ int xen_multi_mmu_update(mmu_update_t *s
break;
}
- return commit ? xen_multicall_flush(true) : 0;
+ return commit ? _xen_multicall_flush(true) : 0;
}
int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
@@ -289,7 +294,7 @@ int xen_multi_mmuext_op(struct mmuext_op
merge = lazy->nr_mc && !commit
&& mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
- xen_multicall_flush(false);
+ _xen_multicall_flush(false);
mc = lazy->mc;
commit = count > NR_MMUEXT || success_count;
}
@@ -336,7 +341,7 @@ int xen_multi_mmuext_op(struct mmuext_op
break;
}
- return commit ? xen_multicall_flush(true) : 0;
+ return commit ? _xen_multicall_flush(true) : 0;
}
void xen_l1_entry_update(pte_t *ptr, pte_t val)
--- head-2011-03-17.orig/arch/x86/mm/init_32-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init_32-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -71,7 +71,7 @@ static unsigned long __initdata table_to
static int __initdata after_init_bootmem;
-static __init void *alloc_low_page(unsigned long *phys)
+static __init void *alloc_low_page(void)
{
unsigned long pfn = table_end++;
void *adr;
@@ -81,7 +81,6 @@ static __init void *alloc_low_page(unsig
adr = __va(pfn * PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
- *phys = pfn * PAGE_SIZE;
return adr;
}
@@ -96,17 +95,18 @@ static pmd_t * __init one_md_table_init(
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
- unsigned long phys;
if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_init_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
- pmd_table = (pmd_t *)alloc_low_page(&phys);
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
@@ -135,10 +135,8 @@ static pte_t * __init one_page_table_ini
if (!page_table)
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- } else {
- unsigned long phys;
- page_table = (pte_t *)alloc_low_page(&phys);
- }
+ } else
+ page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
make_lowmem_page_readonly(page_table,
@@ -150,6 +148,51 @@ static pte_t * __init one_page_table_ini
return pte_offset_kernel(pmd, 0);
}
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+ unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+ /*
+ * Something (early fixmap) may already have put a pte
+ * page here, which causes the page table allocation
+ * to become nonlinear. Attempt to fix it, and if it
+ * is still nonlinear then we have to bug.
+ */
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+ if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+ && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+ && ((__pa(pte) >> PAGE_SHIFT) < table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ pte_t *newpte;
+ int i;
+
+ BUG_ON(after_init_bootmem);
+ newpte = alloc_low_page();
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ set_pte(newpte + i, pte[i]);
+
+ paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+ make_lowmem_page_readonly(newpte,
+ XENFEAT_writable_page_tables);
+ set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+ BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+ __flush_tlb_all();
+
+ paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+ make_lowmem_page_writable(pte,
+ XENFEAT_writable_page_tables);
+ pte = newpte;
+ }
+ BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+ && vaddr > fix_to_virt(FIX_KMAP_END)
+ && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+ return pte;
+}
+
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
@@ -166,6 +209,7 @@ page_table_range_init(unsigned long star
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
+ pte_t *pte = NULL;
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -177,8 +221,10 @@ page_table_range_init(unsigned long star
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
- if (vaddr < hypervisor_virt_start)
- one_page_table_init(pmd);
+ if (vaddr >= hypervisor_virt_start)
+ break;
+ pte = page_table_kmap_check(one_page_table_init(pmd),
+ pmd, vaddr, pte);
vaddr += PMD_SIZE;
}
@@ -361,6 +407,8 @@ int devmem_is_allowed(unsigned long page
{
if (pagenr <= 256)
return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
if (mfn_to_local_pfn(pagenr) >= max_pfn)
return 1;
return 0;
@@ -476,8 +524,12 @@ static void __init set_highmem_pages_ini
#endif /* !CONFIG_NUMA */
#else
-# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init() do { } while (0)
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+static inline void set_highmem_pages_init(void)
+{
+}
#endif /* CONFIG_HIGHMEM */
pgd_t *swapper_pg_dir;
@@ -509,7 +561,6 @@ static void __init early_ioremap_page_ta
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
- early_ioremap_clear();
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
@@ -856,10 +907,7 @@ static void __init find_early_table_spac
tables += PAGE_ALIGN(ptes * sizeof(pte_t));
/* for fixmap */
- tables += PAGE_SIZE
- * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
- - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
- >> PMD_SHIFT);
+ tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
table_start = extend_init_mapping(tables);
@@ -1023,8 +1071,6 @@ void __init mem_init(void)
pci_iommu_alloc();
- start_periodic_check_for_corruption();
-
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
@@ -1099,11 +1145,25 @@ void __init mem_init(void)
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
@@ -1123,7 +1183,7 @@ int arch_add_memory(int nid, u64 start,
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(zone, start_pfn, nr_pages);
+ return __add_pages(nid, zone, start_pfn, nr_pages);
}
#endif
--- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -841,7 +841,7 @@ static void __init init_gbpages(void)
#endif
}
-static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
@@ -966,6 +966,8 @@ unsigned long __init_refok init_memory_m
pos = start_pfn << PAGE_SHIFT;
end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pos = end_pfn << PAGE_SHIFT;
@@ -1146,7 +1148,7 @@ int arch_add_memory(int nid, u64 start,
if (last_mapped_pfn > max_pfn_mapped)
max_pfn_mapped = last_mapped_pfn;
- ret = __add_pages(zone, start_pfn, nr_pages);
+ ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
return ret;
@@ -1177,6 +1179,8 @@ int devmem_is_allowed(unsigned long page
{
if (pagenr <= 256)
return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
if (mfn_to_local_pfn(pagenr) >= max_pfn)
return 1;
return 0;
@@ -1192,8 +1196,6 @@ void __init mem_init(void)
unsigned long absent_pages;
unsigned long pfn;
- start_periodic_check_for_corruption();
-
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
--- head-2011-03-17.orig/arch/x86/mm/iomap_32-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/iomap_32-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -17,9 +17,21 @@
*/
#include <asm/iomap.h>
+#include <asm/pat.h>
#include <linux/bitops.h>
#include <linux/module.h>
+int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#ifndef CONFIG_X86_PAE
+ /* There is no way to map greater than 1 << 32 address without PAE */
+ if (base + size > 0x100000000ULL)
+ return 0;
+#endif
+ return 1;
+}
+EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
/* Map 'mfn' using fixed map 'type' and protections 'prot'
*/
void *
@@ -30,6 +42,15 @@ iomap_atomic_prot_pfn(unsigned long mfn,
pagefault_disable();
+ /*
+ * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+ * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+ * MTRR is UC or WC. UC_MINUS gets the real intention, of the
+ * user, which is "WC if the MTRR is WC, UC if you can't do that."
+ */
+ if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+ prot = PAGE_KERNEL_UC_MINUS;
+
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
pgprot_val(prot) |= _PAGE_IOMAP;
--- head-2011-03-17.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:41:07.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/ioremap-xen.c 2011-02-07 15:41:20.000000000 +0100
@@ -274,25 +274,6 @@ int page_is_ram(unsigned long pagenr)
return 0;
}
-int pagerange_is_ram(unsigned long start, unsigned long end)
-{
- int ram_page = 0, not_rampage = 0;
- unsigned long page_nr;
-
- for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
- ++page_nr) {
- if (page_is_ram(mfn_to_local_pfn(page_nr)))
- ram_page = 1;
- else
- not_rampage = 1;
-
- if (ram_page == not_rampage)
- return -1;
- }
-
- return ram_page;
-}
-
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -383,7 +364,8 @@ static void __iomem *__ioremap_caller(re
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
- WARN_ON(iomem_map_sanity_check(phys_addr, size));
+ WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
/*
* Don't allow anybody to remap normal RAM that we're using..
@@ -727,38 +709,10 @@ void __init early_ioremap_init(void)
}
}
-#ifdef CONFIG_X86_32
-void __init early_ioremap_clear(void)
-{
- pmd_t *pmd;
-
- if (early_ioremap_debug)
- printk(KERN_INFO "early_ioremap_clear()\n");
-
- pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
- pmd_clear(pmd);
- make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables);
- /* paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); */
- __flush_tlb_all();
-}
-
void __init early_ioremap_reset(void)
{
- enum fixed_addresses idx;
- unsigned long addr, phys;
- pte_t *pte;
-
after_paging_init = 1;
- for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
- addr = fix_to_virt(idx);
- pte = early_ioremap_pte(addr);
- if (pte_present(*pte)) {
- phys = __pte_val(*pte) & PAGE_MASK;
- set_fixmap(idx, phys);
- }
- }
}
-#endif /* CONFIG_X86_32 */
static void __init __early_set_fixmap(enum fixed_addresses idx,
unsigned long phys, pgprot_t flags)
--- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -524,22 +524,28 @@ static int split_large_page(pte_t *kpte,
set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
/*
- * Install the new, split up pagetable. Important details here:
+ * Install the new, split up pagetable.
*
- * On Intel the NX bit of all levels must be cleared to make a
- * page executable. See section 4.13.2 of Intel 64 and IA-32
- * Architectures Software Developer's Manual).
- *
- * Mark the entry present. The current mapping might be
- * set to not present, which we preserved above.
+ * We use the standard kernel pagetable protections for the new
+ * pagetable protections, the actual ptes set above control the
+ * primary protection behavior:
*/
if (!xen_feature(XENFEAT_writable_page_tables) &&
HYPERVISOR_update_va_mapping((unsigned long)pbase,
mk_pte(base, PAGE_KERNEL_RO), 0))
BUG();
- ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
- pgprot_val(ref_prot) |= _PAGE_PRESENT;
- __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot));
+ __set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+ /*
+ * Intel Atom errata AAH41 workaround.
+ *
+ * The real fix should be in hw or in a microcode update, but
+ * we also probabilistically try to reduce the window of having
+ * a large TLB mixed with 4K TLBs while instruction fetches are
+ * going on.
+ */
+ __flush_tlb_all();
+
base = NULL;
out_unlock:
@@ -554,6 +560,36 @@ out_unlock:
return 0;
}
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+ int primary)
+{
+ /*
+ * Ignore all non primary paths.
+ */
+ if (!primary)
+ return 0;
+
+ /*
+ * Ignore the NULL PTE for kernel identity mapping, as it is expected
+ * to have holes.
+ * Also set numpages to '1' indicating that we processed cpa req for
+ * one virtual address page and its pfn. TBD: numpages can be set based
+ * on the initial value and the level returned by lookup_address().
+ */
+ if (within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ cpa->numpages = 1;
+ cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+ return 0;
+ } else {
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
+ "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+ *cpa->vaddr);
+
+ return -EFAULT;
+ }
+}
+
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
unsigned long address;
@@ -565,21 +601,14 @@ static int __change_page_attr(struct cpa
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
-
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
- return 0;
+ return __cpa_process_fault(cpa, address, primary);
old_pte = *kpte;
- if (!__pte_val(old_pte)) {
- if (!primary)
- return 0;
- WARN(1, KERN_WARNING "CPA: called for zero pte. "
- "vaddr = %lx cpa->vaddr = %lx\n", address,
- *cpa->vaddr);
- return -EINVAL;
- }
+ if (!__pte_val(old_pte))
+ return __cpa_process_fault(cpa, address, primary);
if (level == PG_LEVEL_4K) {
pte_t new_pte;
@@ -678,12 +707,7 @@ static int cpa_process_alias(struct cpa_
vaddr = *cpa->vaddr;
if (!(within(vaddr, PAGE_OFFSET,
- PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
-#ifdef CONFIG_X86_64
- || within(vaddr, PAGE_OFFSET + (1UL<<32),
- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
-#endif
- )) {
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
alias_cpa = *cpa;
temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -814,6 +838,15 @@ static int change_page_attr_set_clr(unsi
vm_unmap_aliases();
+ /*
+ * If we're called with lazy mmu updates enabled, the
+ * in-memory pte state may be stale. Flush pending updates to
+ * bring them up to date.
+ *
+ arch_flush_lazy_mmu_mode();*/
+ if (arch_use_lazy_mmu_mode())
+ xen_multicall_flush(true);
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
@@ -856,6 +889,14 @@ static int change_page_attr_set_clr(unsi
} else
cpa_flush_all(cache);
+ /*
+ * If we've been called with lazy mmu updates enabled, then
+ * make sure that everything gets flushed out before we
+ * return.
+ *
+ arch_flush_lazy_mmu_mode();*/
+ WARN_ON_ONCE(arch_use_lazy_mmu_mode() && !irq_count());
+
out:
return ret;
}
--- head-2011-03-17.orig/arch/x86/mm/pat-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pat-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -11,6 +11,7 @@
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -167,11 +168,12 @@ struct memtype {
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
static inline u8 _mtrr_type_lookup(u64 start, u64 end)
{
if (is_initial_xendomain())
return mtrr_type_lookup(start, end);
- return pagerange_is_ram(start, end) > 0
+ return pat_pagerange_is_ram(start, end) > 0
? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
}
#define mtrr_type_lookup _mtrr_type_lookup
@@ -232,6 +234,33 @@ chk_conflict(struct memtype *new, struct
static struct memtype *cached_entry;
static u64 cached_start;
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+ int ram_page = 0, not_rampage = 0;
+ unsigned long page_nr;
+
+ for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+ ++page_nr) {
+ /*
+ * For legacy reasons, physical address range in the legacy ISA
+ * region is tracked as non-RAM. This will allow users of
+ * /dev/mem to map portions of legacy ISA region, even when
+ * some of those portions are listed(or not even listed) with
+ * different e820 types(RAM/reserved/..)
+ */
+ if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
+ page_is_ram(mfn_to_local_pfn(page_nr)))
+ ram_page = 1;
+ else
+ not_rampage = 1;
+
+ if (ram_page == not_rampage)
+ return -1;
+ }
+
+ return ram_page;
+}
+
/*
* For RAM pages, mark the pages as non WB memory type using
* PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
@@ -360,9 +389,13 @@ int reserve_memtype(u64 start, u64 end,
req_type & _PAGE_CACHE_MASK);
}
- is_range_ram = pagerange_is_ram(start, end);
+ if (new_type)
+ *new_type = actual_type;
+
+ is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1)
- return reserve_ram_pages_type(start, end, req_type, new_type);
+ return reserve_ram_pages_type(start, end, req_type,
+ new_type);
else if (is_range_ram < 0)
return -EINVAL;
@@ -374,9 +407,6 @@ int reserve_memtype(u64 start, u64 end,
new->end = end;
new->type = actual_type;
- if (new_type)
- *new_type = actual_type;
-
spin_lock(&memtype_lock);
if (cached_entry && start >= cached_start)
@@ -464,7 +494,7 @@ int free_memtype(u64 start, u64 end)
if (is_ISA_range(start, end - 1))
return 0;
- is_range_ram = pagerange_is_ram(start, end);
+ is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1)
return free_ram_pages_type(start, end);
else if (is_range_ram < 0)
@@ -623,6 +653,254 @@ void unmap_devmem(unsigned long mfn, uns
free_memtype(addr, addr + size);
}
+#ifndef CONFIG_XEN
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+ int strict_prot)
+{
+ int is_ram = 0;
+ int id_sz, ret;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+ /*
+ * reserve_pfn_range() doesn't support RAM pages.
+ */
+ if (is_ram != 0)
+ return -EINVAL;
+
+ ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+ if (ret)
+ return ret;
+
+ if (flags != want_flags) {
+ if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+ " for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ return -EINVAL;
+ }
+ /*
+ * We allow returning different type than the one requested in
+ * non strict case.
+ */
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ flags);
+ }
+
+ /* Need to keep identity mapping in sync */
+ if (paddr >= __pa(high_memory))
+ return 0;
+
+ id_sz = (__pa(high_memory) < paddr + size) ?
+ __pa(high_memory) - paddr :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR
+ "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
+ "for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ * Otherwise, we reserve the entire vma range, my ging through the PTEs page
+ * by page to get physical address and protection.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+ pgprot_t pgprot;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /*
+ * reserve the whole chunk covered by vma. We need the
+ * starting address and protection from pte.
+ */
+ if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ pgprot = __pgprot(prot);
+ return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ }
+
+ /* reserve entire vma page by page, using pfn and prot from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ pgprot = __pgprot(prot);
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ * Otherwise, we look t the pfn and size and reserve only the specified range
+ * page by page.
+ *
+ * Note that this function can be called with caller trying to map only a
+ * subrange/page inside the vma.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long size)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t base_paddr;
+ resource_size_t paddr;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* reserve the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return reserve_pfn_range(paddr, vma_size, prot, 0);
+ }
+
+ /* reserve page by page using pfn and size */
+ base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = base_paddr + i;
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ paddr = base_paddr + j;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long i;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* free the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ free_pfn_range(paddr, vma_size);
+ return;
+ }
+
+ if (size != 0 && size != vma_size) {
+ /* free page by page, using pfn and size */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = paddr + i;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ } else {
+ /* free entire vma, page by page, using the pfn from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ }
+}
+#endif /* CONFIG_XEN */
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ if (pat_enabled)
+ return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+ else
+ return pgprot_noncached(prot);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
--- head-2011-03-17.orig/arch/x86/pci/irq-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/arch/x86/pci/irq-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -16,8 +16,7 @@
#include <asm/io_apic.h>
#include <linux/irq.h>
#include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
#define PIRQ_VERSION 0x0100
@@ -540,7 +539,7 @@ static int pirq_bios_set(struct pci_dev
{
struct pci_dev *bridge;
int pin = pci_get_interrupt_pin(dev, &bridge);
- return pcibios_set_irq_routing(bridge, pin, irq);
+ return pcibios_set_irq_routing(bridge, pin - 1, irq);
}
#endif
@@ -579,6 +578,7 @@ static __init int intel_router_probe(str
case PCI_DEVICE_ID_INTEL_ICH7_1:
case PCI_DEVICE_ID_INTEL_ICH7_30:
case PCI_DEVICE_ID_INTEL_ICH7_31:
+ case PCI_DEVICE_ID_INTEL_TGP_LPC:
case PCI_DEVICE_ID_INTEL_ESB2_0:
case PCI_DEVICE_ID_INTEL_ICH8_0:
case PCI_DEVICE_ID_INTEL_ICH8_1:
@@ -894,7 +894,6 @@ static int pcibios_lookup_irq(struct pci
dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
- pin = pin - 1;
/* Find IRQ routing entry */
@@ -904,17 +903,17 @@ static int pcibios_lookup_irq(struct pci
info = pirq_get_info(dev);
if (!info) {
dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
- 'A' + pin);
+ 'A' + pin - 1);
return 0;
}
- pirq = info->irq[pin].link;
- mask = info->irq[pin].bitmap;
+ pirq = info->irq[pin - 1].link;
+ mask = info->irq[pin - 1].bitmap;
if (!pirq) {
- dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
return 0;
}
dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
- 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
+ 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -956,7 +955,7 @@ static int pcibios_lookup_irq(struct pci
newirq = i;
}
}
- dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
@@ -984,18 +983,18 @@ static int pcibios_lookup_irq(struct pci
return 0;
}
}
- dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
- pin--;
+
info = pirq_get_info(dev2);
if (!info)
continue;
- if (info->irq[pin].link == pirq) {
+ if (info->irq[pin - 1].link == pirq) {
/*
* We refuse to override the dev->irq
* information. Give a warning!
@@ -1049,6 +1048,9 @@ static void __init pcibios_fixup_irqs(vo
dev = NULL;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (!pin)
+ continue;
+
#ifdef CONFIG_X86_IO_APIC
/*
* Recalculate IRQ numbers if we use the I/O APIC.
@@ -1056,15 +1058,11 @@ static void __init pcibios_fixup_irqs(vo
if (io_apic_assign_pci_irqs) {
int irq;
- if (!pin)
- continue;
-
/*
* interrupt pins are numbered starting from 1
*/
- pin--;
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin);
+ PCI_SLOT(dev->devfn), pin - 1);
/*
* Busses behind bridges are typically not listed in the
* MP-table. In this case we have to look up the IRQ
@@ -1077,22 +1075,22 @@ static void __init pcibios_fixup_irqs(vo
struct pci_dev *bridge = dev->bus->self;
int bus;
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ pin = pci_swizzle_interrupt_pin(dev, pin);
bus = bridge->bus->number;
irq = IO_APIC_get_PCI_irq_vector(bus,
- PCI_SLOT(bridge->devfn), pin);
+ PCI_SLOT(bridge->devfn), pin - 1);
if (irq >= 0)
dev_warn(&dev->dev,
"using bridge %s INT %c to "
"get IRQ %d\n",
pci_name(bridge),
- 'A' + pin, irq);
+ 'A' + pin - 1, irq);
}
if (irq >= 0) {
dev_info(&dev->dev,
"PCI->APIC IRQ transform: INT %c "
"-> IRQ %d\n",
- 'A' + pin, irq);
+ 'A' + pin - 1, irq);
dev->irq = irq;
}
}
@@ -1100,7 +1098,7 @@ static void __init pcibios_fixup_irqs(vo
/*
* Still no IRQ? Try to lookup one...
*/
- if (pin && !dev->irq)
+ if (!dev->irq)
pcibios_lookup_irq(dev, 0);
}
}
@@ -1227,12 +1225,10 @@ static int pirq_enable_irq(struct pci_de
if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
char *msg = "";
- pin--; /* interrupt pins are numbered starting from 1 */
-
if (io_apic_assign_pci_irqs) {
int irq;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1243,20 +1239,20 @@ static int pirq_enable_irq(struct pci_de
while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
struct pci_dev *bridge = dev->bus->self;
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ pin = pci_swizzle_interrupt_pin(dev, pin);
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin);
+ PCI_SLOT(bridge->devfn), pin - 1);
if (irq >= 0)
dev_warn(&dev->dev, "using bridge %s "
"INT %c to get IRQ %d\n",
- pci_name(bridge), 'A' + pin,
+ pci_name(bridge), 'A' + pin - 1,
irq);
dev = bridge;
}
dev = temp_dev;
if (irq >= 0) {
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
- "INT %c -> IRQ %d\n", 'A' + pin, irq);
+ "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
dev->irq = irq;
return 0;
} else
@@ -1275,7 +1271,7 @@ static int pirq_enable_irq(struct pci_de
return 0;
dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
- 'A' + pin, msg);
+ 'A' + pin - 1, msg);
}
return 0;
}
--- head-2011-03-17.orig/arch/x86/pci/pcifront.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/arch/x86/pci/pcifront.c 2011-02-01 14:42:26.000000000 +0100
@@ -8,8 +8,8 @@
#include <linux/init.h>
#include <linux/pci.h>
#include <asm/acpi.h>
+#include <asm/pci_x86.h>
#include <xen/evtchn.h>
-#include "pci.h"
static int pcifront_enable_irq(struct pci_dev *dev)
{
--- head-2011-03-17.orig/arch/x86/vdso/vdso32-setup-xen.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/arch/x86/vdso/vdso32-setup-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -349,7 +349,7 @@ int __init sysenter_setup(void)
}
/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
--- head-2011-03-17.orig/drivers/acpi/Kconfig 2011-01-31 14:42:03.000000000 +0100
+++ head-2011-03-17/drivers/acpi/Kconfig 2011-02-01 14:42:26.000000000 +0100
@@ -196,7 +196,7 @@ config ACPI_DOCK
config ACPI_PROCESSOR
tristate "Processor"
select THERMAL
- select CPU_IDLE
+ select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL
default y
help
This driver installs ACPI as the idle handler for Linux and uses
--- head-2011-03-17.orig/drivers/acpi/processor_core.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/acpi/processor_core.c 2011-02-01 14:42:26.000000000 +0100
@@ -192,7 +192,7 @@ int acpi_get_cpuid(acpi_handle handle, i
* stub enforcing a 1:1 mapping, we keep it undefined to catch bad
* uses. Return as if there was a 1:1 mapping.
*/
- if (apic_id < NR_CPUS && cpu_possible(apic_id))
+ if (apic_id < nr_cpu_ids && cpu_possible(apic_id))
return apic_id;
#endif
return -1;
--- head-2011-03-17.orig/drivers/acpi/processor_idle.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/acpi/processor_idle.c 2011-02-01 14:42:26.000000000 +0100
@@ -125,6 +125,7 @@ static struct dmi_system_id __cpuinitdat
};
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
/*
* Callers should disable interrupts before the call and enable
* interrupts after return.
@@ -143,6 +144,7 @@ static void acpi_safe_halt(void)
}
current_thread_info()->status |= TS_POLLING;
}
+#endif
#ifdef ARCH_APICTIMER_STOPS_ON_C3
@@ -213,7 +215,7 @@ static void lapic_timer_state_broadcast(
static void lapic_timer_check_state(int state, struct acpi_processor *pr,
struct acpi_processor_cx *cstate) { }
static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { }
-static void lapic_timer_state_broadcast(struct acpi_processor *pr,
+static inline void lapic_timer_state_broadcast(struct acpi_processor *pr,
struct acpi_processor_cx *cx,
int broadcast)
{
@@ -261,7 +263,7 @@ int acpi_processor_resume(struct acpi_de
return 0;
}
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86) && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
static void tsc_check_state(int state)
{
switch (boot_cpu_data.x86_vendor) {
@@ -621,7 +623,9 @@ static int acpi_processor_power_verify(s
unsigned int i;
unsigned int working = 0;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
pr->power.timer_broadcast_on_state = INT_MAX;
+#endif
for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
struct acpi_processor_cx *cx = &pr->power.states[i];
@@ -693,6 +697,7 @@ static int acpi_processor_get_power_info
return 0;
}
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
/**
* acpi_idle_bm_check - checks if bus master activity was detected
*/
@@ -1064,6 +1069,13 @@ static int acpi_processor_setup_cpuidle(
return 0;
}
+#else /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+static inline int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
+{
+ return 0;
+}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+
int acpi_processor_cst_has_changed(struct acpi_processor *pr)
{
int ret = 0;
--- head-2011-03-17.orig/drivers/gpu/drm/i915/i915_drv.c 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/drivers/gpu/drm/i915/i915_drv.c 2011-03-17 14:13:15.000000000 +0100
@@ -722,7 +722,7 @@ static struct drm_driver driver = {
.open = drm_open,
.release = drm_release,
.unlocked_ioctl = drm_ioctl,
- .mmap = drm_gem_mmap,
+ .mmap = i915_gem_mmap,
.poll = drm_poll,
.fasync = drm_fasync,
.read = drm_read,
--- head-2011-03-17.orig/drivers/gpu/drm/i915/i915_drv.h 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/drivers/gpu/drm/i915/i915_drv.h 2011-03-17 14:13:13.000000000 +0100
@@ -1162,6 +1162,11 @@ int __must_check i915_do_wait_request(st
uint32_t seqno,
bool interruptible,
struct intel_ring_buffer *ring);
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma);
+#else
+#define i915_gem_mmap drm_gem_mmap
+#endif
int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
int __must_check
i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
--- head-2011-03-17.orig/drivers/gpu/drm/i915/i915_gem.c 2011-03-17 14:35:45.000000000 +0100
+++ head-2011-03-17/drivers/gpu/drm/i915/i915_gem.c 2011-02-08 10:05:05.000000000 +0100
@@ -1152,6 +1152,17 @@ i915_gem_mmap_ioctl(struct drm_device *d
return 0;
}
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int ret = drm_gem_mmap(filp, vma);
+
+ pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+
+ return ret;
+}
+#endif
+
/**
* i915_gem_fault - fault a page into the GTT
* vma: VMA in question
--- head-2011-03-17.orig/drivers/oprofile/buffer_sync.c 2011-01-31 17:01:49.000000000 +0100
+++ head-2011-03-17/drivers/oprofile/buffer_sync.c 2011-02-01 14:42:26.000000000 +0100
@@ -538,7 +538,6 @@ void sync_buffer(int cpu)
int cpu_mode = CPU_MODE_KERNEL;
sync_buffer_state state = sb_buffer_start;
unsigned int i;
- int domain_switch = 0;
unsigned long available;
unsigned long flags;
struct op_entry entry;
@@ -563,15 +562,6 @@ void sync_buffer(int cpu)
if (!sample)
break;
-#ifdef CONFIG_XEN
- if (domain_switch) {
- cpu_current_domain[cpu] = sample->eip;
- add_domain_switch(sample->eip);
- domain_switch = 0;
- continue;
- }
-#endif
-
if (is_code(sample->eip)) {
flags = sample->event;
if (flags & TRACE_BEGIN) {
@@ -597,8 +587,11 @@ void sync_buffer(int cpu)
add_user_ctx_switch(new, cookie);
}
#ifdef CONFIG_XEN
- if (flags & DOMAIN_SWITCH)
- domain_switch = 1;
+ if ((flags & DOMAIN_SWITCH)
+ && op_cpu_buffer_get_data(&entry, &val)) {
+ cpu_current_domain[cpu] = val;
+ add_domain_switch(val);
+ }
#endif
if (op_cpu_buffer_get_size(&entry))
add_data(&entry, mm);
--- head-2011-03-17.orig/drivers/oprofile/cpu_buffer.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-17/drivers/oprofile/cpu_buffer.c 2011-02-01 14:42:26.000000000 +0100
@@ -417,34 +417,15 @@ void oprofile_add_pc(unsigned long pc, i
#ifdef CONFIG_XEN
/*
- * This is basically log_sample(b, ESCAPE_CODE, cpu_mode, CPU_TRACE_BEGIN),
+ * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
* as was previously accessible through oprofile_add_pc().
*/
void oprofile_add_mode(int cpu_mode)
{
struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
- struct task_struct *task;
- if (nr_available_slots(cpu_buf) < 3) {
+ if (op_add_code(cpu_buf, 1, cpu_mode, current))
cpu_buf->sample_lost_overflow++;
- return;
- }
-
- task = current;
-
- /* notice a switch from user->kernel or vice versa */
- if (cpu_buf->last_cpu_mode != cpu_mode) {
- cpu_buf->last_cpu_mode = cpu_mode;
- add_code(cpu_buf, cpu_mode);
- }
-
- /* notice a task switch */
- if (cpu_buf->last_task != task) {
- cpu_buf->last_task = task;
- add_code(cpu_buf, (unsigned long)task);
- }
-
- add_code(cpu_buf, CPU_TRACE_BEGIN);
}
#endif
@@ -475,17 +456,18 @@ fail:
#ifdef CONFIG_XEN
int oprofile_add_domain_switch(int32_t domain_id)
{
- struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
+ struct op_entry entry;
+ struct op_sample *sample;
- /* should have space for switching into and out of domain
- (2 slots each) plus one sample and one cpu mode switch */
- if (((nr_available_slots(cpu_buf) < 6) &&
- (domain_id != COORDINATOR_DOMAIN)) ||
- (nr_available_slots(cpu_buf) < 2))
+ sample = op_cpu_buffer_write_reserve(&entry, 1);
+ if (!sample)
return 0;
- add_code(cpu_buf, DOMAIN_SWITCH);
- add_sample(cpu_buf, domain_id, 0);
+ sample->eip = ESCAPE_CODE;
+ sample->event = DOMAIN_SWITCH;
+
+ op_cpu_buffer_add_data(&entry, domain_id);
+ op_cpu_buffer_write_commit(&entry);
current_domain = domain_id;
--- head-2011-03-17.orig/drivers/pci/msi-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/pci/msi-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -724,30 +724,21 @@ void pci_no_msi(void)
pci_msi_enable = 0;
}
+/**
+ * pci_msi_enabled - is MSI enabled?
+ *
+ * Returns true if MSI has not been disabled by the command-line option
+ * pci=nomsi.
+ **/
+int pci_msi_enabled(void)
+{
+ return pci_msi_enable;
+}
+EXPORT_SYMBOL(pci_msi_enabled);
+
void pci_msi_init_pci_dev(struct pci_dev *dev)
{
#ifndef CONFIG_XEN
INIT_LIST_HEAD(&dev->msi_list);
#endif
}
-
-#ifdef CONFIG_ACPI
-#include <linux/acpi.h>
-#include <linux/pci-acpi.h>
-static void __devinit msi_acpi_init(void)
-{
- if (acpi_pci_disabled)
- return;
- pci_osc_support_set(OSC_MSI_SUPPORT);
- pcie_osc_support_set(OSC_MSI_SUPPORT);
-}
-#else
-static inline void msi_acpi_init(void) { }
-#endif /* CONFIG_ACPI */
-
-void __devinit msi_init(void)
-{
- if (!pci_msi_enable)
- return;
- msi_acpi_init();
-}
--- head-2011-03-17.orig/drivers/xen/Kconfig 2011-02-02 15:36:33.000000000 +0100
+++ head-2011-03-17/drivers/xen/Kconfig 2011-02-02 15:37:07.000000000 +0100
@@ -393,6 +393,7 @@ config XEN_BACKEND
config XENFS
tristate "Xen filesystem"
+ depends on PARAVIRT_XEN
default y
help
The xen filesystem provides a way for domains to share
--- head-2011-03-17.orig/drivers/xen/Makefile 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/xen/Makefile 2011-02-24 14:09:54.000000000 +0100
@@ -15,6 +15,7 @@ obj-$(CONFIG_XEN) += features.o $(xen-
obj-$(CONFIG_HOTPLUG_CPU) += $(xen-hotplug-y)
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += $(xen-balloon-y)
+obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/
obj-$(CONFIG_XEN_BLKDEV_TAP2) += blktap2/ blktap2-new/
--- head-2011-03-17.orig/drivers/xen/balloon/sysfs.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/balloon/sysfs.c 2011-02-01 14:42:26.000000000 +0100
@@ -67,7 +67,7 @@ static ssize_t store_target_kb(struct sy
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
- char memstring[64], *endchar;
+ char *endchar;
unsigned long long target_bytes;
if (!capable(CAP_SYS_ADMIN))
@@ -75,11 +75,8 @@ static ssize_t store_target_kb(struct sy
if (count <= 1)
return -EBADMSG; /* runt */
- if (count > sizeof(memstring))
- return -EFBIG; /* too long */
- strcpy(memstring, buf);
- target_bytes = memparse(memstring, &endchar);
+ target_bytes = simple_strtoull(buf, &endchar, 0) << 10;
balloon_set_new_target(target_bytes >> PAGE_SHIFT);
return count;
@@ -88,8 +85,40 @@ static ssize_t store_target_kb(struct sy
static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
show_target_kb, store_target_kb);
+static ssize_t show_target(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)balloon_stats.target_pages
+ << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ char *endchar;
+ unsigned long long target_bytes;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count <= 1)
+ return -EBADMSG; /* runt */
+
+ target_bytes = memparse(buf, &endchar);
+ balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+ return count;
+}
+
+static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
+ show_target, store_target);
+
static struct sysdev_attribute *balloon_attrs[] = {
&attr_target_kb,
+ &attr_target,
};
static struct attribute *balloon_info_attrs[] = {
--- head-2011-03-17.orig/drivers/xen/blkfront/vbd.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-17/drivers/xen/blkfront/vbd.c 2011-02-01 14:42:26.000000000 +0100
@@ -305,6 +305,10 @@ xlvbd_init_blk_queue(struct gendisk *gd,
if (rq == NULL)
return -1;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+ queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+#endif
+
/* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_hardsect_size(rq, sector_size);
blk_queue_max_sectors(rq, 512);
--- head-2011-03-17.orig/drivers/xen/core/cpu_hotplug.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/cpu_hotplug.c 2011-02-01 14:42:26.000000000 +0100
@@ -11,10 +11,10 @@
* Set of CPUs that remote admin software will allow us to bring online.
* Notified to us via xenbus.
*/
-static cpumask_t xenbus_allowed_cpumask;
+static cpumask_var_t xenbus_allowed_cpumask;
/* Set of CPUs that local admin will allow us to bring online. */
-static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
+static cpumask_var_t local_allowed_cpumask;
static int local_cpu_hotplug_request(void)
{
@@ -41,11 +41,11 @@ static void vcpu_hotplug(unsigned int cp
}
if (strcmp(state, "online") == 0) {
- cpu_set(cpu, xenbus_allowed_cpumask);
+ cpumask_set_cpu(cpu, xenbus_allowed_cpumask);
if (!cpu_up(cpu) && dev)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
} else if (strcmp(state, "offline") == 0) {
- cpu_clear(cpu, xenbus_allowed_cpumask);
+ cpumask_clear_cpu(cpu, xenbus_allowed_cpumask);
if (!cpu_down(cpu) && dev)
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
} else {
@@ -78,7 +78,7 @@ static int smpboot_cpu_notify(struct not
* as it's always executed from within a stopmachine kthread.
*/
if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
- cpu_clear(cpu, local_allowed_cpumask);
+ cpumask_clear_cpu(cpu, local_allowed_cpumask);
return NOTIFY_OK;
}
@@ -157,21 +157,26 @@ int cpu_up_check(unsigned int cpu)
int rc = 0;
if (local_cpu_hotplug_request()) {
- cpu_set(cpu, local_allowed_cpumask);
- if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
+ cpumask_set_cpu(cpu, local_allowed_cpumask);
+ if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
pr_warning("%s: attempt to bring up CPU %u disallowed "
"by remote admin.\n", __FUNCTION__, cpu);
rc = -EBUSY;
}
- } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
- !cpu_isset(cpu, xenbus_allowed_cpumask)) {
+ } else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) ||
+ !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
rc = -EBUSY;
}
return rc;
}
-void init_xenbus_allowed_cpumask(void)
+void __init init_xenbus_allowed_cpumask(void)
{
- xenbus_allowed_cpumask = cpu_present_map;
+ if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL))
+ BUG();
+ cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask);
+ if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL))
+ BUG();
+ cpumask_setall(local_allowed_cpumask);
}
--- head-2011-03-17.orig/drivers/xen/core/evtchn.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/evtchn.c 2011-02-01 14:42:26.000000000 +0100
@@ -36,6 +36,7 @@
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/bootmem.h>
+#include <linux/ftrace.h>
#include <linux/version.h>
#include <asm/atomic.h>
#include <asm/system.h>
@@ -57,9 +58,6 @@ static DEFINE_SPINLOCK(irq_mapping_updat
static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
[0 ... NR_EVENT_CHANNELS-1] = -1 };
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-static u32 irq_info[NR_IRQS];
-
/* Binding types. */
enum {
IRQT_UNBOUND,
@@ -75,6 +73,30 @@ enum {
#define _EVTCHN_BITS 12
#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS)
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND (IRQT_UNBOUND << (32 - _IRQT_BITS))
+
+static struct irq_cfg _irq_cfg[] = {
+ [0 ...
+#ifdef CONFIG_SPARSE_IRQ
+ BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY
+#else
+ NR_IRQS
+#endif
+ - 1].info = IRQ_UNBOUND
+};
+
+static inline struct irq_cfg *__pure irq_cfg(unsigned int irq)
+{
+#ifdef CONFIG_SPARSE_IRQ
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc ? desc->chip_data : NULL;
+#else
+ return irq < NR_IRQS ? _irq_cfg + irq : NULL;
+#endif
+}
+
/* Constructor for packed IRQ information. */
static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
{
@@ -90,26 +112,30 @@ static inline u32 mk_irq_info(u32 type,
return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn);
}
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
-
/*
* Accessors for packed IRQ information.
*/
static inline unsigned int evtchn_from_irq(int irq)
{
- return irq_info[irq] & ((1U << _EVTCHN_BITS) - 1);
+ const struct irq_cfg *cfg = irq_cfg(irq);
+
+ return cfg ? cfg->info & ((1U << _EVTCHN_BITS) - 1) : 0;
}
static inline unsigned int index_from_irq(int irq)
{
- return (irq_info[irq] >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1);
+ const struct irq_cfg *cfg = irq_cfg(irq);
+
+ return cfg ? (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1)
+ : 0;
}
static inline unsigned int type_from_irq(int irq)
{
- return irq_info[irq] >> (32 - _IRQT_BITS);
+ const struct irq_cfg *cfg = irq_cfg(irq);
+
+ return cfg ? cfg->info >> (32 - _IRQT_BITS) : IRQT_UNBOUND;
}
/* IRQ <-> VIRQ mapping. */
@@ -121,9 +147,6 @@ DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS
#endif
DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
#ifdef CONFIG_SMP
#if CONFIG_NR_CPUS <= 256
@@ -161,8 +184,12 @@ static void init_evtchn_cpu_bindings(voi
int i;
/* By default all event channels notify CPU#0. */
- for (i = 0; i < NR_IRQS; i++)
- irq_to_desc(i)->affinity = cpumask_of_cpu(0);
+ for (i = 0; i < nr_irqs; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ if (desc)
+ desc->affinity = cpumask_of_cpu(0);
+ }
memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
for_each_possible_cpu(i)
@@ -239,7 +266,7 @@ static DEFINE_PER_CPU(unsigned int, curr
static DEFINE_PER_CPU(unsigned int, current_l2i);
/* NB. Interrupts are disabled on entry. */
-asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
+asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
{
unsigned long l1, l2;
unsigned long masked_l1, masked_l2;
@@ -341,14 +368,25 @@ asmlinkage void evtchn_do_upcall(struct
irq_exit();
}
-static int find_unbound_irq(void)
+static struct irq_chip dynirq_chip;
+
+static int find_unbound_irq(unsigned int cpu)
{
static int warned;
int irq;
- for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++)
- if (irq_bindcount[irq] == 0)
+ for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) {
+ struct irq_desc *desc = irq_to_desc_alloc_cpu(irq, cpu);
+ struct irq_cfg *cfg = desc->chip_data;
+
+ if (!cfg->bindcount) {
+ desc->status |= IRQ_NOPROBE;
+ set_irq_chip_and_handler_name(irq, &dynirq_chip,
+ handle_fasteoi_irq,
+ "fasteoi");
return irq;
+ }
+ }
if (!warned) {
warned = 1;
@@ -366,14 +404,15 @@ static int bind_caller_port_to_irq(unsig
spin_lock(&irq_mapping_update_lock);
if ((irq = evtchn_to_irq[caller_port]) == -1) {
- if ((irq = find_unbound_irq()) < 0)
+ if ((irq = find_unbound_irq(smp_processor_id())) < 0)
goto out;
evtchn_to_irq[caller_port] = irq;
- irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_CALLER_PORT,
+ 0, caller_port);
}
- irq_bindcount[irq]++;
+ irq_cfg(irq)->bindcount++;
out:
spin_unlock(&irq_mapping_update_lock);
@@ -388,7 +427,7 @@ static int bind_local_port_to_irq(unsign
BUG_ON(evtchn_to_irq[local_port] != -1);
- if ((irq = find_unbound_irq()) < 0) {
+ if ((irq = find_unbound_irq(smp_processor_id())) < 0) {
struct evtchn_close close = { .port = local_port };
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
BUG();
@@ -396,8 +435,8 @@ static int bind_local_port_to_irq(unsign
}
evtchn_to_irq[local_port] = irq;
- irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
- irq_bindcount[irq]++;
+ irq_cfg(irq)->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
+ irq_cfg(irq)->bindcount++;
out:
spin_unlock(&irq_mapping_update_lock);
@@ -441,7 +480,7 @@ static int bind_virq_to_irq(unsigned int
spin_lock(&irq_mapping_update_lock);
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
- if ((irq = find_unbound_irq()) < 0)
+ if ((irq = find_unbound_irq(cpu)) < 0)
goto out;
bind_virq.virq = virq;
@@ -452,14 +491,14 @@ static int bind_virq_to_irq(unsigned int
evtchn = bind_virq.port;
evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
per_cpu(virq_to_irq, cpu)[virq] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
- irq_bindcount[irq]++;
+ irq_cfg(irq)->bindcount++;
out:
spin_unlock(&irq_mapping_update_lock);
@@ -474,7 +513,7 @@ static int bind_ipi_to_irq(unsigned int
spin_lock(&irq_mapping_update_lock);
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
- if ((irq = find_unbound_irq()) < 0)
+ if ((irq = find_unbound_irq(cpu)) < 0)
goto out;
bind_ipi.vcpu = cpu;
@@ -484,14 +523,14 @@ static int bind_ipi_to_irq(unsigned int
evtchn = bind_ipi.port;
evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
per_cpu(ipi_to_irq, cpu)[ipi] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
- irq_bindcount[irq]++;
+ irq_cfg(irq)->bindcount++;
out:
spin_unlock(&irq_mapping_update_lock);
@@ -506,7 +545,7 @@ static void unbind_from_irq(unsigned int
spin_lock(&irq_mapping_update_lock);
- if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
+ if (!--irq_cfg(irq)->bindcount && VALID_EVTCHN(evtchn)) {
close.port = evtchn;
if ((type_from_irq(irq) != IRQT_CALLER_PORT) &&
HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
@@ -529,11 +568,15 @@ static void unbind_from_irq(unsigned int
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = IRQ_UNBOUND;
+ irq_cfg(irq)->info = IRQ_UNBOUND;
/* Zap stats across IRQ changes of use. */
for_each_possible_cpu(cpu)
+#ifdef CONFIG_SPARSE_IRQ
+ irq_to_desc(irq)->kstat_irqs[cpu] = 0;
+#else
kstat_cpu(cpu).irqs[irq] = 0;
+#endif
}
spin_unlock(&irq_mapping_update_lock);
@@ -685,10 +728,9 @@ static void rebind_irq_to_cpu(unsigned i
rebind_evtchn_to_cpu(evtchn, tcpu);
}
-static void set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void set_affinity_irq(unsigned int irq, const struct cpumask *dest)
{
- unsigned tcpu = first_cpu(dest);
- rebind_irq_to_cpu(irq, tcpu);
+ rebind_irq_to_cpu(irq, cpumask_first(dest));
}
#endif
@@ -854,7 +896,7 @@ static void enable_pirq(unsigned int irq
evtchn_to_irq[evtchn] = irq;
bind_evtchn_to_cpu(evtchn, 0);
- irq_info[irq] = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
out:
pirq_unmask_and_notify(evtchn, irq);
@@ -884,7 +926,7 @@ static void shutdown_pirq(unsigned int i
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0);
}
static void unmask_pirq(unsigned int irq)
@@ -1009,7 +1051,7 @@ static void restore_cpu_virqs(unsigned i
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
continue;
- BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
+ BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0));
/* Get a new binding from Xen. */
bind_virq.virq = virq;
@@ -1021,7 +1063,7 @@ static void restore_cpu_virqs(unsigned i
/* Record the new mapping. */
evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
bind_evtchn_to_cpu(evtchn, cpu);
/* Ready for use. */
@@ -1038,7 +1080,7 @@ static void restore_cpu_ipis(unsigned in
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
continue;
- BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
+ BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_IPI, ipi, 0));
/* Get a new binding from Xen. */
bind_ipi.vcpu = cpu;
@@ -1049,7 +1091,7 @@ static void restore_cpu_ipis(unsigned in
/* Record the new mapping. */
evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
bind_evtchn_to_cpu(evtchn, cpu);
/* Ready for use. */
@@ -1061,6 +1103,7 @@ static void restore_cpu_ipis(unsigned in
void irq_resume(void)
{
unsigned int cpu, irq, evtchn;
+ struct irq_cfg *cfg;
init_evtchn_cpu_bindings();
@@ -1077,12 +1120,17 @@ void irq_resume(void)
mask_evtchn(evtchn);
/* Check that no PIRQs are still bound. */
- for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++)
- BUG_ON(irq_info[irq] != IRQ_UNBOUND);
+ for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) {
+ cfg = irq_cfg(irq);
+ BUG_ON(cfg && cfg->info != IRQ_UNBOUND);
+ }
/* No IRQ <-> event-channel mappings. */
- for (irq = 0; irq < NR_IRQS; irq++)
- irq_info[irq] &= ~((1U << _EVTCHN_BITS) - 1);
+ for (irq = 0; irq < nr_irqs; irq++) {
+ cfg = irq_cfg(irq);
+ if (cfg)
+ cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
+ }
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
evtchn_to_irq[evtchn] = -1;
@@ -1094,10 +1142,56 @@ void irq_resume(void)
}
#endif
+int __init arch_early_irq_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
+ irq_to_desc(i)->chip_data = _irq_cfg + i;
+
+ return 0;
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+ if (!desc->chip_data) {
+ /* By default all event channels notify CPU#0. */
+ desc->affinity = cpumask_of_cpu(0);
+
+ desc->chip_data = kzalloc(sizeof(struct irq_cfg), GFP_ATOMIC);
+ }
+ if (!desc->chip_data) {
+ pr_emerg("cannot alloc irq_cfg\n");
+ BUG();
+ }
+
+ return 0;
+}
+#endif
+
#if defined(CONFIG_X86_IO_APIC)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ struct physdev_irq irq_op;
+
+ if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS)
+ return -EINVAL;
+
+ if (cfg->vector)
+ return 0;
+
+ irq_op.irq = irq;
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+ return -ENOSPC;
+
+ cfg->vector = irq_op.vector;
+
+ return 0;
+}
#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
#elif defined(CONFIG_X86)
-#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < 16)
+#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY)
#else
#define identity_mapped_irq(irq) (1)
#endif
@@ -1107,7 +1201,7 @@ void evtchn_register_pirq(int irq)
BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS);
if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND)
return;
- irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0);
+ irq_cfg(irq)->info = mk_irq_info(IRQT_PIRQ, irq, 0);
set_irq_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq,
"fasteoi");
}
@@ -1120,12 +1214,17 @@ int evtchn_map_pirq(int irq, int xen_pir
irq = PIRQ_BASE + NR_PIRQS - 1;
spin_lock(&irq_alloc_lock);
do {
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
if (identity_mapped_irq(irq))
continue;
+ desc = irq_to_desc_alloc_cpu(irq, smp_processor_id());
+ cfg = desc->chip_data;
if (!index_from_irq(irq)) {
BUG_ON(type_from_irq(irq) != IRQT_UNBOUND);
- irq_info[irq] = mk_irq_info(IRQT_PIRQ,
- xen_pirq, 0);
+ cfg->info = mk_irq_info(IRQT_PIRQ,
+ xen_pirq, 0);
break;
}
} while (--irq >= PIRQ_BASE);
@@ -1144,7 +1243,7 @@ int evtchn_map_pirq(int irq, int xen_pir
* then causes a warning in dynamic_irq_cleanup().
*/
set_irq_chip_and_handler(irq, NULL, NULL);
- irq_info[irq] = IRQ_UNBOUND;
+ irq_cfg(irq)->info = IRQ_UNBOUND;
return 0;
} else if (type_from_irq(irq) != IRQT_PIRQ
|| index_from_irq(irq) != xen_pirq) {
@@ -1181,23 +1280,17 @@ void __init xen_init_IRQ(void)
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
- /* No IRQ -> event-channel mappings. */
- for (i = 0; i < NR_IRQS; i++)
- irq_info[i] = IRQ_UNBOUND;
-
- /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+#ifndef CONFIG_SPARSE_IRQ
for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) {
- irq_bindcount[i] = 0;
-
irq_to_desc(i)->status |= IRQ_NOPROBE;
set_irq_chip_and_handler_name(i, &dynirq_chip,
handle_fasteoi_irq, "fasteoi");
}
- /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) {
- irq_bindcount[i] = 1;
-
+#else
+ for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
+#endif
if (!identity_mapped_irq(i))
continue;
--- head-2011-03-17.orig/drivers/xen/core/machine_reboot.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/machine_reboot.c 2011-02-01 14:42:26.000000000 +0100
@@ -19,6 +19,9 @@
#include <xen/interface/vcpu.h>
#if defined(__i386__) || defined(__x86_64__)
+#include <asm/pci_x86.h>
+/* TBD: Dom0 should propagate the determined value to Xen. */
+bool port_cf9_safe = false;
/*
* Power off function, if any
@@ -79,7 +82,7 @@ static void post_suspend(int suspend_can
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
- cpu_initialized_map = cpu_online_map;
+ cpumask_copy(vcpu_initialized_mask, cpu_online_mask);
#endif
for_each_possible_cpu(i)
setup_runstate_area(i);
@@ -219,6 +222,12 @@ int __xen_suspend(int fast_suspend, void
if (num_possible_cpus() == 1)
fast_suspend = 0;
+ if (fast_suspend) {
+ err = stop_machine_create();
+ if (err)
+ return err;
+ }
+
suspend.fast_suspend = fast_suspend;
suspend.resume_notifier = resume_notifier;
@@ -245,6 +254,8 @@ int __xen_suspend(int fast_suspend, void
if (!fast_suspend)
smp_resume();
+ else
+ stop_machine_destroy();
return 0;
}
--- head-2011-03-17.orig/drivers/xen/core/smpboot.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/xen/core/smpboot.c 2011-02-01 14:42:26.000000000 +0100
@@ -33,11 +33,7 @@ extern void failsafe_callback(void);
extern void system_call(void);
extern void smp_trap_init(trap_info_t *);
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-cpumask_t cpu_initialized_map;
+cpumask_var_t vcpu_initialized_mask;
DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -64,10 +60,14 @@ void __init prefill_possible_map(void)
#endif
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
- cpu_set(i, cpu_possible_map);
+ set_cpu_possible(i, true);
nr_cpu_ids = i + 1;
}
}
+ total_cpus = num_possible_cpus();
+ for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i)
+ if (i != smp_processor_id())
+ ++total_cpus;
}
static int __cpuinit xen_smp_intr_init(unsigned int cpu)
@@ -167,7 +167,7 @@ static void __cpuinit cpu_initialize_con
struct task_struct *idle = idle_task(cpu);
- if (cpu_test_and_set(cpu, cpu_initialized_map))
+ if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask))
return;
spin_lock(&ctxt_lock);
@@ -237,13 +237,15 @@ void __init smp_prepare_cpus(unsigned in
if (xen_smp_intr_init(0))
BUG();
- cpu_initialized_map = cpumask_of_cpu(0);
+ if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL))
+ BUG();
+ cpumask_copy(vcpu_initialized_mask, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
- for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ for (cpu = nr_cpu_ids-1; !cpu_possible(cpu); cpu--)
continue;
- cpu_clear(cpu, cpu_possible_map);
+ set_cpu_possible(cpu, false);
}
for_each_possible_cpu (cpu) {
@@ -278,10 +280,8 @@ void __init smp_prepare_cpus(unsigned in
#ifdef CONFIG_HOTPLUG_CPU
if (is_initial_xendomain())
- cpu_set(cpu, cpu_present_map);
-#else
- cpu_set(cpu, cpu_present_map);
#endif
+ set_cpu_present(cpu, true);
}
init_xenbus_allowed_cpumask();
@@ -314,22 +314,24 @@ void __init smp_prepare_boot_cpu(void)
*/
static int __init initialize_cpu_present_map(void)
{
- cpu_present_map = cpu_possible_map;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ set_cpu_present(cpu, true);
+
return 0;
}
core_initcall(initialize_cpu_present_map);
int __cpuinit __cpu_disable(void)
{
- cpumask_t map = cpu_online_map;
unsigned int cpu = smp_processor_id();
if (cpu == 0)
return -EBUSY;
- cpu_clear(cpu, map);
- fixup_irqs(map);
- cpu_clear(cpu, cpu_online_map);
+ set_cpu_online(cpu, false);
+ fixup_irqs();
return 0;
}
@@ -369,7 +371,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
if (rc)
return rc;
- cpu_set(cpu, cpu_online_map);
+ set_cpu_online(cpu, true);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
@@ -381,7 +383,7 @@ void __ref play_dead(void)
{
idle_task_exit();
local_irq_disable();
- cpu_clear(smp_processor_id(), cpu_initialized);
+ cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask);
preempt_enable_no_resched();
VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
#ifdef CONFIG_HOTPLUG_CPU
--- head-2011-03-17.orig/drivers/xen/netback/interface.c 2011-03-17 14:12:41.000000000 +0100
+++ head-2011-03-17/drivers/xen/netback/interface.c 2011-02-17 10:15:18.000000000 +0100
@@ -222,6 +222,15 @@ static struct ethtool_ops network_ethtoo
.get_strings = netbk_get_strings,
};
+static const struct net_device_ops netif_be_netdev_ops = {
+ .ndo_open = net_open,
+ .ndo_stop = net_close,
+ .ndo_start_xmit = netif_be_start_xmit,
+ .ndo_change_mtu = netbk_change_mtu,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
{
int err = 0;
@@ -258,10 +267,7 @@ netif_t *netif_alloc(struct device *pare
init_timer(&netif->tx_queue_timeout);
- dev->hard_start_xmit = netif_be_start_xmit;
- dev->open = net_open;
- dev->stop = net_close;
- dev->change_mtu = netbk_change_mtu;
+ dev->netdev_ops = &netif_be_netdev_ops;
netif_set_features(netif);
--- head-2011-03-17.orig/drivers/xen/netback/loopback.c 2011-01-31 17:32:29.000000000 +0100
+++ head-2011-03-17/drivers/xen/netback/loopback.c 2011-03-01 11:52:05.000000000 +0100
@@ -155,7 +155,6 @@ static int loopback_start_xmit(struct sk
skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
skb->protocol = eth_type_trans(skb, dev);
- dev->last_rx = jiffies;
/* Flush netfilter context: rx'ed skbuffs not expected to have any. */
nf_reset(skb);
@@ -194,6 +193,14 @@ static void loopback_set_multicast_list(
{
}
+static const struct net_device_ops loopback_netdev_ops = {
+ .ndo_open = loopback_open,
+ .ndo_stop = loopback_close,
+ .ndo_start_xmit = loopback_start_xmit,
+ .ndo_set_multicast_list = loopback_set_multicast_list,
+ .ndo_change_mtu = NULL, /* allow arbitrary mtu */
+};
+
static void loopback_construct(struct net_device *dev, struct net_device *lo,
int loop_idx)
{
@@ -202,12 +209,7 @@ static void loopback_construct(struct ne
np->loopback_dev = lo;
np->loop_idx = loop_idx;
- dev->open = loopback_open;
- dev->stop = loopback_close;
- dev->hard_start_xmit = loopback_start_xmit;
- dev->set_multicast_list = loopback_set_multicast_list;
- dev->change_mtu = NULL; /* allow arbitrary mtu */
-
+ dev->netdev_ops = &loopback_netdev_ops;
dev->tx_queue_len = 0;
dev->features = (NETIF_F_HIGHDMA |
--- head-2011-03-17.orig/drivers/xen/netback/netback.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/netback/netback.c 2011-03-01 11:52:09.000000000 +0100
@@ -363,7 +363,7 @@ static void xen_network_done_notify(void
static struct net_device *eth0_dev = NULL;
if (unlikely(eth0_dev == NULL))
eth0_dev = __dev_get_by_name(&init_net, "eth0");
- netif_rx_schedule(eth0_dev, ???);
+ netif_rx_schedule(???);
}
/*
* Add following to poll() function in NAPI driver (Tigon3 is example):
@@ -1495,7 +1495,6 @@ static void net_tx_action(unsigned long
dev->stats.rx_packets++;
netif_rx(skb);
- dev->last_rx = jiffies;
}
out:
--- head-2011-03-17.orig/drivers/xen/netfront/netfront.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/xen/netfront/netfront.c 2011-02-09 16:04:26.000000000 +0100
@@ -632,7 +632,7 @@ static int network_open(struct net_devic
if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(dev, &np->napi);
+ netif_rx_schedule(&np->napi);
}
}
spin_unlock_bh(&np->rx_lock);
@@ -703,7 +703,7 @@ static void rx_refill_timeout(unsigned l
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(dev, &np->napi);
+ netif_rx_schedule(&np->napi);
}
static void network_alloc_rx_buffers(struct net_device *dev)
@@ -1057,8 +1057,7 @@ static irqreturn_t netif_int(int irq, vo
if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
netfront_accelerator_call_stop_napi_irq(np, dev);
- netif_rx_schedule(dev, &np->napi);
- dev->last_rx = jiffies;
+ netif_rx_schedule(&np->napi);
}
}
@@ -1474,7 +1473,6 @@ err:
/* Pass it up. */
netif_receive_skb(skb);
- dev->last_rx = jiffies;
}
/* If we get a callback with very few responses, reduce fill target. */
@@ -1516,7 +1514,7 @@ err:
}
if (!more_to_do && !accel_more_to_do)
- __netif_rx_complete(dev, napi);
+ __netif_rx_complete(napi);
local_irq_restore(flags);
}
@@ -2069,6 +2067,18 @@ static void network_set_multicast_list(s
{
}
+static const struct net_device_ops xennet_netdev_ops = {
+ .ndo_uninit = netif_uninit,
+ .ndo_open = network_open,
+ .ndo_stop = network_close,
+ .ndo_start_xmit = network_start_xmit,
+ .ndo_set_multicast_list = network_set_multicast_list,
+ .ndo_set_mac_address = xennet_set_mac_address,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = xennet_change_mtu,
+ .ndo_get_stats = network_get_stats,
+};
+
static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
{
int i, err = 0;
@@ -2124,15 +2134,8 @@ static struct net_device * __devinit cre
goto exit_free_tx;
}
- netdev->open = network_open;
- netdev->hard_start_xmit = network_start_xmit;
- netdev->stop = network_close;
- netdev->get_stats = network_get_stats;
+ netdev->netdev_ops = &xennet_netdev_ops;
netif_napi_add(netdev, &np->napi, netif_poll, 64);
- netdev->set_multicast_list = network_set_multicast_list;
- netdev->uninit = netif_uninit;
- netdev->set_mac_address = xennet_set_mac_address;
- netdev->change_mtu = xennet_change_mtu;
netdev->features = NETIF_F_IP_CSUM;
SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
@@ -2163,7 +2166,7 @@ inetdev_notify(struct notifier_block *th
struct net_device *dev = ifa->ifa_dev->dev;
/* UP event and is it one of our devices? */
- if (event == NETDEV_UP && dev->open == network_open)
+ if (event == NETDEV_UP && dev->netdev_ops->ndo_open == network_open)
send_fake_arp(dev);
return NOTIFY_DONE;
--- head-2011-03-17.orig/drivers/xen/sfc_netfront/accel_msg.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/sfc_netfront/accel_msg.c 2011-02-01 14:42:26.000000000 +0100
@@ -47,7 +47,7 @@ static void vnic_start_interrupts(netfro
netfront_accel_disable_net_interrupts(vnic);
vnic->irq_enabled = 0;
NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
- netif_rx_schedule(vnic->net_dev, &np->napi);
+ netif_rx_schedule(&np->napi);
} else {
/*
* Nothing yet, make sure we get interrupts through
@@ -532,7 +532,7 @@ irqreturn_t netfront_accel_net_channel_i
vnic->stats.event_count_since_irq;
vnic->stats.event_count_since_irq = 0;
#endif
- netif_rx_schedule(net_dev, &np->napi);
+ netif_rx_schedule(&np->napi);
}
else {
spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_client.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_client.c 2011-02-01 14:42:26.000000000 +0100
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
/**
* xenbus_switch_state
* @dev: xenbus device
- * @xbt: transaction handle
* @state: new state
*
* Advertise in the store a change of the given driver to the given new_state.
@@ -302,7 +301,7 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error);
* @fmt: error message format
*
* Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
- * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
* closedown of this driver and its peer.
*/
void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:38:38.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:42:26.000000000 +0100
@@ -42,6 +42,7 @@
#include <linux/ctype.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
+#include <linux/proc_fs.h>
#include <linux/notifier.h>
#include <linux/mutex.h>
#include <linux/io.h>
@@ -73,6 +74,10 @@
#endif
int xen_store_evtchn;
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+EXPORT_SYMBOL(xen_store_evtchn);
+#endif
+
struct xenstore_domain_interface *xen_store_interface;
static unsigned long xen_store_mfn;
@@ -198,6 +203,12 @@ static int xenbus_uevent_frontend(struct
}
#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+static struct device_attribute xenbus_dev_attrs[] = {
+ __ATTR_NULL
+};
+#endif
+
/* Bus type for frontend drivers. */
static struct xen_bus_type xenbus_frontend = {
.root = "device",
@@ -206,13 +217,16 @@ static struct xen_bus_type xenbus_fronte
.probe = xenbus_probe_frontend,
.error = -ENODEV,
.bus = {
- .name = "xen",
- .match = xenbus_match,
+ .name = "xen",
+ .match = xenbus_match,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
- .shutdown = xenbus_dev_shutdown,
- .uevent = xenbus_uevent_frontend,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
+ .uevent = xenbus_uevent_frontend,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+ .dev_attrs = xenbus_dev_attrs,
#endif
},
#if defined(CONFIG_XEN) || defined(MODULE)
@@ -586,7 +600,17 @@ int xenbus_probe_node(struct xen_bus_typ
xendev->dev.bus = &bus->bus;
xendev->dev.release = xenbus_dev_release;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+ {
+ char devname[XEN_BUS_ID_SIZE];
+
+ err = bus->get_bus_id(devname, xendev->nodename);
+ if (!err)
+ dev_set_name(&xendev->dev, devname);
+ }
+#else
err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+#endif
if (err)
goto fail;
@@ -774,7 +798,7 @@ static int suspend_dev(struct device *de
err = drv->suspend(xdev);
if (err)
pr_warning("xenbus: suspend %s failed: %i\n",
- dev->bus_id, err);
+ dev_name(dev), err);
return 0;
}
@@ -794,7 +818,7 @@ static int suspend_cancel_dev(struct dev
err = drv->suspend_cancel(xdev);
if (err)
pr_warning("xenbus: suspend_cancel %s failed: %i\n",
- dev->bus_id, err);
+ dev_name(dev), err);
return 0;
}
@@ -815,7 +839,7 @@ static int resume_dev(struct device *dev
err = talk_to_otherend(xdev);
if (err) {
pr_warning("xenbus: resume (talk_to_otherend) %s failed: %i\n",
- dev->bus_id, err);
+ dev_name(dev), err);
return err;
}
@@ -825,7 +849,7 @@ static int resume_dev(struct device *dev
err = drv->resume(xdev);
if (err) {
pr_warning("xenbus: resume %s failed: %i\n",
- dev->bus_id, err);
+ dev_name(dev), err);
return err;
}
}
@@ -833,7 +857,7 @@ static int resume_dev(struct device *dev
err = watch_otherend(xdev);
if (err) {
pr_warning("xenbus_probe: resume (watch_otherend) %s failed:"
- " %d\n", dev->bus_id, err);
+ " %d\n", dev_name(dev), err);
return err;
}
@@ -1143,6 +1167,14 @@ static int __devinit xenbus_probe_init(v
if (!is_initial_xendomain())
xenbus_probe(NULL);
+#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE)
+ /*
+ * Create xenfs mountpoint in /proc for compatibility with
+ * utilities that expect to find "xenbus" under "/proc/xen".
+ */
+ proc_mkdir("xen", NULL);
+#endif
+
return 0;
err:
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe.h 2011-02-07 14:42:39.000000000 +0100
@@ -43,6 +43,8 @@
#ifdef CONFIG_PARAVIRT_XEN
#define is_running_on_xen() xen_domain()
#define is_initial_xendomain() xen_initial_domain()
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+#define dev_name(dev) ((dev)->bus_id)
#endif
#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_probe_backend.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_probe_backend.c 2011-02-01 14:42:26.000000000 +0100
@@ -36,6 +36,7 @@
__FUNCTION__, __LINE__, ##args)
#include <linux/kernel.h>
+#include <linux/version.h>
#include <linux/err.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -108,6 +109,10 @@ static int backend_bus_id(char bus_id[XE
return 0;
}
+static struct device_attribute xenbus_backend_attrs[] = {
+ __ATTR_NULL
+};
+
static struct xen_bus_type xenbus_backend = {
.root = "backend",
.levels = 3, /* backend/type/<frontend>/<id> */
@@ -115,12 +120,13 @@ static struct xen_bus_type xenbus_backen
.probe = xenbus_probe_backend,
.error = -ENODEV,
.bus = {
- .name = "xen-backend",
- .match = xenbus_match,
- .probe = xenbus_dev_probe,
- .remove = xenbus_dev_remove,
-// .shutdown = xenbus_dev_shutdown,
- .uevent = xenbus_uevent_backend,
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+// .shutdown = xenbus_dev_shutdown,
+ .uevent = xenbus_uevent_backend,
+ .dev_attrs = xenbus_backend_attrs,
},
.dev = {
.bus_id = "xen-backend",
--- head-2011-03-17.orig/drivers/xen/xenbus/xenbus_xs.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenbus/xenbus_xs.c 2011-02-01 14:42:26.000000000 +0100
@@ -226,6 +226,9 @@ void *xenbus_dev_request_and_reply(struc
return ret;
}
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+#endif
/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
static void *xs_talkv(struct xenbus_transaction t,
--- head-2011-03-17.orig/drivers/xen/xenoprof/xenoprofile.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-17/drivers/xen/xenoprof/xenoprofile.c 2011-02-01 14:42:26.000000000 +0100
@@ -49,7 +49,7 @@ static int xenoprof_enabled = 0;
static int xenoprof_is_primary = 0;
static int active_defined;
-extern unsigned long backtrace_depth;
+extern unsigned long oprofile_backtrace_depth;
/* Number of buffers in shared area (one per VCPU) */
static int nbuf;
@@ -338,11 +338,11 @@ static int xenoprof_setup(void)
active_defined = 1;
}
- if (backtrace_depth > 0) {
+ if (oprofile_backtrace_depth > 0) {
ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace,
- &backtrace_depth);
+ &oprofile_backtrace_depth);
if (ret)
- backtrace_depth = 0;
+ oprofile_backtrace_depth = 0;
}
ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
--- head-2011-03-17.orig/include/acpi/processor.h 2011-01-31 14:53:38.000000000 +0100
+++ head-2011-03-17/include/acpi/processor.h 2011-02-01 14:42:26.000000000 +0100
@@ -91,13 +91,24 @@ struct acpi_processor_cx {
};
struct acpi_processor_power {
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+ union { /* 'dev' is actually only used for taking its address. */
+#endif
struct cpuidle_device dev;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
struct acpi_processor_cx *state;
unsigned long bm_check_timestamp;
u32 default_state;
+#else
+ struct {
+#endif
int count;
struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
int timer_broadcast_on_state;
+#else
+ }; };
+#endif
};
/* Performance Management */
--- head-2011-03-17.orig/include/xen/cpu_hotplug.h 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/include/xen/cpu_hotplug.h 2011-02-01 14:42:26.000000000 +0100
@@ -5,7 +5,7 @@
#include <linux/cpumask.h>
#if defined(CONFIG_X86) && defined(CONFIG_SMP)
-extern cpumask_t cpu_initialized_map;
+extern cpumask_var_t vcpu_initialized_mask;
#endif
#if defined(CONFIG_HOTPLUG_CPU)
--- head-2011-03-17.orig/include/xen/evtchn.h 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-17/include/xen/evtchn.h 2011-02-01 14:42:26.000000000 +0100
@@ -48,6 +48,18 @@
* LOW-LEVEL DEFINITIONS
*/
+struct irq_cfg {
+ u32 info;
+ union {
+ int bindcount; /* for dynamic IRQs */
+#ifdef CONFIG_X86_IO_APIC
+ u8 vector; /* for physical IRQs */
+#endif
+ };
+};
+
+int assign_irq_vector(int irq, struct irq_cfg *, const struct cpumask *);
+
/*
* Dynamically bind an event source to an IRQ-like callback handler.
* On some platforms this may not be implemented via the Linux IRQ subsystem.
--- head-2011-03-17.orig/include/xen/xenbus.h 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-17/include/xen/xenbus.h 2011-02-02 16:58:42.000000000 +0100
@@ -325,7 +325,9 @@ void xenbus_dev_error(struct xenbus_devi
void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
...);
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
int xenbus_dev_init(void);
+#endif
const char *xenbus_strstate(enum xenbus_state state);
int xenbus_dev_is_online(struct xenbus_device *dev);
--- head-2011-03-17.orig/lib/swiotlb-xen.c 2011-02-01 14:39:24.000000000 +0100
+++ head-2011-03-17/lib/swiotlb-xen.c 2011-02-01 14:42:26.000000000 +0100
@@ -8,6 +8,7 @@
* Copyright (C) 2000, 2003 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
* Copyright (C) 2005 Keir Fraser <keir@xensource.com>
+ * 08/12/11 beckyb Add highmem support
*/
#include <linux/cache.h>
@@ -16,6 +17,8 @@
#include <linux/pci.h>
#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/init.h>
@@ -30,24 +33,9 @@
#include <xen/interface/memory.h>
#include <asm/gnttab_dma.h>
-int swiotlb;
-
#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
/*
- * Maximum allowable number of contiguous slabs to map,
- * must be a power of 2. What is the appropriate value ?
- * The complexity of {map,unmap}_single is linearly dependent on this value.
- */
-#define IO_TLB_SEGSIZE 128
-
-/*
- * log of the size of each IO TLB slab. The number of slabs is command line
- * controllable.
- */
-#define IO_TLB_SHIFT 11
-
-/*
* Enumeration for sync targets
*/
enum dma_sync_target {
@@ -55,10 +43,9 @@ enum dma_sync_target {
SYNC_FOR_DEVICE = 1,
};
+int swiotlb;
int swiotlb_force;
-static unsigned long iotlb_nslabs;
-
/*
* Used to do a quick range check in swiotlb_unmap_single and
* swiotlb_sync_single_*, to see if the memory was in fact allocated by this
@@ -67,6 +54,12 @@ static unsigned long iotlb_nslabs;
static char *io_tlb_start, *io_tlb_end;
/*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
* When the IOMMU overflows we return a fallback buffer. This sets the size.
*/
static unsigned long io_tlb_overflow = 32*1024;
@@ -84,10 +77,7 @@ static unsigned int io_tlb_index;
* We need to save away the original address corresponding to a mapped entry
* for the sync operations.
*/
-static struct phys_addr {
- struct page *page;
- unsigned int offset;
-} *io_tlb_orig_addr;
+static phys_addr_t *io_tlb_orig_addr;
/*
* Protect the above data structures in the map and unmap calls
@@ -109,9 +99,9 @@ setup_io_tlb_npages(char *str)
{
/* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
if (isdigit(*str)) {
- iotlb_nslabs = simple_strtoul(str, &str, 0) <<
+ io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
(20 - IO_TLB_SHIFT);
- iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
}
if (*str == ',')
++str;
@@ -129,35 +119,17 @@ setup_io_tlb_npages(char *str)
__setup("swiotlb=", setup_io_tlb_npages);
/* make io_tlb_overflow tunable too? */
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the PCI DMA API.
- */
-void __init
-swiotlb_init_with_default_size(size_t default_size)
+void *__init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
{
- unsigned long i, bytes;
+ void *start = alloc_bootmem_pages(size);
+ unsigned int i;
int rc;
- if (!iotlb_nslabs) {
- iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
- iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
- }
-
- bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
-
- /*
- * Get IO TLB memory from the low pages
- */
- io_tlb_start = alloc_bootmem_pages(bytes);
- if (!io_tlb_start)
- panic("Cannot allocate SWIOTLB buffer!\n");
-
dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
- for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
+ for (i = 0; i < nslabs; i += IO_TLB_SEGSIZE) {
do {
rc = xen_create_contiguous_region(
- (unsigned long)io_tlb_start + (i << IO_TLB_SHIFT),
+ (unsigned long)start + (i << IO_TLB_SHIFT),
get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
dma_bits);
} while (rc && dma_bits++ < max_dma_bits);
@@ -166,12 +138,12 @@ swiotlb_init_with_default_size(size_t de
panic("No suitable physical memory available for SWIOTLB buffer!\n"
"Use dom0_mem Xen boot parameter to reserve\n"
"some DMA memory (e.g., dom0_mem=-128M).\n");
- iotlb_nslabs = i;
+ io_tlb_nslabs = i;
i <<= IO_TLB_SHIFT;
- free_bootmem(__pa(io_tlb_start + i), bytes - i);
- bytes = i;
+ free_bootmem(__pa(start + i), size - i);
+ size = i;
for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
- unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1));
+ unsigned int bits = fls64(virt_to_bus(start + i - 1));
if (bits > dma_bits)
dma_bits = bits;
@@ -179,18 +151,88 @@ swiotlb_init_with_default_size(size_t de
break;
}
}
+
+ return start;
+}
+
+#ifndef CONFIG_XEN
+void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+ return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+#endif
+
+dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
+{
+ return phys_to_machine(paddr);
+}
+
+phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+ return machine_to_phys(baddr);
+}
+
+static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+ volatile void *address)
+{
+ return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
+}
+
+static void *swiotlb_bus_to_virt(dma_addr_t address)
+{
+ return phys_to_virt(swiotlb_bus_to_phys(address));
+}
+
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+ return 0;
+}
+
+static void swiotlb_print_info(unsigned long bytes)
+{
+ printk(KERN_INFO "Software IO TLB enabled: \n"
+ " Aperture: %lu megabytes\n"
+ " Address size: %u bits\n"
+ " Kernel range: %p - %p\n",
+ bytes >> 20, dma_bits,
+ io_tlb_start, io_tlb_end);
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the PCI DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size)
+{
+ unsigned long i, bytes;
+ int rc;
+
+ if (!io_tlb_nslabs) {
+ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+ }
+
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+ /*
+ * Get IO TLB memory from the low pages
+ */
+ io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs);
+ if (!io_tlb_start)
+ panic("Cannot allocate SWIOTLB buffer!\n");
+ bytes = io_tlb_nslabs << IO_TLB_SHIFT;
io_tlb_end = io_tlb_start + bytes;
/*
* Allocate and initialize the free list array. This array is used
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
*/
- io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
- for (i = 0; i < iotlb_nslabs; i++)
+ io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+ for (i = 0; i < io_tlb_nslabs; i++)
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_index = 0;
- io_tlb_orig_addr = alloc_bootmem(
- iotlb_nslabs * sizeof(*io_tlb_orig_addr));
+ io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
/*
* Get the overflow emergency buffer
@@ -208,13 +250,7 @@ swiotlb_init_with_default_size(size_t de
if (rc)
panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
- printk(KERN_INFO "Software IO TLB enabled: \n"
- " Aperture: %lu megabytes\n"
- " Kernel range: %p - %p\n"
- " Address size: %u bits\n",
- bytes >> 20,
- io_tlb_start, io_tlb_end,
- dma_bits);
+ swiotlb_print_info(bytes);
}
void __init
@@ -241,6 +277,11 @@ swiotlb_init(void)
printk(KERN_INFO "Software IO TLB disabled\n");
}
+static inline int range_needs_mapping(phys_addr_t pa, size_t size)
+{
+ return range_straddles_page_boundary(pa, size);
+}
+
static int is_swiotlb_buffer(dma_addr_t addr)
{
unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr));
@@ -254,46 +295,50 @@ static int is_swiotlb_buffer(dma_addr_t
}
/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ *
* We use __copy_to_user_inatomic to transfer to the host buffer because the
* buffer may be mapped read-only (e.g, in blkback driver) but lower-level
* drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
* unnecessary copy from the aperture to the host buffer, and a page fault.
*/
-static void
-__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
+static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+ enum dma_data_direction dir)
{
- if (PageHighMem(buffer.page)) {
- size_t len, bytes;
- char *dev, *host, *kmp;
- len = size;
- while (len != 0) {
- unsigned long flags;
-
- if (((bytes = len) + buffer.offset) > PAGE_SIZE)
- bytes = PAGE_SIZE - buffer.offset;
- local_irq_save(flags); /* protects KM_BOUNCE_READ */
- kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ);
- dev = dma_addr + size - len;
- host = kmp + buffer.offset;
- if (dir == DMA_FROM_DEVICE) {
- if (__copy_to_user_inatomic(host, dev, bytes))
- /* inaccessible */;
- } else
- memcpy(dev, host, bytes);
- kunmap_atomic(kmp, KM_BOUNCE_READ);
+ unsigned long pfn = PFN_DOWN(phys);
+
+ if (PageHighMem(pfn_to_page(pfn))) {
+ /* The buffer does not have a mapping. Map it in and copy */
+ unsigned int offset = phys & ~PAGE_MASK;
+ char *buffer;
+ unsigned int sz = 0;
+ unsigned long flags;
+
+ while (size) {
+ sz = min((size_t)(PAGE_SIZE - offset), size);
+
+ local_irq_save(flags);
+ buffer = kmap_atomic(pfn_to_page(pfn),
+ KM_BOUNCE_READ);
+ if (dir == DMA_TO_DEVICE)
+ memcpy(dma_addr, buffer + offset, sz);
+ else if (__copy_to_user_inatomic(buffer + offset,
+ dma_addr, sz))
+ /* inaccessible */;
+ kunmap_atomic(buffer, KM_BOUNCE_READ);
local_irq_restore(flags);
- len -= bytes;
- buffer.page++;
- buffer.offset = 0;
+
+ size -= sz;
+ pfn++;
+ dma_addr += sz;
+ offset = 0;
}
} else {
- char *host = (char *)phys_to_virt(
- page_to_pseudophys(buffer.page)) + buffer.offset;
- if (dir == DMA_FROM_DEVICE) {
- if (__copy_to_user_inatomic(host, dma_addr, size))
- /* inaccessible */;
- } else if (dir == DMA_TO_DEVICE)
- memcpy(dma_addr, host, size);
+ if (dir == DMA_TO_DEVICE)
+ memcpy(dma_addr, phys_to_virt(phys), size);
+ else if (__copy_to_user_inatomic(phys_to_virt(phys),
+ dma_addr, size))
+ /* inaccessible */;
}
}
@@ -301,12 +346,11 @@ __sync_single(struct phys_addr buffer, c
* Allocates bounce buffer and returns its kernel virtual address.
*/
static void *
-map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
+map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
{
unsigned long flags;
char *dma_addr;
unsigned int nslots, stride, index, wrap;
- struct phys_addr slot_buf;
int i;
unsigned long mask;
unsigned long offset_slots;
@@ -314,6 +358,10 @@ map_single(struct device *hwdev, struct
mask = dma_get_seg_boundary(hwdev);
offset_slots = -IO_TLB_SEGSIZE;
+
+ /*
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
max_slots = mask + 1
? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
: 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -336,7 +384,7 @@ map_single(struct device *hwdev, struct
*/
spin_lock_irqsave(&io_tlb_lock, flags);
index = ALIGN(io_tlb_index, stride);
- if (index >= iotlb_nslabs)
+ if (index >= io_tlb_nslabs)
index = 0;
wrap = index;
@@ -344,7 +392,7 @@ map_single(struct device *hwdev, struct
while (iommu_is_span_boundary(index, nslots, offset_slots,
max_slots)) {
index += stride;
- if (index >= iotlb_nslabs)
+ if (index >= io_tlb_nslabs)
index = 0;
if (index == wrap)
goto not_found;
@@ -368,13 +416,13 @@ map_single(struct device *hwdev, struct
* Update the indices to avoid searching in the next
* round.
*/
- io_tlb_index = ((index + nslots) < iotlb_nslabs
+ io_tlb_index = ((index + nslots) < io_tlb_nslabs
? (index + nslots) : 0);
goto found;
}
index += stride;
- if (index >= iotlb_nslabs)
+ if (index >= io_tlb_nslabs)
index = 0;
} while (index != wrap);
@@ -389,29 +437,14 @@ found:
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- slot_buf = buffer;
- for (i = 0; i < nslots; i++) {
- slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
- slot_buf.offset &= PAGE_SIZE - 1;
- io_tlb_orig_addr[index+i] = slot_buf;
- slot_buf.offset += 1 << IO_TLB_SHIFT;
- }
- if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
- __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+ for (i = 0; i < nslots; i++)
+ io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
return dma_addr;
}
-static struct phys_addr dma_addr_to_phys_addr(char *dma_addr)
-{
- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- struct phys_addr buffer = io_tlb_orig_addr[index];
- buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
- buffer.page += buffer.offset >> PAGE_SHIFT;
- buffer.offset &= PAGE_SIZE - 1;
- return buffer;
-}
-
/*
* dma_addr is the kernel virtual address of the bounce buffer to unmap.
*/
@@ -421,13 +454,13 @@ unmap_single(struct device *hwdev, char
unsigned long flags;
int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+ phys_addr_t phys = io_tlb_orig_addr[index];
/*
* First, sync the memory before unmapping the entry
*/
- if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
- __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+ if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -462,17 +495,21 @@ static void
sync_single(struct device *hwdev, char *dma_addr, size_t size,
int dir, int target)
{
- struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr);
+ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t phys = io_tlb_orig_addr[index];
+
+ phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+
switch (target) {
case SYNC_FOR_CPU:
if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
else
BUG_ON(dir != DMA_TO_DEVICE);
break;
case SYNC_FOR_DEVICE:
if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
else
BUG_ON(dir != DMA_FROM_DEVICE);
break;
@@ -492,7 +529,7 @@ swiotlb_full(struct device *dev, size_t
* the damage, or panic when the transfer is too big.
*/
printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
- "device %s\n", size, dev ? dev->bus_id : "?");
+ "device %s\n", size, dev ? dev_name(dev) : "?");
if (size > io_tlb_overflow && do_panic) {
if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -517,7 +554,6 @@ _swiotlb_map_single(struct device *hwdev
dma_addr_t dev_addr = gnttab_dma_map_page(page) +
offset_in_page(paddr);
void *map;
- struct phys_addr buffer;
BUG_ON(dir == DMA_NONE);
@@ -526,23 +562,21 @@ _swiotlb_map_single(struct device *hwdev
* we can safely return the device addr and not worry about bounce
* buffering it.
*/
- if (!range_straddles_page_boundary(paddr, size) &&
- !address_needs_mapping(hwdev, dev_addr, size))
+ if (!address_needs_mapping(hwdev, dev_addr, size) &&
+ !range_needs_mapping(paddr, size))
return dev_addr;
/*
* Oh well, have to allocate and map a bounce buffer.
*/
gnttab_dma_unmap_page(dev_addr);
- buffer.page = page;
- buffer.offset = offset_in_page(paddr);
- map = map_single(hwdev, buffer, size, dir);
+ map = map_single(hwdev, paddr, size, dir);
if (!map) {
swiotlb_full(hwdev, size, dir, 1);
map = io_tlb_overflow_buffer;
}
- dev_addr = virt_to_bus(map);
+ dev_addr = swiotlb_virt_to_bus(hwdev, map);
return dev_addr;
}
@@ -559,6 +593,7 @@ swiotlb_map_single(struct device *hwdev,
{
return _swiotlb_map_single(hwdev, virt_to_phys(ptr), size, dir, NULL);
}
+EXPORT_SYMBOL(swiotlb_map_single);
dma_addr_t
swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
@@ -578,7 +613,7 @@ void
swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
size_t size, int dir, struct dma_attrs *attrs)
{
- char *dma_addr = bus_to_virt(dev_addr);
+ char *dma_addr = swiotlb_bus_to_virt(dev_addr);
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(dev_addr))
@@ -594,6 +629,8 @@ swiotlb_unmap_single(struct device *hwde
{
return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
}
+EXPORT_SYMBOL(swiotlb_unmap_single);
+
/*
* Make physical memory consistent for a single streaming mode DMA translation
* after a transfer.
@@ -608,7 +645,7 @@ static void
swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
size_t size, int dir, int target)
{
- char *dma_addr = bus_to_virt(dev_addr);
+ char *dma_addr = swiotlb_bus_to_virt(dev_addr);
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(dev_addr))
@@ -621,6 +658,7 @@ swiotlb_sync_single_for_cpu(struct devic
{
swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
}
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
void
swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
@@ -628,6 +666,7 @@ swiotlb_sync_single_for_device(struct de
{
swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
}
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
/*
* Same as above, but for a sub-range of the mapping.
@@ -637,7 +676,7 @@ swiotlb_sync_single_range(struct device
unsigned long offset, size_t size,
int dir, int target)
{
- char *dma_addr = bus_to_virt(dev_addr);
+ char *dma_addr = swiotlb_bus_to_virt(dev_addr);
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(dev_addr))
@@ -651,6 +690,7 @@ swiotlb_sync_single_range_for_cpu(struct
swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
SYNC_FOR_CPU);
}
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
void
swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
@@ -659,9 +699,8 @@ swiotlb_sync_single_range_for_device(str
swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
SYNC_FOR_DEVICE);
}
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
-void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
- struct dma_attrs *);
/*
* Map a set of buffers described by scatterlist in streaming mode for DMA.
* This is the scatter-gather version of the above swiotlb_map_single
@@ -683,23 +722,23 @@ swiotlb_map_sg_attrs(struct device *hwde
int dir, struct dma_attrs *attrs)
{
struct scatterlist *sg;
- struct phys_addr buffer;
- dma_addr_t dev_addr;
- char *map;
int i;
BUG_ON(dir == DMA_NONE);
for_each_sg(sgl, sg, nelems, i) {
- dev_addr = gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+ dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg))
+ + sg->offset;
+ phys_addr_t paddr = page_to_pseudophys(sg_page(sg))
+ + sg->offset;
- if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg))
- + sg->offset, sg->length)
+ if (range_needs_mapping(paddr, sg->length)
|| address_needs_mapping(hwdev, dev_addr, sg->length)) {
+ void *map;
+
gnttab_dma_unmap_page(dev_addr);
- buffer.page = sg_page(sg);
- buffer.offset = sg->offset;
- map = map_single(hwdev, buffer, sg->length, dir);
+ map = map_single(hwdev, paddr,
+ sg->length, dir);
if (!map) {
/* Don't panic here, we expect map_sg users
to do proper error handling. */
@@ -709,7 +748,7 @@ swiotlb_map_sg_attrs(struct device *hwde
sgl[0].dma_length = 0;
return 0;
}
- sg->dma_address = virt_to_bus(map);
+ sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
} else
sg->dma_address = dev_addr;
sg->dma_length = sg->length;
@@ -724,6 +763,7 @@ swiotlb_map_sg(struct device *hwdev, str
{
return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
}
+EXPORT_SYMBOL(swiotlb_map_sg);
/*
* Unmap a set of streaming mode DMA translations. Again, cpu read rules
@@ -740,7 +780,7 @@ swiotlb_unmap_sg_attrs(struct device *hw
for_each_sg(sgl, sg, nelems, i) {
if (sg->dma_address != sg_phys(sg))
- unmap_single(hwdev, bus_to_virt(sg->dma_address),
+ unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
sg->dma_length, dir);
else
gnttab_dma_unmap_page(sg->dma_address);
@@ -754,6 +794,7 @@ swiotlb_unmap_sg(struct device *hwdev, s
{
return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
}
+EXPORT_SYMBOL(swiotlb_unmap_sg);
/*
* Make physical memory consistent for a set of streaming mode DMA translations
@@ -773,7 +814,7 @@ swiotlb_sync_sg(struct device *hwdev, st
for_each_sg(sgl, sg, nelems, i) {
if (sg->dma_address != sg_phys(sg))
- sync_single(hwdev, bus_to_virt(sg->dma_address),
+ sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
sg->dma_length, dir, target);
}
}
@@ -784,6 +825,7 @@ swiotlb_sync_sg_for_cpu(struct device *h
{
swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
void
swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
@@ -791,12 +833,14 @@ swiotlb_sync_sg_for_device(struct device
{
swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
int
swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
{
- return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+ return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
}
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
/*
* Return whether the given PCI device DMA address mask can be supported
@@ -809,14 +853,4 @@ swiotlb_dma_supported (struct device *hw
{
return (mask >= ((1UL << dma_bits) - 1));
}
-
-EXPORT_SYMBOL(swiotlb_map_single);
-EXPORT_SYMBOL(swiotlb_unmap_single);
-EXPORT_SYMBOL(swiotlb_map_sg);
-EXPORT_SYMBOL(swiotlb_unmap_sg);
-EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_single_for_device);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
-EXPORT_SYMBOL(swiotlb_dma_mapping_error);
EXPORT_SYMBOL(swiotlb_dma_supported);