You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
qubes-linux-kernel/patches.xen/xen3-patch-2.6.27

25808 lines
731 KiB

From: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: [PATCH] Linux: Update to 2.6.27
Patch-mainline: 2.6.27
This patch contains the differences between Linux 2.6.26 and 2.6.27.
Acked-by: Jeff Mahoney <jeffm@suse.com>
Automatically created from "patches.kernel.org/patch-2.6.27" by xen-port-patches.py
Removed adjustments NO_HZ -> NO_HZ || NO_IDLE_HZ from kernel/{hr,}timer.c,
as they would get removed again by xen-clockevents (and really shouldn't
have been needed - see SLE11 SPn).
--- head-2011-03-11.orig/arch/x86/Kconfig 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/Kconfig 2011-02-01 14:38:38.000000000 +0100
@@ -723,7 +723,7 @@ config AMD_IOMMU
bool "AMD IOMMU support"
select SWIOTLB
select PCI_MSI
- depends on X86_64 && PCI && ACPI
+ depends on X86_64 && PCI && ACPI && !XEN
---help---
With this option you can enable support for AMD IOMMU hardware in
your system. An IOMMU is a hardware component which provides
@@ -1465,7 +1465,7 @@ config MTRR
config MTRR_SANITIZER
def_bool y
prompt "MTRR cleanup support"
- depends on MTRR
+ depends on MTRR && !XEN
---help---
Convert MTRR layout from continuous to discrete, so X drivers can
add writeback entries.
--- head-2011-03-11.orig/arch/x86/Kconfig.debug 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/Kconfig.debug 2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@ config STRICT_DEVMEM
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
+ depends on !XEN
---help---
Enables the informational output from the decompression stage
(e.g. bzImage) of the boot. If you disable this you will still
@@ -179,6 +180,7 @@ config IOMMU_LEAK
config HAVE_MMIOTRACE_SUPPORT
def_bool y
+ depends on !XEN
config X86_DECODER_SELFTEST
bool "x86 instruction decoder selftest"
--- head-2011-03-11.orig/arch/x86/Makefile 2011-02-01 14:11:04.000000000 +0100
+++ head-2011-03-11/arch/x86/Makefile 2011-02-01 14:38:38.000000000 +0100
@@ -117,8 +117,8 @@ endif
KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
# Xen subarch support
-mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen
-mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/
+mflags-$(CONFIG_XEN) := -Iinclude/asm-x86/mach-xen
+mcore-$(CONFIG_XEN) := arch/x86/mach-xen/
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
--- head-2011-03-11.orig/arch/x86/ia32/ia32entry-xen.S 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/ia32/ia32entry-xen.S 2011-02-01 14:38:38.000000000 +0100
@@ -15,6 +15,16 @@
#include <asm/irqflags.h>
#include <linux/linkage.h>
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysexit_audit int_ret_from_sys_call
+#define sysretl_audit int_ret_from_sys_call
+#endif
+
#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
.macro IA32_ARG_FIXUP noebp=0
@@ -37,6 +47,11 @@
movq %rax,R8(%rsp)
.endm
+ /*
+ * Reload arg registers from stack in case ptrace changed them.
+ * We don't reload %eax because syscall_trace_enter() returned
+ * the value it wants us to use in the table lookup.
+ */
.macro LOAD_ARGS32 offset
movl \offset(%rsp),%r11d
movl \offset+8(%rsp),%r10d
@@ -46,7 +61,6 @@
movl \offset+48(%rsp),%edx
movl \offset+56(%rsp),%esi
movl \offset+64(%rsp),%edi
- movl \offset+72(%rsp),%eax
.endm
.macro CFI_STARTPROC32 simple
@@ -61,6 +75,19 @@
CFI_UNDEFINED r15
.endm
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+ swapgs
+ sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+ swapgs
+ sti
+ sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
/*
* 32bit SYSENTER instruction entry.
*
@@ -98,7 +125,7 @@ ENTRY(ia32_sysenter_target)
CFI_RESTORE rcx
movl %ebp,%ebp /* zero extension */
movl %eax,%eax
- movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d
+ movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
movl $__USER32_DS,40(%rsp)
movq %rbp,32(%rsp)
movl $__USER32_CS,16(%rsp)
@@ -113,19 +140,75 @@ ENTRY(ia32_sysenter_target)
.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz sysenter_tracesys
-sysenter_do_call:
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
+sysenter_do_call:
IA32_ARG_FIXUP 1
+sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%r10)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+ jnz sysexit_audit
jmp int_ret_from_sys_call
+#ifdef CONFIG_AUDITSYSCALL
+ .macro auditsys_entry_common
+ movl %esi,%r9d /* 6th arg: 4th syscall arg */
+ movl %edx,%r8d /* 5th arg: 3rd syscall arg */
+ /* (already in %ecx) 4th arg: 2nd syscall arg */
+ movl %ebx,%edx /* 3rd arg: 1st syscall arg */
+ movl %eax,%esi /* 2nd arg: syscall number */
+ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
+ call audit_syscall_entry
+ movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
+ movl %ebx,%edi /* reload 1st syscall arg */
+ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
+ movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
+ movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
+ movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
+ .endm
+
+ .macro auditsys_exit exit,ebpsave=RBP
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ jnz int_ret_from_sys_call
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ movl %eax,%esi /* second arg, syscall return value */
+ cmpl $0,%eax /* is it < 0? */
+ setl %al /* 1 if so, 0 if not */
+ movzbl %al,%edi /* zero-extend that into %edi */
+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+ call audit_syscall_exit
+ movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check
+ .endm
+
+sysenter_auditsys:
+ auditsys_entry_common
+ movl %ebp,%r9d /* reload 6th syscall arg */
+ jmp sysenter_dispatch
+
+sysexit_audit:
+ auditsys_exit sysexit_from_sys_call
+#endif
+
sysenter_tracesys:
xchgl %r9d,%ebp
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ jz sysenter_auditsys
+#endif
SAVE_REST
CLEAR_RREGS
movq %r9,R9(%rsp)
@@ -186,18 +269,38 @@ ENTRY(ia32_cstar_target)
.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz cstar_tracesys
cstar_do_call:
cmpl $IA32_NR_syscalls-1,%eax
ja ia32_badsys
IA32_ARG_FIXUP 1
+cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%r10)
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+ jnz sysretl_audit
jmp int_ret_from_sys_call
-cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+ movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
+ auditsys_entry_common
+ movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
+ jmp cstar_dispatch
+
+sysretl_audit:
+ auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ jz cstar_auditsys
+#endif
xchgl %r9d,%ebp
SAVE_REST
CLEAR_RREGS
@@ -263,8 +366,8 @@ ENTRY(ia32_syscall)
this could be a problem. */
SAVE_ARGS 0,0,1
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
@@ -309,13 +412,11 @@ quiet_ni_syscall:
PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
- PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
PTREGSCALL stub32_execve, sys32_execve, %rcx
PTREGSCALL stub32_fork, sys_fork, %rdi
PTREGSCALL stub32_clone, sys32_clone, %rdx
PTREGSCALL stub32_vfork, sys_vfork, %rdi
PTREGSCALL stub32_iopl, sys_iopl, %rsi
- PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
ENTRY(ia32_ptregs_common)
popq %r11
@@ -415,7 +516,7 @@ ia32_sys_call_table:
.quad sys_ssetmask
.quad sys_setreuid16 /* 70 */
.quad sys_setregid16
- .quad stub32_sigsuspend
+ .quad sys32_sigsuspend
.quad compat_sys_sigpending
.quad sys_sethostname
.quad compat_sys_setrlimit /* 75 */
@@ -522,7 +623,7 @@ ia32_sys_call_table:
.quad sys32_rt_sigpending
.quad compat_sys_rt_sigtimedwait
.quad sys32_rt_sigqueueinfo
- .quad stub32_rt_sigsuspend
+ .quad sys_rt_sigsuspend
.quad sys32_pread /* 180 */
.quad sys32_pwrite
.quad sys_chown16
@@ -670,4 +771,10 @@ ia32_sys_call_table:
.quad sys32_fallocate
.quad compat_sys_timerfd_settime /* 325 */
.quad compat_sys_timerfd_gettime
+ .quad compat_sys_signalfd4
+ .quad sys_eventfd2
+ .quad sys_epoll_create1
+ .quad sys_dup3 /* 330 */
+ .quad sys_pipe2
+ .quad sys_inotify_init1
ia32_syscall_end:
--- head-2011-03-11.orig/arch/x86/kernel/Makefile 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/Makefile 2011-02-01 14:38:38.000000000 +0100
@@ -125,9 +125,11 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
- obj-$(CONFIG_XEN) += nmi_64.o
+ obj-$(CONFIG_XEN) += nmi.o
time_64-$(CONFIG_XEN) += time_32.o
endif
-disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259_$(BITS).o \
- pci-swiotlb_64.o reboot.o smpboot.o tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \
+ i8259.o irqinit_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \
+ tlb_$(BITS).o tsc.o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o
--- head-2011-03-11.orig/arch/x86/kernel/acpi/boot.c 2011-03-11 10:59:02.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/acpi/boot.c 2011-03-11 10:59:30.000000000 +0100
@@ -1349,6 +1349,7 @@ static int __init dmi_disable_acpi(const
return 0;
}
+#ifndef CONFIG_XEN
/*
* Force ignoring BIOS IRQ0 pin2 override
*/
@@ -1366,6 +1367,7 @@ static int __init dmi_ignore_irq0_timer_
}
return 0;
}
+#endif
static int __init force_acpi_rsdt(const struct dmi_system_id *d)
{
@@ -1486,6 +1488,7 @@ static struct dmi_system_id __initdata a
{}
};
+#ifndef CONFIG_XEN
/* second table for DMI checks that should run after early-quirks */
static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
/*
@@ -1532,6 +1535,7 @@ static struct dmi_system_id __initdata a
},
{}
};
+#endif
/*
* acpi_boot_table_init() and acpi_boot_init()
@@ -1604,8 +1608,10 @@ int __init early_acpi_boot_init(void)
int __init acpi_boot_init(void)
{
+#ifndef CONFIG_XEN
/* those are executed after early-quirks are executed */
dmi_check_system(acpi_dmi_table_late);
+#endif
/*
* If acpi_disabled, bail out
--- head-2011-03-11.orig/arch/x86/kernel/acpi/sleep-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/acpi/sleep-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -9,6 +9,7 @@
#include <linux/bootmem.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
+#include <asm/segment.h>
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -20,7 +21,7 @@ unsigned long acpi_realmode_flags;
/* address in low memory of the wakeup routine. */
static unsigned long acpi_realmode;
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
static char temp_stack[10240];
#endif
#endif
@@ -54,18 +55,27 @@ int acpi_save_state_mem(void)
header->video_mode = saved_video_mode;
header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+
+ /*
+ * Set up the wakeup GDT. We set these up as Big Real Mode,
+ * that is, with limits set to 4 GB. At least the Lenovo
+ * Thinkpad X61 is known to need this for the video BIOS
+ * initialization quirk to work; this is likely to also
+ * be the case for other laptops or integrated video devices.
+ */
+
/* GDT[0]: GDT self-pointer */
header->wakeup_gdt[0] =
(u64)(sizeof(header->wakeup_gdt) - 1) +
((u64)(acpi_wakeup_address +
((char *)&header->wakeup_gdt - (char *)acpi_realmode))
<< 16);
- /* GDT[1]: real-mode-like code segment */
- header->wakeup_gdt[1] = (0x009bULL << 40) +
- ((u64)acpi_wakeup_address << 16) + 0xffff;
- /* GDT[2]: real-mode-like data segment */
- header->wakeup_gdt[2] = (0x0093ULL << 40) +
- ((u64)acpi_wakeup_address << 16) + 0xffff;
+ /* GDT[1]: big real mode-like code segment */
+ header->wakeup_gdt[1] =
+ GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
+ /* GDT[2]: big real mode-like data segment */
+ header->wakeup_gdt[2] =
+ GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -79,7 +89,7 @@ int acpi_save_state_mem(void)
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = read_cr4_safe();
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
@@ -89,7 +99,9 @@ int acpi_save_state_mem(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
- init_rsp = (unsigned long)temp_stack + 4096;
+#ifdef CONFIG_SMP
+ stack_start.sp = temp_stack + 4096;
+#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
#endif /* CONFIG_64BIT */
@@ -145,6 +157,12 @@ static int __init acpi_sleep_setup(char
acpi_realmode_flags |= 2;
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_nohwsig", 10) == 0)
+ acpi_no_s4_hw_signature();
+#endif
+ if (strncmp(str, "old_ordering", 12) == 0)
+ acpi_old_suspend_ordering();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
--- head-2011-03-11.orig/arch/x86/kernel/amd_nb.c 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/amd_nb.c 2011-02-01 14:38:38.000000000 +0100
@@ -15,6 +15,10 @@ static u32 *flush_words;
struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+#ifdef CONFIG_XEN
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
+#endif
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
--- head-2011-03-11.orig/arch/x86/kernel/apic/apic-xen.c 2011-02-24 15:45:13.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/apic/apic-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -59,7 +59,10 @@ static cpumask_t timer_bcast_ipi;
/*
* Debug level, exported for io_apic.c
*/
-int apic_verbosity;
+unsigned int apic_verbosity;
+
+/* Have we found an MP table */
+int smp_found_config;
#ifndef CONFIG_XEN
static int modern_apic(void)
--- head-2011-03-11.orig/arch/x86/kernel/asm-offsets_64.c 2010-01-19 16:00:48.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/asm-offsets_64.c 2011-02-01 14:38:38.000000000 +0100
@@ -132,7 +132,7 @@ int main(void)
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
--- head-2011-03-11.orig/arch/x86/kernel/cpu/amd.c 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/amd.c 2011-02-01 14:38:38.000000000 +0100
@@ -575,6 +575,7 @@ static void __cpuinit init_amd(struct cp
fam10h_check_enable_mmcfg();
}
+#ifndef CONFIG_XEN
if (c == &boot_cpu_data && c->x86 >= 0xf) {
unsigned long long tseg;
@@ -594,6 +595,7 @@ static void __cpuinit init_amd(struct cp
}
}
#endif
+#endif
}
#ifdef CONFIG_X86_32
--- head-2011-03-11.orig/arch/x86/kernel/cpu/bugs_64.c 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/bugs_64.c 2011-02-01 14:38:38.000000000 +0100
@@ -20,6 +20,7 @@ void __init check_bugs(void)
#endif
alternative_instructions();
+#ifndef CONFIG_XEN
/*
* Make sure the first 2MB area is not mapped by huge pages
* There are typically fixed size MTRRs in there and overlapping
@@ -30,4 +31,5 @@ void __init check_bugs(void)
*/
if (!direct_gbpages)
set_memory_4k((unsigned long)__va(0), 1);
+#endif
}
--- head-2011-03-11.orig/arch/x86/kernel/cpu/common-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/cpu/common-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -13,6 +13,7 @@
#include <asm/mtrr.h>
#include <asm/mce.h>
#include <asm/pat.h>
+#include <asm/asm.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
@@ -341,11 +342,24 @@ static void __init early_cpu_detect(void
get_cpu_vendor(c, 1);
+ early_get_cap(c);
+
if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
cpu_devs[c->x86_vendor]->c_early_init)
cpu_devs[c->x86_vendor]->c_early_init(c);
+}
- early_get_cap(c);
+/*
+ * The NOPL instruction is supposed to exist on all CPUs with
+ * family >= 6; unfortunately, that's not true in practice because
+ * of early VIA chips and (more importantly) broken virtualizers that
+ * are not easy to detect. In the latter case it doesn't even *fail*
+ * reliably, so probing for it doesn't even work. Disable it completely
+ * unless we can find a reliable way to detect all the broken cases.
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+ clear_cpu_cap(c, X86_FEATURE_NOPL);
}
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -404,8 +418,8 @@ static void __cpuinit generic_identify(s
}
init_scattered_cpuid_features(c);
+ detect_nopl(c);
}
-
}
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
@@ -436,7 +450,7 @@ __setup("serialnumber", x86_serial_nr_se
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
@@ -452,6 +466,8 @@ void __cpuinit identify_cpu(struct cpuin
#endif
c->x86_clflush_size = 32;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ if (boot_cpu_has(X86_FEATURE_SYSCALL32))
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
if (!have_cpuid_p()) {
/*
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/cpu/common_64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,777 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/linkage.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pat.h>
+#include <asm/asm.h>
+#include <asm/numa.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#elif defined(CONFIG_XEN)
+#include <mach_apic.h>
+#endif
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/genapic.h>
+
+#include "cpu.h"
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+#ifndef CONFIG_XEN
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+#else
+ void *va, *gdt_addr = get_cpu_gdt_table(smp_processor_id());
+ unsigned long frames[16];
+ unsigned int f = 0;
+
+ for (va = gdt_addr; va < gdt_addr + GDT_SIZE; va += PAGE_SIZE) {
+ frames[f++] = virt_to_mfn(va);
+ make_page_readonly(va, XENFEAT_writable_descriptor_tables);
+ }
+ if (HYPERVISOR_set_gdt(frames, GDT_SIZE / sizeof(struct desc_struct)))
+ BUG();
+#endif
+}
+
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+ display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return 0;
+
+ v = (unsigned int *) c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+ return 1;
+}
+
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, ebx, ecx, edx;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+ "D cache %dK (%d bytes/line)\n",
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+ }
+
+ if (n >= 0x80000006) {
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ ecx = cpuid_ecx(0x80000006);
+ c->x86_cache_size = ecx >> 16;
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ c->x86_cache_size, ecx & 0xFF);
+ }
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of "
+ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = phys_pkg_id(index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
+ }
+out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+ char *v = c->x86_vendor_id;
+ int i;
+ static int printed;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (cpu_devs[i]) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ c->x86_vendor = i;
+ this_cpu = cpu_devs[i];
+ return;
+ }
+ }
+ }
+ if (!printed) {
+ printed++;
+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
+ }
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+static void __init early_cpu_support_print(void)
+{
+ int i,j;
+ struct cpu_dev *cpu_devx;
+
+ printk("KERNEL supported cpus:\n");
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ cpu_devx = cpu_devs[i];
+ if (!cpu_devx)
+ continue;
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devx->c_ident[j])
+ continue;
+ printk(" %s %s\n", cpu_devx->c_vendor,
+ cpu_devx->c_ident[j]);
+ }
+ }
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs with
+ * family >= 6, unfortunately, that's not true in practice because
+ * of early VIA chips and (more importantly) broken virtualizers that
+ * are not easy to detect. Hence, probe for it based on first
+ * principles.
+ *
+ * Note: no 64-bit chip is known to lack these, but put the code here
+ * for consistency with 32 bits, and to make it utterly trivial to
+ * diagnose the problem should it ever surface.
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+ const u32 nopl_signature = 0x888c53b1; /* Random number */
+ u32 has_nopl = nopl_signature;
+
+ clear_cpu_cap(c, X86_FEATURE_NOPL);
+ if (c->x86 >= 6) {
+ asm volatile("\n"
+ "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
+ "2:\n"
+ " .section .fixup,\"ax\"\n"
+ "3: xor %0,%0\n"
+ " jmp 2b\n"
+ " .previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : "+a" (has_nopl));
+
+ if (has_nopl == nopl_signature)
+ set_cpu_cap(c, X86_FEATURE_NOPL);
+ }
+}
+
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+void __init early_cpu_init(void)
+{
+ struct cpu_vendor_dev *cvdev;
+
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+ early_cpu_support_print();
+ early_identify_cpu(&boot_cpu_data);
+}
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+ needed before check_bugs. Everything advanced is in identify_cpu
+ below. */
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+ u32 tfms, xlvl;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_clflush_size = 64;
+ c->x86_cache_alignment = c->x86_clflush_size;
+#ifndef CONFIG_XEN
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+#endif
+ c->extended_cpuid_level = 0;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c);
+
+ /* Initialize the standard set of capabilities */
+ /* Note that the vendor-specific code below might override */
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ __u32 misc;
+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+ &c->x86_capability[0]);
+ c->x86 = (tfms >> 8) & 0xf;
+ c->x86_model = (tfms >> 4) & 0xf;
+ c->x86_mask = tfms & 0xf;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
+
+#ifndef CONFIG_XEN
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+#ifdef CONFIG_SMP
+ c->phys_proc_id = c->initial_apicid;
+#endif
+#endif
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ if (xlvl >= 0x80000004)
+ get_model_name(c); /* Default name */
+ }
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ /* Don't set x86_cpuid_level here for now to not confuse. */
+ if (xlvl >= 0x80860001)
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ u32 eax = cpuid_eax(0x80000008);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ detect_nopl(c);
+
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
+
+ validate_pat_support(c);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ early_identify_cpu(c);
+
+ init_scattered_cpuid_features(c);
+
+#ifndef CONFIG_XEN
+ c->apicid = phys_pkg_id(0);
+#endif
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ detect_ht(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
+ /* Clear all flags overriden by options */
+ for (i = 0; i < NCAPINTS; i++)
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+#ifdef CONFIG_X86_MCE
+ mcheck_init(c);
+#endif
+ select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+#endif
+
+}
+
+void __cpuinit identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ mtrr_ap_init();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+ if (c->x86_model_id[0])
+ printk(KERN_CONT "%s", c->x86_model_id);
+
+ if (c->x86_mask || c->cpuid_level >= 0)
+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ else
+ printk(KERN_CONT "\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+ int bit;
+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ setup_clear_cpu_cap(bit);
+ else
+ return 0;
+ return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+#ifndef CONFIG_X86_NO_IDT
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+#endif
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on Enable(default)
+off Disable
+*/
+static int __init nonx_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", nonx_setup);
+
+int force_personality32;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off PROT_READ implies PROT_EXEC
+*/
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+static void __init_refok switch_pt(int cpu)
+{
+#ifdef CONFIG_XEN
+ if (cpu == 0)
+ xen_init_pt();
+ xen_pt_switch(__pa_symbol(init_level4_pgt));
+ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
+#endif
+}
+
+void pda_init(int cpu)
+{
+ struct x8664_pda *pda = cpu_pda(cpu);
+
+ /* Setup up data that may be needed in __get_free_pages early */
+ loadsegment(fs, 0);
+ loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+ /* Memory clobbers used to order PDA accessed */
+ mb();
+ wrmsrl(MSR_GS_BASE, pda);
+ mb();
+#else
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+ (unsigned long)pda))
+ BUG();
+#endif
+
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->kernelstack = (unsigned long)stack_thread_info() -
+ PDA_STACKOFFSET + THREAD_SIZE;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
+ if (cpu == 0) {
+ /* others are initialized in smpboot.c */
+ pda->pcurrent = &init_task;
+ pda->irqstackptr = boot_cpu_stack;
+ pda->irqstackptr += IRQSTACKSIZE - 64;
+ } else {
+ if (!pda->irqstackptr) {
+ pda->irqstackptr = (char *)
+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+ if (!pda->irqstackptr)
+ panic("cannot allocate irqstack for cpu %d",
+ cpu);
+ pda->irqstackptr += IRQSTACKSIZE - 64;
+ }
+
+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+ pda->nodenumber = cpu_to_node(cpu);
+ }
+
+ switch_pt(cpu);
+}
+
+#ifndef CONFIG_X86_NO_TSS
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+ DEBUG_STKSZ] __page_aligned_bss;
+#endif
+
+extern asmlinkage void ignore_sysret(void);
+
+void __cpuinit syscall_init(void)
+{
+#ifndef CONFIG_XEN
+ /*
+ * LSTAR and STAR live in a bit strange symbiosis.
+ * They both write to the same internal register. STAR allows to
+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ */
+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init();
+#else
+ static const struct callback_register __cpuinitconst cstar = {
+ .type = CALLBACKTYPE_syscall32,
+ .address = (unsigned long)ignore_sysret
+ };
+
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
+ printk(KERN_WARNING "Unable to register CSTAR callback\n");
+#endif
+}
+
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+unsigned long kernel_eflags;
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init(void)
+{
+ int cpu = stack_smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ unsigned long v;
+ char *estacks = NULL;
+ int i;
+#endif
+ struct task_struct *me;
+
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0)
+ pda_init(cpu);
+#ifndef CONFIG_X86_NO_TSS
+ else
+ estacks = boot_exception_stacks;
+#endif
+
+ me = current;
+
+ if (cpu_test_and_set(cpu, cpu_initialized))
+ panic("CPU#%d already initialized!\n", cpu);
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt();
+#ifndef CONFIG_X86_NO_IDT
+ load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ check_efer();
+
+#ifndef CONFIG_X86_NO_TSS
+ /*
+ * set up and load the per-CPU TSS
+ */
+ if (!orig_ist->ist[0]) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ if (cpu) {
+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+ if (!estacks)
+ panic("Cannot allocate exception "
+ "stack %ld %d\n", v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->x86_tss.ist[v] =
+ (unsigned long)estacks;
+ }
+ }
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+#endif
+
+ atomic_inc(&init_mm.mm_count);
+ me->active_mm = &init_mm;
+ if (me->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, me);
+
+ load_sp0(t, &current->thread);
+#ifndef CONFIG_X86_NO_TSS
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+#endif
+ load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+#endif
+ /*
+ * Clear all 6 debug registers:
+ */
+
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+#endif
+
+ fpu_init();
+
+ asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
+ if (raw_irqs_disabled())
+ kernel_eflags &= ~X86_EFLAGS_IF;
+
+ if (is_uv_system())
+ uv_cpu_init();
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/e820-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,1553 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/firmware-map.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <xen/interface/memory.h>
+
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
+struct e820map e820;
+#ifndef CONFIG_XEN
+struct e820map e820_saved;
+#else
+static struct e820map machine_e820;
+#define e820_saved machine_e820
+#endif
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; ++i) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+
+#ifndef CONFIG_XEN
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+#else
+ if (!is_initial_xendomain())
+ return 0;
+ for (i = 0; i < machine_e820.nr_map; ++i) {
+ const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /*
+ * if start is now at or beyond end, we're done, full
+ * coverage
+ */
+ if (start >= end)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ int x = e820.nr_map;
+
+ if (x == ARRAY_SIZE(e820.map)) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+}
+
+static void __init _e820_print_map(const struct e820map *e820, const char *who)
+{
+ int i;
+
+ for (i = 0; i < e820->nr_map; i++) {
+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ (unsigned long long) e820->map[i].addr,
+ (unsigned long long)
+ (e820->map[i].addr + e820->map[i].size));
+ switch (e820->map[i].type) {
+ case E820_RAM:
+ case E820_RESERVED_KERN:
+ printk(KERN_CONT "(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)\n");
+ break;
+ default:
+ printk(KERN_CONT "type %u\n", e820->map[i].type);
+ break;
+ }
+ }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ * Visually we're performing the following
+ * (1,2,3,4 = memory types)...
+ *
+ * Sample memory map (w/overlaps):
+ * ____22__________________
+ * ______________________4_
+ * ____1111________________
+ * _44_____________________
+ * 11111111________________
+ * ____________________33__
+ * ___________44___________
+ * __________33333_________
+ * ______________22________
+ * ___________________2222_
+ * _________111111111______
+ * _____________________11_
+ * _________________4______
+ *
+ * Sanitized equivalent (no overlap):
+ * 1_______________________
+ * _44_____________________
+ * ___1____________________
+ * ____22__________________
+ * ______11________________
+ * _________1______________
+ * __________3_____________
+ * ___________44___________
+ * _____________33_________
+ * _______________2________
+ * ________________1_______
+ * _________________4______
+ * ___________________2____
+ * ____________________33__
+ * ______________________4_
+ */
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+ int *pnr_map)
+{
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+ };
+ static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+ static struct change_member *change_point[2*E820_X_MAX] __initdata;
+ static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+ static struct e820entry new_bios[E820_X_MAX] __initdata;
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr, chg_nr;
+ int i;
+
+ /* if there's only one memory region, don't bother */
+#ifdef CONFIG_XEN
+ if (*pnr_map == 1)
+ return 0;
+#endif
+ if (*pnr_map < 2)
+ return -1;
+
+ old_nr = *pnr_map;
+ BUG_ON(old_nr > max_nr_map);
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i = 0; i < old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ return -1;
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i = 0; i < 2 * old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+ for (i = 0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr +
+ biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+ chg_nr = chgidx;
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i = 1; i < chg_nr; i++) {
+ unsigned long long curaddr, lastaddr;
+ unsigned long long curpbaddr, lastpbaddr;
+
+ curaddr = change_point[i]->addr;
+ lastaddr = change_point[i - 1]->addr;
+ curpbaddr = change_point[i]->pbios->addr;
+ lastpbaddr = change_point[i - 1]->pbios->addr;
+
+ /*
+ * swap entries, when:
+ *
+ * curaddr > lastaddr or
+ * curaddr == lastaddr and curaddr == curpbaddr and
+ * lastaddr != lastpbaddr
+ */
+ if (curaddr < lastaddr ||
+ (curaddr == lastaddr && curaddr == curpbaddr &&
+ lastaddr != lastpbaddr)) {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing = 1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries = 0; /* number of entries in the overlap table */
+ new_bios_entry = 0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr ==
+ change_point[chgidx]->pbios->addr) {
+ /*
+ * add map entry to overlap list (> 1 entry
+ * implies an overlap)
+ */
+ overlap_list[overlap_entries++] =
+ change_point[chgidx]->pbios;
+ } else {
+ /*
+ * remove entry from list (order independent,
+ * so swap with last)
+ */
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i] ==
+ change_point[chgidx]->pbios)
+ overlap_list[i] =
+ overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /*
+ * if there are overlapping entries, decide which
+ * "type" to use (larger value takes precedence --
+ * 1=usable, 2,3,4,4+=unusable)
+ */
+ current_type = 0;
+ for (i = 0; i < overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /*
+ * continue building up new bios map based on this
+ * information
+ */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /*
+ * move forward only if the new size
+ * was non-zero
+ */
+ if (new_bios[new_bios_entry].size != 0)
+ /*
+ * no more space left for new
+ * bios entries ?
+ */
+ if (++new_bios_entry >= max_nr_map)
+ break;
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr =
+ change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr = change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ /* retain count for new bios entries */
+ new_nr = new_bios_entry;
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+}
+
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+ while (nr_map) {
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ e820_add_region(start, size, type);
+
+ biosmap++;
+ nr_map--;
+ }
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+#else
+ BUG_ON(nr_map < 1);
+#endif
+
+ return __append_e820_map(biosmap, nr_map);
+}
+
+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+ u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ unsigned int i, x;
+ u64 real_updated_size = 0;
+
+ BUG_ON(old_type == new_type);
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ for (i = 0; i < e820x->nr_map; i++) {
+ struct e820entry *ei = &e820x->map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start &&
+ (ei->addr + ei->size) <= (start + size)) {
+ ei->type = new_type;
+ real_updated_size += ei->size;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+
+ x = e820x->nr_map;
+ if (x == ARRAY_SIZE(e820x->map)) {
+ printk(KERN_ERR "Too many memory map entries!\n");
+ break;
+ }
+ e820x->map[x].addr = final_start;
+ e820x->map[x].size = final_end - final_start;
+ e820x->map[x].type = new_type;
+ e820x->nr_map++;
+
+ real_updated_size += final_end - final_start;
+
+ if (ei->addr < final_start)
+ continue;
+ ei->addr = final_end;
+ ei->size -= final_end - final_start;
+ }
+ return real_updated_size;
+}
+
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ return e820_update_range_map(&e820, start, size, old_type, new_type);
+}
+
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+ unsigned old_type, unsigned new_type)
+{
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain())
+ return e820_update_range_map(&machine_e820,
+ phys_to_machine(start), size,
+ old_type, new_type);
+#endif
+ return e820_update_range_map(&e820_saved, start, size, old_type,
+ new_type);
+}
+
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+ int checktype)
+{
+ int i;
+ u64 real_removed_size = 0;
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+
+ if (checktype && ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start &&
+ (ei->addr + ei->size) <= (start + size)) {
+ real_removed_size += ei->size;
+ memset(ei, 0, sizeof(struct e820entry));
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ real_removed_size += final_end - final_start;
+
+ ei->size -= final_end - final_start;
+ if (ei->addr < final_start)
+ continue;
+ ei->addr = final_end;
+ }
+ return real_removed_size;
+}
+
+void __init update_e820(void)
+{
+ int nr_map;
+
+ nr_map = e820.nr_map;
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ return;
+ e820.nr_map = nr_map;
+ printk(KERN_INFO "modified physical RAM map:\n");
+ _e820_print_map(&e820, "modified");
+}
+static void __init update_e820_saved(void)
+{
+ int nr_map;
+
+ nr_map = e820_saved.nr_map;
+ if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+ return;
+ e820_saved.nr_map = nr_map;
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+#define MAX_GAP_END 0x100000000ull
+/*
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
+ */
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+ unsigned long start_addr, unsigned long long end_addr)
+{
+ unsigned long long last;
+ int i = e820.nr_map;
+ int found = 0;
+
+ last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+#ifdef CONFIG_X86_64
+ if (start_addr >= MAX_GAP_END)
+ last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
+#endif
+
+ while (--i >= 0) {
+ unsigned long long start = e820.map[i].addr;
+ unsigned long long end = start + e820.map[i].size;
+
+ if (end < start_addr)
+ continue;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap >= *gapsize) {
+ *gapsize = gap;
+ *gapstart = end;
+ found = 1;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+ return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space. We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+ unsigned long gapstart, gapsize, round;
+ int found;
+
+ gapstart = 0x10000000;
+ gapsize = 0x400000;
+ found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+
+#ifdef CONFIG_X86_64
+ if (!found) {
+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+ "address range\n"
+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
+ "registers may break!\n");
+ found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
+ WARN_ON(!found);
+ }
+#endif
+
+ /*
+ * See how much we want to round up: start off with
+ * rounding to the next 1MB area.
+ */
+ round = 0x100000;
+ while ((gapsize >> 4) > round)
+ round += round;
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+ printk(KERN_INFO
+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+ pci_mem_start, gapstart, gapsize);
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+ u32 map_len;
+ int entries;
+ struct e820entry *extmap;
+
+ entries = sdata->len / sizeof(struct e820entry);
+ map_len = sdata->len + sizeof(struct setup_data);
+ if (map_len > PAGE_SIZE)
+ sdata = early_ioremap(pa_data, map_len);
+ extmap = (struct e820entry *)(sdata->data);
+ __append_e820_map(extmap, entries);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ if (map_len > PAGE_SIZE)
+ early_iounmap(sdata, map_len);
+ printk(KERN_INFO "extended physical RAM map:\n");
+ _e820_print_map(&e820, "extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn;
+
+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (pfn < PFN_UP(ei->addr))
+ register_nosave_region(pfn, PFN_UP(ei->addr));
+
+ pfn = PFN_DOWN(ei->addr + ei->size);
+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+ register_nosave_region(PFN_UP(ei->addr), pfn);
+
+ if (pfn >= limit_pfn)
+ break;
+ }
+}
+#endif
+#endif
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+ u64 start, end;
+ char name[16];
+ char overlap_ok;
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+#ifndef CONFIG_XEN
+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+ /*
+ * Has to be in very low memory so we can execute
+ * real-mode AP code.
+ */
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#endif
+ {}
+};
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (end > r->start && start < r->end)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+ int j;
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memmove(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+/*
+ * Split any existing ranges that:
+ * 1) are marked 'overlap_ok', and
+ * 2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range. Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+ u64 lower_start, lower_end;
+ u64 upper_start, upper_end;
+ char name[16];
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+
+ /* Continue past non-overlapping ranges */
+ if (end <= r->start || start >= r->end)
+ continue;
+
+ /*
+ * Leave non-ok overlaps as is; let caller
+ * panic "Overlapping early reservations"
+ * when it hits this overlap.
+ */
+ if (!r->overlap_ok)
+ return;
+
+ /*
+ * We have an ok overlap. We will drop it from the early
+ * reservation map, and add back in any non-overlapping
+ * portions (lower or upper) as separate, overlap_ok,
+ * non-overlapping ranges.
+ */
+
+ /* 1. Note any non-overlapping (lower or upper) ranges. */
+ strncpy(name, r->name, sizeof(name) - 1);
+
+ lower_start = lower_end = 0;
+ upper_start = upper_end = 0;
+ if (r->start < start) {
+ lower_start = r->start;
+ lower_end = start;
+ }
+ if (r->end > end) {
+ upper_start = end;
+ upper_end = r->end;
+ }
+
+ /* 2. Drop the original ok overlapping range */
+ drop_range(i);
+
+ i--; /* resume for-loop on copied down entry */
+
+ /* 3. Add back in any non-overlapping ranges. */
+ if (lower_end)
+ reserve_early_overlap_ok(lower_start, lower_end, name);
+ if (upper_end)
+ reserve_early_overlap_ok(upper_start, upper_end, name);
+ }
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+ int overlap_ok)
+{
+ int i;
+ struct early_res *r;
+
+ i = find_overlapped_early(start, end);
+ if (i >= MAX_EARLY_RES)
+ panic("Too many early reservations");
+ r = &early_res[i];
+ if (r->end)
+ panic("Overlapping early reservations "
+ "%llx-%llx %s to %llx-%llx %s\n",
+ start, end - 1, name?name:"", r->start,
+ r->end - 1, r->name);
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = overlap_ok;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first. We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 0);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+ struct early_res *r;
+ int i;
+
+ i = find_overlapped_early(start, end);
+ r = &early_res[i];
+ if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+ panic("free_early on not reserved area: %llx-%llx!",
+ start, end - 1);
+
+ drop_range(i);
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+ int i, count;
+ u64 final_start, final_end;
+
+ count = 0;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+ count++;
+
+ printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+ count, start, end);
+ for (i = 0; i < count; i++) {
+ struct early_res *r = &early_res[i];
+ printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
+ r->start, r->end, r->name);
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end) {
+ printk(KERN_CONT "\n");
+ continue;
+ }
+ printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+ final_start, final_end);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
+ }
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+ int i;
+ u64 addr = *addrp;
+ int changed = 0;
+ struct early_res *r;
+again:
+ i = find_overlapped_early(addr, addr + size);
+ r = &early_res[i];
+ if (i < MAX_EARLY_RES && r->end) {
+ *addrp = addr = round_up(r->end, align);
+ changed = 1;
+ goto again;
+ }
+ return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+ int i;
+ u64 addr = *addrp, last;
+ u64 size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ continue;
+ if (last > end)
+ continue;
+ return addr;
+ }
+ return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+
+ return -1ULL;
+}
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+ u64 size = 0;
+ u64 addr;
+ u64 start;
+#ifdef CONFIG_XEN
+ unsigned int order = get_order(sizet);
+
+ if (is_initial_xendomain()) {
+ sizet = PAGE_SIZE << order;
+ if (align < PAGE_SIZE)
+ align = PAGE_SIZE;
+ }
+#endif
+ for (start = startt; ; start += size) {
+ start = find_e820_area_size(start, &size, align);
+ if (!(start + 1))
+ return 0;
+ if (size >= sizet)
+ break;
+ }
+
+#ifdef CONFIG_X86_32
+ if (start >= MAXMEM)
+ return 0;
+ if (start + size > MAXMEM)
+ size = MAXMEM - start;
+#endif
+#ifdef CONFIG_XEN
+ if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
+ return 0;
+ if (PFN_UP(start + size) > xen_start_info->nr_pages)
+ size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
+#endif
+
+ addr = round_down(start + size - sizet, align);
+ if (addr < start)
+ return 0;
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ int rc;
+ unsigned long max_initmap_pfn;
+
+ max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
+ + xen_start_info->nr_pt_frames
+ + 1 + (1 << (19 - PAGE_SHIFT)),
+ 1UL << (22 - PAGE_SHIFT));
+#ifdef CONFIG_X86_32
+ if ((addr >> PAGE_SHIFT)
+ < max(max_initmap_pfn, max_pfn_mapped))
+ rc = xen_create_contiguous_region((unsigned long)
+ __va(addr),
+ order, 32);
+#else
+ if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
+ rc = xen_create_contiguous_region((unsigned long)
+ __va(addr),
+ order, 32);
+ else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
+ rc = xen_create_contiguous_region(__START_KERNEL_map
+ + addr,
+ order, 32);
+#endif
+ else
+ rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
+ order, 32);
+ if (rc)
+ return 0;
+ }
+#endif
+ e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+ e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820 for early_reserve_e820\n");
+ update_e820();
+ update_e820_saved();
+
+ return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT))
+# else
+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+{
+ int i;
+ unsigned long last_pfn = 0;
+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+
+ if (ei->type != type)
+ continue;
+
+ start_pfn = ei->addr >> PAGE_SHIFT;
+ end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+ if (start_pfn >= limit_pfn)
+ continue;
+ if (end_pfn > limit_pfn) {
+ last_pfn = limit_pfn;
+ break;
+ }
+ if (end_pfn > last_pfn)
+ last_pfn = end_pfn;
+ }
+
+ if (last_pfn > max_arch_pfn)
+ last_pfn = max_arch_pfn;
+
+ printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
+ return last_pfn;
+}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
+{
+ u64 align = PAGE_SIZE;
+
+#ifdef CONFIG_XEN
+ if (last_pfn > xen_start_info->nr_pages)
+ last_pfn = xen_start_info->nr_pages;
+#endif
+
+ *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+ *ei_startpfn >= last_pfn)
+ return 0;
+
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > last_pfn)
+ *ei_endpfn = last_pfn;
+
+ return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++)
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+#ifdef CONFIG_XEN
+ BUG_ON(nid);
+ add_active_range(nid, last_pfn, last_pfn);
+#endif
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+ }
+ return end - start - ((u64)ram << PAGE_SHIFT);
+}
+
+static void early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+static int userdef __initdata;
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+ u64 mem_size, current_end;
+ unsigned int i;
+
+ if (!p)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ if (!strcmp(p, "nopentium")) {
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ return 0;
+ }
+#endif
+
+ userdef = 1;
+ mem_size = memparse(p, &p);
+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ i = e820.nr_map - 1;
+ current_end = e820.map[i].addr + e820.map[i].size;
+ if (current_end < mem_size) {
+ /*
+ * The e820 map ends before our requested size so
+ * extend the final entry to the requested address.
+ */
+ if (e820.map[i].type == E820_RAM)
+ e820.map[i].size = mem_size - e820.map[i].addr;
+ else
+ e820_add_region(current_end, mem_size - current_end, E820_RAM);
+ }
+
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+#ifndef CONFIG_XEN
+static int __init parse_memmap_opt(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real mem size before original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820_end_of_ram_pfn();
+#endif
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else
+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+#endif
+
+void __init finish_e820_parsing(void)
+{
+ if (userdef) {
+ int nr = e820.nr_map;
+
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ early_panic("Invalid user supplied memory map");
+ e820.nr_map = nr;
+
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ _e820_print_map(&e820, "user");
+ }
+}
+
+static inline const char *e820_type_to_string(int e820_type)
+{
+ switch (e820_type) {
+ case E820_RESERVED_KERN:
+ case E820_RAM: return "System RAM";
+ case E820_ACPI: return "ACPI Tables";
+ case E820_NVS: return "ACPI Non-volatile Storage";
+ default: return "reserved";
+ }
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+ int i;
+ struct resource *res;
+ u64 end;
+
+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ for (i = 0; i < e820.nr_map; i++) {
+ end = e820.map[i].addr + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+ if (end > 0x100000000ULL) {
+ res++;
+ continue;
+ }
+#endif
+ res->name = e820_type_to_string(e820.map[i].type);
+ res->start = e820.map[i].addr;
+ res->end = end;
+
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ res++;
+ }
+
+ for (i = 0; i < e820_saved.nr_map; i++) {
+ struct e820entry *entry = &e820_saved.map[i];
+ firmware_map_add_early(entry->addr,
+ entry->addr + entry->size - 1,
+ e820_type_to_string(entry->type));
+ }
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+char *__init default_machine_specific_memory_setup(void)
+{
+ char *who = "BIOS-e820";
+ int new_nr;
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ new_nr = boot_params.e820_entries;
+ sanitize_e820_map(boot_params.e820_map,
+ ARRAY_SIZE(boot_params.e820_map),
+ &new_nr);
+ boot_params.e820_entries = new_nr;
+ if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
+ < 0) {
+ u64 mem_size;
+
+ /* compare results from other methods and take the greater */
+ if (boot_params.alt_mem_k
+ < boot_params.screen_info.ext_mem_k) {
+ mem_size = boot_params.screen_info.ext_mem_k;
+ who = "BIOS-88";
+ } else {
+ mem_size = boot_params.alt_mem_k;
+ who = "BIOS-e801";
+ }
+
+ e820.nr_map = 0;
+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+ e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ }
+
+ /* In case someone cares... */
+ return who;
+}
+
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+ if (x86_quirks->arch_memory_setup) {
+ char *who = x86_quirks->arch_memory_setup();
+
+ if (who)
+ return who;
+ }
+ return default_machine_specific_memory_setup();
+}
+#endif
+
+static char * __init _memory_setup(void)
+{
+ int rc, nr_map;
+ struct xen_memory_map memmap;
+ static struct e820entry __initdata map[E820MAX];
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc == -ENOSYS) {
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
+
+ nr_map = memmap.nr_entries;
+ sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
+
+ if (append_e820_map(map, nr_map) < 0)
+ BUG();
+
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+ BUG();
+ machine_e820.nr_map = memmap.nr_entries;
+ }
+#endif
+
+ return "Xen";
+}
+
+void __init setup_memory_map(void)
+{
+ char *who;
+
+ who = _memory_setup();
+#ifdef CONFIG_XEN
+ if (is_initial_xendomain()) {
+ printk(KERN_INFO "Xen-provided machine memory map:\n");
+ _e820_print_map(&machine_e820, "BIOS");
+ } else
+#endif
+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ printk(KERN_INFO "Xen-provided physical RAM map:\n");
+ _e820_print_map(&e820, who);
+}
--- head-2011-03-11.orig/arch/x86/kernel/e820_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,860 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <xen/interface/memory.h>
-
-struct e820map e820;
-struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
- .name = "Adapter ROM",
- .start = 0xc8000,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
- const unsigned short * const ptr = (const unsigned short *)rom;
- unsigned short sig;
-
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
- unsigned char sum, c;
-
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
- sum += c;
- return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
- const unsigned char *rom;
- unsigned long start, length, upper;
- unsigned char c;
- int i;
-
-#ifdef CONFIG_XEN
- /* Nothing to do if not running in dom0. */
- if (!is_initial_xendomain())
- return;
-#endif
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
-#ifdef CONFIG_XEN
-static struct e820map machine_e820;
-#define e820 machine_e820
-#endif
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
- struct resource *data_resource,
- struct resource *bss_resource)
-{
- int i;
-
- probe_roms();
- for (i = 0; i < e820.nr_map; i++) {
- struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
-#endif
- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res)) {
- kfree(res);
- continue;
- }
- if (e820.map[i].type == E820_RAM) {
- /*
- * We don't know which RAM region contains kernel data,
- * so we try it repeatedly and let the resource manager
- * test it.
- */
-#ifndef CONFIG_XEN
- request_resource(res, code_resource);
- request_resource(res, data_resource);
- request_resource(res, bss_resource);
-#endif
-#ifdef CONFIG_KEXEC
- if (crashk_res.start != crashk_res.end)
- request_resource(res, &crashk_res);
-#ifdef CONFIG_XEN
- xen_machine_kexec_register_resources(res);
-#endif
-#endif
- }
- }
-}
-
-#undef e820
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long pfn;
-
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (pfn < PFN_UP(ei->addr))
- register_nosave_region(pfn, PFN_UP(ei->addr));
-
- pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr), pfn);
-
- if (pfn >= max_low_pfn)
- break;
- }
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type)
-{
- int x;
-
- x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2) {
- return -1;
- }
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i=0; i<old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
- return -1;
- }
-
- /* create pointers for initial change-point information (for sorting) */
- for (i=0; i < 2*old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i=0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx; /* true number of change-points */
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i=1; i < chg_nr; i++) {
- /* if <current_addr> > <last_addr>, swap */
- /* or, if current=<start_addr> & last=<end_addr>, swap */
- if ((change_point[i]->addr < change_point[i-1]->addr) ||
- ((change_point[i]->addr == change_point[i-1]->addr) &&
- (change_point[i]->addr == change_point[i]->pbios->addr) &&
- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
- )
- {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing=1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries=0; /* number of entries in the overlap table */
- new_bios_entry=0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx=0; chgidx < chg_nr; chgidx++)
- {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
- {
- /* add map entry to overlap list (> 1 entry implies an overlap) */
- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
- }
- else
- {
- /* remove entry from list (order independent, so swap with last) */
- for (i=0; i<overlap_entries; i++)
- {
- if (overlap_list[i] == change_point[chgidx]->pbios)
- overlap_list[i] = overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /* if there are overlapping entries, decide which "type" to use */
- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
- current_type = 0;
- for (i=0; i<overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /* continue building up new bios map based on this information */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /* move forward only if the new size was non-zero */
- if (new_bios[new_bios_entry].size != 0)
- if (++new_bios_entry >= E820MAX)
- break; /* no more space left for new bios entries */
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr=change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- new_nr = new_bios_entry; /* retain count for new bios entries */
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-#ifndef CONFIG_XEN
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-#else
- BUG_ON(nr_map < 1);
-#endif
-
- do {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
-
-#ifdef CONFIG_XEN
- if (is_initial_xendomain()) {
- struct xen_memory_map memmap;
-
- memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, machine_e820.map);
-
- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
- BUG();
- machine_e820.nr_map = memmap.nr_entries;
- } else
- machine_e820 = e820;
-#endif
-
- return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
- int i;
-
- max_pfn = 0;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
- /* RAM? */
- if (e820.map[i].type != E820_RAM)
- continue;
- start = PFN_UP(e820.map[i].addr);
- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
- if (start >= end)
- continue;
- if (end > max_pfn)
- max_pfn = end;
- memory_present(0, start, end);
- }
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long curr_pfn, last_pfn, size;
- /*
- * Reserve usable low memory
- */
- if (e820.map[i].type != E820_RAM)
- continue;
- /*
- * We are rounding up the start address of usable memory:
- */
- curr_pfn = PFN_UP(e820.map[i].addr);
- if (curr_pfn >= max_low_pfn)
- continue;
- /*
- * ... and at the end of the usable range downwards:
- */
- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-#ifdef CONFIG_XEN
- /*
- * Truncate to the number of actual pages currently
- * present.
- */
- if (last_pfn > xen_start_info->nr_pages)
- last_pfn = xen_start_info->nr_pages;
-#endif
-
- if (last_pfn > max_low_pfn)
- last_pfn = max_low_pfn;
-
- /*
- * .. finally, did all the rounding and playing
- * around just make the area go away?
- */
- if (last_pfn <= curr_pfn)
- continue;
-
- size = last_pfn - curr_pfn;
- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
- }
-}
-
-void __init e820_register_memory(void)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long long last;
- int i;
-
-#ifdef CONFIG_XEN
-#define e820 machine_e820
-#endif
- /*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.
- */
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = e820.nr_map;
- while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- }
- }
- if (start < last)
- last = start;
- }
-#undef e820
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(" %s: %016Lx - %016Lx ", who,
- e820.map[i].addr,
- e820.map[i].addr + e820.map[i].size);
- switch (e820.map[i].type) {
- case E820_RAM: printk("(usable)\n");
- break;
- case E820_RESERVED:
- printk("(reserved)\n");
- break;
- case E820_ACPI:
- printk("(ACPI data)\n");
- break;
- case E820_NVS:
- printk("(ACPI NVS)\n");
- break;
- default: printk("type %u\n", e820.map[i].type);
- break;
- }
- }
-}
-
-void __init limit_regions(unsigned long long size)
-{
- unsigned long long current_addr = 0;
- int i;
-
- print_memory_map("limit_regions start");
- for (i = 0; i < e820.nr_map; i++) {
- current_addr = e820.map[i].addr + e820.map[i].size;
- if (current_addr < size)
- continue;
-
- if (e820.map[i].type != E820_RAM)
- continue;
-
- if (e820.map[i].addr >= size) {
- /*
- * This region starts past the end of the
- * requested size, skip it completely.
- */
- e820.nr_map = i;
- } else {
- e820.nr_map = i + 1;
- e820.map[i].size -= current_addr - size;
- }
- print_memory_map("limit_regions endfor");
- return;
- }
-#ifdef CONFIG_XEN
- if (current_addr < size) {
- /*
- * The e820 map finished before our requested size so
- * extend the final entry to the requested address.
- */
- --i;
- if (e820.map[i].type == E820_RAM)
- e820.map[i].size -= current_addr - size;
- else
- add_memory_region(current_addr, size - current_addr, E820_RAM);
- }
-#endif
- print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
- int i;
-
-#ifndef CONFIG_XEN
- for (i = 0; i < e820.nr_map; i++) {
- const struct e820entry *ei = &e820.map[i];
-#else
- if (!is_initial_xendomain())
- return 0;
- for (i = 0; i < machine_e820.nr_map; ++i) {
- const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
- if (type && ei->type != type)
- continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
- u64 start = s;
- u64 end = e;
- int i;
-
-#ifndef CONFIG_XEN
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-#else
- if (!is_initial_xendomain())
- return 0;
- for (i = 0; i < machine_e820.nr_map; ++i) {
- const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /* if start is now at or beyond end, we're done, full
- * coverage */
- if (start >= end)
- return 1; /* we're done */
- }
- return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- propagate_e820_map();
- saved_max_pfn = max_pfn;
-#endif
- e820.nr_map = 0;
- user_defined_memmap = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
-
- mem_size = memparse(arg, &arg);
- if (*arg == '@') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*arg == '#') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*arg == '$') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- }
- return 0;
-}
-early_param("memmap", parse_memmap);
-
-#ifndef CONFIG_XEN
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
-{
- int i;
-
- BUG_ON(old_type == new_type);
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 final_start, final_end;
- if (ei->type != old_type)
- continue;
- /* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
- ei->type = new_type;
- continue;
- }
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
- if (final_start >= final_end)
- continue;
- add_memory_region(final_start, final_end - final_start,
- new_type);
- }
-}
-
-void __init update_e820(void)
-{
- u8 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
- return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- print_memory_map("modified");
-}
-#endif
--- head-2011-03-11.orig/arch/x86/kernel/e820_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,1052 +0,0 @@
-/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
- *
- * Getting sanitize_e820_map() in sync with i386 version by applying change:
- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/pfn.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
-#include <xen/interface/memory.h>
-
-struct e820map e820 __initdata;
-#ifdef CONFIG_XEN
-struct e820map machine_e820;
-#endif
-
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-#ifndef CONFIG_XEN
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-#endif
-
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
- unsigned long start, end;
- char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
-#ifndef CONFIG_XEN
- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#endif
- {}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
- int i;
- struct early_res *r;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
- start, end - 1, name?name:"", r->start, r->end - 1, r->name);
- }
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- r->start = start;
- r->end = end;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
- struct early_res *r;
- int i, j;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (start == r->start && end == r->end)
- break;
- }
- if (i >= MAX_EARLY_RES || !early_res[i].end)
- panic("free_early on not reserved area: %lx-%lx!", start, end);
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
- int i;
- unsigned long final_start, final_end;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end)
- continue;
- printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
- final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
- }
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last >= r->start && addr < r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- }
- return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- unsigned long size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{
- int i;
-
-#ifndef CONFIG_XEN
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-#else
- if (!is_initial_xendomain())
- return 0;
- for (i = 0; i < machine_e820.nr_map; i++) {
- const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
- if (type && ei->type != type)
- continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end,
- unsigned type)
-{
- int i;
-
-#ifndef CONFIG_XEN
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-#else
- if (!is_initial_xendomain())
- return 0;
- for (i = 0; i < machine_e820.nr_map; i++) {
- const struct e820entry *ei = &machine_e820.map[i];
-#endif
-
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
-
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /*
- * if start is now at or beyond end, we're done, full
- * coverage
- */
- if (start >= end)
- return 1;
- }
- return 0;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
- unsigned long size, unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
- unsigned long *sizep,
- unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
- }
- return -1UL;
-
-}
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
- unsigned long end_pfn;
-
- end_pfn = find_max_pfn_with_active_regions();
-
- if (end_pfn > max_pfn_mapped)
- max_pfn_mapped = end_pfn;
- if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
- max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
- if (end_pfn > end_user_pfn)
- end_pfn = end_user_pfn;
- if (end_pfn > max_pfn_mapped)
- end_pfn = max_pfn_mapped;
-
- printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
- return end_pfn;
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
-{
- int i;
- struct resource *res;
-
- res = alloc_bootmem_low(sizeof(struct resource) * nr_map);
- for (i = 0; i < nr_map; i++) {
- switch (e820[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820[i].addr;
- res->end = res->start + e820[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
- res++;
- }
-}
-
-#ifndef CONFIG_XEN
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long paddr;
-
- paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (paddr < ei->addr)
- register_nosave_region(PFN_DOWN(paddr),
- PFN_UP(ei->addr));
-
- paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr),
- PFN_DOWN(paddr));
-
- if (paddr >= (end_pfn << PAGE_SHIFT))
- break;
- }
-}
-#endif
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long end_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
-#ifdef CONFIG_XEN
- if (end_pfn > xen_start_info->nr_pages)
- end_pfn = xen_start_info->nr_pages;
-#endif
-
- *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Check if max_pfn_mapped should be updated */
- if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
- max_pfn_mapped = *ei_endpfn;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= end_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > end_pfn)
- *ei_endpfn = end_pfn;
-
- /* Obey end_user_pfn to save on memmap */
- if (*ei_startpfn >= end_user_pfn)
- return 0;
- if (*ei_endpfn > end_user_pfn)
- *ei_endpfn = end_user_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-#ifdef CONFIG_XEN
- BUG_ON(nid);
- add_active_range(nid, end_pfn, end_pfn);
-#endif
-}
-
-/*
- * Add a memory region to the kernel e820 map.
- */
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
-{
- int x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - (ram << PAGE_SHIFT);
-}
-
-static void __init e820_print_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
- (unsigned long long) e820.map[i].addr,
- (unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
- switch (e820.map[i].type) {
- case E820_RAM:
- printk(KERN_CONT "(usable)\n");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)\n");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)\n");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)\n");
- break;
- default:
- printk(KERN_CONT "type %u\n", e820.map[i].type);
- break;
- }
- }
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
-{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
- static struct change_member change_point_list[2*E820MAX] __initdata;
- static struct change_member *change_point[2*E820MAX] __initdata;
- static struct e820entry *overlap_list[E820MAX] __initdata;
- static struct e820entry new_bios[E820MAX] __initdata;
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following
- (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
-
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2)
- return -1;
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i = 0; i < old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
- return -1;
-
- /* create pointers for initial change-point information (for sorting) */
- for (i = 0; i < 2 * old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i = 0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr +
- biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx;
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries = 0; /* number of entries in the overlap table */
- new_bios_entry = 0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
-
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr ==
- change_point[chgidx]->pbios->addr) {
- /*
- * add map entry to overlap list (> 1 entry
- * implies an overlap)
- */
- overlap_list[overlap_entries++] =
- change_point[chgidx]->pbios;
- } else {
- /*
- * remove entry from list (order independent,
- * so swap with last)
- */
- for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i] ==
- change_point[chgidx]->pbios)
- overlap_list[i] =
- overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /*
- * if there are overlapping entries, decide which
- * "type" to use (larger value takes precedence --
- * 1=usable, 2,3,4,4+=unusable)
- */
- current_type = 0;
- for (i = 0; i < overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /*
- * continue building up new bios map based on this
- * information
- */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /*
- * move forward only if the new size
- * was non-zero
- */
- if (new_bios[new_bios_entry].size != 0)
- /*
- * no more space left for new
- * bios entries ?
- */
- if (++new_bios_entry >= E820MAX)
- break;
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr =
- change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr = change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- /* retain count for new bios entries */
- new_nr = new_bios_entry;
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-#ifndef CONFIG_XEN
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-#else
- BUG_ON(nr_map < 1);
-#endif
-
- do {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
-
-#ifdef CONFIG_XEN
- if (is_initial_xendomain()) {
- struct xen_memory_map memmap;
-
- memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, machine_e820.map);
-
- if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
- BUG();
- machine_e820.nr_map = memmap.nr_entries;
- } else
- machine_e820 = e820;
-#endif
-
- return 0;
-}
-
-static void early_panic(char *msg)
-{
- early_printk(msg);
- panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
-{
-#ifndef CONFIG_XEN
- char *who = "BIOS-e820";
- /*
- * Try to copy the BIOS-supplied E820-map.
- *
- * Otherwise fake a memory map; one section from 0k->640k,
- * the next section from 1mb->appropriate_mem_k
- */
- sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
- if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
- early_panic("Cannot find a valid memory map");
-#else /* CONFIG_XEN */
- char *who = "Xen";
- int rc;
- struct xen_memory_map memmap;
- static struct e820entry __initdata map[E820MAX];
-
- memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
-
- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
- if ( rc == -ENOSYS ) {
- memmap.nr_entries = 1;
- map[0].addr = 0ULL;
- map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
- /* 8MB slack (to balance backend allocations). */
- map[0].size += 8 << 20;
- map[0].type = E820_RAM;
- rc = 0;
- }
- BUG_ON(rc);
-
- sanitize_e820_map(map, (char *)&memmap.nr_entries);
-
- if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
- early_panic("Cannot find a valid memory map");
-#endif
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
-
- /* In case someone cares... */
- return who;
-}
-
-static int __init parse_memopt(char *p)
-{
- int i;
- unsigned long current_end;
- unsigned long end;
-
- if (!p)
- return -EINVAL;
- end_user_pfn = memparse(p, &p);
- end_user_pfn >>= PAGE_SHIFT;
-
- end = end_user_pfn<<PAGE_SHIFT;
- i = e820.nr_map-1;
- current_end = e820.map[i].addr + e820.map[i].size;
-
- if (current_end < end) {
- /*
- * The e820 map ends before our requested size so
- * extend the final entry to the requested address.
- */
- if (e820.map[i].type == E820_RAM)
- e820.map[i].size = end - e820.map[i].addr;
- else
- add_memory_region(current_end, end - current_end, E820_RAM);
- }
-
- return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
- char *oldp;
- unsigned long long start_at, mem_size;
-
- if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
- /*
- * If we are doing a crash dump, we still need to know
- * the real mem size before original memory map is
- * reset.
- */
- e820_register_active_regions(0, 0, -1UL);
- saved_max_pfn = e820_end_of_ram();
- remove_all_active_ranges();
-#endif
- max_pfn_mapped = 0;
- e820.nr_map = 0;
- userdef = 1;
- return 0;
- }
-
- oldp = p;
- mem_size = memparse(p, &p);
- if (p == oldp)
- return -EINVAL;
-
- userdef = 1;
- if (*p == '@') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*p == '#') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*p == '$') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- end_user_pfn = (mem_size >> PAGE_SHIFT);
- }
- return *p == '\0' ? 0 : -EINVAL;
-}
-early_param("memmap", parse_memmap_opt);
-
-void __init finish_e820_parsing(void)
-{
- if (userdef) {
- char nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, &nr) < 0)
- early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
-
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
- }
-}
-
-#ifndef CONFIG_XEN
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
-{
- int i;
-
- BUG_ON(old_type == new_type);
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 final_start, final_end;
- if (ei->type != old_type)
- continue;
- /* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
- ei->type = new_type;
- continue;
- }
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
- if (final_start >= final_end)
- continue;
- add_memory_region(final_start, final_end - final_start,
- new_type);
- }
-}
-
-void __init update_e820(void)
-{
- u8 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
- return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- e820_print_map("modified");
-}
-#endif
-
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
-/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space. We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
- */
-__init void e820_setup_gap(struct e820entry *e820, int nr_map)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long last;
- int i;
- int found = 0;
-
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = nr_map;
- while (--i >= 0) {
- unsigned long long start = e820[i].addr;
- unsigned long long end = start + e820[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- found = 1;
- }
- }
- if (start < last)
- last = start;
- }
-
- if (!found) {
- gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
- "address range\n"
- KERN_ERR "PCI: Unassigned devices with 32bit resource "
- "registers may break!\n");
- }
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
- int i;
-
- if (slot < 0 || slot >= e820.nr_map)
- return -1;
- for (i = slot; i < e820.nr_map; i++) {
- if (e820.map[i].type != E820_RAM)
- continue;
- break;
- }
- if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
- return -1;
- *addr = e820.map[i].addr;
- *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
- max_pfn << PAGE_SHIFT) - *addr;
- return i + 1;
-}
--- head-2011-03-11.orig/arch/x86/kernel/early_printk-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/early_printk-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -225,7 +225,7 @@ static struct console simnow_console = {
static struct console *early_console = &early_vga_console;
static int early_console_initialized;
-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
{
char buf[512];
int n;
--- head-2011-03-11.orig/arch/x86/kernel/entry_32-xen.S 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_32-xen.S 2011-02-01 14:38:38.000000000 +0100
@@ -51,15 +51,26 @@
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
#include <xen/interface/xen.h>
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit syscall_trace_entry
+#define sysexit_audit syscall_exit_work
+#endif
+
/*
* We use macros for low-level operations which need to be overridden
* for paravirtualization. The following will never clobber any registers:
* INTERRUPT_RETURN (aka. "iret")
* GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
*
* For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
* specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -277,11 +288,6 @@ END(resume_kernel)
#endif
CFI_ENDPROC
- .macro test_tif ti_reg # system call tracing in operation / emulation
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg)
- .endm
-
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -338,8 +344,9 @@ sysenter_past_esp:
.previous
GET_THREAD_INFO(%ebp)
- test_tif %ebp
- jnz syscall_trace_entry
+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ jnz sysenter_audit
+sysenter_do_call:
cmpl $(nr_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
@@ -349,14 +356,54 @@ sysenter_past_esp:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
- jne syscall_exit_work
+ jne sysexit_audit
+sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
- ENABLE_INTERRUPTS_SYSCALL_RET
+ ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+ testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ jnz syscall_trace_entry
+ addl $4,%esp
+ CFI_ADJUST_CFA_OFFSET -4
+ /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
+ /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
+ /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
+ movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
+ movl %eax,%edx /* 2nd arg: syscall number */
+ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
+ call audit_syscall_entry
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ movl PT_EAX(%esp),%eax /* reload syscall number */
+ jmp sysenter_do_call
+
+sysexit_audit:
+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ jne syscall_exit_work
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ movl %eax,%edx /* second arg, syscall return value */
+ cmpl $0,%eax /* is it < 0? */
+ setl %al /* 1 if so, 0 if not */
+ movzbl %al,%eax /* zero-extend that */
+ inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+ call audit_syscall_exit
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ jne syscall_exit_work
+ movl PT_EAX(%esp),%eax /* reload syscall return value */
+ jmp sysenter_exit
+#endif
+
CFI_ENDPROC
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
@@ -400,7 +447,7 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- test_tif %ebp
+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -413,10 +460,6 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
- testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
- jz no_singlestep
- orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
@@ -588,12 +631,8 @@ END(work_pending)
syscall_trace_entry:
movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
- xorl %edx,%edx
- call do_syscall_trace
- cmpl $0, %eax
- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
- # so must skip actual syscall
- movl PT_ORIG_EAX(%esp), %eax
+ call syscall_trace_enter
+ /* What it returned is what we'll actually use. */
cmpl $(nr_syscalls), %eax
jnae syscall_call
jmp syscall_exit
@@ -602,14 +641,13 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+ testb $_TIF_WORK_SYSCALL_EXIT, %cl
jz work_pending
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
# schedule() instead
movl %esp, %eax
- movl $1, %edx
- call do_syscall_trace
+ call syscall_trace_leave
jmp resume_userspace
END(syscall_exit_work)
CFI_ENDPROC
@@ -1113,10 +1151,10 @@ ENTRY(native_iret)
.previous
END(native_iret)
-ENTRY(native_irq_enable_syscall_ret)
+ENTRY(native_irq_enable_sysexit)
sti
sysexit
-END(native_irq_enable_syscall_ret)
+END(native_irq_enable_sysexit)
#endif
KPROBE_ENTRY(int3)
@@ -1265,6 +1303,77 @@ ENTRY(kernel_thread_helper)
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl mcount_call
+mcount_call:
+ call ftrace_stub
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+ ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+ ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz trace
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+trace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
#include <asm/alternative-asm.h>
# pv syscall call handler stub
@@ -1290,7 +1399,7 @@ ENTRY(ia32pv_cstar_target)
.previous
SAVE_ALL
GET_THREAD_INFO(%ebp)
- test_tif %ebp
+ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz cstar_trace_entry
cmpl $nr_syscalls,%eax
jae cstar_badsys
@@ -1324,29 +1433,21 @@ cstar_trace_entry:
btl %eax,cstar_special
jc .Lcstar_trace_special
1: movl %esp,%eax
- xorl %edx,%edx
LOCK_PREFIX
orl $_TIF_CSTAR,TI_flags(%ebp)
- call do_syscall_trace
+ call syscall_trace_enter
LOCK_PREFIX
andl $~_TIF_CSTAR,TI_flags(%ebp)
- testl %eax,%eax
- jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU,
- # so must skip actual syscall
- movl PT_ORIG_EAX(%esp),%eax
+ /* What it returned is what we'll actually use. */
cmpl $nr_syscalls,%eax
jb .Lcstar_call
jmp .Lcstar_exit
.Lcstar_trace_special:
movl PT_ECX(%esp),%ecx
movl %esp,%eax
- xorl %edx,%edx
movl %ecx,PT_EBP(%esp) # put user EBP back in place
- call do_syscall_trace
- testl %eax,%eax
- jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
- # so must skip actual syscall
- movl PT_ORIG_EAX(%esp),%eax
+ call syscall_trace_enter
+ /* What it returned is what we'll actually use. */
cmpl $nr_syscalls,%eax
jb syscall_call
jmp syscall_exit
--- head-2011-03-11.orig/arch/x86/kernel/entry_64.S 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_64.S 2011-02-16 16:02:30.000000000 +0100
@@ -1253,7 +1253,7 @@ ENTRY(arch_unwind_init_running)
END(arch_unwind_init_running)
#endif
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
@@ -1353,7 +1353,7 @@ END(xen_failsafe_callback)
apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
xen_hvm_callback_vector xen_evtchn_do_upcall
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
/*
* Some functions should be protected against kprobes
--- head-2011-03-11.orig/arch/x86/kernel/entry_64-xen.S 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/entry_64-xen.S 2011-02-01 14:38:38.000000000 +0100
@@ -53,12 +53,124 @@
#include <asm/hw_irq.h>
#include <asm/page.h>
#include <asm/irqflags.h>
+#include <asm/ftrace.h>
#include <asm/errno.h>
#include <xen/interface/xen.h>
#include <xen/interface/features.h>
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE 0x40000000
+
.code64
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl mcount_call
+mcount_call:
+ call ftrace_stub
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+ retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+ retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+.globl ftrace_stub
+ftrace_stub:
+ retq
+
+trace:
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
@@ -95,7 +207,7 @@ NMI_MASK = 0x80000000
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq %rax /* ss */
+ pushq $__KERNEL_DS /* ss */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET ss,0*/
pushq %rax /* rsp */
@@ -190,13 +302,13 @@ ENTRY(ret_from_fork)
CFI_ADJUST_CFA_OFFSET -4
call schedule_tail
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jnz rff_trace
rff_action:
RESTORE_REST
testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
- testl $_TIF_IA32,threadinfo_flags(%rcx)
+ testl $_TIF_IA32,TI_flags(%rcx)
jnz int_ret_from_sys_call
RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
jmp ret_from_sys_call
@@ -258,8 +370,9 @@ ENTRY(system_call)
SAVE_ARGS -8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
jnz tracesys
+system_call_fastpath:
cmpq $__NR_syscall_max,%rax
ja badsys
movq %r10,%rcx
@@ -277,7 +390,7 @@ sysret_check:
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -308,16 +421,16 @@ sysret_careful:
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
-
- /* Really a signal */
+#ifdef CONFIG_AUDITSYSCALL
+ bt $TIF_SYSCALL_AUDIT,%edx
+ jc sysret_audit
+#endif
/* edx: work flags (arg3) */
leaq do_notify_resume(%rip),%rax
leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
xorl %esi,%esi # oldset -> arg2
call ptregscall_common
-1: movl $_TIF_NEED_RESCHED,%edi
+ movl $_TIF_WORK_MASK,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -328,14 +441,56 @@ badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call
+#ifdef CONFIG_AUDITSYSCALL
+ /*
+ * Fast path for syscall audit without full syscall trace.
+ * We just call audit_syscall_entry() directly, and then
+ * jump back to the normal fast path.
+ */
+auditsys:
+ movq %r10,%r9 /* 6th arg: 4th syscall arg */
+ movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
+ movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
+ movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
+ movq %rax,%rsi /* 2nd arg: syscall number */
+ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
+ call audit_syscall_entry
+ LOAD_ARGS 0 /* reload call-clobbered registers */
+ jmp system_call_fastpath
+
+ /*
+ * Return fast path for syscall audit. Call audit_syscall_exit()
+ * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
+ * masked off.
+ */
+sysret_audit:
+ movq %rax,%rsi /* second arg, syscall return value */
+ cmpq $0,%rax /* is it < 0? */
+ setl %al /* 1 if so, 0 if not */
+ movzbl %al,%edi /* zero-extend that into %edi */
+ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+ call audit_syscall_exit
+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+ jmp sysret_check
+#endif /* CONFIG_AUDITSYSCALL */
+
/* Do syscall tracing */
tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ jz auditsys
+#endif
SAVE_REST
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
call syscall_trace_enter
- LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ /*
+ * Reload arg registers from stack in case ptrace changed them.
+ * We don't reload %rax because syscall_trace_enter() returned
+ * the value it wants us to use in the table lookup.
+ */
+ LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
cmpq $__NR_syscall_max,%rax
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -349,6 +504,7 @@ tracesys:
* Has correct top of stack, but partial stack frame.
*/
.globl int_ret_from_sys_call
+ .globl int_with_check
int_ret_from_sys_call:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -363,10 +519,10 @@ int_ret_from_sys_call:
int_with_check:
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,threadinfo_status(%rcx)
+ andl $~TS_COMPAT,TI_status(%rcx)
jmp retint_restore_args
/* Either reschedule or signal or syscall exit tracking needed. */
@@ -392,7 +548,7 @@ int_very_careful:
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
/* Check for syscall exit trace */
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+ testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
@@ -400,7 +556,7 @@ int_very_careful:
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
- andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
int_signal:
@@ -409,7 +565,7 @@ int_signal:
movq %rsp,%rdi # &ptregs -> arg1
xorl %esi,%esi # oldset -> arg2
call do_notify_resume
-1: movl $_TIF_NEED_RESCHED,%edi
+1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -436,7 +592,6 @@ END(\label)
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
@@ -510,10 +665,12 @@ END(stub_rt_sigreturn)
*
*/
-retint_check:
+retint_with_reschedule:
CFI_DEFAULT_STACK adj=1
+ movl $_TIF_WORK_MASK,%edi
+retint_check:
LOCKDEP_SYS_EXIT_IRQ
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
CFI_REMEMBER_STATE
jnz retint_careful
@@ -558,17 +715,16 @@ retint_signal:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl $_TIF_NEED_RESCHED,%edi
GET_THREAD_INFO(%rcx)
- jmp retint_check
+ jmp retint_with_reschedule
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
- cmpl $0,threadinfo_preempt_count(%rcx)
+ cmpl $0,TI_preempt_count(%rcx)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+ bt $TIF_NEED_RESCHED,TI_flags(%rcx)
jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
@@ -623,6 +779,9 @@ END(invalidate_interrupt\num)
ENTRY(call_function_interrupt)
apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
END(call_function_interrupt)
+ENTRY(call_function_single_interrupt)
+ apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
+END(call_function_single_interrupt)
ENTRY(irq_move_cleanup_interrupt)
apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
END(irq_move_cleanup_interrupt)
@@ -632,6 +791,10 @@ ENTRY(apic_timer_interrupt)
apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
END(apic_timer_interrupt)
+ENTRY(uv_bau_message_intr1)
+ apicinterrupt 220,uv_bau_message_interrupt
+END(uv_bau_message_intr1)
+
ENTRY(error_interrupt)
apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
END(error_interrupt)
@@ -745,7 +908,7 @@ paranoid_restore\trace:
jmp irq_return
paranoid_userspace\trace:
GET_THREAD_INFO(%rcx)
- movl threadinfo_flags(%rcx),%ebx
+ movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
jz paranoid_swapgs\trace
movq %rsp,%rdi /* &pt_regs */
@@ -842,7 +1005,7 @@ error_exit:
testb $3,CS-ARGOFFSET(%rsp)
jz retint_kernel
LOCKDEP_SYS_EXIT_IRQ
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
movl $_TIF_WORK_MASK,%edi
andl %edi,%edx
jnz retint_careful
@@ -864,11 +1027,11 @@ error_kernelspace:
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rbp
- cmpq %rbp,RIP(%rsp)
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
- movl %ebp,%ebp /* zero extend */
- cmpq %rbp,RIP(%rsp)
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
cmpq $gs_change,RIP(%rsp)
je error_swapgs
@@ -1114,6 +1277,7 @@ END(device_not_available)
/* runs on exception stack */
KPROBE_ENTRY(debug)
/* INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8 */
zeroentry do_debug
@@ -1141,6 +1305,7 @@ END(do_nmi_callback)
KPROBE_ENTRY(int3)
/* INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8 */
zeroentry do_int3
@@ -1164,14 +1329,11 @@ ENTRY(coprocessor_segment_overrun)
zeroentry do_coprocessor_segment_overrun
END(coprocessor_segment_overrun)
-ENTRY(reserved)
- zeroentry do_reserved
-END(reserved)
-
#if 0
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_double_fault
jmp paranoid_exit1
CFI_ENDPROC
@@ -1189,6 +1351,7 @@ END(segment_not_present)
/* runs on exception stack */
ENTRY(stack_segment)
/* XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_stack_segment */
errorentry do_stack_segment
/* jmp paranoid_exit1
--- head-2011-03-11.orig/arch/x86/kernel/fixup.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/fixup.c 2011-02-01 14:38:38.000000000 +0100
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/version.h>
+#include <asm/traps.h>
#define DP(_f, _args...) pr_alert(" " _f "\n" , ## _args )
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/head-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+#ifndef CONFIG_XEN
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+#endif
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/kernel/head32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -0,0 +1,57 @@
+/*
+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
+void __init i386_start_kernel(void)
+{
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_BLK_DEV_INITRD
+ /* Reserve INITRD */
+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ }
+#endif
+ reserve_early(init_pg_tables_start, init_pg_tables_end,
+ "INIT_PG_TABLE");
+#else
+ reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+ __pa(xen_start_info->pt_base)
+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+ "Xen provided");
+
+ {
+ int max_cmdline;
+
+ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+ max_cmdline = COMMAND_LINE_SIZE;
+ memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+ boot_command_line[max_cmdline-1] = '\0';
+ }
+#endif
+
+ reserve_ebda_region();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
+ start_kernel();
+}
--- head-2011-03-11.orig/arch/x86/kernel/head64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -32,7 +32,26 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>
-unsigned long start_pfn;
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+/*
+ * We install an empty cpu_pda pointer table to indicate to early users
+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
+ * the boot cpu is not yet setup.
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
+void __init x86_64_init_pda(void)
+{
+ _cpu_pda = __cpu_pda;
+ cpu_pda(0) = &_boot_cpu_pda;
+ pda_init(0);
+}
#ifndef CONFIG_XEN
static void __init zap_identity_mappings(void)
@@ -77,83 +96,10 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
unsigned int machine_to_phys_order;
EXPORT_SYMBOL(machine_to_phys_order);
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-#ifndef CONFIG_XEN
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_early(lowmem, 0x100000, "BIOS reserved");
-#endif
-}
-
-static void __init reserve_setup_data(void)
-{
-#ifndef CONFIG_XEN
- struct setup_data *data;
- unsigned long pa_data;
- char buf[32];
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
- }
-#endif
-}
-
void __init x86_64_start_kernel(char * real_mode_data)
{
struct xen_machphys_mapping mapping;
unsigned long machine_to_phys_nr_ents;
- int i;
/*
* Build-time sanity checks on the kernel image and module
@@ -167,6 +113,7 @@ void __init x86_64_start_kernel(char * r
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
xen_setup_features();
@@ -174,8 +121,6 @@ void __init x86_64_start_kernel(char * r
if (!xen_feature(XENFEAT_auto_translated_physmap))
phys_to_machine_mapping =
(unsigned long *)xen_start_info->mfn_list;
- start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
- xen_start_info->nr_pt_frames;
machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
@@ -208,19 +153,23 @@ void __init x86_64_start_kernel(char * r
early_printk("Kernel alive\n");
- for (i = 0; i < NR_CPUS; i++)
- cpu_pda(i) = &boot_cpu_pda[i];
+ x86_64_init_pda();
- pda_init(0);
+ early_printk("Kernel really alive\n");
+
+ x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
copy_bootdata(__va(real_mode_data));
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE),
- start_pfn << PAGE_SHIFT, "Xen provided");
-
- reserve_ebda_region();
- reserve_setup_data();
+ __pa(xen_start_info->pt_base)
+ + (xen_start_info->nr_pt_frames << PAGE_SHIFT),
+ "Xen provided");
/*
* At this point everything still needed from the boot loader
--- head-2011-03-11.orig/arch/x86/kernel/head_32-xen.S 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head_32-xen.S 2011-03-03 16:19:21.000000000 +0100
@@ -61,8 +61,6 @@ ENTRY(startup_32)
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
- movb $1,X86_HARD_MATH
-
xorl %eax,%eax # Clear GS
movl %eax,%gs
--- head-2011-03-11.orig/arch/x86/kernel/head_64-xen.S 2011-01-31 17:49:31.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/head_64-xen.S 2011-02-01 14:38:38.000000000 +0100
@@ -92,53 +92,6 @@ NEXT_PAGE(hypercall_page)
#undef NEXT_PAGE
- .data
-
- .align 16
- .globl cpu_gdt_descr
-cpu_gdt_descr:
- .word gdt_end-cpu_gdt_table-1
-gdt:
- .quad cpu_gdt_table
-#ifdef CONFIG_SMP
- .rept NR_CPUS-1
- .word 0
- .quad 0
- .endr
-#endif
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-
- .section .data.page_aligned, "aw"
- .align PAGE_SIZE
-
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-
-ENTRY(cpu_gdt_table)
- .quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
- .quad 0x00af9b000000ffff /* __KERNEL_CS */
- .quad 0x00cf93000000ffff /* __KERNEL_DS */
- .quad 0x00cffb000000ffff /* __USER32_CS */
- .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
- .quad 0x00affb000000ffff /* __USER_CS */
- .quad 0x0 /* unused */
- .quad 0,0 /* TSS */
- .quad 0,0 /* LDT */
- .quad 0,0,0 /* three TLS descriptors */
- .quad 0x0000f40000000000 /* node/CPU stored in limit */
-gdt_end:
- /* asm/segment.h:GDT_ENTRIES must match this */
- /* This should be a multiple of the cache line size */
- /* GDTs of other CPUs are now dynamically allocated */
-
- /* zero the remaining page */
- .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
-
.section .bss.page_aligned, "aw", @nobits
.align PAGE_SIZE
ENTRY(empty_zero_page)
--- head-2011-03-11.orig/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/io_apic_32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
+#include <linux/bootmem.h>
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
@@ -75,7 +76,7 @@ static struct { int pin, apic; } ioapic_
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/*
* Is the SiS APIC rmw bug present ?
@@ -89,15 +90,21 @@ int sis_apic_bug = -1;
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
static int disable_timer_pin_1 __initdata;
/*
@@ -128,7 +135,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
#endif
@@ -142,7 +149,7 @@ static inline unsigned int io_apic_read(
struct physdev_apic apic_op;
int ret;
- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
apic_op.reg = reg;
ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
if (ret)
@@ -160,7 +167,7 @@ static inline void io_apic_write(unsigne
#else
struct physdev_apic apic_op;
- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
apic_op.reg = reg;
apic_op.value = value;
WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -288,7 +295,7 @@ static void __init replace_pin_at_irq(un
}
}
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
{
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int pin, reg;
@@ -308,30 +315,32 @@ static void __modify_IO_APIC_irq (unsign
}
/* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
}
/* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0, 0x00010000);
+ __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
}
/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER);
}
/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED);
}
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -340,7 +349,7 @@ static void mask_IO_APIC_irq (unsigned i
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -352,7 +361,7 @@ static void unmask_IO_APIC_irq (unsigned
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
-
+
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
@@ -364,7 +373,7 @@ static void clear_IO_APIC_pin(unsigned i
ioapic_mask_entry(apic, pin);
}
-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
{
int apic, pin;
@@ -381,7 +390,7 @@ static void set_ioapic_affinity_irq(unsi
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int apicid_value;
cpumask_t tmp;
-
+
cpus_and(tmp, cpumask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
@@ -410,7 +419,7 @@ static void set_ioapic_affinity_irq(unsi
# include <linux/kernel_stat.h> /* kstat */
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h>
-
+
#define IRQBALANCE_CHECK_ARCH -999
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -422,14 +431,14 @@ static int physical_balance __read_mostl
static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
static struct irq_cpu_info {
- unsigned long * last_irq;
- unsigned long * irq_delta;
+ unsigned long *last_irq;
+ unsigned long *irq_delta;
unsigned long irq;
} irq_cpu_data[NR_CPUS];
#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -468,8 +477,8 @@ inside:
if (cpu == -1)
cpu = NR_CPUS-1;
}
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu,now)));
+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+ (search_idle && !IDLE_ENOUGH(cpu, now)));
return cpu;
}
@@ -479,15 +488,14 @@ static inline void balance_irq(int cpu,
unsigned long now = jiffies;
cpumask_t allowed_mask;
unsigned int new_cpu;
-
+
if (irqbalance_disabled)
- return;
+ return;
cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu) {
+ if (cpu != new_cpu)
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
- }
}
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -499,14 +507,14 @@ static inline void rotate_irqs_among_cpu
if (!irq_desc[j].action)
continue;
/* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
useful_load_threshold)
continue;
balance_irq(i, j);
}
}
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
@@ -535,22 +543,22 @@ static void do_irq_balance(void)
/* Is this an active IRQ or balancing disabled ? */
if (!irq_desc[j].action || irq_balancing_disabled(j))
continue;
- if ( package_index == i )
- IRQ_DELTA(package_index,j) = 0;
+ if (package_index == i)
+ IRQ_DELTA(package_index, j) = 0;
/* Determine the total count per processor per IRQ */
value_now = (unsigned long) kstat_cpu(i).irqs[j];
/* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i,j);
+ delta = value_now - LAST_CPU_IRQ(i, j);
/* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i,j) = value_now;
+ LAST_CPU_IRQ(i, j) = value_now;
/* Ignore IRQs whose rate is less than the clock */
if (delta < useful_load_threshold)
continue;
/* update the load for the processor or package total */
- IRQ_DELTA(package_index,j) += delta;
+ IRQ_DELTA(package_index, j) += delta;
/* Keep track of the higher numbered sibling as well */
if (i != package_index)
@@ -576,7 +584,8 @@ static void do_irq_balance(void)
max_cpu_irq = ULONG_MAX;
tryanothercpu:
- /* Look for heaviest loaded processor.
+ /*
+ * Look for heaviest loaded processor.
* We may come back to get the next heaviest loaded processor.
* Skip processors with trivial loads.
*/
@@ -585,7 +594,7 @@ tryanothercpu:
for_each_online_cpu(i) {
if (i != CPU_TO_PACKAGEINDEX(i))
continue;
- if (max_cpu_irq <= CPU_IRQ(i))
+ if (max_cpu_irq <= CPU_IRQ(i))
continue;
if (tmp_cpu_irq < CPU_IRQ(i)) {
tmp_cpu_irq = CPU_IRQ(i);
@@ -594,8 +603,9 @@ tryanothercpu:
}
if (tmp_loaded == -1) {
- /* In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
+ /*
+ * In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
* approach to rotate them around.
*/
if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -604,13 +614,14 @@ tryanothercpu:
}
goto not_worth_the_effort;
}
-
+
first_attempt = 0; /* heaviest search */
max_cpu_irq = tmp_cpu_irq; /* load */
max_loaded = tmp_loaded; /* processor */
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /* if imbalance is less than approx 10% of max load, then
+
+ /*
+ * if imbalance is less than approx 10% of max load, then
* observe diminishing returns action. - quit
*/
if (imbalance < (max_cpu_irq >> 3))
@@ -626,26 +637,25 @@ tryanotherirq:
/* Is this an active IRQ? */
if (!irq_desc[j].action)
continue;
- if (imbalance <= IRQ_DELTA(max_loaded,j))
+ if (imbalance <= IRQ_DELTA(max_loaded, j))
continue;
/* Try to find the IRQ that is closest to the imbalance
* without going over.
*/
- if (move_this_load < IRQ_DELTA(max_loaded,j)) {
- move_this_load = IRQ_DELTA(max_loaded,j);
+ if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+ move_this_load = IRQ_DELTA(max_loaded, j);
selected_irq = j;
}
}
- if (selected_irq == -1) {
+ if (selected_irq == -1)
goto tryanothercpu;
- }
imbalance = move_this_load;
-
+
/* For physical_balance case, we accumulated both load
* values in the one of the siblings cpu_irq[],
* to use the same code for physical and logical processors
- * as much as possible.
+ * as much as possible.
*
* NOTE: the cpu_irq[] array holds the sum of the load for
* sibling A and sibling B in the slot for the lowest numbered
@@ -674,11 +684,11 @@ tryanotherirq:
/* mark for change destination */
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
- /* Since we made a change, come back sooner to
+ /* Since we made a change, come back sooner to
* check for more variation.
*/
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
goto tryanotherirq;
@@ -689,7 +699,7 @@ not_worth_the_effort:
* upward
*/
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
return;
}
@@ -728,13 +738,13 @@ static int __init balanced_irq_init(void
cpumask_t tmp;
cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
+ c = &boot_cpu_data;
/* When not overwritten by the command line ask subarchitecture. */
if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
irqbalance_disabled = NO_BALANCE_IRQ;
if (irqbalance_disabled)
return 0;
-
+
/* disable irqbalance completely if there is only one processor online */
if (num_online_cpus() < 2) {
irqbalance_disabled = 1;
@@ -748,16 +758,14 @@ static int __init balanced_irq_init(void
physical_balance = 1;
for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
printk(KERN_ERR "balanced_irq_init: out of memory");
goto failed;
}
- memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
- memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
}
-
+
printk(KERN_INFO "Starting balanced_irq\n");
if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
return 0;
@@ -799,7 +807,7 @@ void send_IPI_self(int vector)
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
- apic_write_around(APIC_ICR, cfg);
+ apic_write(APIC_ICR, cfg);
#endif
}
#endif /* !CONFIG_SMP */
@@ -853,10 +861,10 @@ static int find_irq_entry(int apic, int
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -871,13 +879,13 @@ static int __init find_isa_irq_pin(int i
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -887,17 +895,17 @@ static int __init find_isa_irq_apic(int
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
- for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -918,28 +926,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
"slot:%d, pin:%d.\n", bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -954,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus,
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
/*
- * This function currently is only a helper for the i386 smp boot process where
+ * This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
* so mask in all cases should simply be TARGET_CPUS
*/
@@ -1008,7 +1016,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -1025,118 +1033,115 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3) {
+ case 0: /* conforms, ie. bus-type dependent polarity */
{
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ polarity = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_polarity(idx):
+ default_PCI_polarity(idx);
+ break;
+ }
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
}
return polarity;
}
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
+ case 0: /* conforms, ie. bus-type dependent */
{
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
+ trigger = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_trigger(idx):
+ default_PCI_trigger(idx);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
break;
}
- case 1: /* edge */
+ case MP_BUS_EISA: /* EISA pin */
{
- trigger = 0;
+ trigger = default_EISA_trigger(idx);
break;
}
- case 2: /* reserved */
+ case MP_BUS_PCI: /* PCI pin */
{
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
+ /* set before the switch */
break;
}
- case 3: /* level */
+ case MP_BUS_MCA: /* MCA pin */
{
- trigger = 1;
+ trigger = default_MCA_trigger(idx);
break;
}
- default: /* invalid */
+ default:
{
printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
+ trigger = 1;
break;
}
}
+#endif
+ break;
+ }
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
return trigger;
}
@@ -1153,16 +1158,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
else {
/*
* PCI IRQs are mapped in order
@@ -1204,8 +1209,8 @@ static inline int IO_APIC_irq_trigger(in
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic,pin,mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
}
}
@@ -1291,25 +1296,25 @@ static void __init setup_IO_APIC_irqs(vo
/*
* add it to the IO-APIC irq-routing table:
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
+ entry.dest.logical.logical_dest =
cpu_mask_to_apicid(TARGET_CPUS);
- idx = find_irq_entry(apic,pin,mp_INT);
+ idx = find_irq_entry(apic, pin, mp_INT);
if (idx == -1) {
if (first_notcon) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
" IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mpc_apicid,
+ mp_ioapics[apic].mp_apicid,
pin);
first_notcon = 0;
} else
apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mpc_apicid, pin);
+ mp_ioapics[apic].mp_apicid, pin);
continue;
}
@@ -1343,7 +1348,7 @@ static void __init setup_IO_APIC_irqs(vo
vector = assign_irq_vector(irq);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
+
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
@@ -1355,27 +1360,23 @@ static void __init setup_IO_APIC_irqs(vo
apic_printk(APIC_VERBOSE, " not connected.\n");
}
+#ifndef CONFIG_XEN
/*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-#ifndef CONFIG_XEN
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
- memset(&entry,0,sizeof(entry));
-
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ memset(&entry, 0, sizeof(entry));
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -1384,17 +1385,14 @@ static void __init setup_ExtINT_IRQ0_pin
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
- irq_desc[0].chip = &ioapic_chip;
- set_irq_handler(0, handle_edge_irq);
+ ioapic_register_intr(0, vector, IOAPIC_EDGE);
/*
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __init print_IO_APIC(void)
@@ -1409,10 +1407,10 @@ void __init print_IO_APIC(void)
if (apic_verbosity == APIC_QUIET)
return;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1431,7 +1429,7 @@ void __init print_IO_APIC(void)
reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1512,7 +1510,7 @@ void __init print_IO_APIC(void)
return;
}
-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
{
unsigned int v;
int i, j;
@@ -1533,7 +1531,7 @@ static void print_APIC_bitfield (int bas
}
}
-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
@@ -1542,6 +1540,7 @@ void /*__init*/ print_local_APIC(void *
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
@@ -1616,9 +1615,9 @@ void /*__init*/ print_local_APIC(void *
printk("\n");
}
-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
{
- on_each_cpu(print_local_APIC, NULL, 1, 1);
+ on_each_cpu(print_local_APIC, NULL, 1);
}
void /*__init*/ print_PIC(void)
@@ -1639,11 +1638,11 @@ void /*__init*/ print_PIC(void)
v = inb(0xa0) << 8 | inb(0x20);
printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
spin_unlock_irqrestore(&i8259A_lock, flags);
@@ -1652,6 +1651,8 @@ void /*__init*/ print_PIC(void)
v = inb(0x4d1) << 8 | inb(0x4d0);
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
+#else
+void __init print_IO_APIC(void) {}
#endif /* !CONFIG_XEN */
static void __init enable_IO_APIC(void)
@@ -1681,7 +1682,7 @@ static void __init enable_IO_APIC(void)
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
#ifndef CONFIG_XEN
- for(apic = 0; apic < nr_ioapics; apic++) {
+ for (apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1774,7 +1775,7 @@ void disable_IO_APIC(void)
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
+#ifndef CONFIG_XEN
static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
@@ -1784,6 +1785,11 @@ static void __init setup_ioapic_ids_from
unsigned char old_id;
unsigned long flags;
+#ifdef CONFIG_X86_NUMAQ
+ if (found_numaq)
+ return;
+#endif
+
/*
* Don't check I/O APIC IDs for xAPIC systems. They have
* no meaning without the serial APIC bus.
@@ -1806,15 +1812,15 @@ static void __init setup_ioapic_ids_from
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mpc_apicid;
- if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
}
/*
@@ -1823,9 +1829,9 @@ static void __init setup_ioapic_ids_from
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mpc_apicid)) {
+ mp_ioapics[apic].mp_apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1834,13 +1840,13 @@ static void __init setup_ioapic_ids_from
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mpc_apicid = i;
+ mp_ioapics[apic].mp_apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1849,21 +1855,21 @@ static void __init setup_ioapic_ids_from
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mpc_apicid)
+ if (old_id != mp_ioapics[apic].mp_apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_dstapic == old_id)
- mp_irqs[i].mpc_dstapic
- = mp_ioapics[apic].mpc_apicid;
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
/*
* Read the right value from the MPC table and
* write it into the ID register.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1874,17 +1880,13 @@ static void __init setup_ioapic_ids_from
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
-#ifndef CONFIG_XEN
int no_timer_check __initdata;
static int __init notimercheck(char *s)
@@ -2077,45 +2079,53 @@ static inline void init_IO_APIC_traps(vo
* The local APIC irq-chip implementation:
*/
-static void ack_apic(unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC-edge",
+ .name = "local-APIC",
.mask = mask_lapic_irq,
.unmask = unmask_lapic_irq,
- .eoi = ack_apic,
+ .ack = ack_lapic_irq,
};
+static void lapic_register_intr(int irq, int vector)
+{
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+ set_intr_gate(vector, interrupt[irq]);
+}
+
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
@@ -2191,11 +2201,16 @@ static inline void __init unlock_ExtINT_
static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
+ int no_pin1 = 0;
int vector;
+ unsigned int ver;
unsigned long flags;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -2204,34 +2219,54 @@ static inline void __init check_timer(vo
set_intr_gate(vector, interrupt[0]);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- timer_ack = 1;
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
- vector, apic1, pin1, apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ vector, apic1, pin1, apic2, pin2);
+
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, vector);
+ }
unmask_IO_APIC_irq(0);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -2240,71 +2275,77 @@ static inline void __init check_timer(vo
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
- "IO-APIC\n");
- }
-
- printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
- if (pin2 != -1) {
- printk("\n..... (found pin %d) ...", pin2);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+ "(IRQ0) through the 8259A ...\n");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
- printk("works.\n");
- if (pin1 != -1)
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- else
- add_pin_to_irq(0, apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
- printk(" failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+ "through the IO-APIC - disabling NMI Watchdog!\n");
+ nmi_watchdog = NMI_NONE;
}
+ timer_ack = 0;
- printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
- disable_8259A_irq(0);
- set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
- "fasteoi");
- apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
+ lapic_register_intr(0, vector);
+ apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
- printk(" works.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
- printk(" failed.\n");
+ disable_8259A_irq(0);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
- printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as ExtINT IRQ...\n");
- timer_ack = 0;
init_8259A(0);
make_8259A_irq(0);
- apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
unlock_ExtINT_logic();
if (timer_irq_works()) {
- printk(" works.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- printk(" failed :(.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
- "report. Then try booting with the 'noapic' option");
+ "report. Then try booting with the 'noapic' option.\n");
out:
local_irq_restore(flags);
}
@@ -2314,11 +2355,21 @@ int timer_uses_ioapic_pin_0 = 0;
#endif
/*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- * Linux doesn't really care, as it's not actually used
- * for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
*/
#define PIC_IRQS (1 << PIC_CASCADE_IR)
@@ -2328,25 +2379,22 @@ void __init setup_IO_APIC(void)
int i;
/* Reserve all the system vectors. */
- for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+ for (i = first_system_vector; i < NR_VECTORS; i++)
set_bit(i, used_vectors);
#endif
enable_IO_APIC();
- if (acpi_ioapic)
- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
- else
- io_apic_irqs = ~PIC_IRQS;
+ io_apic_irqs = ~PIC_IRQS;
printk("ENABLING IO-APIC IRQs\n");
+#ifndef CONFIG_XEN
/*
* Set up IO-APIC IRQ routing.
*/
if (!acpi_ioapic)
setup_ioapic_ids_from_mpc();
-#ifndef CONFIG_XEN
sync_Arb_IDs();
#endif
setup_IO_APIC_irqs();
@@ -2356,28 +2404,14 @@ void __init setup_IO_APIC(void)
print_IO_APIC();
}
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
/*
* Called after all the initialization is done. If we didnt find any
* APIC bugs then we can allow the modify fast path
*/
-
+
static int __init io_apic_bug_finalize(void)
{
- if(sis_apic_bug == -1)
+ if (sis_apic_bug == -1)
sis_apic_bug = 0;
if (is_initial_xendomain()) {
struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
@@ -2396,17 +2430,17 @@ struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
entry[i] = ioapic_read_entry(dev->id, i);
return 0;
@@ -2419,18 +2453,18 @@ static int ioapic_resume(struct sys_devi
unsigned long flags;
union IO_APIC_reg_00 reg_00;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
return 0;
@@ -2444,24 +2478,23 @@ static struct sysdev_class ioapic_sysdev
static int __init ioapic_init_sysfs(void)
{
- struct sys_device * dev;
+ struct sys_device *dev;
int i, size, error = 0;
error = sysdev_class_register(&ioapic_sysdev_class);
if (error)
return error;
- for (i = 0; i < nr_ioapics; i++ ) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ for (i = 0; i < nr_ioapics; i++) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
* sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
if (!mp_ioapic_data[i]) {
printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
continue;
}
- memset(mp_ioapic_data[i], 0, size);
dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
+ dev->id = i;
dev->cls = &ioapic_sysdev_class;
error = sysdev_register(dev);
if (error) {
@@ -2538,7 +2571,7 @@ static int msi_compose_msg(struct pci_de
msg->address_lo =
MSI_ADDR_BASE_LO |
((INT_DEST_MODE == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
@@ -2549,7 +2582,7 @@ static int msi_compose_msg(struct pci_de
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(vector);
}
@@ -2720,13 +2753,13 @@ int arch_setup_ht_irq(unsigned int irq,
#endif /* CONFIG_HT_IRQ */
/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
+ ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
#ifndef CONFIG_XEN
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
{
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2735,10 +2768,10 @@ int __init io_apic_get_unique_id (int io
int i = 0;
/*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
* supports up to 16 on one shared APIC bus.
- *
+ *
* TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
* advantage of new APIC bus architecture.
*/
@@ -2757,7 +2790,7 @@ int __init io_apic_get_unique_id (int io
}
/*
- * Every APIC in a system must have a unique ID or we get lots of nice
+ * Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2774,7 +2807,7 @@ int __init io_apic_get_unique_id (int io
"trying %d\n", ioapic, apic_id, i);
apic_id = i;
- }
+ }
tmp = apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
@@ -2802,7 +2835,7 @@ int __init io_apic_get_unique_id (int io
#endif /* !CONFIG_XEN */
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2815,7 +2848,7 @@ int __init io_apic_get_version (int ioap
}
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2828,7 +2861,7 @@ int __init io_apic_get_redir_entries (in
}
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
{
struct IO_APIC_route_entry entry;
@@ -2844,7 +2877,7 @@ int io_apic_set_pci_routing (int ioapic,
* corresponding device driver registers for this IRQ.
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
@@ -2863,7 +2896,7 @@ int io_apic_set_pci_routing (int ioapic,
apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+ mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
edge_level, active_high_low);
ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2884,8 +2917,8 @@ int acpi_get_override_irq(int bus_irq, i
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -2918,3 +2951,35 @@ static int __init parse_noapic(char *arg
return 0;
}
early_param("noapic", parse_noapic);
+
+#ifndef CONFIG_XEN
+void __init ioapic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ int i;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+ } else {
+fake_ioapic_page:
+ ioapic_phys = (unsigned long)
+ alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx), ioapic_phys);
+ idx++;
+ }
+}
+#endif
--- head-2011-03-11.orig/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/io_apic_64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -45,6 +45,7 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/dma.h>
+#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
@@ -63,10 +64,16 @@ struct irq_cfg {
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly;
static int assign_irq_vector(int irq, cpumask_t mask);
+#ifndef CONFIG_XEN
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+#endif
+
#define __apicdebuginit __init
int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -89,14 +96,14 @@ unsigned long io_apic_irqs;
#define clear_IO_APIC() ((void)0)
#else
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
#endif
static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
+static DEFINE_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -104,15 +111,17 @@ DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
/*
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
@@ -141,7 +150,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
#endif
@@ -155,7 +164,7 @@ static inline unsigned int io_apic_read(
struct physdev_apic apic_op;
int ret;
- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
apic_op.reg = reg;
ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
if (ret)
@@ -173,7 +182,7 @@ static inline void io_apic_write(unsigne
#else
struct physdev_apic apic_op;
- apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
+ apic_op.apic_physbase = mp_ioapics[apic].mp_apicaddr;
apic_op.reg = reg;
apic_op.value = value;
WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
@@ -209,7 +218,7 @@ static bool io_apic_level_ack_pending(un
break;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if ((reg >> 14) & 1) {
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
@@ -308,7 +317,7 @@ static void __target_IO_APIC_irq(unsigne
break;
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~0x000000ff;
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
io_apic_modify(apic, reg);
if (!entry->next)
@@ -372,6 +381,26 @@ static void add_pin_to_irq(unsigned int
}
#ifndef CONFIG_XEN
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (1) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ }
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
#define __DO_ACTION(R, ACTION, FINAL) \
\
{ \
@@ -399,10 +428,11 @@ static void add_pin_to_irq(unsigned int
static void name##_IO_APIC_irq (unsigned int irq) \
__DO_ACTION(R, ACTION, FINAL)
-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
- /* mask = 1 */
-DO_ACTION( __unmask, 0, &= 0xfffeffff, )
- /* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -465,22 +495,6 @@ static int __init disable_timer_pin_setu
}
__setup("disable_timer_pin_1", disable_timer_pin_setup);
-#ifndef CONFIG_XEN
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-#endif /* !CONFIG_XEN */
-
/*
* Find the IRQ entry number of a certain pin.
@@ -490,10 +504,10 @@ static int find_irq_entry(int apic, int
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -508,13 +522,13 @@ static int __init find_isa_irq_pin(int i
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -524,17 +538,17 @@ static int __init find_isa_irq_apic(int
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -555,28 +569,28 @@ int IO_APIC_get_PCI_irq_vector(int bus,
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -604,13 +618,13 @@ int IO_APIC_get_PCI_irq_vector(int bus,
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -646,13 +660,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -699,16 +713,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -722,6 +736,19 @@ static int pin_2_irq(int idx, int apic,
return irq;
}
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ spin_unlock(&vector_lock);
+}
+
static int __assign_irq_vector(int irq, cpumask_t mask)
{
struct physdev_irq irq_op;
@@ -773,7 +800,7 @@ static void __clear_irq_vector(int irq)
vector = cfg->vector;
cpus_and(mask, cfg->domain, cpu_online_map);
- for_each_cpu_mask(cpu, mask)
+ for_each_cpu_mask_nr(cpu, mask)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
@@ -842,7 +869,7 @@ static void setup_IO_APIC_irq(int apic,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
irq, trigger, polarity);
/*
@@ -883,10 +910,10 @@ static void __init setup_IO_APIC_irqs(vo
idx = find_irq_entry(apic,pin,mp_INT);
if (idx == -1) {
if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
first_notcon = 0;
} else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
continue;
}
if (!first_notcon) {
@@ -908,26 +935,21 @@ static void __init setup_IO_APIC_irqs(vo
#ifndef CONFIG_XEN
/*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -936,7 +958,7 @@ static void __init setup_ExtINT_IRQ0_pin
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
@@ -944,8 +966,6 @@ static void __init setup_ExtINT_IRQ0_pin
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __apicdebuginit print_IO_APIC(void)
@@ -962,7 +982,7 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -980,7 +1000,7 @@ void __apicdebuginit print_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1072,6 +1092,7 @@ void __apicdebuginit print_local_APIC(vo
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1141,7 +1162,7 @@ void __apicdebuginit print_local_APIC(vo
void print_all_local_APICs (void)
{
- on_each_cpu(print_local_APIC, NULL, 1, 1);
+ on_each_cpu(print_local_APIC, NULL, 1);
}
void __apicdebuginit print_PIC(void)
@@ -1175,6 +1196,8 @@ void __apicdebuginit print_PIC(void)
v = inb(0x4d1) << 8 | inb(0x4d0);
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
+#else
+void __apicdebuginit print_IO_APIC(void) {}
#endif /* !CONFIG_XEN */
void __init enable_IO_APIC(void)
@@ -1359,12 +1382,10 @@ static unsigned int startup_ioapic_irq(u
static int ioapic_retrigger_irq(unsigned int irq)
{
struct irq_cfg *cfg = &irq_cfg[irq];
- cpumask_t mask;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- mask = cpumask_of_cpu(first_cpu(cfg->domain));
- send_IPI_mask(mask, cfg->vector);
+ send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -1545,7 +1566,7 @@ static inline void init_IO_APIC_traps(vo
}
#ifndef CONFIG_XEN
-static void enable_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -1553,7 +1574,7 @@ static void enable_lapic_irq (unsigned i
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -1566,19 +1587,20 @@ static void ack_lapic_irq (unsigned int
ack_APIC_irq();
}
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
-
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
- .name = "local-APIC",
- .typename = "local-APIC-edge",
- .startup = NULL, /* startup_irq() not used for IRQ0 */
- .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
- .enable = enable_lapic_irq,
- .disable = disable_lapic_irq,
- .ack = ack_lapic_irq,
- .end = end_lapic_irq,
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC",
+ .mask = mask_lapic_irq,
+ .unmask = unmask_lapic_irq,
+ .ack = ack_lapic_irq,
};
+static void lapic_register_intr(int irq)
+{
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+}
+
static void __init setup_nmi(void)
{
/*
@@ -1664,6 +1686,7 @@ static inline void __init check_timer(vo
struct irq_cfg *cfg = irq_cfg + 0;
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ int no_pin1 = 0;
local_irq_save(flags);
@@ -1674,34 +1697,48 @@ static inline void __init check_timer(vo
assign_irq_vector(0, TARGET_CPUS);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
+
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ }
unmask_IO_APIC_irq(0);
if (!no_timer_check && timer_irq_works()) {
- nmi_watchdog_default();
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -1710,54 +1747,62 @@ static inline void __init check_timer(vo
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
- "connected to IO-APIC\n");
- }
-
- apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
- "through the 8259A ... ");
- if (pin2 != -1) {
- apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
- apic2, pin2);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+ "(IRQ0) through the 8259A ...\n");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
- apic_printk(APIC_VERBOSE," works.\n");
- nmi_watchdog_default();
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
- apic_printk(APIC_VERBOSE," failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
- printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+ "through the IO-APIC - disabling NMI Watchdog!\n");
+ nmi_watchdog = NMI_NONE;
}
- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
- disable_8259A_irq(0);
- irq_desc[0].chip = &lapic_irq_type;
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
- apic_printk(APIC_VERBOSE," works.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_VERBOSE," failed.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
- apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as ExtINT IRQ...\n");
init_8259A(0);
make_8259A_irq(0);
@@ -1766,11 +1811,12 @@ static inline void __init check_timer(vo
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_VERBOSE," works.\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- apic_printk(APIC_VERBOSE," failed :(.\n");
- panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
+ "report. Then try booting with the 'noapic' option.\n");
out:
local_irq_restore(flags);
}
@@ -1788,10 +1834,21 @@ __setup("no_timer_check", notimercheck);
/*
*
- * IRQs that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- * Linux doesn't really care, as it's not actually used
- * for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
*/
#define PIC_IRQS (1<<2)
@@ -1799,10 +1856,7 @@ void __init setup_IO_APIC(void)
{
enable_IO_APIC();
- if (acpi_ioapic)
- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
- else
- io_apic_irqs = ~PIC_IRQS;
+ io_apic_irqs = ~PIC_IRQS;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
@@ -1851,8 +1905,8 @@ static int ioapic_resume(struct sys_devi
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2254,8 +2308,8 @@ int acpi_get_override_irq(int bus_irq, i
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -2349,7 +2403,7 @@ void __init ioapic_init_mappings(void)
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
} else {
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
--- head-2011-03-11.orig/arch/x86/kernel/ldt-xen.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/ldt-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -20,9 +20,9 @@
#include <asm/mmu_context.h>
#ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
{
- if (current->active_mm)
+ if (current->active_mm == current_mm)
load_LDT(&current->active_mm->context);
}
#endif
@@ -62,8 +62,6 @@ static int alloc_ldt(mm_context_t *pc, i
if (reload) {
#ifdef CONFIG_SMP
- cpumask_t mask;
-
preempt_disable();
#endif
make_pages_readonly(newldt,
@@ -71,9 +69,9 @@ static int alloc_ldt(mm_context_t *pc, i
XENFEAT_writable_descriptor_tables);
load_LDT(pc);
#ifdef CONFIG_SMP
- mask = cpumask_of_cpu(smp_processor_id());
- if (!cpus_equal(current->mm->cpu_vm_mask, mask))
- smp_call_function(flush_ldt, NULL, 1, 1);
+ if (!cpus_equal(current->mm->cpu_vm_mask,
+ cpumask_of_cpu(smp_processor_id())))
+ smp_call_function(flush_ldt, current->mm, 1);
preempt_enable();
#endif
}
--- head-2011-03-11.orig/arch/x86/kernel/machine_kexec_32.c 2011-01-31 14:54:00.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/machine_kexec_32.c 2011-02-01 14:38:38.000000000 +0100
@@ -131,6 +131,8 @@ void machine_kexec_setup_load_arg(xen_ke
xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
}
int __init machine_kexec_setup_resources(struct resource *hypervisor,
--- head-2011-03-11.orig/arch/x86/kernel/microcode-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/microcode-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -5,13 +5,14 @@
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
+ * belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
- * Order Number 245472 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/245472.htm
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
*
* For more information, go to http://www.urbanmyth.org/microcode
*
@@ -26,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -86,6 +88,7 @@ static int do_microcode_update (const vo
static int microcode_open (struct inode *unused1, struct file *unused2)
{
+ cycle_kernel_lock();
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
@@ -162,7 +165,7 @@ static int request_microcode(void)
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, &microcode_pdev->dev);
if (error) {
- pr_debug("microcode: ucode data file %s load failed\n", name);
+ pr_debug("microcode: data file %s load failed\n", name);
return error;
}
@@ -183,6 +186,9 @@ static int __init microcode_init (void)
{
int error;
+ printk(KERN_INFO
+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
error = microcode_dev_init();
if (error)
return error;
@@ -195,8 +201,6 @@ static int __init microcode_init (void)
request_microcode();
- printk(KERN_INFO
- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
return 0;
}
--- head-2011-03-11.orig/arch/x86/kernel/mpparse-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/mpparse-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,9 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/setup.h>
#include <mach_apic.h>
#ifdef CONFIG_X86_32
@@ -32,27 +35,10 @@
#include <mach_mpparse.h>
#endif
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
+static void *_bus_to_virt(unsigned long ma)
+{
+ return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
+}
/*
* Checksum an MP configuration block.
@@ -68,19 +54,7 @@ static int __init mpf_checksum(unsigned
return sum & 0xFF;
}
-#ifdef CONFIG_X86_NUMAQ
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
- __cpuinitdata;
-#endif
-
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
{
#ifndef CONFIG_XEN
int apicid;
@@ -90,11 +64,12 @@ static void __cpuinit MP_processor_info(
disabled_cpus++;
return;
}
-#ifdef CONFIG_X86_NUMAQ
- apicid = mpc_apic_id(m, translation_table[mpc_record]);
-#else
- apicid = m->mpc_apicid;
-#endif
+
+ if (x86_quirks->mpc_apic_id)
+ apicid = x86_quirks->mpc_apic_id(m);
+ else
+ apicid = m->mpc_apicid;
+
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
boot_cpu_physical_apicid = m->mpc_apicid;
@@ -107,18 +82,17 @@ static void __cpuinit MP_processor_info(
#endif
}
+#ifdef CONFIG_X86_IO_APIC
static void __init MP_bus_info(struct mpc_config_bus *m)
{
char str[7];
-
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
-#ifdef CONFIG_X86_NUMAQ
- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+ if (x86_quirks->mpc_oem_bus_info)
+ x86_quirks->mpc_oem_bus_info(m, str);
+ else
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -135,12 +109,10 @@ static void __init MP_bus_info(struct mp
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
- mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+ if (x86_quirks->mpc_oem_pci_bus)
+ x86_quirks->mpc_oem_pci_bus(m);
+
clear_bit(m->mpc_busid, mp_bus_not_pci);
- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
- mp_current_pci_id++;
#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -151,6 +123,7 @@ static void __init MP_bus_info(struct mp
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
+#endif
#ifdef CONFIG_X86_IO_APIC
@@ -180,117 +153,111 @@ static void __init MP_ioapic_info(struct
if (bad_ioapic(m->mpc_apicaddr))
return;
- mp_ioapics[nr_ioapics] = *m;
+ mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+ mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+ mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+ mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+ mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
nr_ioapics++;
}
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
{
- mp_irqs[mp_irq_entries] = *m;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
}
-#endif
-
-static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
{
- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+ (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+ mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
}
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+ struct mp_config_intsrc *mp_irq)
{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
+ mp_irq->mp_dstapic = m->mpc_dstapic;
+ mp_irq->mp_type = m->mpc_type;
+ mp_irq->mp_irqtype = m->mpc_irqtype;
+ mp_irq->mp_irqflag = m->mpc_irqflag;
+ mp_irq->mp_srcbus = m->mpc_srcbus;
+ mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+ mp_irq->mp_dstirq = m->mpc_dstirq;
+}
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ m->mpc_dstapic = mp_irq->mp_dstapic;
+ m->mpc_type = mp_irq->mp_type;
+ m->mpc_irqtype = mp_irq->mp_irqtype;
+ m->mpc_irqflag = mp_irq->mp_irqflag;
+ m->mpc_srcbus = mp_irq->mp_srcbus;
+ m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+ m->mpc_dstirq = mp_irq->mp_dstirq;
}
-/*
- * Read/parse the MPC oem tables
- */
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ if (mp_irq->mp_dstapic != m->mpc_dstapic)
+ return 1;
+ if (mp_irq->mp_type != m->mpc_type)
+ return 2;
+ if (mp_irq->mp_irqtype != m->mpc_irqtype)
+ return 3;
+ if (mp_irq->mp_irqflag != m->mpc_irqflag)
+ return 4;
+ if (mp_irq->mp_srcbus != m->mpc_srcbus)
+ return 5;
+ if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+ return 6;
+ if (mp_irq->mp_dstirq != m->mpc_dstirq)
+ return 7;
+
+ return 0;
+}
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
- unsigned short oemsize)
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
{
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+ int i;
- mpc_record = 0;
- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
- oemtable);
- if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->oem_signature[0], oemtable->oem_signature[1],
- oemtable->oem_signature[2], oemtable->oem_signature[3]);
- return;
- }
- if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
- while (count < oemtable->oem_length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_config_translation *m =
- (struct mpc_config_translation *)oemptr;
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- {
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
+ print_MP_intsrc_info(m);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+ return;
}
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
}
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
+#endif
+
+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
{
- if (strncmp(oem, "IBM NUMA", 8))
- printk("Warning! May not be a NUMA-Q system!\n");
- if (mpc->mpc_oemptr)
- smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
- mpc->mpc_oemsize);
+ apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ m->mpc_irqtype, m->mpc_irqflag & 3,
+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
}
-#endif /* CONFIG_X86_NUMAQ */
/*
* Read/parse the MPC
*/
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+ char *str)
{
- char str[16];
- char oem[10];
- int count = sizeof(*mpc);
- unsigned char *mpt = ((unsigned char *)mpc) + count;
if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -313,20 +280,44 @@ static int __init smp_read_mpc(struct mp
}
memcpy(oem, mpc->mpc_oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+ printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->mpc_productid, 12);
str[12] = 0;
- printk("Product ID: %s ", str);
-#ifdef CONFIG_X86_32
- mps_oem_check(mpc, oem, str);
-#endif
- printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+ printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
#ifndef CONFIG_XEN
printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+#endif
+
+ return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
+#ifdef CONFIG_X86_32
+ /*
+ * need to make sure summit and es7000's mps_oem_check is safe to be
+ * called early via genericarch 's mps_oem_check
+ */
+ if (early) {
+#ifdef CONFIG_X86_NUMAQ
+ numaq_mps_oem_check(mpc, oem, str);
+#endif
+ } else
+ mps_oem_check(mpc, oem, str);
+#endif
+#ifndef CONFIG_XEN
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
@@ -335,12 +326,17 @@ static int __init smp_read_mpc(struct mp
if (early)
return 1;
+ if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+ struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+ x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+ }
+
/*
* Now process the configuration blocks.
*/
-#ifdef CONFIG_X86_NUMAQ
- mpc_record = 0;
-#endif
+ if (x86_quirks->mpc_record)
+ *x86_quirks->mpc_record = 0;
+
while (count < mpc->mpc_length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -358,7 +354,9 @@ static int __init smp_read_mpc(struct mp
{
struct mpc_config_bus *m =
(struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
MP_bus_info(m);
+#endif
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -404,10 +402,14 @@ static int __init smp_read_mpc(struct mp
count = mpc->mpc_length;
break;
}
-#ifdef CONFIG_X86_NUMAQ
- ++mpc_record;
-#endif
+ if (x86_quirks->mpc_record)
+ (*x86_quirks->mpc_record)++;
}
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_bigsmp_probe();
+#endif
+
setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -433,7 +435,7 @@ static void __init construct_default_ioi
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqflag = 0; /* conforming */
intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+ intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
intsrc.mpc_irqtype = mp_INT;
@@ -494,42 +496,11 @@ static void __init construct_default_ioi
MP_intsrc_info(&intsrc);
}
-#endif
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void __init construct_ioapic_table(int mpc_default_type)
{
- struct mpc_config_processor processor;
- struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
struct mpc_config_ioapic ioapic;
-#endif
- struct mpc_config_lintsrc lintsrc;
- int linttypes[2] = { mp_ExtINT, mp_NMI };
- int i;
-
-#ifndef CONFIG_XEN
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-#endif
-
- /*
- * 2 CPUs, numbered 0 & 1.
- */
- processor.mpc_type = MP_PROCESSOR;
- /* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
- for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
- MP_processor_info(&processor);
- }
+ struct mpc_config_bus bus;
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
@@ -558,7 +529,6 @@ static inline void __init construct_defa
MP_bus_info(&bus);
}
-#ifdef CONFIG_X86_IO_APIC
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -570,7 +540,44 @@ static inline void __init construct_defa
* We set up most of the low 16 IO-APIC pins according to MPS rules.
*/
construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_config_processor processor;
+ struct mpc_config_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+#ifndef CONFIG_XEN
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
#endif
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.mpc_type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_cpuflag = CPU_ENABLED;
+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_reserved[0] = 0;
+ processor.mpc_reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.mpc_apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ construct_ioapic_table(mpc_default_type);
+
lintsrc.mpc_type = MP_LINTSRC;
lintsrc.mpc_irqflag = 0; /* conforming */
lintsrc.mpc_srcbusid = 0;
@@ -589,7 +596,7 @@ static struct intel_mp_floating *mpf_fou
* Scan the memory blocks for an SMP configuration block.
*/
#ifndef CONFIG_XEN
-static void __init __get_smp_config(unsigned early)
+static void __init __get_smp_config(unsigned int early)
#else
void __init get_smp_config(void)
#define early 0
@@ -597,6 +604,10 @@ void __init get_smp_config(void)
{
struct intel_mp_floating *mpf = mpf_found;
+ if (x86_quirks->mach_get_smp_config) {
+ if (x86_quirks->mach_get_smp_config(early))
+ return;
+ }
if (acpi_lapic && early)
return;
/*
@@ -613,7 +624,7 @@ void __init get_smp_config(void)
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
if (mpf->mpf_feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
@@ -646,8 +657,10 @@ void __init get_smp_config(void)
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) {
+ if (!smp_read_mpc(_bus_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
+#endif
printk(KERN_ERR
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
@@ -704,10 +717,11 @@ void __init get_smp_config(void)
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
- unsigned int *bp = isa_bus_to_virt(base);
+ unsigned int *bp = _bus_to_virt(base);
struct intel_mp_floating *mpf;
- Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+ apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+ bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -717,16 +731,22 @@ static int __init smp_scan_config(unsign
!mpf_checksum((unsigned char *)bp, 16) &&
((mpf->mpf_specification == 1)
|| (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
+#endif
mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
#ifndef CONFIG_XEN
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
mpf, virt_to_phys(mpf));
- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+ if (!reserve)
+ return 1;
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
BOOTMEM_DEFAULT);
if (mpf->mpf_physptr) {
+ unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute
* table size yet, as only few megabytes from
@@ -736,27 +756,18 @@ static int __init smp_scan_config(unsign
* PAGE_SIZE from mpg->mpf_physptr yields BUG()
* in reserve_bootmem.
*/
- unsigned long size = PAGE_SIZE;
unsigned long end = max_low_pfn * PAGE_SIZE;
if (mpf->mpf_physptr + size > end)
size = end - mpf->mpf_physptr;
- reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+ reserve_bootmem_generic(mpf->mpf_physptr, size,
BOOTMEM_DEFAULT);
}
#else
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, ((void *)bp - isa_bus_to_virt(base)) + base);
-#endif
-#elif !defined(CONFIG_XEN)
- if (!reserve)
- return 1;
-
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
- if (mpf->mpf_physptr)
- reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
+ mpf, ((void *)bp - _bus_to_virt(base)) + base);
#endif
- return 1;
+ return 1;
}
bp += 4;
length -= 16;
@@ -764,12 +775,16 @@ static int __init smp_scan_config(unsign
return 0;
}
-static void __init __find_smp_config(unsigned reserve)
+static void __init __find_smp_config(unsigned int reserve)
{
#ifndef CONFIG_XEN
unsigned int address;
#endif
+ if (x86_quirks->mach_find_smp_config) {
+ if (x86_quirks->mach_find_smp_config(reserve))
+ return;
+ }
/*
* FIXME: Linux assumes you have 640K of base ram..
* this continues the error...
@@ -816,302 +831,297 @@ void __init find_smp_config(void)
__find_smp_config(1);
}
-/* --------------------------------------------------------------------------
- ACPI-based MP Configuration
- -------------------------------------------------------------------------- */
-
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
-
-#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
-#ifdef CONFIG_X86_IO_APIC
+static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+ int i;
-#define MP_ISA_BUS 0
+ if (m->mpc_irqtype != mp_INT)
+ return 0;
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+ if (m->mpc_irqflag != 0x0f)
+ return 0;
-static int mp_find_ioapic(int gsi)
-{
- int i = 0;
+ /* not legacy */
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
+
+ if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+ continue;
+ if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+ continue;
+ if (irq_used[i]) {
+ /* already claimed */
+ return -2;
+ }
+ irq_used[i] = 1;
+ return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ /* not found */
return -1;
}
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-#ifndef CONFIG_XEN
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
-#endif
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mpc_apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
#endif
-}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int __init replace_intsrc_all(struct mp_config_table *mpc,
+ unsigned long mpc_new_phys,
+ unsigned long mpc_new_length)
{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
+#ifdef CONFIG_X86_IO_APIC
+ int i;
+ int nr_m_spare = 0;
+#endif
- idx = nr_ioapics;
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
- mp_ioapics[idx].mpc_type = MP_IOAPIC;
- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mpc_apicaddr = address;
+ printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+ while (count < mpc->mpc_length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ {
+ struct mpc_config_processor *m =
+ (struct mpc_config_processor *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_BUS:
+ {
+ struct mpc_config_bus *m =
+ (struct mpc_config_bus *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_IOAPIC:
+ {
+ mpt += sizeof(struct mpc_config_ioapic);
+ count += sizeof(struct mpc_config_ioapic);
+ break;
+ }
+ case MP_INTSRC:
+ {
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
-#ifndef CONFIG_XEN
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_MP_intsrc_info(m);
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ } else if (!i) {
+ /* legacy, do nothing */
+ } else if (nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2)
+ * are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[nr_m_spare] = m;
+ nr_m_spare++;
+ }
#endif
- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].mpc_apicver = 0;
+ mpt += sizeof(struct mpc_config_intsrc);
+ count += sizeof(struct mpc_config_intsrc);
+ break;
+ }
+ case MP_LINTSRC:
+ {
+ struct mpc_config_lintsrc *m =
+ (struct mpc_config_lintsrc *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ default:
+ /* wrong mptable */
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+ printk(KERN_ERR "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->mpc_length, 1);
+ goto out;
+ }
+ }
+
+#ifdef CONFIG_X86_IO_APIC
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (irq_used[i])
+ continue;
+
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
+
+ if (nr_m_spare > 0) {
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ nr_m_spare--;
+ assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ m_spare[nr_m_spare] = NULL;
+ } else {
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
+ count += sizeof(struct mpc_config_intsrc);
+ if (!mpc_new_phys) {
+ printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+ else {
+ printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+ goto out;
+ }
+ }
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ mpc->mpc_length = count;
+ mpt += sizeof(struct mpc_config_intsrc);
+ }
+ print_mp_irq_info(&mp_irqs[i]);
+ }
#endif
- /*
- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
- */
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
- mp_ioapic_routing[idx].gsi_base = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
-
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+out:
+ /* update checksum */
+ mpc->mpc_checksum = 0;
+ mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+ mpc->mpc_length);
- nr_ioapics++;
+ return 0;
}
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- struct mpc_config_intsrc intsrc;
- int ioapic = -1;
- int pin = -1;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+static int __initdata enable_update_mptable;
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
+static int __init update_mptable_setup(char *str)
+{
+ enable_update_mptable = 1;
+ return 0;
+}
+early_param("update_mptable", update_mptable_setup);
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_irqflag = (trigger << 2) | polarity;
- intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
- intsrc.mpc_dstirq = pin; /* INTIN# */
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
- MP_intsrc_info(&intsrc);
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+ enable_update_mptable = 1;
+ alloc_mptable = 1;
+ if (!p)
+ return 0;
+ mpc_new_length = memparse(p, &p);
+ return 0;
}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
-void __init mp_config_acpi_legacy_irqs(void)
+void __init early_reserve_e820_mpc_new(void)
{
- struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
- /*
- * Fabricate the legacy ISA bus (bus #31).
- */
- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
- set_bit(MP_ISA_BUS, mp_bus_not_pci);
- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
-
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
-
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
-
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* Conforming */
- intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+ if (enable_update_mptable && alloc_mptable) {
+ u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+ startt = TRAMPOLINE_BASE;
#endif
- /*
- * Use the default configuration for the IRQs 0-15. Unless
- * overridden by (MADT) interrupt source override entries.
- */
- for (i = 0; i < 16; i++) {
- int idx;
-
- for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mpc_config_intsrc *irq = mp_irqs + idx;
-
- /* Do we already have a mapping for this ISA IRQ? */
- if (irq->mpc_srcbus == MP_ISA_BUS
- && irq->mpc_srcbusirq == i)
- break;
-
- /* Do we already have a mapping for this IOAPIC pin */
- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
- (irq->mpc_dstirq == i))
- break;
- }
-
- if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
- continue; /* IRQ already used */
- }
-
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_srcbusirq = i; /* Identity mapped */
- intsrc.mpc_dstirq = i;
-
- MP_intsrc_info(&intsrc);
+ mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
{
- int ioapic;
- int ioapic_pin;
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ char str[16];
+ char oem[10];
+ struct intel_mp_floating *mpf;
+ struct mp_config_table *mpc;
+ struct mp_config_table *mpc_new;
+
+ if (!enable_update_mptable)
+ return 0;
+
+ mpf = mpf_found;
+ if (!mpf)
+ return 0;
- static int pci_irq = IRQ_COMPRESSION_START;
/*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
+ * Now see if we need to go further.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
+ if (mpf->mpf_feature1 != 0)
+ return 0;
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-#endif
+ if (!mpf->mpf_physptr)
+ return 0;
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
+ mpc = _bus_to_virt(mpf->mpf_physptr);
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ printk(KERN_INFO "mpf: %lx\n", (long)arbitrary_virt_to_machine(mpf));
+ printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
-#ifndef CONFIG_X86_32
- if (ioapic_renumber_irq)
- gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+ if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+ mpc_new_phys = 0;
+ printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
+ }
+
+ if (!mpc_new_phys) {
+ unsigned char old, new;
+ /* check if we can change the postion */
+ mpc->mpc_checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ mpc->mpc_checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ if (old == new) {
+ printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+ return 0;
+ }
+ printk(KERN_INFO "use in-positon replacing\n");
+ } else {
+ maddr_t mpc_new_bus;
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
- ioapic_pin);
- return gsi;
- }
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
+ mpc_new_bus = phys_to_machine(mpc_new_phys);
+ mpf->mpf_physptr = mpc_new_bus;
+ mpc_new = phys_to_virt(mpc_new_phys);
+ memcpy(mpc_new, mpc, mpc->mpc_length);
+ mpc = mpc_new;
+ /* check if we can modify that */
+ if (mpc_new_bus - mpf->mpf_physptr) {
+ struct intel_mp_floating *mpf_new;
+ /* steal 16 bytes from [0, 1k) */
+ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+ mpf_new = isa_bus_to_virt(0x400 - 16);
+ memcpy(mpf_new, mpf, 16);
+ mpf = mpf_new;
+ mpf->mpf_physptr = mpc_new_bus;
+ }
+ mpf->mpf_checksum = 0;
+ mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
}
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
/*
- * For GSI >= 64, use IRQ compression
+ * only replace the one with mp_INT and
+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+ * may need pci=routeirq for all coverage
*/
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+ return 0;
}
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);
--- head-2011-03-11.orig/arch/x86/kernel/pci-dma-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/pci-dma-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -5,13 +5,13 @@
#include <asm/proto.h>
#include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
#include <asm/calgary.h>
+#include <asm/amd_iommu.h>
-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
+static int forbid_dac __read_mostly;
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -74,13 +74,17 @@ early_param("dma32_size", parse_dma32_si
void __init dma32_reserve_bootmem(void)
{
unsigned long size, align;
- if (end_pfn <= MAX_DMA32_PFN)
+ if (max_pfn <= MAX_DMA32_PFN)
return;
+ /*
+ * check aperture_64.c allocate_aperture() for reason about
+ * using 512M as goal
+ */
align = 64ULL<<20;
size = round_up(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- __pa(MAX_DMA_ADDRESS));
+ 512ULL<<20);
if (dma32_bootmem_ptr)
dma32_bootmem_size = size;
else
@@ -88,17 +92,14 @@ void __init dma32_reserve_bootmem(void)
}
static void __init dma32_free_bootmem(void)
{
- int node;
- if (end_pfn <= MAX_DMA32_PFN)
+ if (max_pfn <= MAX_DMA32_PFN)
return;
if (!dma32_bootmem_ptr)
return;
- for_each_online_node(node)
- free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
- dma32_bootmem_size);
+ free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
@@ -107,7 +108,7 @@ static void __init dma32_free_bootmem(vo
#define dma32_free_bootmem() ((void)0)
#endif
-static const struct dma_mapping_ops swiotlb_dma_ops = {
+static struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.map_single = swiotlb_map_single_phys,
.unmap_single = swiotlb_unmap_single,
@@ -130,25 +131,31 @@ void __init pci_iommu_alloc(void)
* The order of these functions is important for
* fall-back/fail-over reasons
*/
-#ifdef CONFIG_GART_IOMMU
gart_iommu_hole_init();
-#endif
-#ifdef CONFIG_CALGARY_IOMMU
detect_calgary();
-#endif
detect_intel_iommu();
-#ifdef CONFIG_SWIOTLB
+ amd_iommu_detect();
+
swiotlb_init();
if (swiotlb) {
printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
dma_ops = &swiotlb_dma_ops;
}
-#endif
}
+#ifndef CONFIG_XEN
+unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
+{
+ unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
+
+ return size >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(iommu_num_pages);
+#endif
+
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.
@@ -201,9 +208,7 @@ static __init int iommu_setup(char *p)
swiotlb = 1;
#endif
-#ifdef CONFIG_GART_IOMMU
gart_parse_options(p);
-#endif
#ifdef CONFIG_CALGARY_IOMMU
if (!strncmp(p, "calgary", 7))
@@ -245,136 +250,19 @@ int range_straddles_page_boundary(paddr_
!check_pages_physically_contiguous(pfn, offset, size));
}
-#ifdef CONFIG_X86_32
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
- dma_addr_t device_addr, size_t size, int flags)
-{
- void __iomem *mem_base = NULL;
- int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
- goto out;
- if (!size)
- goto out;
- if (dev->dma_mem)
- goto out;
-
- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
- mem_base = ioremap(bus_addr, size);
- if (!mem_base)
- goto out;
-
- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dev->dma_mem)
- goto out;
- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dev->dma_mem->bitmap)
- goto free1_out;
-
- dev->dma_mem->virt_base = mem_base;
- dev->dma_mem->device_base = device_addr;
- dev->dma_mem->size = pages;
- dev->dma_mem->flags = flags;
-
- if (flags & DMA_MEMORY_MAP)
- return DMA_MEMORY_MAP;
-
- return DMA_MEMORY_IO;
-
- free1_out:
- kfree(dev->dma_mem);
- out:
- if (mem_base)
- iounmap(mem_base);
- return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dev->dma_mem = NULL;
- iounmap(mem->virt_base);
- kfree(mem->bitmap);
- kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- int pos, err;
- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
-
- pages >>= PAGE_SHIFT;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle, void **ret)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
- int order = get_order(size);
-
- if (mem) {
- int page = bitmap_find_free_region(mem->bitmap, mem->size,
- order);
- if (page >= 0) {
- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
- *ret = mem->virt_base + (page << PAGE_SHIFT);
- memset(*ret, 0, size);
- }
- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
- *ret = NULL;
- }
- return (mem != NULL);
-}
-
-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
- if (mem && vaddr >= mem->virt_base && vaddr <
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
- bitmap_release_region(mem->bitmap, page, order);
- return 1;
- }
- return 0;
-}
-#else
-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
-#define dma_release_coherent(dev, order, vaddr) (0)
-#endif /* CONFIG_X86_32 */
-
int dma_supported(struct device *dev, u64 mask)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
- printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
- dev->bus_id);
+ dev_info(dev, "PCI: Disallowing DAC for device\n");
return 0;
}
#endif
- if (dma_ops->dma_supported)
- return dma_ops->dma_supported(dev, mask);
+ if (ops->dma_supported)
+ return ops->dma_supported(dev, mask);
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
@@ -395,8 +283,7 @@ int dma_supported(struct device *dev, u6
type. Normally this doesn't make any difference, but gives
more gentle handling of IOMMU overflow. */
if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
- printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
- dev->bus_id, mask);
+ dev_info(dev, "Force SAC with mask %Lx\n", mask);
return 0;
}
@@ -422,6 +309,9 @@ void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp)
{
+#ifndef CONFIG_XEN
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+#endif
void *memory = NULL;
struct page *page;
unsigned long dma_mask = 0;
@@ -431,7 +321,7 @@ dma_alloc_coherent(struct device *dev, s
/* ignore region specifiers */
gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
return memory;
if (!dev) {
@@ -491,8 +381,8 @@ dma_alloc_coherent(struct device *dev, s
/* Let low level make its own zone decisions */
gfp &= ~(GFP_DMA32|GFP_DMA);
- if (dma_ops->alloc_coherent)
- return dma_ops->alloc_coherent(dev, size,
+ if (ops->alloc_coherent)
+ return ops->alloc_coherent(dev, size,
dma_handle, gfp);
return NULL;
}
@@ -504,14 +394,14 @@ dma_alloc_coherent(struct device *dev, s
}
}
- if (dma_ops->alloc_coherent) {
+ if (ops->alloc_coherent) {
free_pages((unsigned long)memory, order);
gfp &= ~(GFP_DMA|GFP_DMA32);
- return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+ return ops->alloc_coherent(dev, size, dma_handle, gfp);
}
- if (dma_ops->map_simple) {
- *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory),
+ if (ops->map_simple) {
+ *dma_handle = ops->map_simple(dev, virt_to_bus(memory),
size,
PCI_DMA_BIDIRECTIONAL);
if (*dma_handle != bad_dma_address)
@@ -542,13 +432,17 @@ EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t bus)
{
+#ifndef CONFIG_XEN
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+#endif
+
int order = get_order(size);
WARN_ON(irqs_disabled()); /* for portability */
- if (dma_release_coherent(dev, order, vaddr))
+ if (dma_release_from_coherent(dev, order, vaddr))
return;
#ifndef CONFIG_XEN
- if (dma_ops->unmap_single)
- dma_ops->unmap_single(dev, bus, size, 0);
+ if (ops->unmap_single)
+ ops->unmap_single(dev, bus, size, 0);
#endif
xen_destroy_contiguous_region((unsigned long)vaddr, order);
free_pages((unsigned long)vaddr, order);
@@ -557,15 +451,13 @@ EXPORT_SYMBOL(dma_free_coherent);
static int __init pci_iommu_init(void)
{
-#ifdef CONFIG_CALGARY_IOMMU
calgary_iommu_init();
-#endif
intel_iommu_init();
-#ifdef CONFIG_GART_IOMMU
+ amd_iommu_init();
+
gart_iommu_init();
-#endif
no_iommu_init();
return 0;
--- head-2011-03-11.orig/arch/x86/kernel/pci-nommu-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/pci-nommu-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -84,18 +84,12 @@ static int nommu_dma_supported(struct de
return 1;
}
-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
- return (dma_addr == bad_dma_address);
-}
-
-static const struct dma_mapping_ops nommu_dma_ops = {
+static struct dma_mapping_ops nommu_dma_ops = {
.map_single = gnttab_map_single,
.unmap_single = gnttab_unmap_single,
.map_sg = gnttab_map_sg,
.unmap_sg = gnttab_unmap_sg,
.dma_supported = nommu_dma_supported,
- .mapping_error = nommu_mapping_error
};
void __init no_iommu_init(void)
--- head-2011-03-11.orig/arch/x86/kernel/probe_roms_32.c 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/probe_roms_32.c 2011-02-01 14:38:38.000000000 +0100
@@ -131,7 +131,7 @@ void __init probe_roms(void)
upper = system_rom_resource.start;
/* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
+ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
if (romsignature(rom)) {
length = extension_rom_resource.end - extension_rom_resource.start + 1;
if (romchecksum(rom, length)) {
--- head-2011-03-11.orig/arch/x86/kernel/process-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process-xen.c 2011-03-03 15:59:49.000000000 +0100
@@ -6,6 +6,13 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
+#include <linux/clockchips.h>
+#include <asm/system.h>
+
+unsigned long idle_halt;
+EXPORT_SYMBOL(idle_halt);
+unsigned long idle_nomwait;
+EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
@@ -45,6 +52,41 @@ void arch_task_cache_init(void)
SLAB_PANIC, NULL);
}
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void xen_idle(void)
+{
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
static void do_nothing(void *unused)
{
}
@@ -61,7 +103,7 @@ void cpu_idle_wait(void)
{
smp_mb();
/* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 0, 1);
+ smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
@@ -125,60 +167,175 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
+static int __cpuinitdata force_mwait;
+
+#define MWAIT_INFO 0x05
+#define MWAIT_ECX_EXTENDED_INFO 0x01
+#define MWAIT_EDX_C1 0xf0
+
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
+ u32 eax, ebx, ecx, edx;
+
if (force_mwait)
return 1;
- if (c->x86_vendor == X86_VENDOR_AMD) {
- switch(c->x86) {
- case 0x10:
- case 0x11:
- return 0;
+ if (c->cpuid_level < MWAIT_INFO)
+ return 0;
+
+ cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+ /* Check, whether EDX has extended info about MWAIT */
+ if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+ return 1;
+
+ /*
+ * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+ * C1 supports MWAIT
+ */
+ return (edx & MWAIT_EDX_C1);
+}
+
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ return 0;
+
+ if (c->x86 < 0x0F)
+ return 0;
+
+ /* Family 0x0f models < rev F do not have C1E */
+ if (c->x86 == 0x0f && c->x86_model < 0x40)
+ return 0;
+
+ return 1;
+}
+
+static cpumask_t c1e_mask = CPU_MASK_NONE;
+static int c1e_detected;
+
+void c1e_remove_cpu(int cpu)
+{
+ cpu_clear(cpu, c1e_mask);
+}
+
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
+{
+ if (need_resched())
+ return;
+
+ if (!c1e_detected) {
+ u32 lo, hi;
+
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+ c1e_detected = 1;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ printk(KERN_INFO "System has AMD C1E enabled\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
- return 1;
+
+ if (c1e_detected) {
+ int cpu = smp_processor_id();
+
+ if (!cpu_isset(cpu, c1e_mask)) {
+ cpu_set(cpu, c1e_mask);
+ /*
+ * Force broadcast so ACPI can not interfere. Needs
+ * to run with interrupts enabled as it uses
+ * smp_function_call.
+ */
+ local_irq_enable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+ &cpu);
+ printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+ cpu);
+ local_irq_disable();
+ }
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+ default_idle();
+
+ /*
+ * The switch back from broadcast mode needs to be
+ * called with interrupts disabled.
+ */
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
+ } else
+ default_idle();
}
#endif
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifndef CONFIG_XEN
- static int selected;
-
- if (selected)
- return;
#ifdef CONFIG_X86_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
}
#endif
+ if (pm_idle)
+ return;
+
if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
/*
- * Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ } else if (check_c1e_idle(c)) {
+ printk(KERN_INFO "using C1E aware idle routine\n");
+ pm_idle = c1e_idle;
+ } else
+ pm_idle = default_idle;
#endif
}
static int __init idle_setup(char *str)
{
+ if (!str)
+ return -EINVAL;
+
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
- }
#ifndef CONFIG_XEN
- else if (!strcmp(str, "mwait"))
+ } else if (!strcmp(str, "mwait"))
force_mwait = 1;
+ else if (!strcmp(str, "halt")) {
+ /*
+ * When the boot option of idle=halt is added, halt is
+ * forced to be used for CPU idle. In such case CPU C2/C3
+ * won't be used again.
+ * To continue to load the CPU idle driver, don't touch
+ * the boot_option_idle_override.
+ */
+ pm_idle = default_idle;
+ idle_halt = 1;
+ return 0;
+ } else if (!strcmp(str, "nomwait")) {
+ /*
+ * If the boot option of "idle=nomwait" is added,
+ * it means that mwait will be disabled for CPU C2/C3
+ * states. In such case it won't touch the variable
+ * of boot_option_idle_override.
+ */
+ idle_nomwait = 1;
+ return 0;
#endif
- else
+ } else
return -1;
boot_option_idle_override = 1;
--- head-2011-03-11.orig/arch/x86/kernel/process_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process_32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -59,15 +59,11 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/kdebug.h>
+#include <asm/idle.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
@@ -82,46 +78,27 @@ unsigned long thread_saved_pc(struct tas
return ((unsigned long *)tsk->thread.sp)[3];
}
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_HOTPLUG_CPU
+#ifndef CONFIG_XEN
+#include <asm/nmi.h>
-void disable_hlt(void)
+static void cpu_exit_clear(void)
{
- hlt_counter++;
-}
+ int cpu = raw_smp_processor_id();
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
- hlt_counter--;
-}
+ idle_task_exit();
-EXPORT_SYMBOL(enable_hlt);
+ cpu_uninit();
+ irq_ctx_exit(cpu);
-static void xen_idle(void)
-{
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
+ numa_remove_cpu(cpu);
+ c1e_remove_cpu(cpu);
}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
#endif
-#ifdef CONFIG_HOTPLUG_CPU
static inline void play_dead(void)
{
idle_task_exit();
@@ -152,13 +129,11 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick();
+ tick_nohz_stop_sched_tick(1);
while (!need_resched()) {
- void (*idle)(void);
check_pgt_cache();
rmb();
- idle = xen_idle; /* no alternatives */
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, 0);
@@ -168,7 +143,10 @@ void cpu_idle(void)
local_irq_disable();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
- idle();
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+ xen_idle();
+ start_critical_timings();
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
--- head-2011-03-11.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:32:46.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/process_64-xen.c 2011-02-02 08:34:01.000000000 +0100
@@ -64,15 +64,6 @@ asmlinkage extern void ret_from_fork(voi
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
@@ -102,25 +93,13 @@ void exit_idle(void)
__exit_idle();
}
-static void xen_idle(void)
-{
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static inline void play_dead(void)
{
idle_task_exit();
+#ifndef CONFIG_XEN
+ c1e_remove_cpu(raw_smp_processor_id());
+#endif
local_irq_disable();
cpu_clear(smp_processor_id(), cpu_initialized);
preempt_enable_no_resched();
@@ -145,12 +124,11 @@ void cpu_idle(void)
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick();
+ tick_nohz_stop_sched_tick(1);
while (!need_resched()) {
- void (*idle)(void);
rmb();
- idle = xen_idle; /* no alternatives */
+
if (cpu_is_offline(smp_processor_id()))
play_dead();
/*
@@ -160,7 +138,10 @@ void cpu_idle(void)
*/
local_irq_disable();
enter_idle();
- idle();
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+ xen_idle();
+ start_critical_timings();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
@@ -270,7 +251,7 @@ void exit_thread(void)
}
}
-void load_gs_index(unsigned gs)
+void xen_load_gs_index(unsigned gs)
{
WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
}
@@ -371,10 +352,10 @@ int copy_thread(int nr, unsigned long cl
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
- asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
- asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
- asm("mov %%es,%0" : "=m" (p->thread.es));
- asm("mov %%ds,%0" : "=m" (p->thread.ds));
+ savesegment(gs, p->thread.gsindex);
+ savesegment(fs, p->thread.fsindex);
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -415,7 +396,9 @@ out:
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+ loadsegment(fs, 0);
+ loadsegment(es, 0);
+ loadsegment(ds, 0);
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
@@ -554,8 +537,8 @@ static inline void __switch_to_xtra(stru
struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
#ifndef CONFIG_X86_NO_TSS
struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -658,12 +641,25 @@ __switch_to(struct task_struct *prev_p,
*/
if (unlikely(next->es))
loadsegment(es, next->es);
-
+
if (unlikely(next->ds))
loadsegment(ds, next->ds);
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before math_state_restore, so the TS bit is up
+ * to date.
+ */
+ arch_leave_lazy_cpu_mode();
+
/*
* Switch FS and GS.
+ *
+ * Segment register != 0 always requires a reload. Also
+ * reload when it has changed. When prev process used 64bit
+ * base always reload to avoid an information leak.
*/
if (unlikely(next->fsindex))
loadsegment(fs, next->fsindex);
@@ -682,7 +678,8 @@ __switch_to(struct task_struct *prev_p,
*/
write_pda(pcurrent, next_p);
write_pda(kernelstack,
- (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
write_pda(stack_canary, next_p->stack_canary);
@@ -843,7 +840,7 @@ long do_arch_prctl(struct task_struct *t
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+ loadsegment(fs, FS_TLS_SEL);
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
@@ -853,7 +850,7 @@ long do_arch_prctl(struct task_struct *t
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
+ loadsegment(fs, 0);
ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
addr);
}
@@ -877,7 +874,7 @@ long do_arch_prctl(struct task_struct *t
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
else if (doit) {
- asm("movl %%gs,%0" : "=r" (gsindex));
+ savesegment(gs, gsindex);
if (gsindex)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
--- head-2011-03-11.orig/arch/x86/kernel/setup-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/setup-xen.c 2011-03-04 15:09:03.000000000 +0100
@@ -1,143 +1,1132 @@
-#include <linux/kernel.h>
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
#include <linux/module.h>
+#include <linux/efi.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
#include <linux/percpu.h>
-#include <asm/smp.h>
-#include <asm/percpu.h>
+#include <linux/crash_dump.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/efi.h>
#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
#include <asm/processor.h>
-#include <asm/setup.h>
+#include <asm/bugs.h>
+
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <mach_apic.h>
+#include <asm/paravirt.h>
+
+#include <asm/percpu.h>
#include <asm/topology.h>
-#include <asm/mpspec.h>
#include <asm/apicdef.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
+
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+ xen_panic_event, NULL, 0 /* try to go last */
+};
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+unsigned long *pfn_to_mfn_frame_list_list,
+#ifdef CONFIG_X86_64
+ *pfn_to_mfn_frame_list[512];
+#else
+ *pfn_to_mfn_frame_list[128];
+#endif
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+#endif
+
+#ifndef ARCH_SETUP
+#define ARCH_SETUP
+#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
#ifndef CONFIG_XEN
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+#endif
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+/* This value is set up by the early boot code to point to the value
+ immediately after the boot time page tables. It contains a *physical*
+ address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
+unsigned long init_pg_tables_end __initdata = ~0UL;
#endif
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1, .hard_math = 1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1, .hard_math = 1 };
+EXPORT_SYMBOL(boot_cpu_data);
+#ifndef CONFIG_XEN
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+ MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+#endif
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
+struct ist_info ist_info;
#endif
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
*/
-static void __init setup_per_cpu_maps(void)
+static inline void copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
+{
+
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 ramdisk_here;
+ unsigned long slop, clen, mapaddr;
+ char *p, *q;
+
+ /* We need to move the initrd down into lowmem */
+ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+ PAGE_SIZE);
+
+ if (ramdisk_here == -1ULL)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
+
+ /* Note: this includes all the lowmem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+ "NEW RAMDISK");
+ initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+ ramdisk_here, ramdisk_here + ramdisk_size);
+
+ q = (char *)initrd_start;
+
+ /* Copy any lowmem portion of the initrd */
+ if (ramdisk_image < end_of_lowmem) {
+ clen = end_of_lowmem - ramdisk_image;
+ p = (char *)__va(ramdisk_image);
+ memcpy(q, p, clen);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+
+ /* Copy the highmem portion of the initrd */
+ while (ramdisk_size) {
+ slop = ramdisk_image & ~PAGE_MASK;
+ clen = ramdisk_size;
+ if (clen > MAX_MAP_CHUNK-slop)
+ clen = MAX_MAP_CHUNK-slop;
+ mapaddr = ramdisk_image & PAGE_MASK;
+ p = early_ioremap(mapaddr, clen+slop);
+ memcpy(q, p+slop, clen);
+ early_iounmap(p, clen+slop);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+ /* high pages is not converted by early_res_to_bootmem */
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+ " %08llx - %08llx\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
+}
+#endif
+
+static void __init reserve_initrd(void)
{
#ifndef CONFIG_XEN
- int cpu;
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- for_each_possible_cpu(cpu) {
- per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
- per_cpu(x86_bios_cpu_apicid, cpu) =
- x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- x86_cpu_to_node_map_init[cpu];
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+#else
+ unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
+ unsigned long ramdisk_size = xen_start_info->mod_len;
+ unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
+ unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+ if (!xen_start_info->mod_start || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
#endif
+
+ initrd_start = 0;
+
+ if (ramdisk_size >= (end_of_lowmem>>1)) {
+ free_early(ramdisk_image, ramdisk_end);
+ printk(KERN_ERR "initrd too large to handle, "
+ "disabling initrd\n");
+ return;
}
- /* indicate the early static arrays will soon be gone */
- x86_cpu_to_apicid_early_ptr = NULL;
- x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = NULL;
+ printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
+ ramdisk_end);
+
+
+ if (ramdisk_end <= end_of_lowmem) {
+ /* All in lowmem, easy case */
+ /*
+ * don't need to reserve again, already reserved early
+ * in i386_start_kernel
+ */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+#ifdef CONFIG_X86_64_XEN
+ initrd_below_start_ok = 1;
#endif
+ return;
+ }
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+ relocate_initrd();
+#else
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+ ramdisk_end, end_of_lowmem);
+ initrd_start = 0;
#endif
+ free_early(ramdisk_image, ramdisk_end);
+}
+#else
+static void __init reserve_initrd(void)
+{
}
+#endif /* CONFIG_BLK_DEV_INITRD */
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
+static void __init parse_setup_data(void)
+{
+#ifndef CONFIG_XEN
+ struct setup_data *data;
+ u64 pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ case SETUP_E820_EXT:
+ parse_e820_ext(data, pa_data);
+ break;
+ default:
+ break;
+ }
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+#endif
+}
-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
+static void __init e820_reserve_setup_data(void)
{
- int i;
+#ifndef CONFIG_XEN
+ struct setup_data *data;
+ u64 pa_data;
+ int found = 0;
- /* alloc_bootmem zeroes memory */
- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
- for (i = 0; i < nr_cpu_ids; i++)
- cpu_set(i, cpumask_of_cpu_map[i]);
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ e820_update_range(pa_data, sizeof(*data)+data->len,
+ E820_RAM, E820_RESERVED_KERN);
+ found = 1;
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+ if (!found)
+ return;
+
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ printk(KERN_INFO "extended physical RAM map:\n");
+ e820_print_map("reserve setup_data");
+#endif
}
-#else
-static inline void setup_cpumask_of_cpu(void) { }
+
+static void __init reserve_early_setup_data(void)
+{
+#ifndef CONFIG_XEN
+ struct setup_data *data;
+ u64 pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
#endif
+}
-#ifdef CONFIG_X86_32
/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+ * --------- Crashkernel reservation ------------------------------
*/
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
+
+#ifdef CONFIG_KEXEC
+
+#ifndef CONFIG_XEN
+/**
+ * Reserve @size bytes of crashkernel memory at any suitable offset.
+ *
+ * @size: Size of the crashkernel memory to reserve.
+ * Returns the base address on success, and -1ULL on failure.
+ */
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
+{
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long start = 0LL;
+
+ while (1) {
+ int ret;
+
+ start = find_e820_area(start, ULONG_MAX, size, alignment);
+ if (start == -1ULL)
+ return start;
+
+ /* try to reserve it */
+ ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
+ if (ret >= 0)
+ return start;
+
+ start += alignment;
+ }
+}
+
+static inline unsigned long long get_total_mem(void)
+{
+ unsigned long long total;
+
+ total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ total += highend_pfn - highstart_pfn;
#endif
+ return total << PAGE_SHIFT;
+}
+
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ total_mem = get_total_mem();
+
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+
+ /* 0 means: find the address automatically */
+ if (crash_base <= 0) {
+ crash_base = find_and_reserve_crashkernel(crash_size);
+ if (crash_base == -1ULL) {
+ pr_info("crashkernel reservation failed. "
+ "No suitable area found.\n");
+ return;
+ }
+ } else {
+ ret = reserve_bootmem_generic(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE);
+ if (ret < 0) {
+ pr_info("crashkernel reservation failed - "
+ "memory is in use\n");
+ return;
+ }
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+#define reserve_crashkernel xen_machine_kexec_setup_resources
+#endif
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+
+static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+static void __init reserve_standard_io_resources(void)
+{
+ int i;
+
+ /* Nothing to do if not running in dom0. */
+ if (!is_initial_xendomain())
+ return;
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
*/
-void __init setup_per_cpu_areas(void)
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
{
- int i, highest_cpu = 0;
- unsigned long size;
+#ifdef CONFIG_XEN
+ unsigned int i;
+ unsigned long p2m_pages;
+ struct physdev_set_iopl set_iopl;
-#ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
+#ifdef CONFIG_X86_32
+ /* Force a quick death if the kernel panics (not domain 0). */
+ extern int panic_timeout;
+ if (!panic_timeout && !is_initial_xendomain())
+ panic_timeout = 1;
#endif
- /* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
- size);
-
- for_each_possible_cpu(i) {
- char *ptr;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
-#else
- int node = early_cpu_to_node(i);
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
- printk(KERN_INFO
- "cpu %d has no node or node-local memory\n", i);
- }
- else
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ /* Register a call for panic conditions. */
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_writable_pagetables));
+#ifdef CONFIG_X86_32
+ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_4gb_segments));
+#endif
+ set_iopl.iopl = 1;
+ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+ visws_early_detect();
+ pre_setup_arch_hook();
+#else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
+
+ early_cpu_init();
+ early_ioremap_init();
+
+#ifndef CONFIG_XEN
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+ if (boot_params.sys_desc_table.length != 0) {
+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+ machine_id = boot_params.sys_desc_table.table[0];
+ machine_submodel_id = boot_params.sys_desc_table.table[1];
+ BIOS_revision = boot_params.sys_desc_table.table[2];
+ }
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+#ifdef CONFIG_X86_32
+ "EL32",
+#else
+ "EL64",
#endif
- if (!ptr)
- panic("Cannot allocate cpu data for CPU %d\n", i);
+ 4)) {
+ efi_enabled = 1;
+ efi_reserve_early();
+ }
+#endif
+#else /* CONFIG_XEN */
+#ifdef CONFIG_X86_32
+ /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+ */
+ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+#else
+ ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+#endif
+ if (is_initial_xendomain()) {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ dom0_init_screen_info(info,
+ xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+ } else
+ screen_info.orig_video_isVGA = 0;
+ copy_edid();
+#endif /* CONFIG_XEN */
+
+ ARCH_SETUP
+
+ setup_memory_map();
+ parse_setup_data();
+ /* update the e820_saved too */
+ e820_reserve_setup_data();
+
+ copy_edd();
+
+#ifndef CONFIG_XEN
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+#endif
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+ init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+#else
+ init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
+ xen_start_info->nr_pt_frames) << PAGE_SHIFT;
+#endif
+#else
+ init_mm.brk = (unsigned long) &_end;
+#endif
+
+ code_resource.start = virt_to_phys(_text);
+ code_resource.end = virt_to_phys(_etext)-1;
+ data_resource.start = virt_to_phys(_etext);
+ data_resource.end = virt_to_phys(_edata)-1;
+ bss_resource.start = virt_to_phys(&__bss_start);
+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_early_param();
+
#ifdef CONFIG_X86_64
- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+ check_efer();
+#endif
+
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+ /*
+ * Must be before kernel pagetables are setup
+ * or fixmap area is touched.
+ */
+ vmi_init();
+#endif
+
+ /* after early param, so could get panic from serial */
+ reserve_early_setup_data();
+
+ if (acpi_mps_check()) {
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+ disable_apic = 1;
+#endif
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ }
+
+#ifdef CONFIG_PCI
+ if (pci_early_dump_regs)
+ early_dump_pci_devices();
+#endif
+
+ finish_e820_parsing();
+
+#ifdef CONFIG_X86_32
+ if (is_initial_xendomain())
+ probe_roms();
+#endif
+
+#ifndef CONFIG_XEN
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ if (efi_enabled)
+ efi_init();
+
+#ifdef CONFIG_X86_32
+ if (ppro_with_ram_bug()) {
+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+ E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820_print_map("bad_ppro");
+ }
#else
- __per_cpu_offset[i] = ptr - __per_cpu_start;
+ early_gart_iommu_check();
#endif
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#endif /* CONFIG_XEN */
- highest_cpu = i;
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820_end_of_ram_pfn();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+#ifndef CONFIG_XEN
+ if (mtrr_trim_uncached_memory(max_pfn))
+ max_pfn = e820_end_of_ram_pfn();
+#endif
+
+#ifdef CONFIG_X86_32
+ /* max_low_pfn get updated here */
+ find_low_pfn_range();
+#else
+ num_physpages = max_pfn;
+ max_mapnr = max_pfn;
+
+
+ /* How many end-of-memory variables you have, grandma! */
+ /* need this before calling reserve_initrd */
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820_end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+ /* max_pfn_mapped is updated here */
+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+ max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ max_pfn_mapped = init_memory_mapping(1UL<<32,
+ max_pfn<<PAGE_SHIFT);
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
}
+#endif
- nr_cpu_ids = highest_cpu + 1;
- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
- /* Setup percpu data maps */
- setup_per_cpu_maps();
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
- /* Setup cpumask_of_cpu map */
- setup_cpumask_of_cpu();
-}
+ reserve_initrd();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+ vsmp_init();
+#endif
+
+ if (is_initial_xendomain())
+ dmi_scan_machine();
+
+ io_delay_init();
+
+#ifdef CONFIG_ACPI
+ if (!is_initial_xendomain()) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ disable_acpi();
+ }
+#endif
+
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * Parse SRAT to discover nodes.
+ */
+ acpi_numa_init();
+#endif
+
+ initmem_init(0, max_pfn);
+
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+ reserve_crashkernel();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+ /*
+ * dma32_reserve_bootmem() allocates bootmem which may conflict
+ * with the crashkernel command line, so do that after
+ * reserve_crashkernel()
+ */
+ dma32_reserve_bootmem();
+#endif
+
+ reserve_ibft_region();
+
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
+ xen_pagetable_setup_start(swapper_pg_dir);
+ paging_init();
+ xen_pagetable_setup_done(swapper_pg_dir);
+ paravirt_post_allocator_init();
+
+#ifdef CONFIG_X86_64
+ map_vsyscall();
+#endif
+
+#ifdef CONFIG_XEN
+ p2m_pages = max_pfn;
+ if (xen_start_info->nr_pages > max_pfn) {
+ /*
+ * the max_pfn was shrunk (probably by mem= or highmem=
+ * kernel parameter); shrink reservation with the HV
+ */
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned int difference;
+ int ret;
+
+ difference = xen_start_info->nr_pages - max_pfn;
+ set_xen_guest_handle(reservation.extent_start,
+ ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+ reservation.nr_extents = difference;
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ BUG_ON(ret != difference);
+ }
+ else if (max_pfn > xen_start_info->nr_pages)
+ p2m_pages = xen_start_info->nr_pages;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long i, j;
+ unsigned int k, fpp;
+
+ /* Make sure we have a large enough P->M table. */
+ phys_to_machine_mapping = alloc_bootmem_pages(
+ max_pfn * sizeof(unsigned long));
+ memset(phys_to_machine_mapping, ~0,
+ max_pfn * sizeof(unsigned long));
+ memcpy(phys_to_machine_mapping,
+ (unsigned long *)xen_start_info->mfn_list,
+ p2m_pages * sizeof(unsigned long));
+ free_bootmem(
+ __pa(xen_start_info->mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+ sizeof(unsigned long))));
+
+ /*
+ * Initialise the list of the frames that specify the list of
+ * frames that make up the p2m table. Used by save/restore.
+ */
+ pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
+
+ fpp = PAGE_SIZE/sizeof(unsigned long);
+ for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+ if (j == fpp)
+ j = 0;
+ if (j == 0) {
+ k++;
+ BUG_ON(k>=ARRAY_SIZE(pfn_to_mfn_frame_list));
+ pfn_to_mfn_frame_list[k] =
+ alloc_bootmem_pages(PAGE_SIZE);
+ pfn_to_mfn_frame_list_list[k] =
+ virt_to_mfn(pfn_to_mfn_frame_list[k]);
+ }
+ pfn_to_mfn_frame_list[k][j] =
+ virt_to_mfn(&phys_to_machine_mapping[i]);
+ }
+ HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(pfn_to_mfn_frame_list_list);
+ }
+
+ /* Mark all ISA DMA channels in-use - using them wouldn't work. */
+ for (i = 0; i < MAX_DMA_CHANNELS; ++i)
+ if (i != 4 && request_dma(i, "xen") != 0)
+ BUG();
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
#endif
+
+#ifndef CONFIG_XEN
+ early_quirks();
+#endif
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ get_smp_config();
+#endif
+
+ prefill_possible_map();
+#ifdef CONFIG_X86_64
+ init_cpu_to_node();
+#endif
+
+#ifndef CONFIG_XEN
+ init_apic_mappings();
+ ioapic_init_mappings();
+
+ kvm_guest_init();
+
+ e820_reserve_resources();
+ e820_mark_nosave_regions(max_low_pfn);
+#else
+ if (is_initial_xendomain())
+ e820_reserve_resources();
+#endif
+
+#ifdef CONFIG_X86_32
+ if (is_initial_xendomain())
+ request_resource(&iomem_resource, &video_ram_resource);
+#endif
+ reserve_standard_io_resources();
+
+#ifndef CONFIG_XEN
+ e820_setup_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+#else /* CONFIG_XEN */
+ if (is_initial_xendomain())
+ e820_setup_gap();
+
+#ifdef CONFIG_VT
+#ifdef CONFIG_DUMMY_CONSOLE
+ conswitchp = &dummy_con;
+#endif
+#ifdef CONFIG_VGA_CONSOLE
+ if (is_initial_xendomain())
+ conswitchp = &vga_con;
+#endif
+#endif
+#endif /* CONFIG_XEN */
+}
+
+#ifdef CONFIG_XEN
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ HYPERVISOR_shutdown(SHUTDOWN_crash);
+ /* we're never actually going to get here... */
+ return NOTIFY_DONE;
+}
+#endif /* !CONFIG_XEN */
--- head-2011-03-11.orig/arch/x86/kernel/setup64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,370 +0,0 @@
-/*
- * X86-64 specific CPU setup.
- * Copyright (C) 1995 Linus Torvalds
- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
- * See setup.c for older changelog.
- *
- * Jun Nakajima <jun.nakajima@intel.com>
- * Modified for Xen
- *
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/mmu_context.h>
-#include <asm/smp.h>
-#include <asm/i387.h>
-#include <asm/percpu.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-#ifdef CONFIG_XEN
-#include <asm/hypervisor.h>
-#endif
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
-
-#ifndef CONFIG_X86_NO_IDT
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-#endif
-
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata = 0;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on Enable(default)
-off Disable
-*/
-static int __init nonx_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32 = 0;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
- if (!strcmp(str, "on"))
- force_personality32 &= ~READ_IMPLIES_EXEC;
- else if (!strcmp(str, "off"))
- force_personality32 |= READ_IMPLIES_EXEC;
- return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-#ifdef CONFIG_XEN
-static void __init_refok switch_pt(int cpu)
-{
- if (cpu == 0)
- xen_init_pt();
- xen_pt_switch(__pa_symbol(init_level4_pgt));
- xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt)));
-}
-#define switch_pt() switch_pt(cpu)
-
-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
-{
- unsigned long frames[16];
- unsigned long va;
- int f;
-
- for (va = gdt_descr->address, f = 0;
- va < gdt_descr->address + gdt_descr->size;
- va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
- make_page_readonly(
- (void *)va, XENFEAT_writable_descriptor_tables);
- }
- if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) /
- sizeof (struct desc_struct)))
- BUG();
-}
-#else
-static void switch_pt(void)
-{
- asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
-}
-
-static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr)
-{
- load_gdt(gdt_descr);
- load_idt(idt_descr);
-}
-#endif
-
-void pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
-
- /* Setup up data that may be needed in __get_free_pages early */
- asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
-#ifndef CONFIG_XEN
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-#else
- if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
- (unsigned long)pda))
- BUG();
-#endif
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack =
- (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- } else {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d", cpu);
- }
-
- switch_pt();
-
- pda->irqstackptr += IRQSTACKSIZE-64;
-}
-
-#ifndef CONFIG_X86_NO_TSS
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-#endif
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-#ifndef CONFIG_XEN
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to set CS/DS
- * but only a 32bit target. LSTAR sets the 64bit rip.
- */
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
-
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-#endif
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init ();
-#else
- {
- static const struct callback_register cstar = {
- .type = CALLBACKTYPE_syscall32,
- .address = (unsigned long)ignore_sysret
- };
- if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
- printk(KERN_WARNING "Unable to register CSTAR callback\n");
- }
-#endif
-}
-
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx) {
- __supported_pte_mask &= ~_PAGE_NX;
- }
-}
-
-unsigned long kernel_eflags;
-
-#ifndef CONFIG_X86_NO_TSS
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#endif
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init (void)
-{
- int cpu = stack_smp_processor_id();
-#ifndef CONFIG_X86_NO_TSS
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
- char *estacks = NULL;
- unsigned i;
-#endif
- struct task_struct *me;
-
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0) {
- pda_init(cpu);
- }
-#ifndef CONFIG_X86_NO_TSS
- else
- estacks = boot_exception_stacks;
-#endif
-
- me = current;
-
- if (cpu_test_and_set(cpu, cpu_initialized))
- panic("CPU#%d already initialized!\n", cpu);
-
- printk("Initializing CPU#%d\n", cpu);
-
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
-#ifndef CONFIG_XEN
- if (cpu)
- memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
-#endif
-
- cpu_gdt_descr[cpu].size = GDT_SIZE;
- cpu_gdt_init(&cpu_gdt_descr[cpu]);
-
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
-
- check_efer();
-
-#ifndef CONFIG_X86_NO_TSS
- /*
- * set up and load the per-CPU TSS
- */
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception stack %ld %d\n",
- v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
- }
-
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-#endif
-
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- if (me->mm)
- BUG();
- enter_lazy_tlb(&init_mm, me);
-
-#ifndef CONFIG_X86_NO_TSS
- set_tss_desc(cpu, t);
-#endif
-#ifndef CONFIG_XEN
- load_TR_desc();
-#endif
- load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
- /*
- * If the kgdb is connected no debug regs should be altered. This
- * is only applicable when KGDB and a KGDB I/O module are built
- * into the kernel and you are using early debugging with
- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
- */
- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- else {
-#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
- }
-#endif
-
- fpu_init();
-
- asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
- if (raw_irqs_disabled())
- kernel_eflags &= ~X86_EFLAGS_IF;
-
- if (is_uv_system())
- uv_cpu_init();
-}
--- head-2011-03-11.orig/arch/x86/kernel/setup_32-xen.c 2011-03-04 15:07:31.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,1153 +0,0 @@
-/*
- * Copyright (C) 1995 Linus Torvalds
- *
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <video/edid.h>
-
-#include <asm/mtrr.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/mmzone.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/io.h>
-#include <asm/hypervisor.h>
-#include <xen/interface/physdev.h>
-#include <xen/interface/memory.h>
-#include <xen/features.h>
-#include <xen/firmware.h>
-#include <xen/xencons.h>
-#include <setup_arch.h>
-#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_XEN
-#include <xen/interface/kexec.h>
-#endif
-
-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
-static struct notifier_block xen_panic_block = {
- xen_panic_event, NULL, 0 /* try to go last */
-};
-
-/*
- * Machine setup..
- */
-static struct resource data_resource = {
- .name = "Kernel data",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
- .name = "Kernel code",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource bss_resource = {
- .name = "Kernel bss",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
- .name = "dma1",
- .start = 0x0000,
- .end = 0x001f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic1",
- .start = 0x0020,
- .end = 0x0021,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer0",
- .start = 0x0040,
- .end = 0x0043,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer1",
- .start = 0x0050,
- .end = 0x0053,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "keyboard",
- .start = 0x0060,
- .end = 0x0060,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "keyboard",
- .start = 0x0064,
- .end = 0x0064,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma page reg",
- .start = 0x0080,
- .end = 0x008f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic2",
- .start = 0x00a0,
- .end = 0x00a1,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma2",
- .start = 0x00c0,
- .end = 0x00df,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "fpu",
- .start = 0x00f0,
- .end = 0x00ff,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1 };
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1 };
-EXPORT_SYMBOL(boot_cpu_data);
-
-unsigned int def_to_bigsmp;
-
-#ifndef CONFIG_X86_PAE
-unsigned long mmu_cr4_features;
-#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
-#endif
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-/* user-defined highmem size */
-static unsigned int highmem_pages = -1;
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-#ifndef CONFIG_XEN
-#define copy_edid() (edid_info = boot_params.edid_info)
-#endif
-struct ist_info ist_info;
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
- defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-EXPORT_SYMBOL(ist_info);
-#endif
-
-extern void early_cpu_init(void);
-extern int root_mountflags;
-
-unsigned long saved_video_mode;
-
-#define RAMDISK_IMAGE_START_MASK 0x07FF
-#define RAMDISK_PROMPT_FLAG 0x8000
-#define RAMDISK_LOAD_FLAG 0x4000
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-/*
- * Point at the empty zero page to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
-unsigned long *phys_to_machine_mapping;
-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-/* Raw start-of-day parameters from the hypervisor. */
-start_info_t *xen_start_info;
-EXPORT_SYMBOL(xen_start_info);
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-#ifndef CONFIG_XEN
-/**
- * copy_edd() - Copy the BIOS EDD information
- * from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
- sizeof(edd.mbr_signature));
- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
- edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#endif
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem= [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "nopentium") == 0) {
- setup_clear_cpu_cap(X86_FEATURE_PSE);
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long mem_size;
-
- mem_size = memparse(arg, &arg);
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- return 0;
-}
-early_param("mem", parse_mem);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- elfcorehdr_addr = memparse(arg, &arg);
- return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-#endif /* CONFIG_PROC_VMCORE */
-
-/*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
-static int __init parse_highmem(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
- return 0;
-}
-early_param("highmem", parse_highmem);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
-static int __init parse_vmalloc(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- __VMALLOC_RESERVE = memparse(arg, &arg);
- return 0;
-}
-early_param("vmalloc", parse_vmalloc);
-
-#ifndef CONFIG_XEN
-/*
- * reservetop=size reserves a hole at the top of the kernel address space which
- * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
- * so relocating the fixmap can be done before paging initialization.
- */
-static int __init parse_reservetop(char *arg)
-{
- unsigned long address;
-
- if (!arg)
- return -EINVAL;
-
- address = memparse(arg, &arg);
- reserve_top_address(address);
- return 0;
-}
-early_param("reservetop", parse_reservetop);
-#endif
-
-/*
- * Determine low and high memory ranges:
- */
-unsigned long __init find_max_low_pfn(void)
-{
- unsigned long max_low_pfn;
-
- max_low_pfn = max_pfn;
- if (max_low_pfn > MAXMEM_PFN) {
- if (highmem_pages == -1)
- highmem_pages = max_pfn - MAXMEM_PFN;
- if (highmem_pages + MAXMEM_PFN < max_pfn)
- max_pfn = MAXMEM_PFN + highmem_pages;
- if (highmem_pages + MAXMEM_PFN > max_pfn) {
- printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn = MAXMEM_PFN;
-#ifndef CONFIG_HIGHMEM
- /* Maximum memory usable is what is directly addressable */
- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
- MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
- max_pfn = MAXMEM_PFN;
-#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
- if (max_pfn > MAX_NONPAE_PFN) {
- max_pfn = MAX_NONPAE_PFN;
- printk(KERN_WARNING "Warning only 4GB will be used.\n");
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- }
-#endif /* !CONFIG_HIGHMEM64G */
-#endif /* !CONFIG_HIGHMEM */
- } else {
- if (highmem_pages == -1)
- highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
- if (highmem_pages >= max_pfn) {
- printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
- highmem_pages = 0;
- }
- if (highmem_pages) {
- if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
- printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn -= highmem_pages;
- }
-#else
- if (highmem_pages)
- printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-#endif
- }
- return max_low_pfn;
-}
-
-#ifndef CONFIG_XEN
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-#endif
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-static void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
-{
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
- min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
- xen_start_info->nr_pt_frames;
-
- max_low_pfn = find_max_low_pfn();
-
-#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > max_low_pfn) {
- highstart_pfn = max_low_pfn;
- }
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
- high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
- num_physpages = max_low_pfn;
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-#ifdef CONFIG_FLATMEM
- max_mapnr = num_physpages;
-#endif
- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(max_low_pfn));
-
- setup_bootmem_allocator();
-
- return max_low_pfn;
-}
-
-static void __init zone_sizes_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
- add_active_range(0, 0, min(xen_start_info->nr_pages, highend_pfn));
- add_active_range(0, highend_pfn, highend_pfn);
-#else
- add_active_range(0, 0, min(xen_start_info->nr_pages, max_low_pfn));
- add_active_range(0, max_low_pfn, max_low_pfn);
-#endif
-
- free_area_init_nodes(max_zone_pfns);
-}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
-
- total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
- total += highend_pfn - highstart_pfn;
-#endif
-
- return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-#ifndef CONFIG_XEN
-static void __init reserve_crashkernel(void)
-{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
- int ret;
-
- total_mem = get_total_mem();
-
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret == 0 && crash_size > 0) {
- if (crash_base > 0) {
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- if (reserve_bootmem(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE) < 0) {
- printk(KERN_INFO "crashkernel reservation "
- "failed - memory is in use\n");
- return;
- }
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- } else
- printk(KERN_INFO "crashkernel reservation failed - "
- "you have to specify a base address\n");
- }
-}
-#else
-#define reserve_crashkernel xen_machine_kexec_setup_resources
-#endif
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static bool do_relocate_initrd = false;
-
-static void __init reserve_initrd(void)
-{
- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
- unsigned long ramdisk_size = xen_start_info->mod_len;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
-
- initrd_start = 0;
-
- if (!xen_start_info->mod_start || !ramdisk_size)
- return; /* No initrd provided by bootloader */
-
- if (ramdisk_end < ramdisk_image) {
- printk(KERN_ERR "initrd wraps around end of memory, "
- "disabling initrd\n");
- return;
- }
- if (ramdisk_size >= end_of_lowmem/2) {
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
- }
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
- initrd_start = ramdisk_image + PAGE_OFFSET;
- initrd_end = initrd_start+ramdisk_size;
- return;
- }
-
- /* We need to move the initrd down into lowmem */
- ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
-
- /* Note: this includes all the lowmem currently occupied by
- the initrd, we rely on that fact to keep the data intact. */
- reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
- initrd_start = ramdisk_here + PAGE_OFFSET;
- initrd_end = initrd_start + ramdisk_size;
-
- do_relocate_initrd = true;
-}
-
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-
-static void __init relocate_initrd(void)
-{
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
- unsigned long slop, clen, mapaddr;
- char *p, *q;
-
- if (!do_relocate_initrd)
- return;
-
- ramdisk_here = initrd_start - PAGE_OFFSET;
-
- q = (char *)initrd_start;
-
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- /* Copy the highmem portion of the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_ioremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-}
-
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-void __init setup_bootmem_allocator(void)
-{
- unsigned long bootmap_size;
- /*
- * Initialize the boot-time allocator (with low memory only):
- */
- bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
- register_bootmem_low_pages(max_low_pfn);
-
- /*
- * Reserve the bootmem bitmap itself as well. We do this in two
- * steps (first step was init_bootmem()) because this catches
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- */
- reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
- BOOTMEM_DEFAULT);
-
-#ifndef CONFIG_XEN
- /*
- * reserve physical page 0 - it's a special BIOS page on many boxes,
- * enabling clean reboots, SMP operation, laptop functions.
- */
- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
- /* reserve EBDA region */
- reserve_ebda_region();
-
-#ifdef CONFIG_SMP
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
-#endif /* !CONFIG_XEN */
-
-#ifdef CONFIG_BLK_DEV_INITRD
- reserve_initrd();
-#endif
- numa_kva_reserve();
- reserve_crashkernel();
-
- reserve_ibft_region();
-}
-
-/*
- * The node 0 pgdat is initialized before all of these because
- * it's needed for bootmem. node>0 pgdats have their virtual
- * space allocated before the pagetables are in place to access
- * them, so they can't be cleared then.
- *
- * This should all compile down to nothing when NUMA is off.
- */
-static void __init remapped_pgdat_init(void)
-{
- int nid;
-
- for_each_online_node(nid) {
- if (nid != 0)
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
- }
-}
-
-#ifdef CONFIG_MCA
-static void set_mca_bus(int x)
-{
- MCA_bus = x;
-}
-#else
-static void set_mca_bus(int x) { }
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
- return machine_specific_memory_setup();
-}
-
-#ifdef CONFIG_NUMA
-/*
- * In the golden day, when everything among i386 and x86_64 will be
- * integrated, this will not live here
- */
-void *x86_cpu_to_node_map_early_ptr;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
- [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-#endif
-
-/*
- * Determine if we were loaded by an EFI loader. If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
- * for initialization. Note, the efi init code path is determined by the
- * global efi_enabled. This allows the same kernel image to be used on existing
- * systems (with a traditional BIOS) as well as on EFI systems.
- */
-void __init setup_arch(char **cmdline_p)
-{
- int i, j, k, fpp;
- struct physdev_set_iopl set_iopl;
- unsigned long max_low_pfn;
- unsigned long p2m_pages;
-
- /* Force a quick death if the kernel panics (not domain 0). */
- extern int panic_timeout;
- if (!panic_timeout && !is_initial_xendomain())
- panic_timeout = 1;
-
- /* Register a call for panic conditions. */
- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
-
- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_4gb_segments));
- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_writable_pagetables));
-
- memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- pre_setup_arch_hook();
- early_cpu_init();
- early_ioremap_init();
-#ifdef CONFIG_SMP
- prefill_possible_map();
-#endif
-
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4))
- efi_enabled = 1;
-#endif
-
- /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
- properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
- */
- ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
- screen_info = boot_params.screen_info;
- copy_edid();
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
- saved_video_mode = boot_params.hdr.vid_mode;
- if( boot_params.sys_desc_table.length != 0 ) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
- machine_id = boot_params.sys_desc_table.table[0];
- machine_submodel_id = boot_params.sys_desc_table.table[1];
- BIOS_revision = boot_params.sys_desc_table.table[2];
- }
- bootloader_type = boot_params.hdr.type_of_loader;
-
- if (is_initial_xendomain()) {
- const struct dom0_vga_console_info *info =
- (void *)((char *)xen_start_info +
- xen_start_info->console.dom0.info_off);
-
- dom0_init_screen_info(info,
- xen_start_info->console.dom0.info_size);
- xen_start_info->console.domU.mfn = 0;
- xen_start_info->console.domU.evtchn = 0;
- } else
- screen_info.orig_video_isVGA = 0;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-
- ARCH_SETUP
-
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- print_memory_map(memory_setup());
-
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
- xen_start_info->nr_pt_frames) << PAGE_SHIFT;
-
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
- if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
- i = COMMAND_LINE_SIZE;
- memcpy(boot_command_line, xen_start_info->cmd_line, i);
- boot_command_line[i - 1] = '\0';
- parse_early_param();
-
- if (user_defined_memmap) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- print_memory_map("user");
- }
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- if (efi_enabled)
- efi_init();
-
- /* update e820 for memory not covered by WB MTRRs */
- propagate_e820_map();
- mtrr_bp_init();
-#ifndef CONFIG_XEN
- if (mtrr_trim_uncached_memory(max_pfn))
- propagate_e820_map();
-#endif
-
- max_low_pfn = setup_memory();
-
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
-
-#ifdef CONFIG_VMI
- /*
- * Must be after max_low_pfn is determined, and before kernel
- * pagetables are setup.
- */
- vmi_init();
-#endif
- kvm_guest_init();
-
- /*
- * NOTE: before this point _nobody_ is allowed to allocate
- * any memory using the bootmem allocator. Although the
- * allocator is now initialised only the first 8Mb of the kernel
- * virtual address space has been mapped. All allocations before
- * paging_init() has completed must use the alloc_bootmem_low_pages()
- * variant (which allocates DMA'able memory) and care must be taken
- * not to exceed the 8Mb limit.
- */
-
-#ifdef CONFIG_SMP
- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
- paging_init();
-
- /*
- * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
- */
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- if (init_ohci1394_dma_early)
- init_ohci1394_dma_on_all_controllers();
-#endif
-
- remapped_pgdat_init();
- sparse_init();
- zone_sizes_init();
-
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#endif
-
- p2m_pages = max_pfn;
- if (xen_start_info->nr_pages > max_pfn) {
- /*
- * the max_pfn was shrunk (probably by mem= or highmem=
- * kernel parameter); shrink reservation with the HV
- */
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
- unsigned int difference;
- int ret;
-
- difference = xen_start_info->nr_pages - max_pfn;
-
- set_xen_guest_handle(reservation.extent_start,
- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
- reservation.nr_extents = difference;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- BUG_ON (ret != difference);
- }
- else if (max_pfn > xen_start_info->nr_pages)
- p2m_pages = xen_start_info->nr_pages;
-
- /* Make sure we have a correctly sized P->M table. */
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- phys_to_machine_mapping = alloc_bootmem_low_pages(
- max_pfn * sizeof(unsigned long));
- memset(phys_to_machine_mapping, ~0,
- max_pfn * sizeof(unsigned long));
- memcpy(phys_to_machine_mapping,
- (unsigned long *)xen_start_info->mfn_list,
- p2m_pages * sizeof(unsigned long));
- free_bootmem(
- __pa(xen_start_info->mfn_list),
- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
- sizeof(unsigned long))));
-
- /*
- * Initialise the list of the frames that specify the list of
- * frames that make up the p2m table. Used by save/restore
- */
- pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
-
- fpp = PAGE_SIZE/sizeof(unsigned long);
- for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
- if ((j % fpp) == 0) {
- k++;
- BUG_ON(k>=16);
- pfn_to_mfn_frame_list[k] =
- alloc_bootmem_low_pages(PAGE_SIZE);
- pfn_to_mfn_frame_list_list[k] =
- virt_to_mfn(pfn_to_mfn_frame_list[k]);
- j=0;
- }
- pfn_to_mfn_frame_list[k][j] =
- virt_to_mfn(&phys_to_machine_mapping[i]);
- }
- HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(pfn_to_mfn_frame_list_list);
- }
-
- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
- if (i != 4 && request_dma(i, "xen") != 0)
- BUG();
-
- /*
- * NOTE: at this point the bootmem allocator is fully available.
- */
-
-#ifdef CONFIG_BLK_DEV_INITRD
- relocate_initrd();
-#endif
-
- paravirt_post_allocator_init();
-
- if (is_initial_xendomain())
- dmi_scan_machine();
-
- io_delay_init();
-
-#if defined(CONFIG_X86_SMP) && !defined(CONFIG_XEN)
- /*
- * setup to use the early static init tables during kernel startup
- * X86_SMP will exclude sub-arches that don't deal well with it.
- */
- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
-#ifdef CONFIG_X86_GENERICARCH
- generic_apic_probe();
-#endif
-
- set_iopl.iopl = 1;
- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-
-#ifdef CONFIG_ACPI
- if (!is_initial_xendomain()) {
- printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
- acpi_disabled = 1;
- acpi_ht = 0;
- }
-
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-#endif
-
-#ifndef CONFIG_XEN
- early_quirks();
-#endif
-
-#ifdef CONFIG_ACPI
- acpi_boot_init();
-
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
- if (def_to_bigsmp)
- printk(KERN_WARNING "More than 8 CPUs detected and "
- "CONFIG_X86_PC cannot handle it.\nUse "
- "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- if (smp_found_config)
- get_smp_config();
-#endif
-
- e820_register_memory();
- e820_mark_nosave_regions();
-
- if (is_initial_xendomain()) {
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled ||
- (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
-#endif
- } else {
-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
- }
-}
-
-static int
-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- HYPERVISOR_shutdown(SHUTDOWN_crash);
- /* we're never actually going to get here... */
- return NOTIFY_DONE;
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
- int i;
-
- /* Nothing to do if not running in dom0. */
- if (!is_initial_xendomain())
- return 0;
-
- printk(KERN_INFO "Setting up standard PCI resources\n");
- init_iomem_resources(&code_resource, &data_resource, &bss_resource);
-
- request_resource(&iomem_resource, &video_ram_resource);
-
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
- return 0;
-}
-
-subsys_initcall(request_standard_resources);
--- head-2011-03-11.orig/arch/x86/kernel/setup_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,1442 +0,0 @@
-/*
- * Copyright (C) 1995 Linus Torvalds
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <linux/console.h>
-#include <linux/seq_file.h>
-#include <linux/crash_dump.h>
-#include <linux/root_dev.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/efi.h>
-#include <linux/acpi.h>
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/mmzone.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dmi.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/sort.h>
-#include <linux/uaccess.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/vsyscall.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <video/edid.h>
-#include <asm/e820.h>
-#include <asm/dma.h>
-#include <asm/gart.h>
-#include <asm/mpspec.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/cacheflush.h>
-#include <asm/mce.h>
-#include <asm/ds.h>
-#include <asm/topology.h>
-#include <asm/pat.h>
-
-#include <mach_apic.h>
-#ifdef CONFIG_XEN
-#include <linux/percpu.h>
-#include <xen/interface/physdev.h>
-#include "setup_arch_pre.h"
-#include <asm/hypervisor.h>
-#include <xen/interface/nmi.h>
-#include <xen/features.h>
-#include <xen/firmware.h>
-#include <xen/xencons.h>
-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
-#include <asm/mach-xen/setup_arch_post.h>
-#include <xen/interface/memory.h>
-
-#ifdef CONFIG_XEN
-#include <xen/interface/kexec.h>
-#endif
-
-extern unsigned long start_pfn;
-extern struct edid_info edid_info;
-
-shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
-EXPORT_SYMBOL(HYPERVISOR_shared_info);
-
-static int xen_panic_event(struct notifier_block *, unsigned long, void *);
-static struct notifier_block xen_panic_block = {
- xen_panic_event, NULL, 0 /* try to go last */
-};
-
-unsigned long *phys_to_machine_mapping;
-unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
-
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
-DEFINE_PER_CPU(int, nr_multicall_ents);
-
-/* Raw start-of-day parameters from the hypervisor. */
-start_info_t *xen_start_info;
-EXPORT_SYMBOL(xen_start_info);
-#endif
-
-/*
- * Machine setup..
- */
-
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-unsigned long mmu_cr4_features;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-unsigned long saved_video_mode;
-
-int force_mwait __cpuinitdata;
-
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct sys_desc_table_struct {
- unsigned short length;
- unsigned char table[0];
-};
-
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-char __initdata command_line[COMMAND_LINE_SIZE];
-
-static struct resource standard_io_resources[] = {
- { .name = "dma1", .start = 0x00, .end = 0x1f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "pic1", .start = 0x20, .end = 0x21,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "timer0", .start = 0x40, .end = 0x43,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "timer1", .start = 0x50, .end = 0x53,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "keyboard", .start = 0x60, .end = 0x60,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "keyboard", .start = 0x64, .end = 0x64,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "dma page reg", .start = 0x80, .end = 0x8f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "pic2", .start = 0xa0, .end = 0xa1,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "dma2", .start = 0xc0, .end = 0xdf,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "fpu", .start = 0xf0, .end = 0xff,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
-
-static struct resource data_resource = {
- .name = "Kernel data",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-static struct resource code_resource = {
- .name = "Kernel code",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-static struct resource bss_resource = {
- .name = "Kernel bss",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
- char *end;
- if (!arg)
- return -EINVAL;
- elfcorehdr_addr = memparse(arg, &end);
- return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-#ifndef CONFIG_NUMA
-static void __init
-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long bootmap_size, bootmap;
-
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
- PAGE_SIZE);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_register_active_regions(0, start_pfn, end_pfn);
-#ifdef CONFIG_XEN
- if (xen_start_info->nr_pages < end_pfn)
- end_pfn = xen_start_info->nr_pages;
-#endif
- free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
-}
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-#ifndef CONFIG_XEN
-/**
- * copy_edd() - Copy the BIOS EDD information
- * from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
- sizeof(edd.mbr_signature));
- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
- edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#endif
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-#ifdef CONFIG_KEXEC
-#ifndef CONFIG_XEN
-static void __init reserve_crashkernel(void)
-{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
- int ret;
-
- total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
-
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret == 0 && crash_size) {
- if (crash_base <= 0) {
- printk(KERN_INFO "crashkernel reservation failed - "
- "you have to specify a base address\n");
- return;
- }
-
- if (reserve_bootmem(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE) < 0) {
- printk(KERN_INFO "crashkernel reservation failed - "
- "memory is in use\n");
- return;
- }
-
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
- }
-}
-#else
-#define reserve_crashkernel xen_machine_kexec_setup_resources
-#endif
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
- machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
- struct setup_data *data;
- unsigned long pa_data;
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, PAGE_SIZE);
- switch (data->type) {
- default:
- break;
- }
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
- free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
- pa_data = data->next;
- early_iounmap(data, PAGE_SIZE);
- }
-}
-
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
-/*
- * setup_arch - architecture-specific boot-time initializations
- *
- * Note: On x86_64, fixmaps are ready for use even before this is called.
- */
-void __init setup_arch(char **cmdline_p)
-{
- unsigned i;
-
-#ifdef CONFIG_XEN
- extern struct e820map machine_e820;
-
- printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
- /* Register a call for panic conditions. */
- atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
-
- WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_writable_pagetables));
-
- early_ioremap_init();
-
- ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
- screen_info = boot_params.screen_info;
-
- if (is_initial_xendomain()) {
- const struct dom0_vga_console_info *info =
- (void *)((char *)xen_start_info +
- xen_start_info->console.dom0.info_off);
-
- dom0_init_screen_info(info,
- xen_start_info->console.dom0.info_size);
- xen_start_info->console.domU.mfn = 0;
- xen_start_info->console.domU.evtchn = 0;
- } else
- screen_info.orig_video_isVGA = 0;
-
- copy_edid();
-#else
- printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#endif /* !CONFIG_XEN */
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4))
- efi_enabled = 1;
-#endif
-
- ARCH_SETUP
-
- memory_setup();
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) &_text;
- init_mm.end_code = (unsigned long) &_etext;
- init_mm.end_data = (unsigned long) &_edata;
- init_mm.brk = (unsigned long) &_end;
-
- code_resource.start = virt_to_phys(&_text);
- code_resource.end = virt_to_phys(&_etext)-1;
- data_resource.start = virt_to_phys(&_etext);
- data_resource.end = virt_to_phys(&_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
- early_identify_cpu(&boot_cpu_data);
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- parse_setup_data();
-
- parse_early_param();
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- if (init_ohci1394_dma_early)
- init_ohci1394_dma_on_all_controllers();
-#endif
-
- finish_e820_parsing();
-
-#ifndef CONFIG_XEN
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
-#endif
-
- early_gart_iommu_check();
-
- e820_register_active_regions(0, 0, -1UL);
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
- end_pfn = e820_end_of_ram();
- /* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
-#ifndef CONFIG_XEN
- if (mtrr_trim_uncached_memory(end_pfn)) {
- e820_register_active_regions(0, 0, -1UL);
- end_pfn = e820_end_of_ram();
- }
-#endif
-
- num_physpages = end_pfn;
- max_mapnr = end_pfn;
-
- check_efer();
-
- max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
- if (efi_enabled)
- efi_init();
-
-#ifndef CONFIG_XEN
- vsmp_init();
-#endif
-
- if (is_initial_xendomain())
- dmi_scan_machine();
-
- io_delay_init();
-
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
-
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- /* setup to use the early static init tables during kernel startup */
- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
- /* How many end-of-memory variables you have, grandma! */
- max_low_pfn = end_pfn;
- max_pfn = end_pfn;
- high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
-
- /* Remove active ranges so rediscovery with NUMA-awareness happens */
- remove_all_active_ranges();
-
-#ifdef CONFIG_ACPI_NUMA
- /*
- * Parse SRAT to discover nodes.
- */
- acpi_numa_init();
-#endif
-
-#ifdef CONFIG_NUMA
- numa_initmem_init(0, end_pfn);
-#else
- contig_initmem_init(0, end_pfn);
-#endif
-
-#ifndef CONFIG_XEN
- dma32_reserve_bootmem();
-
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
-
- if (efi_enabled)
- efi_reserve_bootmem();
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-#ifdef CONFIG_XEN
- if (xen_start_info->mod_start) {
- unsigned long ramdisk_image = __pa(xen_start_info->mod_start);
- unsigned long ramdisk_size = xen_start_info->mod_len;
-#else
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
-#endif
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
-
- if (ramdisk_end <= end_of_mem) {
- /*
- * don't need to reserve again, already reserved early
- * in x86_64_start_kernel, and early_res_to_bootmem
- * convert that to reserved in bootmem
- */
- initrd_start = ramdisk_image + PAGE_OFFSET;
- initrd_end = initrd_start+ramdisk_size;
-#ifdef CONFIG_XEN
- initrd_below_start_ok = 1;
-#endif
- } else {
- free_bootmem(ramdisk_image, ramdisk_size);
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
- ramdisk_end, end_of_mem);
- initrd_start = 0;
- }
- }
-#endif
- reserve_crashkernel();
-
- reserve_ibft_region();
-
- paging_init();
- map_vsyscall();
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#endif
-#ifdef CONFIG_XEN
- {
- int i, j, k, fpp;
- unsigned long p2m_pages;
-
- p2m_pages = end_pfn;
- if (xen_start_info->nr_pages > end_pfn) {
- /*
- * the end_pfn was shrunk (probably by mem= or highmem=
- * kernel parameter); shrink reservation with the HV
- */
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
- unsigned int difference;
- int ret;
-
- difference = xen_start_info->nr_pages - end_pfn;
-
- set_xen_guest_handle(reservation.extent_start,
- ((unsigned long *)xen_start_info->mfn_list) + end_pfn);
- reservation.nr_extents = difference;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- BUG_ON (ret != difference);
- }
- else if (end_pfn > xen_start_info->nr_pages)
- p2m_pages = xen_start_info->nr_pages;
-
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Make sure we have a large enough P->M table. */
- phys_to_machine_mapping = alloc_bootmem_pages(
- end_pfn * sizeof(unsigned long));
- memset(phys_to_machine_mapping, ~0,
- end_pfn * sizeof(unsigned long));
- memcpy(phys_to_machine_mapping,
- (unsigned long *)xen_start_info->mfn_list,
- p2m_pages * sizeof(unsigned long));
- free_bootmem(
- __pa(xen_start_info->mfn_list),
- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
- sizeof(unsigned long))));
-
- /*
- * Initialise the list of the frames that specify the
- * list of frames that make up the p2m table. Used by
- * save/restore.
- */
- pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE);
-
- fpp = PAGE_SIZE/sizeof(unsigned long);
- for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
- if ((j % fpp) == 0) {
- k++;
- BUG_ON(k>=fpp);
- pfn_to_mfn_frame_list[k] =
- alloc_bootmem_pages(PAGE_SIZE);
- pfn_to_mfn_frame_list_list[k] =
- virt_to_mfn(pfn_to_mfn_frame_list[k]);
- j=0;
- }
- pfn_to_mfn_frame_list[k][j] =
- virt_to_mfn(&phys_to_machine_mapping[i]);
- }
- HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(pfn_to_mfn_frame_list_list);
- }
-
- /* Mark all ISA DMA channels in-use - using them wouldn't work. */
- for (i = 0; i < MAX_DMA_CHANNELS; ++i)
- if (i != 4 && request_dma(i, "xen") != 0)
- BUG();
- }
-
-#ifdef CONFIG_ACPI
- if (!is_initial_xendomain()) {
- acpi_disabled = 1;
- acpi_ht = 0;
- }
-#endif
-#endif
-
-#ifndef CONFIG_XEN
- early_quirks();
-#endif
-
-#ifdef CONFIG_ACPI
- /*
- * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
- * Call this early for SRAT node setup.
- */
- acpi_boot_table_init();
-
- /*
- * Read APIC and some other early information from ACPI tables.
- */
- acpi_boot_init();
-#endif
-
- init_cpu_to_node();
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- get_smp_config();
-#ifndef CONFIG_XEN
- init_apic_mappings();
- ioapic_init_mappings();
-#endif
-#endif
-#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
- prefill_possible_map();
-#endif
-
- kvm_guest_init();
-
- /*
- * We trust e820 completely. No explicit ROM probing in memory.
- */
-#ifdef CONFIG_XEN
- if (is_initial_xendomain())
- e820_reserve_resources(machine_e820.map, machine_e820.nr_map);
-#else
- e820_reserve_resources(e820.map, e820.nr_map);
- e820_mark_nosave_regions();
-#endif
-
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
-
-#ifdef CONFIG_XEN
- if (is_initial_xendomain())
- e820_setup_gap(machine_e820.map, machine_e820.nr_map);
-#else
- e820_setup_gap(e820.map, e820.nr_map);
-#endif
-
-#ifdef CONFIG_XEN
- {
- struct physdev_set_iopl set_iopl;
-
- set_iopl.iopl = 1;
- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
-
- if (is_initial_xendomain()) {
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
-#endif
- } else {
-#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
- }
- }
-#else /* CONFIG_XEN */
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
-#endif
-
-#endif /* !CONFIG_XEN */
-
- /* do this before identify_cpu for boot cpu */
- check_enable_amd_mmconf_dmi();
-}
-
-#ifdef CONFIG_XEN
-static int
-xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- HYPERVISOR_shutdown(SHUTDOWN_crash);
- /* we're never actually going to get here... */
- return NOTIFY_DONE;
-}
-#endif /* !CONFIG_XEN */
-
-
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
- unsigned int *v;
-
- if (c->extended_cpuid_level < 0x80000004)
- return 0;
-
- v = (unsigned int *) c->x86_model_id;
- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
- c->x86_model_id[48] = 0;
- return 1;
-}
-
-
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
- unsigned int n, dummy, eax, ebx, ecx, edx;
-
- n = c->extended_cpuid_level;
-
- if (n >= 0x80000005) {
- cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
- "D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
- c->x86_cache_size = (ecx>>24) + (edx>>24);
- /* On K8 L1 TLB is inclusive, so don't count it */
- c->x86_tlbsize = 0;
- }
-
- if (n >= 0x80000006) {
- cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
- ecx = cpuid_ecx(0x80000006);
- c->x86_cache_size = ecx >> 16;
- c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- c->x86_cache_size, ecx & 0xFF);
- }
- if (n >= 0x80000008) {
- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-}
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- unsigned bits;
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node = 0;
- unsigned apicid = hard_smp_processor_id();
-#endif
- bits = c->x86_coreid_bits;
-
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
- node = c->phys_proc_id;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
- if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK 0x18000000
-#define CPUID_PROCESSOR_SIGNATURE 1
-#define CPUID_XFAM 0x0ff00000
-#define CPUID_XFAM_K8 0x00000000
-#define CPUID_XFAM_10H 0x00100000
-#define CPUID_XFAM_11H 0x00200000
-#define CPUID_XMOD 0x000f0000
-#define CPUID_XMOD_REV_F 0x00040000
-
-#ifndef CONFIG_XEN
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
- u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
- switch (eax & CPUID_XFAM) {
- case CPUID_XFAM_K8:
- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
- break;
- case CPUID_XFAM_10H:
- case CPUID_XFAM_11H:
- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK)
- return 1;
- break;
- default:
- /* err on the side of caution */
- return 1;
- }
- return 0;
-}
-#endif
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
- early_init_amd_mc(c);
-
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
- unsigned level;
-
-#ifdef CONFIG_SMP
- unsigned long value;
-
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 15) {
- rdmsrl(MSR_K8_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K8_HWCR, value);
- }
-#endif
-
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_cpu_cap(c, 0*32+31);
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- level = cpuid_eax(1);
- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
- level >= 0x0f58))
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- level = get_model_name(c);
- if (!level) {
- switch (c->x86) {
- case 15:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
- display_cacheinfo(c);
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008)
- amd_detect_cmp(c);
-
- if (c->extended_cpuid_level >= 0x80000006 &&
- (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
-
- if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
-
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
- if (c->x86 == 0x10)
- fam10h_check_enable_mmcfg();
-
-#ifndef CONFIG_XEN
- if (amd_apic_timer_broken())
- disable_apic_timer = 1;
-
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
- (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
-#endif
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
-
- if (smp_num_siblings > NR_CPUS) {
- printk(KERN_WARNING "CPU: Unsupported number of "
- "siblings %d", smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = phys_pkg_id(index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->cpu_core_id = phys_pkg_id(index_msb) &
- ((1 << core_bits) - 1);
- }
-out:
- if ((c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- }
-
-#endif
-}
-
-#ifndef CONFIG_XEN
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, t;
-
- if (c->cpuid_level < 4)
- return 1;
-
- cpuid_count(4, 0, &eax, &t, &t, &t);
-
- if (eax & 0x1f)
- return ((eax >> 26) + 1);
- else
- return 1;
-}
-#endif
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
- unsigned node;
- int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
-
- /* Don't do the funky fallback heuristics the AMD version employs
- for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
- node = first_node(node_online_map);
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned n;
-
- init_intel_cacheinfo(c);
- if (c->cpuid_level > 9) {
- unsigned eax = cpuid_eax(10);
- /* Check for version and the number of counters */
- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
- }
-
- if (cpu_has_ds) {
- unsigned int l1, l2;
- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
- set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
- set_cpu_cap(c, X86_FEATURE_PEBS);
- }
-
-
- if (cpu_has_bts)
- ds_init_intel(c);
-
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- /* CPUID workaround for Intel 0F34 CPU */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 0xF && c->x86_model == 0x3 &&
- c->x86_mask == 0x4)
- c->x86_phys_bits = 36;
- }
-
- if (c->x86 == 15)
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-#ifndef CONFIG_XEN
- c->x86_max_cores = intel_num_cpu_cores(c);
-#endif
-
- srat_detect_node();
-}
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x6 && c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned n;
-
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-
- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
- char *v = c->x86_vendor_id;
-
- if (!strcmp(v, "AuthenticAMD"))
- c->x86_vendor = X86_VENDOR_AMD;
- else if (!strcmp(v, "GenuineIntel"))
- c->x86_vendor = X86_VENDOR_INTEL;
- else if (!strcmp(v, "CentaurHauls"))
- c->x86_vendor = X86_VENDOR_CENTAUR;
- else
- c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
- needed before check_bugs. Everything advanced is in identify_cpu
- below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
- u32 tfms, xlvl;
-
- c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
- c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
- c->x86_vendor_id[0] = '\0'; /* Unset */
- c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_clflush_size = 64;
- c->x86_cache_alignment = c->x86_clflush_size;
-#ifndef CONFIG_XEN
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
-#endif
- c->extended_cpuid_level = 0;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
- /* Get vendor name */
- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
- (unsigned int *)&c->x86_vendor_id[0],
- (unsigned int *)&c->x86_vendor_id[8],
- (unsigned int *)&c->x86_vendor_id[4]);
-
- get_cpu_vendor(c);
-
- /* Initialize the standard set of capabilities */
- /* Note that the vendor-specific code below might override */
-
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- __u32 misc;
- cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
- &c->x86_capability[0]);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
- } else {
- /* Have CPUID level 0 only - unheard of */
- c->x86 = 4;
- }
-
-#ifndef CONFIG_XEN
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
- c->phys_proc_id = c->initial_apicid;
-#endif
-#endif
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
- if (xlvl >= 0x80000004)
- get_model_name(c); /* Default name */
- }
-
- /* Transmeta-defined flags: level 0x80860001 */
- xlvl = cpuid_eax(0x80860000);
- if ((xlvl & 0xffff0000) == 0x80860000) {
- /* Don't set x86_cpuid_level here for now to not confuse. */
- if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
- }
-
- c->extended_cpuid_level = cpuid_eax(0x80000000);
- if (c->extended_cpuid_level >= 0x80000007)
- c->x86_power = cpuid_edx(0x80000007);
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- early_init_amd(c);
- break;
- case X86_VENDOR_INTEL:
- early_init_intel(c);
- break;
- case X86_VENDOR_CENTAUR:
- early_init_centaur(c);
- break;
- }
-
- validate_pat_support(c);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
- int i;
-
- early_identify_cpu(c);
-
- init_scattered_cpuid_features(c);
-
-#ifndef CONFIG_XEN
- c->apicid = phys_pkg_id(0);
-#endif
-
- /*
- * Vendor-specific initialization. In this section we
- * canonicalize the feature flags, meaning if there are
- * features a certain CPU supports which CPUID doesn't
- * tell us, CPUID claiming incorrect flags, or other bugs,
- * we handle them here.
- *
- * At the end of this section, c->x86_capability better
- * indicate the features this CPU genuinely supports!
- */
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- init_amd(c);
- break;
-
- case X86_VENDOR_INTEL:
- init_intel(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- init_centaur(c);
- break;
-
- case X86_VENDOR_UNKNOWN:
- default:
- display_cacheinfo(c);
- break;
- }
-
- detect_ht(c);
-
- /*
- * On SMP, boot_cpu_data holds the common feature set between
- * all CPUs; so make sure that we indicate which features are
- * common between the CPUs. The first time this routine gets
- * executed, c == &boot_cpu_data.
- */
- if (c != &boot_cpu_data) {
- /* AND the already accumulated flags with these */
- for (i = 0; i < NCAPINTS; i++)
- boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
- }
-
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
- mcheck_init(c);
-#endif
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
- numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
- identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
- BUG_ON(c == &boot_cpu_data);
- identify_cpu(c);
- mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
- if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
-
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
- else
- printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
- int bit;
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
- return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
--- head-2011-03-11.orig/arch/x86/kernel/smp-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/smp-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -121,132 +121,14 @@ void xen_smp_send_reschedule(int cpu)
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-void lock_ipi_call_lock(void)
+void xen_send_call_func_single_ipi(int cpu)
{
- spin_lock_irq(&call_lock);
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNC_SINGLE_VECTOR);
}
-void unlock_ipi_call_lock(void)
+void xen_send_call_func_ipi(cpumask_t mask)
{
- spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- struct call_data_struct data;
- int cpus = num_online_cpus() - 1;
-
- if (!cpus)
- return;
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- mb();
-
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on. Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int
-xen_smp_call_function_mask(cpumask_t mask,
- void (*func)(void *), void *info,
- int wait)
-{
- struct call_data_struct data;
- cpumask_t allbutself;
- int cpus;
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- /* Holding any lock stops cpus from going down. */
- spin_lock(&call_lock);
-
- allbutself = cpu_online_map;
- cpu_clear(smp_processor_id(), allbutself);
-
- cpus_and(mask, mask, allbutself);
- cpus = cpus_weight(mask);
-
- if (!cpus) {
- spin_unlock(&call_lock);
- return 0;
- }
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- wmb();
-
- /* Send a message to other CPUs */
- if (cpus_equal(mask, allbutself) &&
- cpus_equal(cpu_online_map, cpu_callout_map))
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
- spin_unlock(&call_lock);
-
- return 0;
+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
static void stop_this_cpu(void *dummy)
@@ -268,15 +150,10 @@ static void stop_this_cpu(void *dummy)
void xen_smp_send_stop(void)
{
- int nolock;
unsigned long flags;
- /* Don't deadlock on the call lock in panic */
- nolock = !spin_trylock(&call_lock);
+ smp_call_function(stop_this_cpu, NULL, 0);
local_irq_save(flags);
- __smp_call_function(stop_this_cpu, NULL, 0, 0);
- if (!nolock)
- spin_unlock(&call_lock);
disable_all_local_evtchn();
local_irq_restore(flags);
}
@@ -298,30 +175,24 @@ irqreturn_t smp_reschedule_interrupt(int
irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
- (*func)(info);
+ generic_smp_call_function_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
- if (wait) {
- mb();
- atomic_inc(&call_data->finished);
- }
+ return IRQ_HANDLED;
+}
+
+irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id)
+{
+ generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
return IRQ_HANDLED;
}
--- head-2011-03-11.orig/arch/x86/kernel/time-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/time-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -460,7 +460,7 @@ irqreturn_t timer_interrupt(int irq, voi
/* Keep nmi watchdog up to date */
#ifdef __i386__
- per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
+ x86_add_percpu(irq_stat.irq0_irqs, 1);
#else
add_pda(irq0_irqs, 1);
#endif
@@ -750,9 +750,7 @@ void __init time_init(void)
update_wallclock();
-#ifndef CONFIG_X86_64
use_tsc_delay();
-#endif
/* Cannot request_irq() until kmem is initialised. */
late_time_init = setup_cpu0_timer_irq;
@@ -809,7 +807,8 @@ static void stop_hz_timer(void)
/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
- (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
+ (j = get_next_timer_interrupt(jiffies),
+ time_before_eq(j, jiffies))) {
cpu_clear(cpu, nohz_cpu_mask);
j = jiffies + 1;
}
--- head-2011-03-11.orig/arch/x86/kernel/traps_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/traps_32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -1,5 +1,6 @@
/*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
@@ -57,11 +58,10 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/io.h>
+#include <asm/traps.h>
#include "mach_traps.h"
-int panic_on_unrecovered_nmi;
-
#ifndef CONFIG_XEN
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);
@@ -82,43 +82,22 @@ gate_desc idt_table[256]
__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
#endif
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-#ifndef CONFIG_XEN
-asmlinkage void spurious_interrupt_bug(void);
-#else
-asmlinkage void fixup_4gb_segment(void);
-#endif
-asmlinkage void machine_check(void);
-
+int panic_on_unrecovered_nmi;
int kstack_depth_to_print = 24;
static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;
void printk_address(unsigned long address, int reliable)
{
#ifdef CONFIG_KALLSYMS
- char namebuf[KSYM_NAME_LEN];
unsigned long offset = 0;
unsigned long symsize;
const char *symname;
- char reliab[4] = "";
- char *delim = ":";
char *modname;
+ char *delim = ":";
+ char namebuf[KSYM_NAME_LEN];
+ char reliab[4] = "";
symname = kallsyms_lookup(address, &symsize, &offset,
&modname, namebuf);
@@ -138,22 +117,23 @@ void printk_address(unsigned long addres
#endif
}
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size)
{
- return p > (void *)tinfo &&
- p <= (void *)tinfo + THREAD_SIZE - size;
+ void *t = tinfo;
+ return p > t && p <= t + THREAD_SIZE - size;
}
/* The form of the top of the frame on the stack */
struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
+ struct stack_frame *next_frame;
+ unsigned long return_address;
};
static inline unsigned long
print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
{
struct stack_frame *frame = (struct stack_frame *)bp;
@@ -175,8 +155,6 @@ print_context_stack(struct thread_info *
return bp;
}
-#define MSG(msg) ops->warning(data, msg)
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
@@ -186,7 +164,6 @@ void dump_trace(struct task_struct *task
if (!stack) {
unsigned long dummy;
-
stack = &dummy;
if (task != current)
stack = (unsigned long *)task->thread.sp;
@@ -204,7 +181,7 @@ void dump_trace(struct task_struct *task
}
#endif
- while (1) {
+ for (;;) {
struct thread_info *context;
context = (struct thread_info *)
@@ -256,15 +233,15 @@ static void print_trace_address(void *da
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
};
static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, unsigned long bp, char *log_lvl)
{
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
printk("%s =======================\n", log_lvl);
@@ -359,15 +336,14 @@ void show_registers(struct pt_regs *regs
printk(KERN_EMERG "Code: ");
ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
/* try starting at EIP */
ip = (u8 *)regs->ip;
code_len = code_len - code_prologue + 1;
}
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
+ probe_kernel_address(ip, c)) {
printk(" Bad EIP value.");
break;
}
@@ -392,7 +368,53 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static int die_counter;
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ unsigned long flags;
+
+ oops_enter();
+
+ if (die_owner != raw_smp_processor_id()) {
+ console_verbose();
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&die_lock);
+ die_owner = smp_processor_id();
+ die_nest_count = 0;
+ bust_spinlocks(1);
+ } else {
+ raw_local_irq_save(flags);
+ }
+ die_nest_count++;
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE);
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+
+ if (!regs)
+ return;
+
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
+
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+
+ if (panic_on_oops)
+ panic("Fatal exception");
+
+ oops_exit();
+ do_exit(signr);
+}
int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
@@ -410,26 +432,22 @@ int __kprobes __die(const char *str, str
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
-
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
-
- return 0;
- }
-
- return 1;
+ show_registers(regs);
+ /* Executive summary in case the oops scrolled away */
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ return 0;
}
/*
@@ -438,31 +456,9 @@ int __kprobes __die(const char *str, str
*/
void die(const char *str, struct pt_regs *regs, long err)
{
- static struct {
- raw_spinlock_t lock;
- u32 lock_owner;
- int lock_owner_depth;
- } die = {
- .lock = __RAW_SPIN_LOCK_UNLOCKED,
- .lock_owner = -1,
- .lock_owner_depth = 0
- };
- unsigned long flags;
-
- oops_enter();
+ unsigned long flags = oops_begin();
- if (die.lock_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_local_irq_save(flags);
- __raw_spin_lock(&die.lock);
- die.lock_owner = smp_processor_id();
- die.lock_owner_depth = 0;
- bust_spinlocks(1);
- } else {
- raw_local_irq_save(flags);
- }
-
- if (++die.lock_owner_depth < 3) {
+ if (die_nest_count < 3) {
report_bug(regs->ip, regs);
if (__die(str, regs, err))
@@ -471,26 +467,7 @@ void die(const char *str, struct pt_regs
printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
}
- bust_spinlocks(0);
- die.lock_owner = -1;
- add_taint(TAINT_DIE);
- __raw_spin_unlock(&die.lock);
- raw_local_irq_restore(flags);
-
- if (!regs)
- return;
-
- if (kexec_should_crash(current))
- crash_kexec(regs);
-
- if (in_interrupt())
- panic("Fatal exception in interrupt");
-
- if (panic_on_oops)
- panic("Fatal exception");
-
- oops_exit();
- do_exit(SIGSEGV);
+ oops_end(flags, regs, SIGSEGV);
}
static inline void
@@ -554,7 +531,7 @@ void do_##name(struct pt_regs *regs, lon
{ \
trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
}
@@ -570,7 +547,7 @@ void do_##name(struct pt_regs *regs, lon
info.si_code = sicode; \
info.si_addr = (void __user *)siaddr; \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
}
@@ -579,7 +556,7 @@ void do_##name(struct pt_regs *regs, lon
void do_##name(struct pt_regs *regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
}
@@ -594,28 +571,29 @@ void do_##name(struct pt_regs *regs, lon
info.si_addr = (void __user *)siaddr; \
trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
}
-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
#ifndef CONFIG_KPROBES
DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
#endif
DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-void __kprobes do_general_protection(struct pt_regs * regs,
- long error_code)
+void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
{
+ struct task_struct *tsk;
struct thread_struct *thread;
thread = &current->thread;
@@ -623,23 +601,24 @@ void __kprobes do_general_protection(str
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
+ tsk = current;
if (!user_mode(regs))
goto gp_in_kernel;
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
- if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
- printk_ratelimit()) {
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- current->comm, task_pid_nr(current),
- regs->ip, regs->sp, error_code);
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, current);
+ force_sig(SIGSEGV, tsk);
return;
gp_in_vm86:
@@ -648,14 +627,15 @@ gp_in_vm86:
return;
gp_in_kernel:
- if (!fixup_exception(regs)) {
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
+ if (fixup_exception(regs))
+ return;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+ if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
- }
+ return;
+ die("general protection fault", regs, error_code);
}
static notrace __kprobes void
@@ -722,9 +702,9 @@ unknown_nmi_error(unsigned char reason,
static DEFINE_SPINLOCK(nmi_print_lock);
-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
spin_lock(&nmi_print_lock);
@@ -733,10 +713,12 @@ void notrace __kprobes die_nmi(struct pt
* to get a message out:
*/
bust_spinlocks(1);
- printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "%s", str);
printk(" on CPU%d, ip %08lx, registers:\n",
smp_processor_id(), regs->ip);
show_registers(regs);
+ if (do_panic)
+ panic("Non maskable interrupt");
console_silent();
spin_unlock(&nmi_print_lock);
bust_spinlocks(0);
@@ -756,14 +738,17 @@ void notrace __kprobes die_nmi(struct pt
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ int cpu;
- /* Only the BSP gets external NMIs from the system: */
- if (!smp_processor_id())
+ cpu = smp_processor_id();
+
+ /* Only the BSP gets external NMIs from the system. */
+ if (!cpu)
reason = get_nmi_reason();
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
+ == NOTIFY_STOP)
return;
#ifdef CONFIG_X86_LOCAL_APIC
/*
@@ -772,7 +757,7 @@ static notrace __kprobes void default_do
*/
if (nmi_watchdog_tick(regs, reason))
return;
- if (!do_nmi_callback(regs, smp_processor_id()))
+ if (!do_nmi_callback(regs, cpu))
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
@@ -782,6 +767,8 @@ static notrace __kprobes void default_do
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
return;
+
+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
if (reason & 0x80)
mem_parity_error(reason, regs);
if (reason & 0x40)
@@ -793,8 +780,6 @@ static notrace __kprobes void default_do
reassert_nmi();
}
-static int ignore_nmis;
-
notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
{
int cpu;
@@ -879,7 +864,7 @@ void __kprobes do_debug(struct pt_regs *
tsk->thread.debugctlmsr = 0;
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
+ SIGTRAP) == NOTIFY_STOP)
return;
/* It's safe to allow irq's after DR6 has been saved */
if (regs->flags & X86_EFLAGS_IF)
@@ -940,9 +925,8 @@ clear_TF_reenable:
void math_error(void __user *ip)
{
struct task_struct *task;
- unsigned short cwd;
- unsigned short swd;
siginfo_t info;
+ unsigned short cwd, swd;
/*
* Save the info for the exception handler and clear the error.
@@ -961,7 +945,7 @@ void math_error(void __user *ip)
* C1 reg you need in case of a stack fault, 0x040 is the stack
* fault bit. We should only be taking one exception at a time,
* so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't syncronizing its FPU usage
+ * then we have a bad program that isn't synchronizing its FPU usage
* and it will suffer the consequences since we won't be able to
* fully reproduce the context of the exception
*/
@@ -970,7 +954,7 @@ void math_error(void __user *ip)
switch (swd & ~cwd & 0x3f) {
case 0x000: /* No unmasked exception */
return;
- default: /* Multiple exceptions */
+ default: /* Multiple exceptions */
break;
case 0x001: /* Invalid Op */
/*
@@ -1006,8 +990,8 @@ void do_coprocessor_error(struct pt_regs
static void simd_math_error(void __user *ip)
{
struct task_struct *task;
- unsigned short mxcsr;
siginfo_t info;
+ unsigned short mxcsr;
/*
* Save the info for the exception handler and clear the error.
@@ -1084,7 +1068,7 @@ void do_spurious_interrupt_bug(struct pt
unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
{
- struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+ struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
unsigned long base = (kesp - uesp) & -THREAD_SIZE;
unsigned long new_kesp = kesp - base;
unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
--- head-2011-03-11.orig/arch/x86/kernel/traps_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/traps_64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -10,73 +10,56 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'entry.S'.
*/
-#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
#include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
#include <linux/errno.h>
-#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
#include <linux/timer.h>
-#include <linux/mm.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nmi.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
#include <linux/bug.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-
-#include <mach_traps.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
+#include <asm/nmi.h>
#include <asm/smp.h>
+#include <asm/io.h>
#include <asm/pgalloc.h>
-#include <asm/pda.h>
#include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
+#include <asm/pda.h>
+#include <asm/traps.h>
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
-asmlinkage void alignment_check(void);
-asmlinkage void machine_check(void);
-asmlinkage void spurious_interrupt_bug(void);
+#include <mach_traps.h>
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 12;
static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -100,34 +83,9 @@ static inline void preempt_conditional_c
dec_preempt_count();
}
-int kstack_depth_to_print = 12;
-
void printk_address(unsigned long address, int reliable)
{
-#ifdef CONFIG_KALLSYMS
- unsigned long offset = 0, symsize;
- const char *symname;
- char *modname;
- char *delim = ":";
- char namebuf[KSYM_NAME_LEN];
- char reliab[4] = "";
-
- symname = kallsyms_lookup(address, &symsize, &offset,
- &modname, namebuf);
- if (!symname) {
- printk(" [<%016lx>]\n", address);
- return;
- }
- if (!reliable)
- strcpy(reliab, "? ");
-
- if (!modname)
- modname = delim = "";
- printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
- address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
- printk(" [<%016lx>]\n", address);
-#endif
+ printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -206,8 +164,6 @@ static unsigned long *in_exception_stack
return NULL;
}
-#define MSG(txt) ops->warning(data, txt)
-
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -234,11 +190,11 @@ struct stack_frame {
unsigned long return_address;
};
-
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end)
{
struct stack_frame *frame = (struct stack_frame *)bp;
@@ -260,7 +216,7 @@ static inline unsigned long print_contex
return bp;
}
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
@@ -269,36 +225,34 @@ void dump_trace(struct task_struct *tsk,
unsigned used = 0;
struct thread_info *tinfo;
- if (!tsk)
- tsk = current;
- tinfo = task_thread_info(tsk);
+ if (!task)
+ task = current;
if (!stack) {
unsigned long dummy;
stack = &dummy;
- if (tsk && tsk != current)
- stack = (unsigned long *)tsk->thread.sp;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.sp;
}
#ifdef CONFIG_FRAME_POINTER
if (!bp) {
- if (tsk == current) {
+ if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp):);
+ asm("movq %%rbp, %0" : "=r" (bp) :);
} else {
/* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) tsk->thread.sp;
+ bp = *(unsigned long *) task->thread.sp;
}
}
#endif
-
-
/*
* Print function call entries in all stacks, starting at the
* current stack address. If the stacks consist of nested
* exceptions
*/
+ tinfo = task_thread_info(task);
for (;;) {
char *id;
unsigned long *estack_end;
@@ -383,18 +337,24 @@ static const struct stacktrace_ops print
.address = print_trace_address,
};
-void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
- unsigned long bp)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
{
printk("\nCall Trace:\n");
- dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
printk("\n");
}
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
- unsigned long bp)
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl)
{
unsigned long *stack;
int i;
@@ -406,14 +366,14 @@ _show_stack(struct task_struct *tsk, str
// back trace for this cpu.
if (sp == NULL) {
- if (tsk)
- sp = (unsigned long *)tsk->thread.sp;
+ if (task)
+ sp = (unsigned long *)task->thread.sp;
else
sp = (unsigned long *)&sp;
}
stack = sp;
- for(i=0; i < kstack_depth_to_print; i++) {
+ for (i = 0; i < kstack_depth_to_print; i++) {
if (stack >= irqstack && stack <= irqstack_end) {
if (stack == irqstack_end) {
stack = (unsigned long *) (irqstack_end[-1]);
@@ -428,12 +388,12 @@ _show_stack(struct task_struct *tsk, str
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
- show_trace(tsk, regs, sp, bp);
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *tsk, unsigned long * sp)
+void show_stack(struct task_struct *task, unsigned long *sp)
{
- _show_stack(tsk, NULL, sp, 0);
+ show_stack_log_lvl(task, NULL, sp, 0, "");
}
/*
@@ -441,8 +401,8 @@ void show_stack(struct task_struct *tsk,
*/
void dump_stack(void)
{
- unsigned long dummy;
unsigned long bp = 0;
+ unsigned long stack;
#ifdef CONFIG_FRAME_POINTER
if (!bp)
@@ -454,7 +414,7 @@ void dump_stack(void)
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- show_trace(NULL, NULL, &dummy, bp);
+ show_trace(NULL, NULL, &stack, bp);
}
EXPORT_SYMBOL(dump_stack);
@@ -465,12 +425,8 @@ void show_registers(struct pt_regs *regs
unsigned long sp;
const int cpu = smp_processor_id();
struct task_struct *cur = cpu_pda(cpu)->pcurrent;
- u8 *ip;
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
sp = regs->sp;
- ip = (u8 *) regs->ip - code_prologue;
printk("CPU %d ", cpu);
__show_regs(regs);
printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -481,15 +437,22 @@ void show_registers(struct pt_regs *regs
* time of the fault..
*/
if (!user_mode(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
unsigned char c;
+ u8 *ip;
+
printk("Stack: ");
- _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+ show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+ regs->bp, "");
printk("\n");
printk(KERN_EMERG "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
/* try starting at RIP */
- ip = (u8 *) regs->ip;
+ ip = (u8 *)regs->ip;
code_len = code_len - code_prologue + 1;
}
for (i = 0; i < code_len; i++, ip++) {
@@ -505,7 +468,7 @@ void show_registers(struct pt_regs *regs
}
}
printk("\n");
-}
+}
int is_valid_bugaddr(unsigned long ip)
{
@@ -545,7 +508,7 @@ unsigned __kprobes long oops_begin(void)
}
void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
+{
die_owner = -1;
bust_spinlocks(0);
die_nest_count--;
@@ -563,10 +526,9 @@ void __kprobes oops_end(unsigned long fl
do_exit(signr);
}
-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
- static int die_counter;
- printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
#ifdef CONFIG_PREEMPT
printk("PREEMPT ");
#endif
@@ -577,8 +539,10 @@ int __kprobes __die(const char * str, st
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
return 1;
+
show_registers(regs);
add_taint(TAINT_DIE);
/* Executive summary in case the oops scrolled away */
@@ -590,7 +554,7 @@ int __kprobes __die(const char * str, st
return 0;
}
-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
{
unsigned long flags = oops_begin();
@@ -608,8 +572,7 @@ die_nmi(char *str, struct pt_regs *regs,
{
unsigned long flags;
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
- NOTIFY_STOP)
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
flags = oops_begin();
@@ -617,7 +580,9 @@ die_nmi(char *str, struct pt_regs *regs,
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- printk(str, smp_processor_id());
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
show_registers(regs);
if (kexec_should_crash(current))
crash_kexec(regs);
@@ -630,44 +595,44 @@ die_nmi(char *str, struct pt_regs *regs,
}
#endif
-static void __kprobes do_trap(int trapnr, int signr, char *str,
- struct pt_regs * regs, long error_code,
- siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
{
struct task_struct *tsk = current;
- if (user_mode(regs)) {
- /*
- * We want error_code and trap_no set for userspace
- * faults and kernelspace faults which result in
- * die(), but not kernelspace faults which are fixed
- * up. die() gives the process no chance to handle
- * the signal and notice the kernel fault information,
- * so that won't result in polluting the information
- * about previously queued, but not yet delivered,
- * faults. See also do_general_protection below.
- */
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ if (!user_mode(regs))
+ goto kernel_trap;
- if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
+ /*
+ * We want error_code and trap_no set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
- return;
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
}
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ return;
+kernel_trap:
if (!fixup_exception(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_no = trapnr;
@@ -677,41 +642,39 @@ static void __kprobes do_trap(int trapnr
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- trace_hardirqs_fixup(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ siginfo_t info; \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
+ do_trap(trapnr, signr, str, regs, error_code, &info); \
}
-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
/* Runs on IST stack */
asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
@@ -741,31 +704,34 @@ asmlinkage void do_double_fault(struct p
die(str, regs, error_code);
}
-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
- long error_code)
+asmlinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
+ struct task_struct *tsk;
conditional_sti(regs);
- if (user_mode(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
+ tsk = current;
+ if (!user_mode(regs))
+ goto gp_in_kernel;
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
- force_sig(SIGSEGV, tsk);
- return;
- }
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+ force_sig(SIGSEGV, tsk);
+ return;
+
+gp_in_kernel:
if (fixup_exception(regs))
return;
@@ -778,14 +744,14 @@ asmlinkage void __kprobes do_general_pro
}
static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
{
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
#if defined(CONFIG_EDAC)
- if(edac_handler_set()) {
+ if (edac_handler_set()) {
edac_atomic_assert_error();
return;
}
@@ -801,7 +767,7 @@ mem_parity_error(unsigned char reason, s
}
static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+io_check_error(unsigned char reason, struct pt_regs *regs)
{
printk("NMI: IOCK error (debug interrupt?)\n");
show_registers(regs);
@@ -827,14 +793,14 @@ unknown_nmi_error(unsigned char reason,
/* Runs on IST stack. This code must keep interrupts off all the time.
Nested NMIs are prevented by the CPU. */
-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int cpu;
cpu = smp_processor_id();
- /* Only the BSP gets external NMIs from the system. */
+ /* Only the BSP gets external NMIs from the system. */
if (!cpu)
reason = get_nmi_reason();
@@ -847,33 +813,58 @@ asmlinkage notrace __kprobes void defau
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog_tick(regs,reason))
+ if (nmi_watchdog_tick(regs, reason))
return;
#endif
- if (!do_nmi_callback(regs,cpu))
+ if (!do_nmi_callback(regs, cpu))
unknown_nmi_error(reason, regs);
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
+ return;
/* AK: following checks seem to be broken on modern chipsets. FIXME */
-
if (reason & 0x80)
mem_parity_error(reason, regs);
if (reason & 0x40)
io_check_error(reason, regs);
}
+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+ nmi_enter();
+
+ add_pda(__nmi_count, 1);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+ nmi_exit();
+}
+
+void stop_nmi(void)
+{
+ acpi_nmi_disable();
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+ acpi_nmi_enable();
+}
+
/* runs on IST stack. */
asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
{
trace_hardirqs_fixup();
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
return;
- }
+
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
@@ -904,8 +895,8 @@ asmlinkage __kprobes struct pt_regs *syn
asmlinkage void __kprobes do_debug(struct pt_regs * regs,
unsigned long error_code)
{
- unsigned long condition;
struct task_struct *tsk = current;
+ unsigned long condition;
siginfo_t info;
trace_hardirqs_fixup();
@@ -926,21 +917,19 @@ asmlinkage void __kprobes do_debug(struc
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7) {
+ if (!tsk->thread.debugreg7)
goto clear_dr7;
- }
}
tsk->thread.debugreg6 = condition;
-
/*
* Single-stepping through TF: make sure we ignore any events in
* kernel space (but re-enable TF when returning to user mode).
*/
if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
+ if (!user_mode(regs))
+ goto clear_TF_reenable;
}
/* Ok, finally something we can handle */
@@ -953,7 +942,7 @@ asmlinkage void __kprobes do_debug(struc
force_sig_info(SIGTRAP, &info, tsk);
clear_dr7:
- set_debugreg(0UL, 7);
+ set_debugreg(0, 7);
preempt_conditional_cli(regs);
return;
@@ -961,6 +950,7 @@ clear_TF_reenable:
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
preempt_conditional_cli(regs);
+ return;
}
static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -983,7 +973,7 @@ static int kernel_math_error(struct pt_r
asmlinkage void do_coprocessor_error(struct pt_regs *regs)
{
void __user *ip = (void __user *)(regs->ip);
- struct task_struct * task;
+ struct task_struct *task;
siginfo_t info;
unsigned short cwd, swd;
@@ -1016,30 +1006,30 @@ asmlinkage void do_coprocessor_error(str
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
switch (swd & ~cwd & 0x3f) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000: /* No unmasked exception */
+ default: /* Multiple exceptions */
+ break;
+ case 0x001: /* Invalid Op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
@@ -1052,7 +1042,7 @@ asmlinkage void bad_intr(void)
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
{
void __user *ip = (void __user *)(regs->ip);
- struct task_struct * task;
+ struct task_struct *task;
siginfo_t info;
unsigned short mxcsr;
@@ -1080,25 +1070,25 @@ asmlinkage void do_simd_coprocessor_erro
*/
mxcsr = get_fpu_mxcsr(task);
switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
@@ -1118,7 +1108,7 @@ asmlinkage void __attribute__((weak)) mc
}
/*
- * 'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*
* Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1145,7 +1135,14 @@ asmlinkage void math_state_restore(void)
/* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
- restore_fpu_checking(&me->thread.xstate->fxsave);
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
+ stts();
+ force_sig(SIGSEGV, me);
+ return;
+ }
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
}
@@ -1190,13 +1187,12 @@ void __init trap_init(void)
ret = HYPERVISOR_set_trap_table(trap_table);
if (ret)
printk("HYPERVISOR_set_trap_table failed: error %d\n", ret);
-
/*
* initialize the per thread extended state:
*/
- init_thread_xstate();
+ init_thread_xstate();
/*
- * Should be a barrier for any external CPU state.
+ * Should be a barrier for any external CPU state:
*/
cpu_init();
}
@@ -1212,27 +1208,25 @@ void __cpuinit smp_trap_init(trap_info_t
}
}
-
static int __init oops_setup(char *s)
-{
+{
if (!s)
return -EINVAL;
if (!strcmp(s, "panic"))
panic_on_oops = 1;
return 0;
-}
+}
early_param("oops", oops_setup);
static int __init kstack_setup(char *s)
{
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s,NULL,0);
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
return 0;
}
early_param("kstack", kstack_setup);
-
static int __init code_bytes_setup(char *s)
{
code_bytes = simple_strtoul(s, NULL, 0);
--- head-2011-03-11.orig/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/kernel/vsyscall_64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -42,7 +42,8 @@
#include <asm/topology.h>
#include <asm/vgtod.h>
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"
/*
@@ -264,10 +265,7 @@ static void __cpuinit vsyscall_set_cpu(i
d |= cpu;
d |= (node & 0xf) << 12;
d |= (node >> 4) << 48;
- if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu)
- + GDT_ENTRY_PER_CPU),
- d))
- BUG();
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static void __cpuinit cpu_vsyscall_init(void *arg)
@@ -281,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_bl
{
long cpu = (long)arg;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
return NOTIFY_DONE;
}
@@ -311,7 +309,7 @@ static int __init vsyscall_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2);
#endif
- on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+ on_each_cpu(cpu_vsyscall_init, NULL, 1);
hotcpu_notifier(cpu_vsyscall_notifier, 0);
return 0;
}
--- head-2011-03-11.orig/arch/x86/mach-xen/setup.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mach-xen/setup.c 2011-02-03 14:23:14.000000000 +0100
@@ -17,6 +17,8 @@
#include <xen/interface/callback.h>
#include <xen/interface/memory.h>
+#ifdef CONFIG_X86_32
+
#ifdef CONFIG_HOTPLUG_CPU
#define DEFAULT_SEND_IPI (1)
#else
@@ -44,47 +46,6 @@ static int __init print_ipi_mode(void)
late_initcall(print_ipi_mode);
-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- *
- * Description:
- * This is included late in kernel/setup.c so that it can make
- * use of all of the static functions.
- **/
-
-char * __init machine_specific_memory_setup(void)
-{
- int rc;
- struct xen_memory_map memmap;
- static struct e820entry __initdata map[E820MAX];
-
- memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
-
- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
- if ( rc == -ENOSYS ) {
- memmap.nr_entries = 1;
- map[0].addr = 0ULL;
- map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
- /* 8MB slack (to balance backend allocations). */
- map[0].size += 8ULL << 20;
- map[0].type = E820_RAM;
- rc = 0;
- }
- BUG_ON(rc);
-
- sanitize_e820_map(map, (char *)&memmap.nr_entries);
-
- BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
-
- return "Xen";
-}
-
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
EXPORT_SYMBOL(machine_to_phys_mapping);
unsigned int machine_to_phys_order;
@@ -117,30 +78,60 @@ void __init pre_setup_arch_hook(void)
(unsigned long *)xen_start_info->mfn_list;
}
+#endif /* CONFIG_X86_32 */
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
void __init machine_specific_arch_setup(void)
{
int ret;
static struct callback_register __initdata event = {
.type = CALLBACKTYPE_event,
- .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
+ .address = CALLBACK_ADDR(hypervisor_callback)
};
static struct callback_register __initdata failsafe = {
.type = CALLBACKTYPE_failsafe,
- .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
+ .address = CALLBACK_ADDR(failsafe_callback)
};
+#ifdef CONFIG_X86_64
+ static struct callback_register __initdata syscall = {
+ .type = CALLBACKTYPE_syscall,
+ .address = CALLBACK_ADDR(system_call)
+ };
+#endif
static struct callback_register __initdata nmi_cb = {
.type = CALLBACKTYPE_nmi,
- .address = { __KERNEL_CS, (unsigned long)nmi },
+ .address = CALLBACK_ADDR(nmi)
};
ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
if (ret == 0)
ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+ if (ret == 0)
+ ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
#if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
if (ret == -ENOSYS)
ret = HYPERVISOR_set_callbacks(
event.address.cs, event.address.eip,
failsafe.address.cs, failsafe.address.eip);
+#else
+ ret = HYPERVISOR_set_callbacks(
+ event.address,
+ failsafe.address,
+ syscall.address);
+#endif
#endif
BUG_ON(ret);
@@ -155,14 +146,41 @@ void __init machine_specific_arch_setup(
}
#endif
+#ifdef CONFIG_X86_32
/* Do an early initialization of the fixmap area */
{
extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr);
pmd_t *pmd = pmd_offset(pud, addr);
+ unsigned int i;
make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+ != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+ FIX_BUG_ON(SHARED_INFO);
+ FIX_BUG_ON(ISAMAP_BEGIN);
+ FIX_BUG_ON(ISAMAP_END);
+#undef __FIXADDR_TOP
+ BUG_ON(pte_index(hypervisor_virt_start));
+
+ /* Switch to the real shared_info page, and clear the
+ * dummy page. */
+ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+ memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+ /* Setup mapping of lower 1st MB */
+ for (i = 0; i < NR_FIX_ISAMAPS; i++)
+ if (is_initial_xendomain())
+ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+ else
+ __set_fixmap(FIX_ISAMAP_BEGIN - i,
+ virt_to_machine(empty_zero_page),
+ PAGE_KERNEL_RO);
}
+#endif
}
--- head-2011-03-11.orig/arch/x86/mm/Makefile 2011-01-31 14:53:50.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/Makefile 2011-02-01 14:38:38.000000000 +0100
@@ -27,6 +27,7 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology_6
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_XEN) += hypervisor.o
+disabled-obj-$(CONFIG_XEN) := gup.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
--- head-2011-03-11.orig/arch/x86/mm/dump_pagetables-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/dump_pagetables-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -45,7 +45,7 @@ static struct addr_marker address_marker
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ HYPERVISOR_VIRT_START, "Hypervisor Space" },
- { HYPERVISOR_VIRT_END, "Low Kernel Mapping" },
+ { PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
{ __START_KERNEL_map, "High Kernel Mapping" },
@@ -160,8 +160,8 @@ static void note_page(struct seq_file *m
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & ~(PTE_MASK);
- cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+ prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
+ cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
if (!st->level) {
/* First entry */
@@ -234,7 +234,7 @@ static void walk_pmd_level(struct seq_fi
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!hypervisor_space(st->current_address)
&& !pmd_none(*start)) {
- pgprotval_t prot = __pmd_val(*start) & ~PTE_MASK;
+ pgprotval_t prot = __pmd_val(*start) & PTE_FLAGS_MASK;
if (pmd_large(*start) || !pmd_present(*start))
note_page(m, st, __pgprot(prot), 3);
@@ -267,7 +267,7 @@ static void walk_pud_level(struct seq_fi
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!hypervisor_space(st->current_address)
&& !pud_none(*start)) {
- pgprotval_t prot = __pud_val(*start) & ~PTE_MASK;
+ pgprotval_t prot = __pud_val(*start) & PTE_FLAGS_MASK;
if (pud_large(*start) || !pud_present(*start))
note_page(m, st, __pgprot(prot), 2);
@@ -303,7 +303,7 @@ static void walk_pgd_level(struct seq_fi
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
if (!pgd_none(*start)) {
- pgprotval_t prot = __pgd_val(*start) & ~PTE_MASK;
+ pgprotval_t prot = __pgd_val(*start) & PTE_FLAGS_MASK;
if (pgd_large(*start) || !pgd_present(*start))
note_page(m, &st, __pgprot(prot), 1);
--- head-2011-03-11.orig/arch/x86/mm/fault-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/fault-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -10,6 +10,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
@@ -49,17 +50,23 @@
#define PF_RSVD (1<<3)
#define PF_INSTR (1<<4)
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+#ifdef CONFIG_MMIOTRACE_HOOKS
+ if (unlikely(is_kmmio_active()))
+ if (kmmio_handler(regs, addr) == 1)
+ return -1;
+#endif
+ return 0;
+}
+
static inline int notify_page_fault(struct pt_regs *regs)
{
#ifdef CONFIG_KPROBES
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
-#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
-#else
- if (!user_mode(regs)) {
-#endif
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, 14))
ret = 1;
@@ -409,11 +416,7 @@ static void show_fault_oops(struct pt_re
printk(KERN_CONT "NULL pointer dereference");
else
printk(KERN_CONT "paging request");
-#ifdef CONFIG_X86_32
- printk(KERN_CONT " at %08lx\n", address);
-#else
- printk(KERN_CONT " at %016lx\n", address);
-#endif
+ printk(KERN_CONT " at %p\n", (void *) address);
printk(KERN_ALERT "IP:");
printk_address(regs->ip, 1);
dump_pagetable(address);
@@ -628,6 +631,8 @@ void __kprobes do_page_fault(struct pt_r
if (notify_page_fault(regs))
return;
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
/*
* We fault-in kernel-space virtual memory on-demand. The
@@ -831,14 +836,10 @@ bad_area_nosemaphore:
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
printk(
-#ifdef CONFIG_X86_32
- "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
-#else
- "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
-#endif
+ "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address, regs->ip,
- regs->sp, error_code);
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *) regs->ip, (void *) regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
@@ -949,89 +950,52 @@ LIST_HEAD(pgd_list);
void vmalloc_sync_all(void)
{
#ifdef CONFIG_X86_32
- /*
- * Note that races in the updates of insync and start aren't
- * problematic: insync can only get set bits added, and updates to
- * start are only improving performance (without affecting correctness
- * if undone).
- * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs.
- * This change works just fine with 2-level paging too.
- */
-#define sync_index(a) ((a) >> PMD_SHIFT)
- static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD);
- static unsigned long start = TASK_SIZE;
- unsigned long address;
+ unsigned long address = VMALLOC_START & PGDIR_MASK;
if (SHARED_KERNEL_PMD)
return;
BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK);
- for (address = start;
- address < hypervisor_virt_start;
- address += PMD_SIZE) {
- if (!test_bit(sync_index(address), insync)) {
- unsigned long flags;
- struct page *page;
-
- spin_lock_irqsave(&pgd_lock, flags);
- /* XEN: failure path assumes non-empty pgd_list. */
- if (unlikely(list_empty(&pgd_list))) {
- spin_unlock_irqrestore(&pgd_lock, flags);
- return;
- }
- list_for_each_entry(page, &pgd_list, lru) {
- pmd_t *pmd;
-
- pgd_page_table(lock, page);
- pmd = vmalloc_sync_one(page_address(page),
- address);
- pgd_page_table(unlock, page);
-
- if (!pmd)
- break;
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- if (!page)
- set_bit(sync_index(address), insync);
+ for (; address < hypervisor_virt_start; address += PMD_SIZE) {
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pmd_t *pmd;
+
+ pgd_page_table(lock, page);
+ pmd = vmalloc_sync_one(page_address(page), address);
+ pgd_page_table(unlock, page);
+
+ if (!pmd)
+ break;
}
- if (address == start && test_bit(sync_index(address), insync))
- start = address + PMD_SIZE;
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
#else /* CONFIG_X86_64 */
- /*
- * Note that races in the updates of insync and start aren't
- * problematic: insync can only get set bits added, and updates to
- * start are only improving performance (without affecting correctness
- * if undone).
- */
- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- static unsigned long start = VMALLOC_START & PGDIR_MASK;
+ unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;
for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
- if (!test_bit(pgd_index(address), insync)) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pgd_page_table(lock, page);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- pgd_page_table(unlock, page);
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- set_bit(pgd_index(address), insync);
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pgd_page_table(lock, page);
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ pgd_page_table(unlock, page);
}
- if (address == start)
- start = address + PGDIR_SIZE;
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
#endif
}
--- head-2011-03-11.orig/arch/x86/mm/hypervisor.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/hypervisor.c 2011-02-01 14:38:38.000000000 +0100
@@ -711,6 +711,72 @@ void xen_destroy_contiguous_region(unsig
}
EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+int __init early_create_contiguous_region(unsigned long pfn,
+ unsigned int order,
+ unsigned int address_bits)
+{
+ unsigned long *in_frames = discontig_frames, out_frame = pfn;
+ unsigned int i;
+ int rc, success;
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = 1UL << order,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = 1,
+ .extent_order = order,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ for (i = 0; i < (1U << order); ++i) {
+ in_frames[i] = pfn_to_mfn(pfn + i);
+ set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
+ }
+
+ set_xen_guest_handle(exchange.in.extent_start, in_frames);
+ set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == (1UL << order));
+ BUG_ON(!success && (exchange.nr_exchanged || !rc));
+ BUG_ON(success && rc);
+#if CONFIG_XEN_COMPAT <= 0x030002
+ if (unlikely(rc == -ENOSYS)) {
+ /* Compatibility when XENMEM_exchange is unavailable. */
+ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &exchange.in) != (1UL << order))
+ BUG();
+ success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.out) == 1);
+ if (!success) {
+ for (i = 0; i < (1U << order); ++i)
+ in_frames[i] = pfn + i;
+ if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+ &exchange.in) != (1UL << order))
+ BUG();
+ }
+ }
+#endif
+
+ for (i = 0; i < (1U << order); ++i, ++out_frame) {
+ if (!success)
+ out_frame = in_frames[i];
+ set_phys_to_machine(pfn + i, out_frame);
+ }
+
+ return success ? 0 : -ENOMEM;
+}
+
static void undo_limit_pages(struct page *pages, unsigned int order)
{
BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
@@ -877,42 +943,9 @@ int write_ldt_entry(struct desc_struct *
return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
}
-#define MAX_BATCHED_FULL_PTES 32
-
-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
+ int type)
{
- int rc = 0, i = 0;
- mmu_update_t u[MAX_BATCHED_FULL_PTES];
- pte_t *pte;
- spinlock_t *ptl;
-
- if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))
- return 0;
-
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- do {
- if (pte_present(*pte)) {
- pte_t ptent = pte_modify(*pte, newprot);
-
- if (dirty_accountable && pte_dirty(ptent))
- ptent = pte_mkwrite(ptent);
- u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
- | ((unsigned long)pte & ~PAGE_MASK)
- | MMU_PT_UPDATE_PRESERVE_AD;
- u[i].val = __pte_val(ptent);
- if (++i == MAX_BATCHED_FULL_PTES) {
- if ((rc = HYPERVISOR_mmu_update(
- &u[0], i, NULL, DOMID_SELF)) != 0)
- break;
- i = 0;
- }
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
- if (i)
- rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF);
- pte_unmap_unlock(pte - 1, ptl);
- BUG_ON(rc && rc != -ENOSYS);
- return !rc;
+ maddr_t mach_gp = virt_to_machine(gdt + entry);
+ return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
}
--- head-2011-03-11.orig/arch/x86/mm/init_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/init_32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -54,6 +54,7 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -61,6 +62,27 @@ unsigned long highstart_pfn, highend_pfn
static noinline int do_test_wp_bit(void);
+
+static unsigned long __initdata table_start;
+static unsigned long __initdata table_end;
+static unsigned long __initdata table_top;
+
+static int __initdata after_init_bootmem;
+
+static __init void *alloc_low_page(unsigned long *phys)
+{
+ unsigned long pfn = table_end++;
+ void *adr;
+
+ if (pfn >= table_top)
+ panic("alloc_low_page: ran out of memory");
+
+ adr = __va(pfn * PAGE_SIZE);
+ memset(adr, 0, PAGE_SIZE);
+ *phys = pfn * PAGE_SIZE;
+ return adr;
+}
+
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -72,9 +94,12 @@ static pmd_t * __init one_md_table_init(
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
+ unsigned long phys;
if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
+ if (after_init_bootmem)
+ pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ else
+ pmd_table = (pmd_t *)alloc_low_page(&phys);
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
@@ -101,12 +126,16 @@ static pte_t * __init one_page_table_ini
#endif
pte_t *page_table = NULL;
+ if (after_init_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+ page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
- if (!page_table) {
- page_table =
+ if (!page_table)
+ page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ } else {
+ unsigned long phys;
+ page_table = (pte_t *)alloc_low_page(&phys);
}
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -167,24 +196,24 @@ static inline int is_kernel_text(unsigne
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ int use_pse)
{
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
+ unsigned pages_2m = 0, pages_4k = 0;
- unsigned long max_ram_pfn = xen_start_info->nr_pages;
- if (max_ram_pfn > max_low_pfn)
- max_ram_pfn = max_low_pfn;
+ if (!cpu_has_pse)
+ use_pse = 0;
- pgd_idx = pgd_index(PAGE_OFFSET);
+ pfn = start_pfn;
+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
- pfn = 0;
- pmd_idx = pmd_index(PAGE_OFFSET);
- pte_ofs = pte_index(PAGE_OFFSET);
-
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
#ifdef CONFIG_XEN
/*
@@ -198,10 +227,16 @@ static void __init kernel_physical_mappi
#else
pmd = one_md_table_init(pgd);
#endif
- if (pfn >= max_low_pfn)
+
+ if (pfn >= end_pfn)
continue;
+#ifdef CONFIG_X86_PAE
+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pmd += pmd_idx;
- for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+#else
+ pmd_idx = 0;
+#endif
+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
pmd++, pmd_idx++) {
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
@@ -211,13 +246,8 @@ static void __init kernel_physical_mappi
/*
* Map with big pages if possible, otherwise
* create normal page tables:
- *
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
*/
- if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
@@ -228,49 +258,35 @@ static void __init kernel_physical_mappi
is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
+ pages_2m++;
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
- max_pfn_mapped = pfn;
continue;
}
pte = one_page_table_init(pmd);
- for (pte += pte_ofs;
- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
/* XEN: Only map initial RAM allocation. */
- if ((pfn >= max_ram_pfn) || pte_present(*pte))
+ if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
continue;
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
+ pages_4k++;
set_pte(pte, pfn_pte(pfn, prot));
}
- max_pfn_mapped = pfn;
- pte_ofs = 0;
}
- pmd_idx = 0;
}
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
}
-#ifndef CONFIG_XEN
-
-static inline int page_kills_ppro(unsigned long pagenr)
-{
- if (pagenr >= 0x70000 && pagenr <= 0x7003F)
- return 1;
- return 0;
-}
-
-#else
-
-#define page_kills_ppro(p) 0
-
-#endif
-
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.
@@ -331,33 +347,66 @@ static void __init permanent_kmaps_init(
pkmap_page_table = pte;
}
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
{
- if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
- } else
- SetPageReserved(page);
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+ totalhigh_pages++;
+}
+
+struct add_highpages_data {
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static int __init add_highpages_work_fn(unsigned long start_pfn,
+ unsigned long end_pfn, void *datax)
+{
+ int node_pfn;
+ struct page *page;
+ unsigned long final_start_pfn, final_end_pfn;
+ struct add_highpages_data *data;
+
+ data = (struct add_highpages_data *)datax;
+
+ final_start_pfn = max(start_pfn, data->start_pfn);
+ final_end_pfn = min(end_pfn, data->end_pfn);
+ if (final_start_pfn >= final_end_pfn)
+ return 0;
+
+ for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+ node_pfn++) {
+ if (!pfn_valid(node_pfn))
+ continue;
+ page = pfn_to_page(node_pfn);
+ add_one_highpage_init(page, node_pfn);
+ }
+
+ return 0;
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct add_highpages_data data;
+
+ data.start_pfn = start_pfn;
+ data.end_pfn = end_pfn;
+
+ work_with_active_regions(nid, add_highpages_work_fn, &data);
}
#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(int bad_ppro)
+static void __init set_highmem_pages_init(void)
{
int pfn;
- for (pfn = highstart_pfn; pfn < highend_pfn
- && pfn < xen_start_info->nr_pages; pfn++) {
- /*
- * Holes under sparsemem might not have no mem_map[]:
- */
- if (pfn_valid(pfn))
- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
- }
+ add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
/* XEN: init high-mem pages outside initial allocation. */
- for (; pfn < highend_pfn; pfn++) {
+ for (pfn = xen_start_info->nr_pages; pfn < highend_pfn; pfn++) {
ClearPageReserved(pfn_to_page(pfn));
init_page_count(pfn_to_page(pfn));
}
@@ -369,24 +418,11 @@ static void __init set_highmem_pages_ini
#else
# define kmap_init() do { } while (0)
# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define set_highmem_pages_init() do { } while (0)
#endif /* CONFIG_HIGHMEM */
-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-
-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-
pgd_t *swapper_pg_dir;
-static void __init xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
-static void __init xen_pagetable_setup_done(pgd_t *base)
-{
-}
-
/*
* Build a proper pagetable for the kernel mappings. Up until this
* point, we've been running on some set of pagetables constructed by
@@ -406,27 +442,10 @@ static void __init xen_pagetable_setup_d
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
{
- pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
unsigned long vaddr, end;
- xen_pagetable_setup_start(pgd_base);
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __PAGE_KERNEL |= _PAGE_GLOBAL;
- __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
- }
-
- kernel_physical_mapping_init(pgd_base);
- remap_numa_kva();
-
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
@@ -436,10 +455,13 @@ static void __init pagetable_init(void)
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
+}
- permanent_kmaps_init(pgd_base);
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
- xen_pagetable_setup_done(pgd_base);
+ permanent_kmaps_init(pgd_base);
}
#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN)
@@ -482,7 +504,7 @@ void zap_low_mappings(void)
int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
@@ -535,42 +557,364 @@ static void __init set_nx(void)
}
#endif
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
/*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
*/
-void __init paging_init(void)
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ /* max_low_pfn is 0, we already have early_res support */
+
+ max_low_pfn = max_pfn;
+ if (max_low_pfn > MAXMEM_PFN) {
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING "only %luMB highmem pages "
+ "available, ignoring highmem size of %uMB.\n",
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+ MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING
+ "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING "Warning only 4GB will be used."
+ "Use a HIGHMEM64G enabled kernel.\n");
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+ } else {
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR "highmem size specified (%uMB) is "
+ "bigger than pages available (%luMB)!.\n",
+ pages_to_mb(highmem_pages),
+ pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages <
+ 64*1024*1024/PAGE_SIZE){
+ printk(KERN_ERR "highmem size %uMB results in "
+ "smaller than 64MB lowmem, ignoring it.\n"
+ , pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem"
+ " kernel!\n");
+#endif
+ }
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ memory_present(0, 0, highend_pfn);
+ e820_register_active_regions(0, 0, highend_pfn);
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ memory_present(0, 0, max_low_pfn);
+ e820_register_active_regions(0, 0, max_low_pfn);
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+static void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] =
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
+void __init setup_bootmem_allocator(void)
{
int i;
+ unsigned long bootmap_size, bootmap;
+ unsigned long end_pfn = min(max_low_pfn, xen_start_info->nr_pages);
+
+ /*
+ * Initialize the boot-time allocator (with low memory only):
+ */
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ min(max_pfn_mapped, xen_start_info->nr_pages)<<PAGE_SHIFT,
+ bootmap_size, PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ min_low_pfn, end_pfn);
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: %08lx - %08lx\n",
+ min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " bootmap %08lx - %08lx\n",
+ bootmap, bootmap + bootmap_size);
+ for_each_online_node(i)
+ free_bootmem_with_active_regions(i, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+
+ after_init_bootmem = 1;
+}
+
+static unsigned long __init extend_init_mapping(unsigned long tables_space)
+{
+ unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
+ + xen_start_info->nr_pt_frames;
+ unsigned long start = start_pfn, va = (unsigned long)&_text;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* Ensure init mappings cover kernel text/data and initial tables. */
+ while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
+ pgd = pgd_offset_k(va);
+ pud = pud_offset(pgd, va);
+ pmd = pmd_offset(pud, va);
+ if (pmd_none(*pmd)) {
+ unsigned long pa = start_pfn++ << PAGE_SHIFT;
+
+ memset(__va(pa), 0, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pa),
+ XENFEAT_writable_page_tables);
+ xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, va);
+ if (pte_none(*pte)) {
+ pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
+
+ if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
+ BUG();
+ }
+ va += PAGE_SIZE;
+ }
+
+ /* Finally, blow away any spurious initial mappings. */
+ while (1) {
+ pgd = pgd_offset_k(va);
+ pud = pud_offset(pgd, va);
+ pmd = pmd_offset(pud, va);
+ if (pmd_none(*pmd))
+ break;
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ if (start_pfn > start)
+ reserve_early(start << PAGE_SHIFT,
+ start_pfn << PAGE_SHIFT, "INITMAP");
+
+ return start_pfn;
+}
+
+static void __init find_early_table_space(unsigned long end)
+{
+ unsigned long puds, pmds, ptes, tables;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = PAGE_ALIGN(puds * sizeof(pud_t));
+
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+
+ if (cpu_has_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ extra += PMD_SIZE;
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+
+ /* for fixmap */
+ tables += PAGE_SIZE
+ * ((((FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK)
+ - (__fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK))
+ >> PMD_SHIFT);
+
+ table_start = extend_init_mapping(tables);
+
+ table_end = table_start;
+ table_top = table_start + (tables>>PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_start << PAGE_SHIFT,
+ (table_start << PAGE_SHIFT) + tables);
+}
+
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+ unsigned long start_pfn, end_pfn;
+ unsigned long big_page_start;
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ */
+ if (!after_init_bootmem)
+ find_early_table_space(end);
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ big_page_start = PMD_SIZE;
+
+ if (start < big_page_start) {
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
+ } else {
+ /* head is not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ }
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
+
+ /* big page range */
+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < (big_page_start >> PAGE_SHIFT))
+ start_pfn = big_page_start >> PAGE_SHIFT;
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
+ cpu_has_pse);
+
+ /* tail is not big page alignment ? */
+ start_pfn = end_pfn;
+ if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn)
+ kernel_physical_mapping_init(pgd_base, start_pfn,
+ end_pfn, 0);
+ }
+
+ early_ioremap_page_table_range_init(pgd_base);
+
+ __flush_tlb_all();
+
+ if (!after_init_bootmem)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_init_bootmem)
+ early_memtest(start, end);
+
+ return end >> PAGE_SHIFT;
+}
+
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
pagetable_init();
__flush_tlb_all();
kmap_init();
- /* Switch to the real shared_info page, and clear the
- * dummy page. */
- set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
- HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
- memset(empty_zero_page, 0, sizeof(empty_zero_page));
-
- /* Setup mapping of lower 1st MB */
- for (i = 0; i < NR_FIX_ISAMAPS; i++)
- if (is_initial_xendomain())
- set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
- else
- __set_fixmap(FIX_ISAMAP_BEGIN - i,
- virt_to_machine(empty_zero_page),
- PAGE_KERNEL_RO);
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ sparse_init();
+ zone_sizes_init();
}
/*
@@ -605,7 +949,7 @@ static struct kcore_list kcore_mem, kcor
void __init mem_init(void)
{
int codesize, reservedpages, datasize, initsize;
- int tmp, bad_ppro;
+ int tmp;
unsigned long pfn;
pci_iommu_alloc();
@@ -613,19 +957,6 @@ void __init mem_init(void)
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
- bad_ppro = ppro_with_ram_bug();
-
-#ifdef CONFIG_HIGHMEM
- /* check that fixmap and pkmap do not overlap */
- if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
- printk(KERN_ERR
- "fixmap and kmap areas overlap - this will crash\n");
- printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
- PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
- FIXADDR_START);
- BUG();
- }
-#endif
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
/* XEN: init low-mem pages outside initial allocation. */
@@ -642,7 +973,7 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
- set_highmem_pages_init(bad_ppro);
+ set_highmem_pages_init();
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -663,7 +994,6 @@ void __init mem_init(void)
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);
-#if 1 /* double-sanity-check paranoia */
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
@@ -704,7 +1034,6 @@ void __init mem_init(void)
#endif
BUG_ON(VMALLOC_START > VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
-#endif /* double-sanity-check paranoia */
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
@@ -761,6 +1090,8 @@ void mark_rodata_ro(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
+#ifndef CONFIG_DYNAMIC_FTRACE
+ /* Dynamic tracing modifies the kernel text section */
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
size >> 10);
@@ -773,6 +1104,8 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -835,3 +1168,9 @@ void free_initrd_mem(unsigned long start
free_init_pages("initrd memory", start, end);
}
#endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
+}
--- head-2011-03-11.orig/arch/x86/mm/init_64-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/init_64-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -21,6 +21,7 @@
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
@@ -52,6 +53,14 @@
#include <xen/features.h>
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
#if CONFIG_XEN_COMPAT <= 0x030002
unsigned int __kernel_page_user;
EXPORT_SYMBOL(__kernel_page_user);
@@ -60,7 +69,6 @@ EXPORT_SYMBOL(__kernel_page_user);
int after_bootmem;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-extern unsigned long start_pfn;
extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
@@ -118,7 +126,7 @@ void __meminit early_make_page_readonly(
}
#ifndef CONFIG_XEN
-int direct_gbpages __meminitdata
+int direct_gbpages
#ifdef CONFIG_DIRECT_GBPAGES
= 1
#endif
@@ -145,55 +153,23 @@ early_param("gbpages", parse_direct_gbpa
* around without checking the pgd every time.
*/
-void show_mem(void)
-{
- long i, total = 0, reserved = 0;
- long shared = 0, cached = 0;
- struct page *page;
- pg_data_t *pgdat;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- /*
- * This loop can take a while with 256 GB and
- * 4k pages so defer the NMI watchdog:
- */
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
-
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
-
- page = pfn_to_page(pgdat->node_start_pfn + i);
- total++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- }
- printk(KERN_INFO "%lu pages of RAM\n", total);
- printk(KERN_INFO "%lu reserved pages\n", reserved);
- printk(KERN_INFO "%lu pages shared\n", shared);
- printk(KERN_INFO "%lu pages swap cached\n", cached);
-}
-
static unsigned long __meminitdata table_start;
-static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_cur;
+static unsigned long __meminitdata table_top;
-static __init void *spp_getpage(void)
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
{
void *ptr;
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC);
- else if (start_pfn < table_end) {
- ptr = __va(start_pfn << PAGE_SHIFT);
- start_pfn++;
+ else if (table_cur < table_top) {
+ ptr = __va(table_cur << PAGE_SHIFT);
+ table_cur++;
memset(ptr, 0, PAGE_SIZE);
} else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -208,30 +184,18 @@ static __init void *spp_getpage(void)
return ptr;
}
-#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
-#define pud_offset_u(address) (level3_user_pgt + pud_index(address))
-
-static __init void
-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode)
+void
+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
- pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, new_pte;
-
- pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+ pte_t *pte;
- pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
- if (pgd_none(*pgd)) {
- printk(KERN_ERR
- "PGD FIXMAP MISSING, it should be setup in head.S!\n");
- return;
- }
- pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
+ pud = pud_page + pud_index(vaddr);
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
make_page_readonly(pmd, XENFEAT_writable_page_tables);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0)) {
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
@@ -242,19 +206,20 @@ set_pte_phys(unsigned long vaddr, unsign
if (pmd_none(*pmd)) {
pte = (pte_t *) spp_getpage();
make_page_readonly(pte, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0)) {
printk(KERN_ERR "PAGETABLE BUG #02!\n");
return;
}
}
- if (pgprot_val(prot))
- new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
- else
- new_pte = __pte(0);
pte = pte_offset_kernel(pmd, vaddr);
if (!pte_none(*pte) && __pte_val(new_pte) &&
+#ifdef CONFIG_ACPI
+ /* __acpi_map_table() fails to properly call clear_fixmap() */
+ (vaddr < __fix_to_virt(FIX_ACPI_END) ||
+ vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
+#endif
__pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
pte_ERROR(*pte);
set_pte(pte, new_pte);
@@ -266,15 +231,13 @@ set_pte_phys(unsigned long vaddr, unsign
__flush_tlb_one(vaddr);
}
-static __init void
-set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+void
+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte, new_pte;
+ pud_t *pud_page;
- pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys);
+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
pgd = pgd_offset_k(vaddr);
if (pgd_none(*pgd)) {
@@ -282,47 +245,51 @@ set_pte_phys_ma(unsigned long vaddr, uns
"PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- pmd = (pmd_t *) spp_getpage();
- make_page_readonly(pmd, XENFEAT_writable_page_tables);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
- if (pmd != pmd_offset(pud, 0)) {
- printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
- pmd, pmd_offset(pud, 0));
+ pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+ set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+ pgd = pgd_offset_k((unsigned long)__va(phys));
+ if (pgd_none(*pgd)) {
+ pud = (pud_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ _PAGE_USER));
}
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- pte = (pte_t *) spp_getpage();
- make_page_readonly(pte, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
- if (pte != pte_offset_kernel(pmd, 0)) {
- printk(KERN_ERR "PAGETABLE BUG #02!\n");
- return;
+ pud = pud_offset(pgd, (unsigned long)__va(phys));
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+ _PAGE_USER));
}
+ pmd = pmd_offset(pud, phys);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
}
- new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
+}
- pte = pte_offset_kernel(pmd, vaddr);
- if (!pte_none(*pte) && __pte_val(new_pte) &&
-#ifdef CONFIG_ACPI
- /* __acpi_map_table() fails to properly call clear_fixmap() */
- (vaddr < __fix_to_virt(FIX_ACPI_END) ||
- vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
-#endif
- __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
- pte_ERROR(*pte);
- set_pte(pte, new_pte);
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}
-#ifndef CONFIG_XEN
/*
* The head.S code sets up the kernel high mapping:
*
@@ -352,63 +319,52 @@ void __init cleanup_highmap(void)
}
#endif
-/* NOTE: this is meant to be run only at boot */
-void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long address = __fix_to_virt(idx);
-
- if (idx >= __end_of_fixed_addresses) {
- printk(KERN_ERR "Invalid __set_fixmap\n");
- return;
- }
- switch (idx) {
- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
- set_pte_phys(address, phys, prot, 0);
- set_pte_phys(address, phys, prot, 1);
- break;
- case FIX_EARLYCON_MEM_BASE:
- xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
- pfn_pte_ma(phys >> PAGE_SHIFT, prot));
- break;
- default:
- set_pte_phys_ma(address, phys, prot);
- break;
- }
-}
-
-static __meminit void *alloc_static_page(unsigned long *phys)
-{
- unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
+ unsigned long pfn;
+ void *adr;
if (after_bootmem) {
- void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ adr = (void *)get_zeroed_page(GFP_ATOMIC);
*phys = __pa(adr);
return adr;
}
- *phys = start_pfn << PAGE_SHIFT;
- start_pfn++;
- memset((void *)va, 0, PAGE_SIZE);
- return (void *)va;
+ BUG_ON(!table_cur);
+ pfn = table_cur++;
+ if (pfn >= table_top)
+ panic("alloc_low_page: ran out of memory");
+
+ adr = early_ioremap(pfn_to_mfn(pfn) * PAGE_SIZE, PAGE_SIZE);
+ memset(adr, 0, PAGE_SIZE);
+ *phys = pfn * PAGE_SIZE;
+ return adr;
}
-#define PTE_SIZE PAGE_SIZE
+static __ref void unmap_low_page(void *adr)
+{
+ if (after_bootmem)
+ return;
+
+ early_iounmap(adr, PAGE_SIZE);
+}
static inline int __meminit make_readonly(unsigned long paddr)
{
extern char __vsyscall_0;
int readonly = 0;
- /* Make new page tables read-only. */
+ /* Make new page tables read-only on the first pass. */
if (!xen_feature(XENFEAT_writable_page_tables)
+ && !max_pfn_mapped
&& (paddr >= (table_start << PAGE_SHIFT))
- && (paddr < (table_end << PAGE_SHIFT)))
+ && (paddr < (table_top << PAGE_SHIFT)))
readonly = 1;
/* Make old page tables read-only. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
- && (paddr < (start_pfn << PAGE_SHIFT)))
+ && (paddr < (table_cur << PAGE_SHIFT)))
readonly = 1;
/*
@@ -425,118 +381,131 @@ static inline int __meminit make_readonl
return readonly;
}
-#ifndef CONFIG_XEN
-/* Must run before zap_low_mappings */
-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
- pmd_t *pmd, *last_pmd;
- unsigned long vaddr;
- int i, pmds;
-
- pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- vaddr = __START_KERNEL_map;
- pmd = level2_kernel_pgt;
- last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
-
- for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
- for (i = 0; i < pmds; i++) {
- if (pmd_present(pmd[i]))
- goto continue_outer_loop;
- }
- vaddr += addr & ~PMD_MASK;
- addr &= PMD_MASK;
+ unsigned pages = 0;
+ unsigned long last_map_addr = end;
+ int i;
+
+ pte_t *pte = pte_page + pte_index(addr);
+
+ for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+ unsigned long pteval = addr | __PAGE_KERNEL;
+
+ if (addr >= end ||
+ (!after_bootmem &&
+ (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
+ break;
+
+ if (__pte_val(*pte))
+ continue;
- for (i = 0; i < pmds; i++, addr += PMD_SIZE)
- set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- __flush_tlb_all();
-
- return (void *)vaddr;
-continue_outer_loop:
- ;
+ if (make_readonly(addr))
+ pteval &= ~_PAGE_RW;
+ if (0)
+ printk(" pte=%p addr=%lx pte=%016lx\n",
+ pte, addr, pteval);
+ if (!after_bootmem)
+ *pte = __pte(pteval & __supported_pte_mask);
+ else
+ set_pte(pte, __pte(pteval & __supported_pte_mask));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ pages++;
}
- printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
- return NULL;
+ update_page_count(PG_LEVEL_4K, pages);
+
+ return last_map_addr;
}
-/*
- * To avoid virtual aliases later:
- */
-__meminit void early_iounmap(void *addr, unsigned long size)
+static unsigned long __meminit
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
- unsigned long vaddr;
- pmd_t *pmd;
- int i, pmds;
+ pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- vaddr = (unsigned long)addr;
- pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- pmd = level2_kernel_pgt + pmd_index(vaddr);
-
- for (i = 0; i < pmds; i++)
- pmd_clear(pmd + i);
-
- __flush_tlb_all();
+ BUG_ON(!max_pfn_mapped);
+ return phys_pte_init(pte, address, end);
}
-#endif
static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+ unsigned long page_size_mask)
{
+ unsigned long pages = 0;
+ unsigned long last_map_addr = end;
+ unsigned long start = address;
+
int i = pmd_index(address);
- for (; i < PTRS_PER_PMD; i++) {
+ for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
unsigned long pte_phys;
- pmd_t *pmd = pmd_page + i;
- pte_t *pte, *pte_save;
- int k;
+ pmd_t *pmd = pmd_page + pmd_index(address);
+ pte_t *pte;
if (address >= end)
break;
if (__pmd_val(*pmd)) {
- address += PMD_SIZE;
+ if (!pmd_large(*pmd)) {
+ spin_lock(&init_mm.page_table_lock);
+ last_map_addr = phys_pte_update(pmd, address,
+ end);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ /* Count entries we're using from level2_ident_pgt */
+ if (start == 0)
+ pages++;
continue;
}
- pte = alloc_static_page(&pte_phys);
- pte_save = pte;
- for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
- unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
-
- if (address >= end ||
- (!after_bootmem &&
- (address >> PAGE_SHIFT) >= xen_start_info->nr_pages))
- pteval = 0;
- else if (make_readonly(address))
- pteval &= ~_PAGE_RW;
- set_pte(pte, __pte(pteval & __supported_pte_mask));
+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pmd,
+ pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ spin_unlock(&init_mm.page_table_lock);
+ last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+ continue;
}
+
+ pte = alloc_low_page(&pte_phys);
+ last_map_addr = phys_pte_init(pte, address, end);
+ unmap_low_page(pte);
+
if (!after_bootmem) {
- early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
- *pmd = __pmd(pte_phys | _KERNPG_TABLE);
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pte_phys),
+ XENFEAT_writable_page_tables);
+ *pmd = __pmd(pte_phys | _PAGE_TABLE);
} else {
- make_page_readonly(pte_save, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+ make_page_readonly(pte, XENFEAT_writable_page_tables);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ spin_unlock(&init_mm.page_table_lock);
}
}
- return address;
+ update_page_count(PG_LEVEL_2M, pages);
+ return last_map_addr;
}
static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
+ unsigned long page_size_mask)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
- spin_lock(&init_mm.page_table_lock);
- last_map_addr = phys_pmd_init(pmd, address, end);
- spin_unlock(&init_mm.page_table_lock);
+ BUG_ON(!max_pfn_mapped);
+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
__flush_tlb_all();
return last_map_addr;
}
static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+ unsigned long page_size_mask)
{
+ unsigned long pages = 0;
unsigned long last_map_addr = end;
int i = pud_index(addr);
@@ -550,29 +519,55 @@ phys_pud_init(pud_t *pud_page, unsigned
if (__pud_val(*pud)) {
if (!pud_large(*pud))
- last_map_addr = phys_pmd_update(pud, addr, end);
+ last_map_addr = phys_pmd_update(pud, addr, end,
+ page_size_mask);
continue;
}
- if (direct_gbpages) {
+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ spin_unlock(&init_mm.page_table_lock);
last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
continue;
}
- pmd = alloc_static_page(&pmd_phys);
+ pmd = alloc_low_page(&pmd_phys);
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ unmap_low_page(pmd);
- spin_lock(&init_mm.page_table_lock);
- *pud = __pud(pmd_phys | _KERNPG_TABLE);
- last_map_addr = phys_pmd_init(pmd, addr, end);
- spin_unlock(&init_mm.page_table_lock);
-
- early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ if (!after_bootmem) {
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pmd_phys),
+ XENFEAT_writable_page_tables);
+ if (page_size_mask & (1 << PG_LEVEL_NUM))
+ xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE));
+ else
+ *pud = __pud(pmd_phys | _PAGE_TABLE);
+ } else {
+ make_page_readonly(pmd, XENFEAT_writable_page_tables);
+ spin_lock(&init_mm.page_table_lock);
+ pud_populate(&init_mm, pud, __va(pmd_phys));
+ spin_unlock(&init_mm.page_table_lock);
+ }
}
__flush_tlb_all();
+ update_page_count(PG_LEVEL_1G, pages);
- return last_map_addr >> PAGE_SHIFT;
+ return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
+ unsigned long page_size_mask)
+{
+ pud_t *pud;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+
+ return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM));
}
void __init xen_init_pt(void)
@@ -656,112 +651,36 @@ void __init xen_init_pt(void)
}
}
-static void __init extend_init_mapping(unsigned long tables_space)
-{
- unsigned long va = __START_KERNEL_map;
- unsigned long start = start_pfn;
- unsigned long phys, addr, *pte_page;
- pmd_t *pmd;
- pte_t *pte, new_pte;
- unsigned long *page = (unsigned long *)init_level4_pgt;
-
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
-
- /* Kill mapping of low 1MB. */
- while (va < (unsigned long)&_text) {
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
-
- /* Ensure init mappings cover kernel text/data and initial tables. */
- while (va < (__START_KERNEL_map
- + (start_pfn << PAGE_SHIFT)
- + tables_space)) {
- if (!(pmd_index(va) | pte_index(va))) {
- pud_t *pud;
-
- page = (unsigned long *)init_level4_pgt;
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- pud = (pud_t *)&page[pud_index(va)];
- if (pud_none(*pud)) {
- page = alloc_static_page(&phys);
- early_make_page_readonly(
- page, XENFEAT_writable_page_tables);
- set_pud(pud, __pud(phys | _KERNPG_TABLE));
- } else {
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
- }
- }
- pmd = (pmd_t *)&page[pmd_index(va)];
- if (pmd_none(*pmd)) {
- pte_page = alloc_static_page(&phys);
- early_make_page_readonly(
- pte_page, XENFEAT_writable_page_tables);
- set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
- } else {
- addr = page[pmd_index(va)];
- addr_to_page(addr, pte_page);
- }
- pte = (pte_t *)&pte_page[pte_index(va)];
- if (pte_none(*pte)) {
- new_pte = pfn_pte(
- (va - __START_KERNEL_map) >> PAGE_SHIFT,
- __pgprot(_KERNPG_TABLE));
- xen_l1_entry_update(pte, new_pte);
- }
- va += PAGE_SIZE;
- }
-
- /* Finally, blow away any spurious initial mappings. */
- while (1) {
- if (!(pmd_index(va) | pte_index(va))) {
- page = (unsigned long *)init_level4_pgt;
- addr = page[pgd_index(va)];
- addr_to_page(addr, page);
- if (pud_none(((pud_t *)page)[pud_index(va)]))
- break;
- addr = page[pud_index(va)];
- addr_to_page(addr, page);
- }
- pmd = (pmd_t *)&page[pmd_index(va)];
- if (pmd_none(*pmd))
- break;
- if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
- BUG();
- va += PAGE_SIZE;
- }
-
- if (start_pfn > start)
- reserve_early(start << PAGE_SHIFT,
- start_pfn << PAGE_SHIFT, "INITMAP");
-}
-
static void __init find_early_table_space(unsigned long end)
{
unsigned long puds, pmds, ptes, tables;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
- tables = round_up(puds * 8, PAGE_SIZE) +
- round_up(pmds * 8, PAGE_SIZE) +
- round_up(ptes * 8, PAGE_SIZE);
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
- extend_init_mapping(tables);
+ if (!table_top) {
+ table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+ xen_start_info->nr_pt_frames;
+ table_cur = table_start;
+ } else {
+ /*
+ * [table_start, table_top) gets passed to reserve_early(),
+ * so we must not use table_cur here, despite continuing
+ * to allocate from there. table_cur possibly being below
+ * table_start is otoh not a problem.
+ */
+ table_start = table_top;
+ }
- table_start = start_pfn;
- table_end = table_start + (tables>>PAGE_SHIFT);
+ table_top = table_cur + (tables >> PAGE_SHIFT);
- early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, table_cur << PAGE_SHIFT, table_top << PAGE_SHIFT);
}
static void __init xen_finish_init_mapping(void)
@@ -783,18 +702,18 @@ static void __init xen_finish_init_mappi
xen_start_info->mod_start = (unsigned long)
__va(__pa(xen_start_info->mod_start));
- /* Destroy the Xen-created mappings beyond the kernel image as
- * well as the temporary mappings created above. Prevents
- * overlap with modules area (if init mapping is very big).
- */
+ /* Destroy the Xen-created mappings beyond the kernel image. */
start = PAGE_ALIGN((unsigned long)_end);
- end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
+ end = __START_KERNEL_map + (table_start << PAGE_SHIFT);
for (; start < end; start += PAGE_SIZE)
if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
BUG();
- /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
- table_end = ~0UL;
+ /* Allocate pte's for initial fixmaps from 'table_cur' allocator. */
+ start = table_top;
+ WARN(table_cur != start, "start=%lx cur=%lx top=%lx\n",
+ table_start, table_cur, start);
+ table_top = ~0UL;
/* Switch to the real shared_info page, and clear the dummy page. */
set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
@@ -811,8 +730,7 @@ static void __init xen_finish_init_mappi
<< PAGE_SHIFT,
PAGE_KERNEL_RO);
- /* Disable the 'start_pfn' allocator. */
- table_end = start_pfn;
+ table_top = max(table_cur, start);
}
static void __init init_gbpages(void)
@@ -825,126 +743,91 @@ static void __init init_gbpages(void)
#endif
}
-#ifdef CONFIG_MEMTEST_BOOTPARAM
-
-static void __init memtest(unsigned long start_phys, unsigned long size,
- unsigned pattern)
+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
- unsigned long i;
- unsigned long *start;
- unsigned long start_bad;
- unsigned long last_bad;
- unsigned long val;
- unsigned long start_phys_aligned;
- unsigned long count;
- unsigned long incr;
-
- switch (pattern) {
- case 0:
- val = 0UL;
- break;
- case 1:
- val = -1UL;
- break;
- case 2:
- val = 0x5555555555555555UL;
- break;
- case 3:
- val = 0xaaaaaaaaaaaaaaaaUL;
- break;
- default:
- return;
- }
- incr = sizeof(unsigned long);
- start_phys_aligned = ALIGN(start_phys, incr);
- count = (size - (start_phys_aligned - start_phys))/incr;
- start = __va(start_phys_aligned);
- start_bad = 0;
- last_bad = 0;
-
- for (i = 0; i < count; i++)
- start[i] = val;
- for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
- if (*start != val) {
- if (start_phys_aligned == last_bad + incr) {
- last_bad += incr;
- } else {
- if (start_bad) {
- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
- val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
- }
- start_bad = last_bad = start_phys_aligned;
- }
- }
- }
- if (start_bad) {
- printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
- val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
- }
+ unsigned long next, last_map_addr = end;
-}
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ unsigned long pud_phys;
+ pud_t *pud;
-static int __init parse_memtest(char *arg)
-{
- if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
- return 0;
-}
+ next = (start + PGDIR_SIZE) & PGDIR_MASK;
+ if (next > end)
+ next = end;
-early_param("memtest", parse_memtest);
+ if (__pgd_val(*pgd)) {
+ last_map_addr = phys_pud_update(pgd, __pa(start),
+ __pa(end), page_size_mask);
+ continue;
+ }
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
- u64 t_start, t_size;
- unsigned pattern;
+ pud = alloc_low_page(&pud_phys);
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ page_size_mask);
+ unmap_low_page(pud);
- if (!memtest_pattern)
- return;
+ if (!after_bootmem) {
+ if (max_pfn_mapped)
+ make_page_readonly(__va(pud_phys),
+ XENFEAT_writable_page_tables);
+ xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
+ } else {
+ make_page_readonly(pud, XENFEAT_writable_page_tables);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_populate(&init_mm, pgd, __va(pud_phys));
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
- printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
- for (pattern = 0; pattern < memtest_pattern; pattern++) {
- t_start = start;
- t_size = 0;
- while (t_start < end) {
- t_start = find_e820_area_size(t_start, &t_size, 1);
+ return last_map_addr;
+}
- /* done ? */
- if (t_start >= end)
- break;
- if (t_start + t_size > end)
- t_size = end - t_start;
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
- printk(KERN_CONT "\n %016llx - %016llx pattern %d",
- (unsigned long long)t_start,
- (unsigned long long)t_start + t_size, pattern);
+#define NR_RANGE_MR 5
- memtest(t_start, t_size, pattern);
+static int save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
- t_start += t_size;
- }
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
}
- printk(KERN_CONT "\n");
-}
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
+
+ return nr_range;
}
-#endif
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
{
- unsigned long next, last_map_addr = end;
- unsigned long start_phys = start, end_phys = end;
+ unsigned long last_map_addr = 0;
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
printk(KERN_INFO "init_memory_mapping\n");
@@ -955,51 +838,150 @@ unsigned long __init_refok init_memory_m
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
- if (!after_bootmem) {
+ if (!after_bootmem)
init_gbpages();
- find_early_table_space(end);
+
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ?*/
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* big page (2M) range*/
+ start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+
+ /* big page (1G) range */
+ start_pfn = end_pfn;
+ end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = end_pfn;
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = end_pfn;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof (struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
}
- start = (unsigned long)__va(start);
- end = (unsigned long)__va(end);
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- for (; start < end; start = next) {
- pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
- pud_t *pud;
+ if (!after_bootmem)
+ find_early_table_space(end);
- if (after_bootmem)
- pud = pud_offset(pgd, start & PGDIR_MASK);
- else
- pud = alloc_static_page(&pud_phys);
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
- if (!after_bootmem) {
- early_make_page_readonly(pud, XENFEAT_writable_page_tables);
- set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+ if (!start) {
+ unsigned long addr, va = __START_KERNEL_map;
+ unsigned long *page = (unsigned long *)init_level4_pgt;
+
+ /* Kill mapping of memory below _text. */
+ while (va < (unsigned long)&_text) {
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
+ }
+
+ /* Blow away any spurious initial mappings. */
+ va = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+ addr = page[pgd_index(va)];
+ addr_to_page(addr, page);
+ addr = page[pud_index(va)];
+ addr_to_page(addr, page);
+ while (pmd_index(va) | pte_index(va)) {
+ if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+ break;
+ if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+ BUG();
+ va += PAGE_SIZE;
}
}
- if (!after_bootmem) {
- BUG_ON(start_pfn != table_end);
+ for (i = 0; i < nr_range; i++)
+ last_map_addr = kernel_physical_mapping_init(
+ mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+
+ BUG_ON(table_cur > table_top);
+ if (!start)
xen_finish_init_mapping();
- }
+ else if (table_cur < table_top)
+ /* Disable the 'table_cur' allocator. */
+ table_top = table_cur;
__flush_tlb_all();
- if (!after_bootmem)
+ if (!after_bootmem && table_top > table_start)
reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
+ table_top << PAGE_SHIFT, "PGTABLE");
+
+ printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+ last_map_addr, end);
if (!after_bootmem)
- early_memtest(start_phys, end_phys);
+ early_memtest(start, end);
- return last_map_addr;
+ return last_map_addr >> PAGE_SHIFT;
}
#ifndef CONFIG_NUMA
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long bootmap_size, bootmap;
+
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#ifdef CONFIG_XEN
+ if (end_pfn > xen_start_info->nr_pages)
+ end_pfn = xen_start_info->nr_pages;
+#endif
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ 0, end_pfn);
+ free_bootmem_with_active_regions(0, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+ reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
+
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1007,9 +989,9 @@ void __init paging_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = end_pfn;
+ max_zone_pfns[ZONE_NORMAL] = max_pfn;
- memory_present(0, 0, end_pfn);
+ memory_present(0, 0, max_pfn);
sparse_init();
free_area_init_nodes(max_zone_pfns);
@@ -1099,8 +1081,8 @@ void __init mem_init(void)
ClearPageReserved(pfn_to_page(pfn));
init_page_count(pfn_to_page(pfn));
}
- reservedpages = end_pfn - totalram_pages -
- absent_pages_in_range(0, end_pfn);
+ reservedpages = max_pfn - totalram_pages -
+ absent_pages_in_range(0, max_pfn);
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -1119,7 +1101,7 @@ void __init mem_init(void)
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
"%ldk reserved, %ldk data, %ldk init)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- end_pfn << (PAGE_SHIFT-10),
+ max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
@@ -1182,6 +1164,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+ unsigned long rodata_start =
+ ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Dynamic tracing modifies the kernel text section */
+ start = rodata_start;
+#endif
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -1191,8 +1180,7 @@ void mark_rodata_ro(void)
* The rodata section (but not the kernel text!) should also be
* not-executable.
*/
- start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
- set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -1214,24 +1202,26 @@ void free_initrd_mem(unsigned long start
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
{
#ifdef CONFIG_NUMA
int nid, next_nid;
+ int ret;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
- if (pfn >= end_pfn) {
+ if (pfn >= max_pfn) {
/*
* This can happen with kdump kernels when accessing
* firmware tables:
*/
if (pfn < max_pfn_mapped)
- return;
+ return -EFAULT;
- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
phys, len);
- return;
+ return -EFAULT;
}
/* Should check here against the e820 map to avoid double free */
@@ -1239,9 +1229,13 @@ void __init reserve_bootmem_generic(unsi
nid = phys_to_nid(phys);
next_nid = phys_to_nid(phys + len - 1);
if (nid == next_nid)
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
else
- reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ ret = reserve_bootmem(phys, len, flags);
+
+ if (ret != 0)
+ return ret;
+
#else
reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
@@ -1252,6 +1246,8 @@ void __init reserve_bootmem_generic(unsi
set_dma_reserve(dma_reserve);
}
#endif
+
+ return 0;
}
int kern_addr_valid(unsigned long addr)
@@ -1369,7 +1365,7 @@ vmemmap_populate(struct page *start_page
pmd_t *pmd;
for (; addr < end; addr = next) {
- next = pmd_addr_end(addr, end);
+ void *p = NULL;
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
@@ -1379,33 +1375,51 @@ vmemmap_populate(struct page *start_page
if (!pud)
return -ENOMEM;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- pte_t entry;
- void *p;
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+
+ if (!pmd)
+ return -ENOMEM;
+
+ p = vmemmap_pte_populate(pmd, addr, node);
- p = vmemmap_alloc_block(PMD_SIZE, node);
if (!p)
return -ENOMEM;
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd_ma(__pte_val(entry)));
-
- /* check to see if we have contiguous blocks */
- if (p_end != p || node_start != node) {
- if (p_start)
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- addr_start, addr_end-1, p_start, p_end-1, node_start);
- addr_start = addr;
- node_start = node;
- p_start = p;
- }
- addr_end = addr + PMD_SIZE;
- p_end = p + PMD_SIZE;
+ addr_end = addr + PAGE_SIZE;
+ p_end = p + PAGE_SIZE;
} else {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ pte_t entry;
+
+ p = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd_ma(__pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
+ } else
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
}
+
}
return 0;
}
--- head-2011-03-11.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:39:13.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/ioremap-xen.c 2011-02-07 15:40:39.000000000 +0100
@@ -13,6 +13,7 @@
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
#include <asm/cacheflush.h>
#include <asm/e820.h>
@@ -255,7 +256,8 @@ int ioremap_check_change_attr(unsigned l
for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
unsigned long pfn = mfn_to_local_pfn(mfn);
- if (pfn >= max_pfn_mapped)
+ if (pfn >= max_low_pfn_mapped &&
+ (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
continue;
rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
PAGE_SIZE, prot_val);
@@ -278,11 +280,14 @@ static void __iomem *__ioremap_caller(re
{
unsigned long offset, vaddr;
phys_addr_t mfn, last_addr;
+ const resource_size_t unaligned_phys_addr = phys_addr;
+ const unsigned long unaligned_size = size;
struct vm_struct *area;
unsigned long new_prot_val;
pgprot_t prot;
int retval;
domid_t domid = DOMID_IO;
+ void __iomem *ret_addr;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -299,7 +304,7 @@ static void __iomem *__ioremap_caller(re
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
- if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS)
+ if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
/*
@@ -323,7 +328,7 @@ static void __iomem *__ioremap_caller(re
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
- retval = reserve_memtype(phys_addr, phys_addr + size,
+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -391,7 +396,10 @@ static void __iomem *__ioremap_caller(re
return NULL;
}
- return (void __iomem *) (vaddr + offset);
+ ret_addr = (void __iomem *) (vaddr + offset);
+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+ return ret_addr;
}
/**
@@ -419,7 +427,7 @@ void __iomem *ioremap_nocache(resource_s
{
/*
* Ideally, this should be:
- * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+ * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
* UC MINUS.
@@ -443,7 +451,7 @@ EXPORT_SYMBOL(ioremap_nocache);
*/
void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
{
- if (pat_wc_enabled)
+ if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
__builtin_return_address(0));
else
@@ -483,6 +491,14 @@ static void __iomem *ioremap_default(res
}
#endif
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
/**
* iounmap - Free a IO remapping
* @addr: virtual address from ioremap_*
@@ -507,6 +523,8 @@ void iounmap(volatile void __iomem *addr
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
+ mmiotrace_iounmap(addr);
+
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by
@@ -514,7 +532,7 @@ void iounmap(volatile void __iomem *addr
cpa takes care of the direct mappings. */
read_lock(&vmlist_lock);
for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
+ if (p->addr == (void __force *)addr)
break;
}
read_unlock(&vmlist_lock);
@@ -528,7 +546,7 @@ void iounmap(volatile void __iomem *addr
free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
- o = remove_vm_area((void *)addr);
+ o = remove_vm_area((void __force *)addr);
BUG_ON(p != o || o == NULL);
kfree(p);
}
@@ -548,7 +566,7 @@ void *xlate_dev_mem_ptr(unsigned long ph
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
- addr = (void *)ioremap_default(start, PAGE_SIZE);
+ addr = (void __force *)ioremap_default(start, PAGE_SIZE);
if (addr)
addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
@@ -576,8 +594,7 @@ static int __init early_ioremap_debug_se
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
- __section(.bss.page_aligned);
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
#ifdef CONFIG_X86_32
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
@@ -676,10 +693,11 @@ static void __init __early_set_fixmap(en
return;
}
pte = early_ioremap_pte(addr);
+
if (pgprot_val(flags))
set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
else
- pte_clear(NULL, addr, pte);
+ pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
@@ -707,13 +725,11 @@ static int __init check_early_ioremap_le
{
if (!early_ioremap_nested)
return 0;
-
- printk(KERN_WARNING
+ WARN(1, KERN_WARNING
"Debug warning: early ioremap leak of %d areas detected.\n",
- early_ioremap_nested);
+ early_ioremap_nested);
printk(KERN_WARNING
- "please boot with early_ioremap_debug and report the dmesg.\n");
- WARN_ON(1);
+ "please boot with early_ioremap_debug and report the dmesg.\n");
return 1;
}
--- head-2011-03-11.orig/arch/x86/mm/pageattr-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pageattr-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -34,6 +34,47 @@ struct cpa_data {
unsigned force_split : 1;
};
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+ unsigned long flags;
+
+ /* Protect against CPA */
+ spin_lock_irqsave(&pgd_lock, flags);
+ direct_pages_count[level] += pages;
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void split_page_count(int level)
+{
+ direct_pages_count[level]--;
+ direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+int arch_report_meminfo(char *page)
+{
+ int n = sprintf(page, "DirectMap4k: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+ n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+#ifdef CONFIG_X86_64
+ if (direct_gbpages)
+ n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_1G] << 20);
+#endif
+ return n;
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
#ifdef CONFIG_X86_64
static inline unsigned long highmap_start_pfn(void)
@@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long
{
BUG_ON(irqs_disabled());
- on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+ on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
static void __cpa_flush_range(void *arg)
@@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned lon
BUG_ON(irqs_disabled());
WARN_ON(PAGE_ALIGN(start) != start);
- on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+ on_each_cpu(__cpa_flush_range, NULL, 1);
if (!cache)
return;
@@ -229,6 +270,7 @@ pte_t *lookup_address(unsigned long addr
return pte_offset_kernel(pmd, address);
}
+EXPORT_SYMBOL_GPL(lookup_address);
/*
* Set the new pmd in all the pgds we know about:
@@ -509,6 +551,16 @@ static int split_large_page(pte_t *kpte,
}
#endif
+ if (address >= (unsigned long)__va(0) &&
+ address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ split_page_count(level);
+
+#ifdef CONFIG_X86_64
+ if (address >= (unsigned long)__va(1UL<<32) &&
+ address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+ split_page_count(level);
+#endif
+
/*
* Get the target mfn from the original entry:
*/
@@ -566,10 +618,9 @@ repeat:
if (!__pte_val(old_pte)) {
if (!primary)
return 0;
- printk(KERN_WARNING "CPA: called for zero pte. "
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
cpa->vaddr);
- WARN_ON(1);
return -EINVAL;
}
@@ -634,15 +685,24 @@ static int cpa_process_alias(struct cpa_
struct cpa_data alias_cpa;
int ret = 0;
- if (cpa->pfn > max_pfn_mapped)
+ if (cpa->pfn >= max_pfn_mapped)
return 0;
+#ifdef CONFIG_X86_64
+ if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+ return 0;
+#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!within(cpa->vaddr, PAGE_OFFSET,
- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ if (!(within(cpa->vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+ || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+ )) {
alias_cpa = *cpa;
alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -796,6 +856,51 @@ static inline int change_page_attr_clear
return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
}
+#ifdef CONFIG_XEN
+static void _free_memtype(u64 pstart, u64 pend)
+{
+ u64 pa = pstart &= __PHYSICAL_MASK;
+ u64 ma = phys_to_machine(pa);
+
+ while ((pa += PAGE_SIZE) < pend) {
+ if (phys_to_machine(pa) != ma + (pa - pstart)) {
+ free_memtype(ma, ma + (pa - pstart));
+ pstart = pa;
+ ma = phys_to_machine(pa);
+ }
+ }
+ free_memtype(ma, ma + (pend - pstart));
+}
+#define free_memtype _free_memtype
+
+static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
+{
+ u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
+ u64 ma = phys_to_machine(pa);
+ int rc = 0;
+
+ while ((pa += PAGE_SIZE) < pend) {
+ if (phys_to_machine(pa) != ma + (pa - pcur)) {
+ rc = reserve_memtype(ma, ma + (pa - pcur),
+ req_type, NULL);
+ if (rc)
+ break;
+ pcur = pa;
+ ma = phys_to_machine(pa);
+ }
+ }
+ if (likely(!rc))
+ rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
+
+ if (unlikely(!rc) && pstart < pcur)
+ _free_memtype(pstart, pcur);
+
+ return rc;
+}
+#define reserve_memtype(s, e, r, n) \
+ _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
+#endif
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
@@ -810,7 +915,7 @@ int set_memory_uc(unsigned long addr, in
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_UC_MINUS, NULL))
return -EINVAL;
@@ -826,10 +931,10 @@ int _set_memory_wc(unsigned long addr, i
int set_memory_wc(unsigned long addr, int numpages)
{
- if (!pat_wc_enabled)
+ if (!pat_enabled)
return set_memory_uc(addr, numpages);
- if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_WC, NULL))
return -EINVAL;
@@ -845,7 +950,7 @@ int _set_memory_wb(unsigned long addr, i
int set_memory_wb(unsigned long addr, int numpages)
{
- free_memtype(addr, addr + numpages * PAGE_SIZE);
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return _set_memory_wb(addr, numpages);
}
--- head-2011-03-11.orig/arch/x86/mm/pat-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pat-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -12,6 +12,8 @@
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
@@ -26,11 +28,11 @@
#include <asm/io.h>
#ifdef CONFIG_X86_PAT
-int __read_mostly pat_wc_enabled = 1;
+int __read_mostly pat_enabled = 1;
void __cpuinit pat_disable(char *reason)
{
- pat_wc_enabled = 0;
+ pat_enabled = 0;
printk(KERN_INFO "%s\n", reason);
}
@@ -42,6 +44,19 @@ static int __init nopat(char *str)
early_param("nopat", nopat);
#endif
+
+static int debug_enable;
+static int __init pat_debug_setup(char *str)
+{
+ debug_enable = 1;
+ return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+#define dprintk(fmt, arg...) \
+ do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+
static u64 __read_mostly boot_pat_state;
enum {
@@ -53,24 +68,25 @@ enum {
PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
};
-#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
void pat_init(void)
{
u64 pat;
- if (!pat_wc_enabled)
+ if (!pat_enabled)
return;
/* Paranoia check. */
- if (!cpu_has_pat) {
- printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
+ if (!cpu_has_pat && boot_pat_state) {
/*
- * Panic if this happens on the secondary CPU, and we
+ * If this happens we are on a secondary CPU, but
* switched to PAT on the boot CPU. We have no way to
* undo PAT.
- */
- BUG_ON(boot_pat_state);
+ */
+ printk(KERN_ERR "PAT enabled, "
+ "but not supported by secondary CPU\n");
+ BUG();
}
#ifndef CONFIG_XEN
@@ -87,8 +103,8 @@ void pat_init(void)
* 011 UC _PAGE_CACHE_UC
* PAT bit unused
*/
- pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
- PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
/* Boot CPU check */
if (!boot_pat_state)
@@ -113,13 +129,13 @@ void pat_init(void)
static char *cattr_name(unsigned long flags)
{
switch (flags & _PAGE_CACHE_MASK) {
- case _PAGE_CACHE_UC: return "uncached";
- case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
- case _PAGE_CACHE_WB: return "write-back";
- case _PAGE_CACHE_WC: return "write-combining";
- case _PAGE_CACHE_WP: return "write-protected";
- case _PAGE_CACHE_WT: return "write-through";
- default: return "broken";
+ case _PAGE_CACHE_UC: return "uncached";
+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_WB: return "write-back";
+ case _PAGE_CACHE_WC: return "write-combining";
+ case _PAGE_CACHE_WP: return "write-protected";
+ case _PAGE_CACHE_WT: return "write-through";
+ default: return "broken";
}
}
@@ -157,49 +173,55 @@ static DEFINE_SPINLOCK(memtype_lock); /
* The intersection is based on "Effective Memory Type" tables in IA-32
* SDM vol 3a
*/
-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
- unsigned long *ret_prot)
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
{
- unsigned long pat_type;
- u8 mtrr_type;
-
- pat_type = prot & _PAGE_CACHE_MASK;
- prot &= (~_PAGE_CACHE_MASK);
-
- /*
- * We return the PAT request directly for types where PAT takes
- * precedence with respect to MTRR and for UC_MINUS.
- * Consistency checks with other PAT requests is done later
- * while going through memtype list.
- */
- if (pat_type == _PAGE_CACHE_WC) {
- *ret_prot = prot | _PAGE_CACHE_WC;
- return 0;
- } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
- *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
- return 0;
- } else if (pat_type == _PAGE_CACHE_UC) {
- *ret_prot = prot | _PAGE_CACHE_UC;
- return 0;
- }
-
/*
* Look for MTRR hint to get the effective type in case where PAT
* request is for WB.
*/
- mtrr_type = mtrr_type_lookup(start, end);
+ if (req_type == _PAGE_CACHE_WB) {
+ u8 mtrr_type;
- if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
- *ret_prot = prot | _PAGE_CACHE_UC;
- } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
- *ret_prot = prot | _PAGE_CACHE_WC;
- } else {
- *ret_prot = prot | _PAGE_CACHE_WB;
+ mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == MTRR_TYPE_UNCACHABLE)
+ return _PAGE_CACHE_UC;
+ if (mtrr_type == MTRR_TYPE_WRCOMB)
+ return _PAGE_CACHE_WC;
+ }
+
+ return req_type;
+}
+
+static int chk_conflict(struct memtype *new, struct memtype *entry,
+ unsigned long *type)
+{
+ if (new->type != entry->type) {
+ if (type) {
+ new->type = entry->type;
+ *type = entry->type;
+ } else
+ goto conflict;
}
+ /* check overlaps with more than one entry in the list */
+ list_for_each_entry_continue(entry, &memtype_list, nd) {
+ if (new->end <= entry->start)
+ break;
+ else if (new->type != entry->type)
+ goto conflict;
+ }
return 0;
+
+ conflict:
+ printk(KERN_INFO "%s:%d conflicting memory types "
+ "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
+ new->end, cattr_name(new->type), cattr_name(entry->type));
+ return -EBUSY;
}
+static struct memtype *cached_entry;
+static u64 cached_start;
+
/*
* req_type typically has one of the:
* - _PAGE_CACHE_WB
@@ -210,37 +232,36 @@ static int pat_x_mtrr_type(u64 start, u6
* req_type will have a special case value '-1', when requester want to inherit
* the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
*
- * If ret_type is NULL, function will return an error if it cannot reserve the
- * region with req_type. If ret_type is non-null, function will return
- * available type in ret_type in case of no error. In case of any error
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *ret_type)
+ unsigned long *new_type)
{
- struct memtype *new_entry = NULL;
- struct memtype *parse;
+ struct memtype *new, *entry;
unsigned long actual_type;
+ struct list_head *where;
int err = 0;
- /* Only track when pat_wc_enabled */
- if (!pat_wc_enabled) {
+ BUG_ON(start >= end); /* end is exclusive */
+
+ if (!pat_enabled) {
/* This is identical to page table setting without PAT */
- if (ret_type) {
- if (req_type == -1) {
- *ret_type = _PAGE_CACHE_WB;
- } else {
- *ret_type = req_type;
- }
+ if (new_type) {
+ if (req_type == -1)
+ *new_type = _PAGE_CACHE_WB;
+ else
+ *new_type = req_type & _PAGE_CACHE_MASK;
}
return 0;
}
/* Low ISA region is always mapped WB in page table. No need to track */
- if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
- if (ret_type)
- *ret_type = _PAGE_CACHE_WB;
-
+ if (is_ISA_range(start, end - 1)) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_WB;
return 0;
}
@@ -253,206 +274,118 @@ int reserve_memtype(u64 start, u64 end,
*/
u8 mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == MTRR_TYPE_WRBACK) {
- req_type = _PAGE_CACHE_WB;
+ if (mtrr_type == MTRR_TYPE_WRBACK)
actual_type = _PAGE_CACHE_WB;
- } else {
- req_type = _PAGE_CACHE_UC_MINUS;
+ else
actual_type = _PAGE_CACHE_UC_MINUS;
- }
- } else {
- req_type &= _PAGE_CACHE_MASK;
- err = pat_x_mtrr_type(start, end, req_type, &actual_type);
- }
-
- if (err) {
- if (ret_type)
- *ret_type = actual_type;
+ } else
+ actual_type = pat_x_mtrr_type(start, end,
+ req_type & _PAGE_CACHE_MASK);
- return -EINVAL;
- }
-
- new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!new_entry)
+ new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new)
return -ENOMEM;
- new_entry->start = start;
- new_entry->end = end;
- new_entry->type = actual_type;
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
- if (ret_type)
- *ret_type = actual_type;
+ if (new_type)
+ *new_type = actual_type;
spin_lock(&memtype_lock);
- /* Search for existing mapping that overlaps the current range */
- list_for_each_entry(parse, &memtype_list, nd) {
- struct memtype *saved_ptr;
+ if (cached_entry && start >= cached_start)
+ entry = cached_entry;
+ else
+ entry = list_entry(&memtype_list, struct memtype, nd);
- if (parse->start >= end) {
- pr_debug("New Entry\n");
- list_add(&new_entry->nd, parse->nd.prev);
- new_entry = NULL;
+ /* Search for existing mapping that overlaps the current range */
+ where = NULL;
+ list_for_each_entry_continue(entry, &memtype_list, nd) {
+ if (end <= entry->start) {
+ where = entry->nd.prev;
+ cached_entry = list_entry(where, struct memtype, nd);
break;
- }
-
- if (start <= parse->start && end >= parse->start) {
- if (actual_type != parse->type && ret_type) {
- actual_type = parse->type;
- *ret_type = actual_type;
- new_entry->type = actual_type;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
-
- saved_ptr = parse;
- /*
- * Check to see whether the request overlaps more
- * than one entry in the list
- */
- list_for_each_entry_continue(parse, &memtype_list, nd) {
- if (end <= parse->start) {
- break;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
- }
-
- if (err) {
- break;
+ } else if (start <= entry->start) { /* end > entry->start */
+ err = chk_conflict(new, entry, new_type);
+ if (!err) {
+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
+ entry->start, entry->end);
+ where = entry->nd.prev;
+ cached_entry = list_entry(where,
+ struct memtype, nd);
}
-
- pr_debug("Overlap at 0x%Lx-0x%Lx\n",
- saved_ptr->start, saved_ptr->end);
- /* No conflict. Go ahead and add this new entry */
- list_add(&new_entry->nd, saved_ptr->nd.prev);
- new_entry = NULL;
break;
- }
-
- if (start < parse->end) {
- if (actual_type != parse->type && ret_type) {
- actual_type = parse->type;
- *ret_type = actual_type;
- new_entry->type = actual_type;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
- }
-
- saved_ptr = parse;
- /*
- * Check to see whether the request overlaps more
- * than one entry in the list
- */
- list_for_each_entry_continue(parse, &memtype_list, nd) {
- if (end <= parse->start) {
- break;
- }
-
- if (actual_type != parse->type) {
- printk(
- KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid,
- start, end,
- cattr_name(actual_type),
- cattr_name(parse->type));
- err = -EBUSY;
- break;
+ } else if (start < entry->end) { /* start > entry->start */
+ err = chk_conflict(new, entry, new_type);
+ if (!err) {
+ dprintk("Overlap at 0x%Lx-0x%Lx\n",
+ entry->start, entry->end);
+ cached_entry = list_entry(entry->nd.prev,
+ struct memtype, nd);
+
+ /*
+ * Move to right position in the linked
+ * list to add this new entry
+ */
+ list_for_each_entry_continue(entry,
+ &memtype_list, nd) {
+ if (start <= entry->start) {
+ where = entry->nd.prev;
+ break;
+ }
}
}
-
- if (err) {
- break;
- }
-
- pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
- saved_ptr->start, saved_ptr->end);
- /* No conflict. Go ahead and add this new entry */
- list_add(&new_entry->nd, &saved_ptr->nd);
- new_entry = NULL;
break;
}
}
if (err) {
- printk(KERN_INFO
- "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
- start, end, cattr_name(new_entry->type),
- cattr_name(req_type));
- kfree(new_entry);
+ printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
+ "track %s, req %s\n",
+ start, end, cattr_name(new->type), cattr_name(req_type));
+ kfree(new);
spin_unlock(&memtype_lock);
return err;
}
- if (new_entry) {
- /* No conflict. Not yet added to the list. Add to the tail */
- list_add_tail(&new_entry->nd, &memtype_list);
- pr_debug("New Entry\n");
- }
+ cached_start = start;
- if (ret_type) {
- pr_debug(
- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
- start, end, cattr_name(actual_type),
- cattr_name(req_type), cattr_name(*ret_type));
- } else {
- pr_debug(
- "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
- start, end, cattr_name(actual_type),
- cattr_name(req_type));
- }
+ if (where)
+ list_add(&new->nd, where);
+ else
+ list_add_tail(&new->nd, &memtype_list);
spin_unlock(&memtype_lock);
+
+ dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+ start, end, cattr_name(new->type), cattr_name(req_type),
+ new_type ? cattr_name(*new_type) : "-");
+
return err;
}
int free_memtype(u64 start, u64 end)
{
- struct memtype *ml;
+ struct memtype *entry;
int err = -EINVAL;
- /* Only track when pat_wc_enabled */
- if (!pat_wc_enabled) {
+ if (!pat_enabled)
return 0;
- }
/* Low ISA region is always mapped WB. No need to track */
- if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+ if (is_ISA_range(start, end - 1))
return 0;
- }
spin_lock(&memtype_lock);
- list_for_each_entry(ml, &memtype_list, nd) {
- if (ml->start == start && ml->end == end) {
- list_del(&ml->nd);
- kfree(ml);
+ list_for_each_entry(entry, &memtype_list, nd) {
+ if (entry->start == start && entry->end == end) {
+ if (cached_entry == entry || cached_start == start)
+ cached_entry = NULL;
+
+ list_del(&entry->nd);
+ kfree(entry);
err = 0;
break;
}
@@ -464,27 +397,19 @@ int free_memtype(u64 start, u64 end)
current->comm, current->pid, start, end);
}
- pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
return err;
}
-/*
- * /dev/mem mmap interface. The memtype used for mapping varies:
- * - Use UC for mappings with O_SYNC flag
- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
- * inherit the memtype from existing mapping.
- * - Else use UC_MINUS memtype (for backward compatibility with existing
- * X drivers.
- */
pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
unsigned long size, pgprot_t vma_prot)
{
return vma_prot;
}
-#ifdef CONFIG_NONPROMISC_DEVMEM
-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
static inline int range_is_allowed(unsigned long mfn, unsigned long size)
{
return 1;
@@ -508,20 +433,20 @@ static inline int range_is_allowed(unsig
}
return 1;
}
-#endif /* CONFIG_NONPROMISC_DEVMEM */
+#endif /* CONFIG_STRICT_DEVMEM */
int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
unsigned long size, pgprot_t *vma_prot)
{
u64 addr = (u64)mfn << PAGE_SHIFT;
- unsigned long flags = _PAGE_CACHE_UC_MINUS;
+ unsigned long flags = -1;
int retval;
if (!range_is_allowed(mfn, size))
return 0;
if (file->f_flags & O_SYNC) {
- flags = _PAGE_CACHE_UC;
+ flags = _PAGE_CACHE_UC_MINUS;
}
#ifndef CONFIG_X86_32
@@ -534,25 +459,26 @@ int phys_mem_access_prot_allowed(struct
* caching for the high addresses through the KEN pin, but
* we maintain the tradition of paranoia in this code.
*/
- if (!pat_wc_enabled &&
- ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
- test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
- (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+ if (!pat_enabled &&
+ !(boot_cpu_has(X86_FEATURE_MTRR) ||
+ boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+ boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+ boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
flags = _PAGE_CACHE_UC;
}
#endif
#endif
/*
- * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+ * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
+ *
* Without O_SYNC, we want to get
* - WB for WB-able memory and no other conflicting mappings
* - UC_MINUS for non-WB-able memory with no other conflicting mappings
* - Inherit from confliting mappings otherwise
*/
- if (flags != _PAGE_CACHE_UC_MINUS) {
+ if (flags != -1) {
retval = reserve_memtype(addr, addr + size, flags, NULL);
} else {
retval = reserve_memtype(addr, addr + size, -1, &flags);
@@ -600,3 +526,88 @@ void unmap_devmem(unsigned long mfn, uns
free_memtype(addr, addr + size);
}
+#if defined(CONFIG_DEBUG_FS)
+
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+ struct memtype *list_node, *print_entry;
+ int i = 1;
+
+ print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!print_entry)
+ return NULL;
+
+ spin_lock(&memtype_lock);
+ list_for_each_entry(list_node, &memtype_list, nd) {
+ if (pos == i) {
+ *print_entry = *list_node;
+ spin_unlock(&memtype_lock);
+ return print_entry;
+ }
+ ++i;
+ }
+ spin_unlock(&memtype_lock);
+ kfree(print_entry);
+ return NULL;
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) {
+ ++*pos;
+ seq_printf(seq, "PAT memtype list:\n");
+ }
+
+ return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+ struct memtype *print_entry = (struct memtype *)v;
+
+ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+ print_entry->start, print_entry->end);
+ kfree(print_entry);
+ return 0;
+}
+
+static struct seq_operations memtype_seq_ops = {
+ .start = memtype_seq_start,
+ .next = memtype_seq_next,
+ .stop = memtype_seq_stop,
+ .show = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+ .open = memtype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+ debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+ NULL, &memtype_fops);
+ return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS */
--- head-2011-03-11.orig/arch/x86/mm/pgtable-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pgtable-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -4,6 +4,7 @@
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
+#include <asm/fixmap.h>
#include <asm/hypervisor.h>
#include <asm/mmu_context.h>
@@ -410,15 +411,9 @@ static inline void pgd_list_del(pgd_t *p
static void pgd_ctor(void *p)
{
pgd_t *pgd = p;
- unsigned long flags;
pgd_test_and_unpin(pgd);
- /* Clear usermode parts of PGD */
- memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
- spin_lock_irqsave(&pgd_lock, flags);
-
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
@@ -440,13 +435,9 @@ static void pgd_ctor(void *p)
__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
#endif
-#ifndef CONFIG_X86_PAE
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD)
pgd_list_add(pgd);
-#endif
-
- spin_unlock_irqrestore(&pgd_lock, flags);
}
static void pgd_dtor(void *pgd)
@@ -475,33 +466,6 @@ static void pgd_dtor(void *pgd)
#ifdef CONFIG_X86_PAE
/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
- int i;
-
- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
- pgd_t pgd = pgdp[i];
-
- if (__pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
- pgdp[i] = xen_make_pgd(0);
-
- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- }
- }
-
- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
- xen_destroy_contiguous_region((unsigned long)pgdp, 0);
-}
-
-/*
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
* updating the top-level pagetable entries to guarantee the
* processor notices the update. Since this is expensive, and
@@ -512,61 +476,7 @@ static void pgd_mop_up_pmds(struct mm_st
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
* and initialize the kernel pmds here.
*/
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- pud_t *pud;
- pmd_t *pmds[UNSHARED_PTRS_PER_PGD];
- unsigned long addr, flags;
- int i;
-
- /*
- * We can race save/restore (if we sleep during a GFP_KERNEL memory
- * allocation). We therefore store virtual addresses of pmds as they
- * do not change across save/restore, and poke the machine addresses
- * into the pgdir under the pgd_lock.
- */
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) {
- pmds[i] = pmd_alloc_one(mm, addr);
- if (!pmds[i])
- goto out_oom;
- }
-
- spin_lock_irqsave(&pgd_lock, flags);
-
- /* Protect against save/restore: move below 4GB under pgd_lock. */
- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
- && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
- spin_unlock_irqrestore(&pgd_lock, flags);
-out_oom:
- while (i--)
- pmd_free(mm, pmds[i]);
- return 0;
- }
-
- /* Copy kernel pmd contents and write-protect the new pmds. */
- pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
- i++, pud++, addr += PUD_SIZE) {
- if (i >= KERNEL_PGD_BOUNDARY) {
- memcpy(pmds[i],
- (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
- sizeof(pmd_t) * PTRS_PER_PMD);
- make_lowmem_page_readonly(
- pmds[i], XENFEAT_writable_page_tables);
- }
-
- /* It is safe to poke machine addresses of pmds under the pgd_lock. */
- pud_populate(mm, pud, pmds[i]);
- }
-
- /* List required to sync kernel mapping updates and
- * to pin/unpin on save/restore. */
- pgd_list_add(pgd);
-
- spin_unlock_irqrestore(&pgd_lock, flags);
-
- return 1;
-}
+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
@@ -596,16 +506,101 @@ void pud_populate(struct mm_struct *mm,
xen_tlb_flush();
}
#else /* !CONFIG_X86_PAE */
+
/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+#define PREALLOCATED_PMDS 0
+
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
{
- return 1;
+ int i;
+
+#ifdef CONFIG_X86_PAE
+ if (contig)
+ xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
+#endif
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++)
+ if (pmds[i])
+ pmd_free(mm, pmds[i]);
}
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
{
+ int i;
+ bool failed = false;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
+ if (pmd == NULL)
+ failed = true;
+ pmds[i] = pmd;
+ }
+
+ if (failed) {
+ free_pmds(pmds, mm, false);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (__pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = xen_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ }
+ }
+
+#ifdef CONFIG_X86_PAE
+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+ xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+#endif
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+ pud_t *pud;
+ unsigned long addr;
+ int i;
+
+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+ return;
+
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < PREALLOCATED_PMDS;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmds[i];
+
+ if (i >= KERNEL_PGD_BOUNDARY) {
+ memcpy(pmd,
+ (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+ make_lowmem_page_readonly(
+ pmd, XENFEAT_writable_page_tables);
+ }
+
+ /* It is safe to poke machine addresses of pmds under the pgd_lock. */
+ pud_populate(mm, pud, pmd);
+ }
}
-#endif /* CONFIG_X86_PAE */
#ifdef CONFIG_X86_64
/* We allocate two contiguous pages for kernel and user. */
@@ -616,22 +611,55 @@ static void pgd_mop_up_pmds(struct mm_st
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+ pgd_t *pgd;
+ pmd_t *pmds[PREALLOCATED_PMDS];
+ unsigned long flags;
+
+ pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER);
+
+ if (pgd == NULL)
+ goto out;
- /* so that alloc_pd can use it */
mm->pgd = pgd;
- if (pgd) {
- /* Store a back link for vmalloc_sync_all(). */
- set_page_private(virt_to_page(pgd), (unsigned long)mm);
- pgd_ctor(pgd);
- }
- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- free_pages((unsigned long)pgd, PGD_ORDER);
- pgd = NULL;
+ if (preallocate_pmds(pmds, mm) != 0)
+ goto out_free_pgd;
+
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_pmds;
+
+ /*
+ * Make sure that pre-populating the pmds is atomic with
+ * respect to anything walking the pgd_list, so that they
+ * never see a partially populated pgd.
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+
+#ifdef CONFIG_X86_PAE
+ /* Protect against save/restore: move below 4GB under pgd_lock. */
+ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+ && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ goto out_free_pmds;
}
+#endif
+
+ pgd_ctor(pgd);
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+
+ /* Store a back link for vmalloc_sync_all(). */
+ set_page_private(virt_to_page(pgd), (unsigned long)mm);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
return pgd;
+
+out_free_pmds:
+ free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
+out_free_pgd:
+ free_pages((unsigned long)pgd, PGD_ORDER);
+out:
+ return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
@@ -647,6 +675,7 @@ void pgd_free(struct mm_struct *mm, pgd_
pgd_dtor(pgd);
pgd_mop_up_pmds(mm, pgd);
+ paravirt_pgd_free(mm, pgd);
free_pages((unsigned long)pgd, PGD_ORDER);
}
@@ -689,7 +718,7 @@ int ptep_test_and_clear_young(struct vm_
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
- &ptep->pte);
+ (unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
@@ -711,3 +740,42 @@ int ptep_clear_flush_young(struct vm_are
return young;
}
+
+int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
+{
+ unsigned long address = __fix_to_virt(idx);
+ pte_t pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+
+ switch (idx) {
+#ifdef CONFIG_X86_64
+ extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+ set_pte_vaddr_pud(level3_user_pgt, address, pte);
+ break;
+ case FIX_EARLYCON_MEM_BASE:
+ xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
+ pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+ fixmaps_set++;
+ return;
+#else
+ case FIX_WP_TEST:
+ case FIX_VDSO:
+ pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+ break;
+#endif
+ default:
+ pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
+ break;
+ }
+ set_pte_vaddr(address, pte);
+ fixmaps_set++;
+}
--- head-2011-03-11.orig/arch/x86/mm/pgtable_32-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/mm/pgtable_32-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -25,51 +25,49 @@
#include <xen/features.h>
#include <asm/hypervisor.h>
-void show_mem(void)
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
- int total = 0, reserved = 0;
- int shared = 0, cached = 0;
- int highmem = 0;
- struct page *page;
- pg_data_t *pgdat;
- unsigned long i;
- unsigned long flags;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
- page = pgdat_page_nr(pgdat, i);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- }
- printk(KERN_INFO "%d pages of RAM\n", total);
- printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
- printk(KERN_INFO "%d reserved pages\n", reserved);
- printk(KERN_INFO "%d pages shared\n", shared);
- printk(KERN_INFO "%d pages swap cached\n", cached);
-
- printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
- printk(KERN_INFO "%lu pages writeback\n",
- global_page_state(NR_WRITEBACK));
- printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
- printk(KERN_INFO "%lu pages slab\n",
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE));
- printk(KERN_INFO "%lu pages pagetables\n",
- global_page_state(NR_PAGETABLE));
+#ifndef CONFIG_XEN
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_val(pteval))
+ set_pte_present(&init_mm, vaddr, pte, pteval);
+ else
+ pte_clear(&init_mm, vaddr, pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+#else
+ if (HYPERVISOR_update_va_mapping(vaddr, pteval,
+ UVMF_INVLPG|UVMF_ALL))
+ BUG();
+#endif
}
/*
@@ -107,35 +105,10 @@ void set_pmd_pfn(unsigned long vaddr, un
__flush_tlb_one(vaddr);
}
-static int fixmaps;
unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
EXPORT_SYMBOL(__FIXADDR_TOP);
-void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
-{
- unsigned long address = __fix_to_virt(idx);
- pte_t pte;
-
- if (idx >= __end_of_fixed_addresses) {
- BUG();
- return;
- }
- switch (idx) {
- case FIX_WP_TEST:
- case FIX_VDSO:
- pte = pfn_pte(phys >> PAGE_SHIFT, flags);
- break;
- default:
- pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
- break;
- }
- if (HYPERVISOR_update_va_mapping(address, pte,
- UVMF_INVLPG|UVMF_ALL))
- BUG();
- fixmaps++;
-}
-
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
@@ -145,13 +118,48 @@ void __set_fixmap (enum fixed_addresses
*/
void __init reserve_top_address(unsigned long reserve)
{
- BUG_ON(fixmaps > 0);
+ BUG_ON(fixmaps_set > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
__VMALLOC_RESERVE += reserve;
}
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ __VMALLOC_RESERVE = memparse(arg, &arg);
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+#ifndef CONFIG_XEN
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
+
+ if (!arg)
+ return -EINVAL;
+
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ return 0;
+}
+early_param("reservetop", parse_reservetop);
+#endif
+
void make_lowmem_page_readonly(void *va, unsigned int feature)
{
pte_t *pte;
--- head-2011-03-11.orig/arch/x86/pci/amd_bus.c 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/pci/amd_bus.c 2011-02-01 14:38:38.000000000 +0100
@@ -350,6 +350,7 @@ static int __init early_fill_mp_bus_info
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
+#ifndef CONFIG_XEN
static void enable_pci_io_ecs(void *unused)
{
u64 reg;
@@ -378,6 +379,7 @@ static int __cpuinit amd_cpu_notify(stru
static struct notifier_block __cpuinitdata amd_cpu_notifier = {
.notifier_call = amd_cpu_notify,
};
+#endif /* CONFIG_XEN */
static void __init pci_enable_pci_io_ecs(void)
{
@@ -419,10 +421,19 @@ static int __init pci_io_ecs_init(void)
if (early_pci_allowed())
pci_enable_pci_io_ecs();
+#ifndef CONFIG_XEN
register_cpu_notifier(&amd_cpu_notifier);
for_each_online_cpu(cpu)
amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
(void *)(long)cpu);
+#else
+ if (cpu = 1, cpu) {
+ u64 reg;
+ rdmsrl(MSR_AMD64_NB_CFG, reg);
+ if (!(reg & ENABLE_CF8_EXT_CFG))
+ return 0;
+ }
+#endif
pci_probe |= PCI_HAS_IO_ECS;
return 0;
@@ -430,6 +441,10 @@ static int __init pci_io_ecs_init(void)
static int __init amd_postcore_init(void)
{
+#ifdef CONFIG_XEN
+ if (!is_initial_xendomain())
+ return 0;
+#endif
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
--- head-2011-03-11.orig/arch/x86/pci/irq-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/pci/irq-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -11,8 +11,8 @@
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/dmi.h>
-#include <asm/io.h>
-#include <asm/smp.h>
+#include <linux/io.h>
+#include <linux/smp.h>
#include <asm/io_apic.h>
#include <linux/irq.h>
#include <linux/acpi.h>
@@ -45,7 +45,8 @@ struct irq_router {
char *name;
u16 vendor, device;
int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
- int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int new);
};
struct irq_router_handler {
@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_d
* and perform checksum verification.
*/
-static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
{
struct irq_routing_table *rt;
int i;
@@ -74,10 +75,11 @@ static inline struct irq_routing_table *
rt->size < sizeof(struct irq_routing_table))
return NULL;
sum = 0;
- for (i=0; i < rt->size; i++)
+ for (i = 0; i < rt->size; i++)
sum += addr[i];
if (!sum) {
- DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
+ rt);
return rt;
}
return NULL;
@@ -104,7 +106,9 @@ static struct irq_routing_table * __init
return rt;
printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
}
- for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
+ for (addr = (u8 *) isa_bus_to_virt(0xf0000);
+ addr < (u8 *) isa_bus_to_virt(0x100000);
+ addr += 16) {
rt = pirq_check_routing_table(addr);
if (rt)
return rt;
@@ -126,20 +130,20 @@ static void __init pirq_peer_trick(void)
struct irq_info *e;
memset(busmap, 0, sizeof(busmap));
- for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+ for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
e = &rt->slots[i];
#ifdef DEBUG
{
int j;
DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
- for(j=0; j<4; j++)
+ for (j = 0; j < 4; j++)
DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
DBG("\n");
}
#endif
busmap[e->bus] = 1;
}
- for(i = 1; i < 256; i++) {
+ for (i = 1; i < 256; i++) {
int node;
if (!busmap[i] || pci_find_bus(0, i))
continue;
@@ -187,7 +191,8 @@ static unsigned int read_config_nybble(s
return (nr & 1) ? (x >> 4) : (x & 0xf);
}
-static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
+static void write_config_nybble(struct pci_dev *router, unsigned offset,
+ unsigned nr, unsigned int val)
{
u8 x;
unsigned reg = offset + (nr >> 1);
@@ -289,7 +294,7 @@ static int pirq_ite_get(struct pci_dev *
static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
WARN_ON_ONCE(pirq > 4);
- return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+ return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
}
static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
@@ -318,7 +323,7 @@ static int pirq_opti_set(struct pci_dev
/*
* Cyrix: nibble offset 0x5C
- * 0x5C bits 7:4 is INTB bits 3:0 is INTA
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
* 0x5D bits 7:4 is INTD bits 3:0 is INTC
*/
static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
@@ -354,7 +359,7 @@ static int pirq_cyrix_set(struct pci_dev
* Apparently there are systems implementing PCI routing table using
* link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
* We try our best to handle both link mappings.
- *
+ *
* Currently (2003-05-21) it appears most SiS chipsets follow the
* definition of routing registers from the SiS-5595 southbridge.
* According to the SiS 5595 datasheets the revision id's of the
@@ -374,7 +379,7 @@ static int pirq_cyrix_set(struct pci_dev
*
* 0x62: USBIRQ:
* bit 6 OHCI function disabled (0), enabled (1)
- *
+ *
* 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
*
* 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
@@ -437,7 +442,7 @@ static int pirq_vlsi_get(struct pci_dev
{
WARN_ON_ONCE(pirq >= 9);
if (pirq > 8) {
- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
return 0;
}
return read_config_nybble(router, 0x74, pirq-1);
@@ -447,7 +452,7 @@ static int pirq_vlsi_set(struct pci_dev
{
WARN_ON_ONCE(pirq >= 9);
if (pirq > 8) {
- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
return 0;
}
write_config_nybble(router, 0x74, pirq-1, irq);
@@ -471,7 +476,8 @@ static int pirq_serverworks_get(struct p
return inb(0xc01) & 0xf;
}
-static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
{
outb(pirq, 0xc00);
outb(irq, 0xc01);
@@ -491,22 +497,20 @@ static int pirq_amd756_get(struct pci_de
u8 irq;
irq = 0;
if (pirq <= 4)
- {
irq = read_config_nybble(router, 0x56, pirq - 1);
- }
- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
- dev->vendor, dev->device, pirq, irq);
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
return irq;
}
static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
{
- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
- dev->vendor, dev->device, pirq, irq);
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
if (pirq <= 4)
- {
write_config_nybble(router, 0x56, pirq - 1, irq);
- }
return 1;
}
@@ -553,50 +557,51 @@ static __init int intel_router_probe(str
if (pci_dev_present(pirq_440gx))
return 0;
- switch(device)
- {
- case PCI_DEVICE_ID_INTEL_82371FB_0:
- case PCI_DEVICE_ID_INTEL_82371SB_0:
- case PCI_DEVICE_ID_INTEL_82371AB_0:
- case PCI_DEVICE_ID_INTEL_82371MX:
- case PCI_DEVICE_ID_INTEL_82443MX_0:
- case PCI_DEVICE_ID_INTEL_82801AA_0:
- case PCI_DEVICE_ID_INTEL_82801AB_0:
- case PCI_DEVICE_ID_INTEL_82801BA_0:
- case PCI_DEVICE_ID_INTEL_82801BA_10:
- case PCI_DEVICE_ID_INTEL_82801CA_0:
- case PCI_DEVICE_ID_INTEL_82801CA_12:
- case PCI_DEVICE_ID_INTEL_82801DB_0:
- case PCI_DEVICE_ID_INTEL_82801E_0:
- case PCI_DEVICE_ID_INTEL_82801EB_0:
- case PCI_DEVICE_ID_INTEL_ESB_1:
- case PCI_DEVICE_ID_INTEL_ICH6_0:
- case PCI_DEVICE_ID_INTEL_ICH6_1:
- case PCI_DEVICE_ID_INTEL_ICH7_0:
- case PCI_DEVICE_ID_INTEL_ICH7_1:
- case PCI_DEVICE_ID_INTEL_ICH7_30:
- case PCI_DEVICE_ID_INTEL_ICH7_31:
- case PCI_DEVICE_ID_INTEL_ESB2_0:
- case PCI_DEVICE_ID_INTEL_ICH8_0:
- case PCI_DEVICE_ID_INTEL_ICH8_1:
- case PCI_DEVICE_ID_INTEL_ICH8_2:
- case PCI_DEVICE_ID_INTEL_ICH8_3:
- case PCI_DEVICE_ID_INTEL_ICH8_4:
- case PCI_DEVICE_ID_INTEL_ICH9_0:
- case PCI_DEVICE_ID_INTEL_ICH9_1:
- case PCI_DEVICE_ID_INTEL_ICH9_2:
- case PCI_DEVICE_ID_INTEL_ICH9_3:
- case PCI_DEVICE_ID_INTEL_ICH9_4:
- case PCI_DEVICE_ID_INTEL_ICH9_5:
- case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
- case PCI_DEVICE_ID_INTEL_ICH10_0:
- case PCI_DEVICE_ID_INTEL_ICH10_1:
- case PCI_DEVICE_ID_INTEL_ICH10_2:
- case PCI_DEVICE_ID_INTEL_ICH10_3:
- r->name = "PIIX/ICH";
- r->get = pirq_piix_get;
- r->set = pirq_piix_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_INTEL_82371FB_0:
+ case PCI_DEVICE_ID_INTEL_82371SB_0:
+ case PCI_DEVICE_ID_INTEL_82371AB_0:
+ case PCI_DEVICE_ID_INTEL_82371MX:
+ case PCI_DEVICE_ID_INTEL_82443MX_0:
+ case PCI_DEVICE_ID_INTEL_82801AA_0:
+ case PCI_DEVICE_ID_INTEL_82801AB_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_10:
+ case PCI_DEVICE_ID_INTEL_82801CA_0:
+ case PCI_DEVICE_ID_INTEL_82801CA_12:
+ case PCI_DEVICE_ID_INTEL_82801DB_0:
+ case PCI_DEVICE_ID_INTEL_82801E_0:
+ case PCI_DEVICE_ID_INTEL_82801EB_0:
+ case PCI_DEVICE_ID_INTEL_ESB_1:
+ case PCI_DEVICE_ID_INTEL_ICH6_0:
+ case PCI_DEVICE_ID_INTEL_ICH6_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_0:
+ case PCI_DEVICE_ID_INTEL_ICH7_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_30:
+ case PCI_DEVICE_ID_INTEL_ICH7_31:
+ case PCI_DEVICE_ID_INTEL_ESB2_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_1:
+ case PCI_DEVICE_ID_INTEL_ICH8_2:
+ case PCI_DEVICE_ID_INTEL_ICH8_3:
+ case PCI_DEVICE_ID_INTEL_ICH8_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_0:
+ case PCI_DEVICE_ID_INTEL_ICH9_1:
+ case PCI_DEVICE_ID_INTEL_ICH9_2:
+ case PCI_DEVICE_ID_INTEL_ICH9_3:
+ case PCI_DEVICE_ID_INTEL_ICH9_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_5:
+ case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_1:
+ case PCI_DEVICE_ID_INTEL_ICH10_2:
+ case PCI_DEVICE_ID_INTEL_ICH10_3:
+ case PCI_DEVICE_ID_INTEL_PCH_0:
+ case PCI_DEVICE_ID_INTEL_PCH_1:
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
}
return 0;
}
@@ -610,7 +615,7 @@ static __init int via_router_probe(struc
* workarounds for some buggy BIOSes
*/
if (device == PCI_DEVICE_ID_VIA_82C586_0) {
- switch(router->device) {
+ switch (router->device) {
case PCI_DEVICE_ID_VIA_82C686:
/*
* Asus k7m bios wrongly reports 82C686A
@@ -635,7 +640,7 @@ static __init int via_router_probe(struc
}
}
- switch(device) {
+ switch (device) {
case PCI_DEVICE_ID_VIA_82C586_0:
r->name = "VIA";
r->get = pirq_via586_get;
@@ -658,28 +663,27 @@ static __init int via_router_probe(struc
static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_VLSI_82C534:
- r->name = "VLSI 82C534";
- r->get = pirq_vlsi_get;
- r->set = pirq_vlsi_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_VLSI_82C534:
+ r->name = "VLSI 82C534";
+ r->get = pirq_vlsi_get;
+ r->set = pirq_vlsi_set;
+ return 1;
}
return 0;
}
-static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int serverworks_router_probe(struct irq_router *r,
+ struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_SERVERWORKS_OSB4:
- case PCI_DEVICE_ID_SERVERWORKS_CSB5:
- r->name = "ServerWorks";
- r->get = pirq_serverworks_get;
- r->set = pirq_serverworks_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+ r->name = "ServerWorks";
+ r->get = pirq_serverworks_get;
+ r->set = pirq_serverworks_set;
+ return 1;
}
return 0;
}
@@ -688,7 +692,7 @@ static __init int sis_router_probe(struc
{
if (device != PCI_DEVICE_ID_SI_503)
return 0;
-
+
r->name = "SIS";
r->get = pirq_sis_get;
r->set = pirq_sis_set;
@@ -697,50 +701,45 @@ static __init int sis_router_probe(struc
static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_CYRIX_5520:
- r->name = "NatSemi";
- r->get = pirq_cyrix_get;
- r->set = pirq_cyrix_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_CYRIX_5520:
+ r->name = "NatSemi";
+ r->get = pirq_cyrix_get;
+ r->set = pirq_cyrix_set;
+ return 1;
}
return 0;
}
static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_OPTI_82C700:
- r->name = "OPTI";
- r->get = pirq_opti_get;
- r->set = pirq_opti_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_OPTI_82C700:
+ r->name = "OPTI";
+ r->get = pirq_opti_get;
+ r->set = pirq_opti_set;
+ return 1;
}
return 0;
}
static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_ITE_IT8330G_0:
- r->name = "ITE";
- r->get = pirq_ite_get;
- r->set = pirq_ite_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_ITE_IT8330G_0:
+ r->name = "ITE";
+ r->get = pirq_ite_get;
+ r->set = pirq_ite_set;
+ return 1;
}
return 0;
}
static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
+ switch (device) {
case PCI_DEVICE_ID_AL_M1533:
case PCI_DEVICE_ID_AL_M1563:
- printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
r->name = "ALI";
r->get = pirq_ali_get;
r->set = pirq_ali_set;
@@ -751,25 +750,24 @@ static __init int ali_router_probe(struc
static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- switch(device)
- {
- case PCI_DEVICE_ID_AMD_VIPER_740B:
- r->name = "AMD756";
- break;
- case PCI_DEVICE_ID_AMD_VIPER_7413:
- r->name = "AMD766";
- break;
- case PCI_DEVICE_ID_AMD_VIPER_7443:
- r->name = "AMD768";
- break;
- default:
- return 0;
+ switch (device) {
+ case PCI_DEVICE_ID_AMD_VIPER_740B:
+ r->name = "AMD756";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7413:
+ r->name = "AMD766";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7443:
+ r->name = "AMD768";
+ break;
+ default:
+ return 0;
}
r->get = pirq_amd756_get;
r->set = pirq_amd756_set;
return 1;
}
-
+
static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
switch (device) {
@@ -811,7 +809,7 @@ static struct pci_dev *pirq_router_dev;
* FIXME: should we have an option to say "generic for
* chipset" ?
*/
-
+
static void __init pirq_find_router(struct irq_router *r)
{
struct irq_routing_table *rt = pirq_table;
@@ -830,7 +828,7 @@ static void __init pirq_find_router(stru
r->name = "default";
r->get = NULL;
r->set = NULL;
-
+
DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
rt->rtr_vendor, rt->rtr_device);
@@ -841,19 +839,19 @@ static void __init pirq_find_router(stru
return;
}
- for( h = pirq_routers; h->vendor; h++) {
+ for (h = pirq_routers; h->vendor; h++) {
/* First look for a router match */
- if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
+ if (rt->rtr_vendor == h->vendor &&
+ h->probe(r, pirq_router_dev, rt->rtr_device))
break;
/* Fall back to a device match */
- if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
+ if (pirq_router_dev->vendor == h->vendor &&
+ h->probe(r, pirq_router_dev, pirq_router_dev->device))
break;
}
- printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
- pirq_router.name,
- pirq_router_dev->vendor,
- pirq_router_dev->device,
- pci_name(pirq_router_dev));
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
+ pirq_router.name,
+ pirq_router_dev->vendor, pirq_router_dev->device);
/* The device remains referenced for the kernel lifetime */
}
@@ -861,11 +859,13 @@ static void __init pirq_find_router(stru
static struct irq_info *pirq_get_info(struct pci_dev *dev)
{
struct irq_routing_table *rt = pirq_table;
- int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
+ int entries = (rt->size - sizeof(struct irq_routing_table)) /
+ sizeof(struct irq_info);
struct irq_info *info;
for (info = rt->slots; entries--; info++)
- if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+ if (info->bus == dev->bus->number &&
+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
return info;
return NULL;
}
@@ -884,7 +884,7 @@ static int pcibios_lookup_irq(struct pci
/* Find IRQ pin */
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin) {
- DBG(KERN_DEBUG " -> no interrupt pin\n");
+ dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
pin = pin - 1;
@@ -893,20 +893,21 @@ static int pcibios_lookup_irq(struct pci
if (!pirq_table)
return 0;
-
- DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
+
info = pirq_get_info(dev);
if (!info) {
- DBG(" -> not found in routing table\n" KERN_DEBUG);
+ dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
+ 'A' + pin);
return 0;
}
pirq = info->irq[pin].link;
mask = info->irq[pin].bitmap;
if (!pirq) {
- DBG(" -> not routed\n" KERN_DEBUG);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
return 0;
}
- DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
+ dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
+ 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -919,7 +920,8 @@ static int pcibios_lookup_irq(struct pci
}
/* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
- if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
+ if (acer_tm360_irqrouting && dev->irq == 11 &&
+ dev->vendor == PCI_VENDOR_ID_O2) {
pirq = 0x68;
mask = 0x400;
dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -932,51 +934,50 @@ static int pcibios_lookup_irq(struct pci
*/
newirq = dev->irq;
if (newirq && !((1 << newirq) & mask)) {
- if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
- else printk("\n" KERN_WARNING
- "PCI: IRQ %i for device %s doesn't match PIRQ mask "
- "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
- pci_name(dev));
+ if (pci_probe & PCI_USE_PIRQ_MASK)
+ newirq = 0;
+ else
+ dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
+ "%#x; try pci=usepirqmask\n", newirq, mask);
}
if (!newirq && assign) {
for (i = 0; i < 16; i++) {
if (!(mask & (1 << i)))
continue;
- if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
+ if (pirq_penalty[i] < pirq_penalty[newirq] &&
+ can_request_irq(i, IRQF_SHARED))
newirq = i;
}
}
- DBG(" -> newirq=%d", newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
irq = pirq & 0xf;
- DBG(" -> hardcoded IRQ %d\n", irq);
- msg = "Hardcoded";
- } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
- ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
- DBG(" -> got IRQ %d\n", irq);
- msg = "Found";
+ msg = "hardcoded";
+ } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
+ msg = "found";
eisa_set_level_irq(irq);
- } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
- DBG(" -> assigning IRQ %d", newirq);
+ } else if (newirq && r->set &&
+ (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
if (r->set(pirq_router_dev, dev, pirq, newirq)) {
eisa_set_level_irq(newirq);
- DBG(" ... OK\n");
- msg = "Assigned";
+ msg = "assigned";
irq = newirq;
}
}
if (!irq) {
- DBG(" ... failed\n");
if (newirq && mask == (1 << newirq)) {
- msg = "Guessed";
+ msg = "guessed";
irq = newirq;
- } else
+ } else {
+ dev_dbg(&dev->dev, "can't route interrupt\n");
return 0;
+ }
}
- printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
/* Update IRQ for all devices with the same pirq value */
while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -988,20 +989,25 @@ static int pcibios_lookup_irq(struct pci
if (!info)
continue;
if (info->irq[pin].link == pirq) {
- /* We refuse to override the dev->irq information. Give a warning! */
- if ( dev2->irq && dev2->irq != irq && \
+ /*
+ * We refuse to override the dev->irq
+ * information. Give a warning!
+ */
+ if (dev2->irq && dev2->irq != irq && \
(!(pci_probe & PCI_USE_PIRQ_MASK) || \
- ((1 << dev2->irq) & mask)) ) {
+ ((1 << dev2->irq) & mask))) {
#ifndef CONFIG_PCI_MSI
- printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
- pci_name(dev2), dev2->irq, irq);
+ dev_info(&dev2->dev, "IRQ routing conflict: "
+ "have IRQ %d, want IRQ %d\n",
+ dev2->irq, irq);
#endif
- continue;
- }
+ continue;
+ }
dev2->irq = irq;
pirq_penalty[irq]++;
if (dev != dev2)
- printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
+ dev_info(&dev->dev, "sharing IRQ %d with %s\n",
+ irq, pci_name(dev2));
}
}
return 1;
@@ -1015,15 +1021,20 @@ static void __init pcibios_fixup_irqs(vo
DBG(KERN_DEBUG "PCI: IRQ fixup\n");
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
/*
- * If the BIOS has set an out of range IRQ number, just ignore it.
- * Also keep track of which IRQ's are already in use.
+ * If the BIOS has set an out of range IRQ number, just
+ * ignore it. Also keep track of which IRQ's are
+ * already in use.
*/
if (dev->irq >= 16) {
- DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
+ dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
dev->irq = 0;
}
- /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
- if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
+ /*
+ * If the IRQ is already assigned to a PCI device,
+ * ignore its ISA use penalty
+ */
+ if (pirq_penalty[dev->irq] >= 100 &&
+ pirq_penalty[dev->irq] < 100000)
pirq_penalty[dev->irq] = 0;
pirq_penalty[dev->irq]++;
}
@@ -1035,13 +1046,17 @@ static void __init pcibios_fixup_irqs(vo
/*
* Recalculate IRQ numbers if we use the I/O APIC.
*/
- if (io_apic_assign_pci_irqs)
- {
+ if (io_apic_assign_pci_irqs) {
int irq;
if (pin) {
- pin--; /* interrupt pins are numbered starting from 1 */
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+ /*
+ * interrupt pins are numbered starting
+ * from 1
+ */
+ pin--;
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn), pin);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1049,18 +1064,18 @@ static void __init pcibios_fixup_irqs(vo
* busses itself so we should get into this branch reliably.
*/
if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
- struct pci_dev * bridge = dev->bus->self;
+ struct pci_dev *bridge = dev->bus->self;
pin = (pin + PCI_SLOT(dev->devfn)) % 4;
- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
PCI_SLOT(bridge->devfn), pin);
if (irq >= 0)
- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
- pci_name(bridge), 'A' + pin, irq);
+ dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
+ pci_name(bridge),
+ 'A' + pin, irq);
}
if (irq >= 0) {
- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
- pci_name(dev), 'A' + pin, irq);
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
dev->irq = irq;
}
}
@@ -1082,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq
{
if (!broken_hp_bios_irq9) {
broken_hp_bios_irq9 = 1;
- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+ d->ident);
}
return 0;
}
@@ -1095,7 +1111,8 @@ static int __init fix_acer_tm360_irqrout
{
if (!acer_tm360_irqrouting) {
acer_tm360_irqrouting = 1;
- printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+ d->ident);
}
return 0;
}
@@ -1107,7 +1124,8 @@ static struct dmi_system_id __initdata p
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
- DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
+ DMI_MATCH(DMI_PRODUCT_VERSION,
+ "HP Pavilion Notebook Model GE"),
DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
},
},
@@ -1122,7 +1140,7 @@ static struct dmi_system_id __initdata p
{ }
};
-static int __init pcibios_irq_init(void)
+int __init pcibios_irq_init(void)
{
DBG(KERN_DEBUG "PCI: IRQ init\n");
@@ -1142,11 +1160,14 @@ static int __init pcibios_irq_init(void)
pirq_find_router(&pirq_router);
if (pirq_table->exclusive_irqs) {
int i;
- for (i=0; i<16; i++)
+ for (i = 0; i < 16; i++)
if (!(pirq_table->exclusive_irqs & (1 << i)))
pirq_penalty[i] += 100;
}
- /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
+ /*
+ * If we're using the I/O APIC, avoid using the PCI IRQ
+ * routing table
+ */
if (io_apic_assign_pci_irqs)
pirq_table = NULL;
}
@@ -1157,9 +1178,6 @@ static int __init pcibios_irq_init(void)
return 0;
}
-subsys_initcall(pcibios_irq_init);
-
-
static void pirq_penalize_isa_irq(int irq, int active)
{
/*
@@ -1193,7 +1211,7 @@ static int pirq_enable_irq(struct pci_de
if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
char *msg = "";
- pin--; /* interrupt pins are numbered starting from 1 */
+ pin--; /* interrupt pins are numbered starting from 1 */
if (io_apic_assign_pci_irqs) {
int irq;
@@ -1207,35 +1225,41 @@ static int pirq_enable_irq(struct pci_de
*/
temp_dev = dev;
while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
- struct pci_dev * bridge = dev->bus->self;
+ struct pci_dev *bridge = dev->bus->self;
pin = (pin + PCI_SLOT(dev->devfn)) % 4;
- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
PCI_SLOT(bridge->devfn), pin);
if (irq >= 0)
- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
- pci_name(bridge), 'A' + pin, irq);
+ dev_warn(&dev->dev, "using bridge %s "
+ "INT %c to get IRQ %d\n",
+ pci_name(bridge), 'A' + pin,
+ irq);
dev = bridge;
}
dev = temp_dev;
if (irq >= 0) {
- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
- pci_name(dev), 'A' + pin, irq);
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: "
+ "INT %c -> IRQ %d\n", 'A' + pin, irq);
dev->irq = irq;
return 0;
} else
- msg = " Probably buggy MP table.";
+ msg = "; probably buggy MP table";
} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
msg = "";
else
- msg = " Please try using pci=biosirq.";
+ msg = "; please try using pci=biosirq";
- /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
- if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
+ /*
+ * With IDE legacy devices the IRQ lookup failure is not
+ * a problem..
+ */
+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
+ !(dev->class & 0x5))
return 0;
- printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
- 'A' + pin, pci_name(dev), msg);
+ dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
+ 'A' + pin, msg);
}
return 0;
}
--- head-2011-03-11.orig/arch/x86/vdso/Makefile 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/Makefile 2011-02-01 14:38:38.000000000 +0100
@@ -65,9 +65,7 @@ obj-$(VDSO32-y) += vdso32-syms.lds
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
-xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80
-xen-vdso32-$(CONFIG_X86_32) += syscall
-vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y)
+vdso32.so-$(CONFIG_X86_XEN) += syscall
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
--- head-2011-03-11.orig/arch/x86/vdso/vdso32.S 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/vdso32.S 2011-02-01 14:38:38.000000000 +0100
@@ -9,7 +9,7 @@ vdso32_int80_end:
.globl vdso32_syscall_start, vdso32_syscall_end
vdso32_syscall_start:
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
.incbin "arch/x86/vdso/vdso32-syscall.so"
#endif
vdso32_syscall_end:
@@ -19,16 +19,4 @@ vdso32_sysenter_start:
.incbin "arch/x86/vdso/vdso32-sysenter.so"
vdso32_sysenter_end:
-#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200
- .globl vdso32_int80_start, vdso32_int80_end
-vdso32_int80_start:
- .incbin "arch/x86/vdso/vdso32-int80.so"
-vdso32_int80_end:
-#elif defined(CONFIG_X86_XEN)
- .globl vdso32_syscall_start, vdso32_syscall_end
-vdso32_syscall_start:
- .incbin "arch/x86/vdso/vdso32-syscall.so"
-vdso32_syscall_end:
-#endif
-
__FINIT
--- head-2011-03-11.orig/arch/x86/vdso/vdso32-setup-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/vdso/vdso32-setup-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -195,50 +195,28 @@ static __init void relocate_vdso(Elf32_E
}
}
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
static struct page *vdso32_pages[1];
#ifdef CONFIG_X86_64
-#if CONFIG_XEN_COMPAT < 0x030200
-static int use_int80 = 1;
-#endif
-static int use_sysenter __read_mostly = -1;
-
-#define vdso32_sysenter() (use_sysenter > 0)
+#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
-/* May not be __init: called during resume */
-void syscall32_cpu_init(void)
+void __cpuinit syscall32_cpu_init(void)
{
- static const struct callback_register cstar = {
+ static const struct callback_register __cpuinitconst cstar = {
.type = CALLBACKTYPE_syscall32,
.address = (unsigned long)ia32_cstar_target
};
- static const struct callback_register sysenter = {
+ static const struct callback_register __cpuinitconst sysenter = {
.type = CALLBACKTYPE_sysenter,
.address = (unsigned long)ia32_sysenter_target
};
- if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) ||
- (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0))
-#if CONFIG_XEN_COMPAT < 0x030200
- return;
- use_int80 = 0;
-#else
- BUG();
-#endif
-
- if (use_sysenter < 0) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- use_sysenter = 1;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
- use_sysenter = 1;
- }
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#define compat_uses_vma 1
@@ -250,6 +228,7 @@ static inline void map_compat_vdso(int m
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
extern asmlinkage void ia32pv_cstar_target(void);
static const struct callback_register __cpuinitconst cstar = {
@@ -265,13 +244,13 @@ void __cpuinit enable_sep_cpu(void)
.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
};
- if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+ if (vdso32_syscall()) {
if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
BUG();
return;
}
- if (!boot_cpu_has(X86_FEATURE_SEP))
+ if (!vdso32_sysenter())
return;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
@@ -341,34 +320,26 @@ int __init sysenter_setup(void)
#ifdef CONFIG_X86_32
gate_vma_init();
-#endif
-#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200
- if (use_int80) {
- extern const char vdso32_int80_start, vdso32_int80_end;
-
- vsyscall = &vdso32_int80_start;
- vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
- } else
-#elif defined(CONFIG_X86_32)
- if (boot_cpu_has(X86_FEATURE_SYSCALL)
- && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
- || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0))
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
- barrier(); /* until clear_bit()'s constraints are correct ... */
if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
- extern const char vdso32_syscall_start, vdso32_syscall_end;
-
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
+ && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
+ setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
+ else {
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+ }
+ }
+#endif
+ if (vdso32_syscall()) {
vsyscall = &vdso32_syscall_start;
vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
- } else
-#endif
- if (!vdso32_sysenter()) {
- vsyscall = &vdso32_default_start;
- vsyscall_len = &vdso32_default_end - &vdso32_default_start;
- } else {
+ } else if (vdso32_sysenter()){
vsyscall = &vdso32_sysenter_start;
vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+ } else {
+ vsyscall = &vdso32_int80_start;
+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
}
memcpy(syscall_page, vsyscall, vsyscall_len);
--- head-2011-03-11.orig/arch/x86/xen/Kconfig 2011-01-31 17:49:31.000000000 +0100
+++ head-2011-03-11/arch/x86/xen/Kconfig 2011-02-01 14:38:38.000000000 +0100
@@ -31,14 +31,14 @@ config XEN_PVHVM
config XEN_MAX_DOMAIN_MEMORY
int
default 128
- depends on XEN
+ depends on PARAVIRT_XEN
help
This only affects the sizing of some bss arrays, the unused
portions of which are freed.
config XEN_SAVE_RESTORE
bool
- depends on XEN && PM
+ depends on PARAVIRT_XEN && PM
default y
config XEN_DEBUG_FS
--- head-2011-03-11.orig/drivers/acpi/processor_driver.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/acpi/processor_driver.c 2011-02-01 14:38:38.000000000 +0100
@@ -512,10 +512,12 @@ static int __cpuinit acpi_processor_add(
per_cpu(processors, pr->id) = pr;
#endif
- sysdev = get_cpu_sysdev(pr->id);
- if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
- result = -EFAULT;
- goto err_free_cpumask;
+ if (pr->id != -1) {
+ sysdev = get_cpu_sysdev(pr->id);
+ if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
+ result = -EFAULT;
+ goto err_free_cpumask;
+ }
}
#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
@@ -599,7 +601,8 @@ static int acpi_processor_remove(struct
acpi_processor_power_exit(pr, device);
- sysfs_remove_link(&device->dev.kobj, "sysdev");
+ if (pr->id != -1)
+ sysfs_remove_link(&device->dev.kobj, "sysdev");
if (pr->cdev) {
sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
--- head-2011-03-11.orig/drivers/acpi/processor_perflib.c 2011-01-31 17:02:29.000000000 +0100
+++ head-2011-03-11/drivers/acpi/processor_perflib.c 2011-02-01 14:38:38.000000000 +0100
@@ -187,6 +187,12 @@ int acpi_processor_ppc_has_changed(struc
{
int ret;
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+ /* Xen hypervisor can handle cpufreq _PPC event */
+ if (ignore_ppc < 0 && processor_pmperf_external())
+ ignore_ppc = 0;
+#endif
+
if (ignore_ppc) {
/*
* Only when it is notification event, the _OST object
--- head-2011-03-11.orig/drivers/char/tpm/tpm_vtpm.c 2011-01-31 14:53:38.000000000 +0100
+++ head-2011-03-11/drivers/char/tpm/tpm_vtpm.c 2011-02-01 14:38:38.000000000 +0100
@@ -347,7 +347,7 @@ static int _vtpm_send_queued(struct tpm_
{
int rc;
int error = 0;
- long flags;
+ unsigned long flags;
unsigned char buffer[1];
struct vtpm_state *vtpms;
vtpms = (struct vtpm_state *)chip_get_private(chip);
--- head-2011-03-11.orig/drivers/dma/ioat/dma.h 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/dma/ioat/dma.h 2011-02-01 14:38:38.000000000 +0100
@@ -363,6 +363,7 @@ __ioat_dca_init(struct pci_dev *pdev, vo
}
#define ioat_dca_init __ioat_dca_init
#define ioat2_dca_init __ioat_dca_init
+#define ioat3_dca_init __ioat_dca_init
#endif
#endif /* IOATDMA_H */
--- head-2011-03-11.orig/drivers/hwmon/coretemp-xen.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/hwmon/coretemp-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -360,10 +360,11 @@ static int coretemp_device_add(unsigned
if (err)
goto exit_entry_free;
- /* check if family 6, models 0xe, 0xf, 0x16, 0x17 */
+ /* check if family 6, models 0xe, 0xf, 0x16, 0x17, 0x1A */
if (info.x86 != 0x6 ||
!((pdev_entry->x86_model == 0xe) || (pdev_entry->x86_model == 0xf) ||
- (pdev_entry->x86_model == 0x16) || (pdev_entry->x86_model == 0x17))) {
+ (pdev_entry->x86_model == 0x16) || (pdev_entry->x86_model == 0x17) ||
+ (pdev_entry->x86_model == 0x1A))) {
/* supported CPU not found, but report the unknown
family 6 CPU */
--- head-2011-03-11.orig/drivers/pci/msi-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/pci/msi-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -53,12 +53,10 @@ arch_msi_check_device(struct pci_dev *de
return 0;
}
-static void msi_set_enable(struct pci_dev *dev, int enable)
+static void __msi_set_enable(struct pci_dev *dev, int pos, int enable)
{
- int pos;
u16 control;
- pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
if (pos) {
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
control &= ~PCI_MSI_FLAGS_ENABLE;
@@ -68,6 +66,11 @@ static void msi_set_enable(struct pci_de
}
}
+static void msi_set_enable(struct pci_dev *dev, int enable)
+{
+ __msi_set_enable(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), enable);
+}
+
static void msix_set_enable(struct pci_dev *dev, int enable)
{
int pos;
@@ -180,8 +183,7 @@ static int msi_get_dev_owner(struct pci_
BUG_ON(!is_initial_xendomain());
if (get_owner && (owner = get_owner(dev)) >= 0) {
- printk(KERN_INFO "get owner for dev %x get %x \n",
- dev->devfn, owner);
+ dev_info(&dev->dev, "get owner: %x \n", owner);
return owner;
}
@@ -201,7 +203,7 @@ static int msi_unmap_pirq(struct pci_dev
? pirq : evtchn_get_xen_pirq(pirq);
if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
- printk(KERN_WARNING "unmap irq %x failed\n", pirq);
+ dev_warn(&dev->dev, "unmap irq %d failed\n", pirq);
if (rc < 0)
return rc;
@@ -249,7 +251,7 @@ static int msi_map_vector(struct pci_dev
map_irq.table_base = table_base;
if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
- printk(KERN_WARNING "map irq failed\n");
+ dev_warn(&dev->dev, "map irq failed\n");
if (rc < 0)
return rc;
@@ -360,10 +362,9 @@ static int msix_capability_init(struct p
mapped = 0;
list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
if (pirq_entry->entry_nr == entries[i].entry) {
- printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \
- not freed before acquire again.\n", entries[i].entry,
- dev->bus->number, PCI_SLOT(dev->devfn),
- PCI_FUNC(dev->devfn));
+ dev_warn(&dev->dev,
+ "msix entry %d was not freed\n",
+ entries[i].entry);
(entries + i)->vector = pirq_entry->pirq;
mapped = 1;
break;
@@ -489,9 +490,8 @@ int pci_enable_msi(struct pci_dev* dev)
/* Check whether driver already requested for MSI-X irqs */
if (dev->msix_enabled) {
- printk(KERN_INFO "PCI: %s: Can't enable MSI. "
- "Device already has MSI-X enabled\n",
- pci_name(dev));
+ dev_info(&dev->dev, "can't enable MSI "
+ "(MSI-X already enabled)\n");
return -EINVAL;
}
@@ -573,7 +573,8 @@ int pci_enable_msix(struct pci_dev* dev,
temp = dev->irq;
ret = pci_frontend_enable_msix(dev, entries, nvec);
if (ret) {
- printk("get %x from pci_frontend_enable_msix\n", ret);
+ dev_warn(&dev->dev,
+ "got %x from frontend_enable_msix\n", ret);
return ret;
}
dev->msix_enabled = 1;
@@ -624,9 +625,8 @@ int pci_enable_msix(struct pci_dev* dev,
temp = dev->irq;
/* Check whether driver already requested for MSI vector */
if (dev->msi_enabled) {
- printk(KERN_INFO "PCI: %s: Can't enable MSI-X. "
- "Device already has an MSI irq assigned\n",
- pci_name(dev));
+ dev_info(&dev->dev, "can't enable MSI-X "
+ "(MSI IRQ already assigned)\n");
return -EINVAL;
}
--- head-2011-03-11.orig/drivers/xen/Makefile 2011-02-28 15:13:33.000000000 +0100
+++ head-2011-03-11/drivers/xen/Makefile 2011-02-01 14:38:38.000000000 +0100
@@ -1,4 +1,4 @@
-obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o
+obj-$(CONFIG_PARAVIRT_XEN) += grant-table.o features.o events.o manage.o
xen-balloon-$(CONFIG_PARAVIRT_XEN) := balloon.o
xen-balloon-$(CONFIG_XEN) := balloon/
--- head-2011-03-11.orig/drivers/xen/balloon/balloon.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/balloon/balloon.c 2011-02-01 14:38:38.000000000 +0100
@@ -82,7 +82,7 @@ struct balloon_stats balloon_stats;
/* We increase/decrease in batches which fit in a page */
static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
-#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
+#ifdef CONFIG_HIGHMEM
#define inc_totalhigh_pages() (totalhigh_pages++)
#define dec_totalhigh_pages() (totalhigh_pages--)
#else
--- head-2011-03-11.orig/drivers/xen/balloon/sysfs.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/balloon/sysfs.c 2011-02-01 14:38:38.000000000 +0100
@@ -45,6 +45,7 @@
#define BALLOON_SHOW(name, format, args...) \
static ssize_t show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
char *buf) \
{ \
return sprintf(buf, format, ##args); \
@@ -56,14 +57,15 @@ BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(b
BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
-static ssize_t show_target_kb(struct sys_device *dev, char *buf)
+static ssize_t show_target_kb(struct sys_device *dev,
+ struct sysdev_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
}
static ssize_t store_target_kb(struct sys_device *dev,
- const char *buf,
- size_t count)
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
{
char memstring[64], *endchar;
unsigned long long target_bytes;
--- head-2011-03-11.orig/drivers/xen/blktap/blktap.c 2011-02-17 10:11:08.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap/blktap.c 2011-02-17 10:11:18.000000000 +0100
@@ -54,6 +54,7 @@
#include <linux/gfp.h>
#include <linux/poll.h>
#include <linux/delay.h>
+#include <linux/nsproxy.h>
#include <asm/tlbflush.h>
#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
@@ -502,7 +503,7 @@ found:
if ((class = get_xen_class()) != NULL)
device_create(class, NULL, MKDEV(blktap_major, minor),
- "blktap%d", minor);
+ NULL, "blktap%d", minor);
}
out:
@@ -1743,7 +1744,8 @@ static int __init blkif_init(void)
* We only create the device when a request of a new device is
* made.
*/
- device_create(class, NULL, MKDEV(blktap_major, 0), "blktap0");
+ device_create(class, NULL, MKDEV(blktap_major, 0), NULL,
+ "blktap0");
} else {
/* this is bad, but not fatal */
WPRINTK("blktap: sysfs xen_class not created\n");
--- head-2011-03-11.orig/drivers/xen/blktap2/device.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap2/device.c 2011-02-01 14:38:38.000000000 +0100
@@ -3,6 +3,7 @@
#include <linux/cdrom.h>
#include <linux/hdreg.h>
#include <linux/module.h>
+#include <linux/version.h>
#include <asm/tlbflush.h>
#include <scsi/scsi.h>
--- head-2011-03-11.orig/drivers/xen/blktap2/sysfs.c 2011-03-11 10:58:58.000000000 +0100
+++ head-2011-03-11/drivers/xen/blktap2/sysfs.c 2011-02-01 14:38:38.000000000 +0100
@@ -307,8 +307,8 @@ blktap_sysfs_create(struct blktap *tap)
ring = &tap->ring;
- dev = device_create_drvdata(class, NULL, ring->devno, tap,
- "blktap%d", tap->minor);
+ dev = device_create(class, NULL, ring->devno, tap,
+ "blktap%d", tap->minor);
if (IS_ERR(dev))
return PTR_ERR(dev);
--- head-2011-03-11.orig/drivers/xen/char/mem.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/char/mem.c 2011-02-01 14:38:38.000000000 +0100
@@ -35,7 +35,7 @@ static inline int uncached_access(struct
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
-#ifdef CONFIG_NONPROMISC_DEVMEM
+#ifdef CONFIG_STRICT_DEVMEM
u64 from = ((u64)pfn) << PAGE_SHIFT;
u64 to = from + size;
u64 cursor = from;
@@ -172,7 +172,10 @@ static void mmap_mem_close(struct vm_are
static struct vm_operations_struct mmap_mem_ops = {
.open = mmap_mem_open,
- .close = mmap_mem_close
+ .close = mmap_mem_close,
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+ .access = generic_access_phys
+#endif
};
static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma)
--- head-2011-03-11.orig/drivers/xen/console/console.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/console/console.c 2011-02-01 14:38:38.000000000 +0100
@@ -431,9 +431,7 @@ static void __xencons_tx_flush(void)
if (work_done && (xencons_tty != NULL)) {
wake_up_interruptible(&xencons_tty->write_wait);
- if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
- (xencons_tty->ldisc.write_wakeup != NULL))
- (xencons_tty->ldisc.write_wakeup)(xencons_tty);
+ tty_wakeup(xencons_tty);
}
}
@@ -634,8 +632,8 @@ static void xencons_close(struct tty_str
tty->closing = 1;
tty_wait_until_sent(tty, 0);
tty_driver_flush_buffer(tty);
- if (tty->ldisc.flush_buffer != NULL)
- tty->ldisc.flush_buffer(tty);
+ if (tty->ldisc.ops->flush_buffer != NULL)
+ tty->ldisc.ops->flush_buffer(tty);
tty->closing = 0;
spin_lock_irqsave(&xencons_lock, flags);
xencons_tty = NULL;
--- head-2011-03-11.orig/drivers/xen/core/evtchn.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/evtchn.c 2011-02-01 14:38:38.000000000 +0100
@@ -126,7 +126,11 @@ static int irq_bindcount[NR_IRQS];
#ifdef CONFIG_SMP
+#if CONFIG_NR_CPUS <= 256
static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+#else
+static u16 cpu_evtchn[NR_EVENT_CHANNELS];
+#endif
static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
@@ -767,8 +771,9 @@ static struct irq_chip dynirq_chip = {
};
/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
-static int pirq_eoi_does_unmask;
+static bool pirq_eoi_does_unmask;
static unsigned long *pirq_needs_eoi;
+static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
{
@@ -815,25 +820,31 @@ static inline void pirq_query_unmask(int
set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
}
-/*
- * On startup, if there is no action associated with the IRQ then we are
- * probing. In this case we should not share with others as it will confuse us.
- */
-#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
+static int set_type_pirq(unsigned int irq, unsigned int type)
+{
+ if (type != IRQ_TYPE_PROBE)
+ return -EINVAL;
+ set_bit(irq - PIRQ_BASE, probing_pirq);
+ return 0;
+}
static void enable_pirq(unsigned int irq)
{
struct evtchn_bind_pirq bind_pirq;
int evtchn = evtchn_from_irq(irq);
- if (VALID_EVTCHN(evtchn))
+ if (VALID_EVTCHN(evtchn)) {
+ clear_bit(irq - PIRQ_BASE, probing_pirq);
goto out;
+ }
bind_pirq.pirq = evtchn_get_xen_pirq(irq);
/* NB. We are happy to share unless we are probing. */
- bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
+ bind_pirq.flags = test_and_clear_bit(irq - PIRQ_BASE, probing_pirq)
+ || (irq_desc[irq].status & IRQ_AUTODETECT)
+ ? 0 : BIND_PIRQ__WILL_SHARE;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
- if (!probing_irq(irq))
+ if (bind_pirq.flags)
pr_info("Failed to obtain physical IRQ %d\n", irq);
return;
}
@@ -910,6 +921,7 @@ static struct irq_chip pirq_chip = {
.ack = ack_pirq,
.end = end_pirq,
.eoi = end_pirq,
+ .set_type = set_type_pirq,
#ifdef CONFIG_SMP
.set_affinity = set_affinity_irq,
#endif
@@ -985,6 +997,7 @@ void disable_all_local_evtchn(void)
synch_set_bit(i, &s->evtchn_mask[0]);
}
+#ifdef CONFIG_PM_SLEEP
static void restore_cpu_virqs(unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
@@ -1077,6 +1090,7 @@ void irq_resume(void)
}
}
+#endif
#if defined(CONFIG_X86_IO_APIC)
#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
@@ -1159,7 +1173,7 @@ void __init xen_init_IRQ(void)
* BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8)));
eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
- pirq_eoi_does_unmask = 1;
+ pirq_eoi_does_unmask = true;
/* No event channels are 'live' right now. */
for (i = 0; i < NR_EVENT_CHANNELS; i++)
--- head-2011-03-11.orig/drivers/xen/core/gnttab.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/gnttab.c 2011-02-01 14:38:38.000000000 +0100
@@ -448,6 +448,7 @@ static int map_pte_fn(pte_t *pte, struct
return 0;
}
+#ifdef CONFIG_PM_SLEEP
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
@@ -455,6 +456,7 @@ static int unmap_pte_fn(pte_t *pte, stru
set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
+#endif
void *arch_gnttab_alloc_shared(unsigned long *frames)
{
@@ -635,6 +637,75 @@ void __gnttab_dma_map_page(struct page *
} while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
}
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+static unsigned int GNTMAP_pte_special;
+
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
+ unsigned int count)
+{
+ unsigned int i;
+
+ if (unlikely(cmd != GNTTABOP_map_grant_ref))
+ count = 0;
+
+ for (i = 0; i < count; ++i, ++map) {
+ if (!(map->flags & GNTMAP_host_map)
+ || !(map->flags & GNTMAP_application_map))
+ continue;
+ if (GNTMAP_pte_special)
+ map->flags |= GNTMAP_pte_special;
+ else {
+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+ return true;
+ }
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(gnttab_pre_map_adjust);
+
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
+{
+ unsigned int i;
+ int rc = 0;
+
+ for (i = 0; i < count && rc == 0; ++i, ++map) {
+ pte_t pte;
+
+ if (!(map->flags & GNTMAP_host_map)
+ || !(map->flags & GNTMAP_application_map))
+ continue;
+
+#ifdef CONFIG_X86
+ pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
+ | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
+ | _PAGE_SPECIAL)
+ & __supported_pte_mask);
+#else
+#error Architecture not yet supported.
+#endif
+ if (!(map->flags & GNTMAP_readonly))
+ pte = pte_mkwrite(pte);
+
+ if (map->flags & GNTMAP_contains_pte) {
+ mmu_update_t u;
+
+ u.ptr = map->host_addr;
+ u.val = __pte_val(pte);
+ rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+ } else
+ rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(gnttab_post_map_adjust);
+#endif
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+
int gnttab_resume(void)
{
if (max_nr_grant_frames() < nr_grant_frames)
@@ -642,6 +713,7 @@ int gnttab_resume(void)
return gnttab_map(0, nr_grant_frames - 1);
}
+#ifdef CONFIG_PM_SLEEP
int gnttab_suspend(void)
{
#ifdef CONFIG_X86
@@ -651,6 +723,7 @@ int gnttab_suspend(void)
#endif
return 0;
}
+#endif
#else /* !CONFIG_XEN */
@@ -761,6 +834,18 @@ int __devinit gnttab_init(void)
gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
gnttab_free_head = NR_RESERVED_ENTRIES;
+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
+ if (!xen_feature(XENFEAT_auto_translated_physmap)
+ && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
+#ifdef CONFIG_X86
+ GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
+ >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
+#else
+#error Architecture not yet supported.
+#endif
+ }
+#endif
+
return 0;
ini_nomem:
--- head-2011-03-11.orig/drivers/xen/core/machine_kexec.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/machine_kexec.c 2011-02-01 14:38:38.000000000 +0100
@@ -57,8 +57,7 @@ void __init xen_machine_kexec_setup_reso
/* allocate xen_phys_cpus */
- xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource));
- BUG_ON(xen_phys_cpus == NULL);
+ xen_phys_cpus = alloc_bootmem(k * sizeof(struct resource));
/* fill in xen_phys_cpus with per-cpu crash note information */
@@ -91,7 +90,7 @@ void __init xen_machine_kexec_setup_reso
xen_hypervisor_res.start = range.start;
xen_hypervisor_res.end = range.start + range.size - 1;
xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
insert_resource(&iomem_resource, &xen_hypervisor_res);
#endif
@@ -106,7 +105,7 @@ void __init xen_machine_kexec_setup_reso
if (range.size) {
crashk_res.start = range.start;
crashk_res.end = range.start + range.size - 1;
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
insert_resource(&iomem_resource, &crashk_res);
#endif
}
@@ -141,15 +140,13 @@ void __init xen_machine_kexec_setup_reso
xen_max_nr_phys_cpus))
goto err;
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86
for (k = 0; k < xen_max_nr_phys_cpus; k++) {
res = xen_phys_cpus + k;
if (!res->parent) /* outside of xen_hypervisor_res range */
insert_resource(&iomem_resource, res);
}
-#endif
-#ifdef CONFIG_X86
if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
get_order(sizeof(vmcoreinfo_note)),
BITS_PER_LONG))
@@ -168,7 +165,7 @@ void __init xen_machine_kexec_setup_reso
return;
}
-#ifndef CONFIG_X86_64
+#ifndef CONFIG_X86
void __init xen_machine_kexec_register_resources(struct resource *res)
{
int k;
--- head-2011-03-11.orig/drivers/xen/core/machine_reboot.c 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/machine_reboot.c 2011-02-01 14:38:38.000000000 +0100
@@ -52,6 +52,7 @@ void machine_power_off(void)
HYPERVISOR_shutdown(SHUTDOWN_poweroff);
}
+#ifdef CONFIG_PM_SLEEP
static void pre_suspend(void)
{
HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
@@ -108,6 +109,7 @@ static void post_suspend(int suspend_can
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(pfn_to_mfn_frame_list_list);
}
+#endif
#else /* !(defined(__i386__) || defined(__x86_64__)) */
@@ -126,6 +128,7 @@ static void post_suspend(int suspend_can
#endif
+#ifdef CONFIG_PM_SLEEP
struct suspend {
int fast_suspend;
void (*resume_notifier)(int);
@@ -221,7 +224,8 @@ int __xen_suspend(int fast_suspend, void
if (fast_suspend) {
xenbus_suspend();
- err = stop_machine_run(take_machine_down, &suspend, 0);
+ err = stop_machine(take_machine_down, &suspend,
+ &cpumask_of_cpu(0));
if (err < 0)
xenbus_suspend_cancel();
} else {
@@ -244,3 +248,4 @@ int __xen_suspend(int fast_suspend, void
return 0;
}
+#endif
--- head-2011-03-11.orig/drivers/xen/core/reboot.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/reboot.c 2011-02-01 14:38:38.000000000 +0100
@@ -28,17 +28,12 @@ MODULE_LICENSE("Dual BSD/GPL");
/* Ignore multiple shutdown requests. */
static int shutting_down = SHUTDOWN_INVALID;
-/* Was last suspend request cancelled? */
-static int suspend_cancelled;
-
/* Can we leave APs online when we suspend? */
static int fast_suspend;
static void __shutdown_handler(struct work_struct *unused);
static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
-static int setup_suspend_evtchn(void);
-
int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
static int shutdown_process(void *__unused)
@@ -68,6 +63,13 @@ static int shutdown_process(void *__unus
return 0;
}
+#ifdef CONFIG_PM_SLEEP
+
+static int setup_suspend_evtchn(void);
+
+/* Was last suspend request cancelled? */
+static int suspend_cancelled;
+
static void xen_resume_notifier(int _suspend_cancelled)
{
int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
@@ -117,6 +119,10 @@ static int xen_suspend(void *__unused)
return 0;
}
+#else
+# define xen_suspend NULL
+#endif
+
static void switch_shutdown_state(int new_state)
{
int prev_state, old_state = SHUTDOWN_INVALID;
@@ -193,8 +199,10 @@ static void shutdown_handler(struct xenb
new_state = SHUTDOWN_POWEROFF;
else if (strcmp(str, "reboot") == 0)
ctrl_alt_del();
+#ifdef CONFIG_PM_SLEEP
else if (strcmp(str, "suspend") == 0)
new_state = SHUTDOWN_SUSPEND;
+#endif
else if (strcmp(str, "halt") == 0)
new_state = SHUTDOWN_HALT;
else
@@ -245,6 +253,7 @@ static struct xenbus_watch sysrq_watch =
.callback = sysrq_handler
};
+#ifdef CONFIG_PM_SLEEP
static irqreturn_t suspend_int(int irq, void* dev_id)
{
switch_shutdown_state(SHUTDOWN_SUSPEND);
@@ -272,6 +281,9 @@ static int setup_suspend_evtchn(void)
return 0;
}
+#else
+#define setup_suspend_evtchn() 0
+#endif
static int setup_shutdown_watcher(void)
{
--- head-2011-03-11.orig/drivers/xen/core/smpboot.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/smpboot.c 2011-02-01 14:38:38.000000000 +0100
@@ -27,6 +27,7 @@
extern irqreturn_t smp_reschedule_interrupt(int, void *);
extern irqreturn_t smp_call_function_interrupt(int, void *);
+extern irqreturn_t smp_call_function_single_interrupt(int, void *);
extern int local_setup_timer(unsigned int cpu);
extern void local_teardown_timer(unsigned int cpu);
@@ -47,8 +48,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
+static DEFINE_PER_CPU(int, call1func_irq);
static char resched_name[NR_CPUS][15];
static char callfunc_name[NR_CPUS][15];
+static char call1func_name[NR_CPUS][15];
void __init prefill_possible_map(void)
{
@@ -64,20 +67,19 @@ void __init prefill_possible_map(void)
break;
#endif
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0)
+ if (rc >= 0) {
cpu_set(i, cpu_possible_map);
+ nr_cpu_ids = i + 1;
+ }
}
}
-void __init smp_alloc_memory(void)
-{
-}
-
static int __cpuinit xen_smp_intr_init(unsigned int cpu)
{
int rc;
- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) =
+ per_cpu(call1func_irq, cpu) = -1;
sprintf(resched_name[cpu], "resched%u", cpu);
rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
@@ -101,6 +103,17 @@ static int __cpuinit xen_smp_intr_init(u
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
+ sprintf(call1func_name[cpu], "call1func%u", cpu);
+ rc = bind_ipi_to_irqhandler(CALL_FUNC_SINGLE_VECTOR,
+ cpu,
+ smp_call_function_single_interrupt,
+ IRQF_DISABLED|IRQF_NOBALANCING,
+ call1func_name[cpu],
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(call1func_irq, cpu) = rc;
+
rc = xen_spinlock_init(cpu);
if (rc < 0)
goto fail;
@@ -115,6 +128,8 @@ static int __cpuinit xen_smp_intr_init(u
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ if (per_cpu(call1func_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
xen_spinlock_cleanup(cpu);
return rc;
}
@@ -127,6 +142,7 @@ static void __cpuinit xen_smp_intr_exit(
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(call1func_irq, cpu), NULL);
xen_spinlock_cleanup(cpu);
}
#endif
@@ -134,11 +150,7 @@ static void __cpuinit xen_smp_intr_exit(
void __cpuinit cpu_bringup(void)
{
cpu_init();
-#ifdef __i386__
identify_secondary_cpu(&current_cpu_data);
-#else
- identify_cpu(&current_cpu_data);
-#endif
touch_softlockup_watchdog();
preempt_disable();
local_irq_enable();
@@ -218,9 +230,6 @@ void __init smp_prepare_cpus(unsigned in
struct task_struct *idle;
int apicid;
struct vcpu_get_physid cpu_id;
-#ifdef __x86_64__
- struct desc_ptr *gdt_descr;
-#endif
void *gdt_addr;
apicid = 0;
@@ -249,20 +258,10 @@ void __init smp_prepare_cpus(unsigned in
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
-#ifdef __x86_64__
- gdt_descr = &cpu_gdt_descr[cpu];
- gdt_descr->address = get_zeroed_page(GFP_KERNEL);
- if (unlikely(!gdt_descr->address)) {
- pr_crit("CPU%d failed to allocate GDT\n", cpu);
- continue;
- }
- gdt_descr->size = GDT_SIZE;
- memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
- gdt_addr = (void *)gdt_descr->address;
-#else
+#ifdef __i386__
init_gdt(cpu);
- gdt_addr = get_cpu_gdt_table(cpu);
#endif
+ gdt_addr = get_cpu_gdt_table(cpu);
make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
apicid = cpu;
@@ -305,8 +304,8 @@ void __init smp_prepare_boot_cpu(void)
{
#ifdef __i386__
init_gdt(smp_processor_id());
- switch_to_new_gdt();
#endif
+ switch_to_new_gdt();
prefill_possible_map();
}
--- head-2011-03-11.orig/drivers/xen/core/spinlock.c 2011-03-15 16:17:10.000000000 +0100
+++ head-2011-03-11/drivers/xen/core/spinlock.c 2011-03-15 16:51:35.000000000 +0100
@@ -5,6 +5,10 @@
* portions of this file.
*/
#define XEN_SPINLOCK_SOURCE
+#include <linux/spinlock_types.h>
+
+#ifdef TICKET_SHIFT
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -55,6 +59,7 @@ void __cpuinit xen_spinlock_cleanup(unsi
WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
}
+#ifdef CONFIG_PM_SLEEP
void __cpuinit spinlock_resume(void)
{
unsigned int cpu;
@@ -64,6 +69,7 @@ void __cpuinit spinlock_resume(void)
xen_spinlock_init(cpu);
}
}
+#endif
static unsigned int spin_adjust(struct spinning *spinning,
const raw_spinlock_t *lock,
@@ -86,7 +92,7 @@ static unsigned int spin_adjust(struct s
unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token)
{
- return spin_adjust(__get_cpu_var(spinning), lock, token);
+ return spin_adjust(x86_read_percpu(spinning), lock, token);
}
bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok,
@@ -99,21 +105,21 @@ bool xen_spin_wait(raw_spinlock_t *lock,
/* If kicker interrupt not initialized yet, just spin. */
if (unlikely(!cpu_online(raw_smp_processor_id()))
- || unlikely(!__get_cpu_var(poll_evtchn)))
+ || unlikely(!x86_read_percpu(poll_evtchn)))
return false;
/* announce we're spinning */
spinning.ticket = *ptok >> TICKET_SHIFT;
spinning.lock = lock;
- spinning.prev = __get_cpu_var(spinning);
+ spinning.prev = x86_read_percpu(spinning);
smp_wmb();
- __get_cpu_var(spinning) = &spinning;
+ x86_write_percpu(spinning, &spinning);
upcall_mask = current_vcpu_info()->evtchn_upcall_mask;
do {
bool nested = false;
- clear_evtchn(__get_cpu_var(poll_evtchn));
+ clear_evtchn(x86_read_percpu(poll_evtchn));
/*
* Check again to make sure it didn't become free while
@@ -126,7 +132,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
* without rechecking the lock.
*/
if (spinning.prev)
- set_evtchn(__get_cpu_var(poll_evtchn));
+ set_evtchn(x86_read_percpu(poll_evtchn));
rc = true;
break;
}
@@ -153,11 +159,11 @@ bool xen_spin_wait(raw_spinlock_t *lock,
bool kick, free;
other->ticket = -1;
- __raw_spin_unlock_body;
+ __ticket_spin_unlock_body;
if (!kick)
break;
xen_spin_kick(lock, token);
- __raw_spin_lock_preamble;
+ __ticket_spin_lock_preamble;
if (!free)
token = spin_adjust(
other->prev, lock,
@@ -181,7 +187,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
current_vcpu_info()->evtchn_upcall_mask = upcall_mask;
- rc = !test_evtchn(__get_cpu_var(poll_evtchn));
+ rc = !test_evtchn(x86_read_percpu(poll_evtchn));
if (!rc)
inc_irq_stat(irq_lock_count);
} while (spinning.prev || rc);
@@ -192,11 +198,12 @@ bool xen_spin_wait(raw_spinlock_t *lock,
*/
/* announce we're done */
- __get_cpu_var(spinning) = other = spinning.prev;
+ other = spinning.prev;
+ x86_write_percpu(spinning, other);
raw_local_irq_disable();
- rm_idx = __get_cpu_var(rm_seq.idx);
+ rm_idx = x86_read_percpu(rm_seq.idx);
smp_wmb();
- __get_cpu_var(rm_seq.idx) = rm_idx + 1;
+ x86_write_percpu(rm_seq.idx, rm_idx + 1);
mb();
/*
@@ -211,7 +218,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
if (other->ticket + 1)
continue;
lock = other->lock;
- __raw_spin_lock_preamble;
+ __ticket_spin_lock_preamble;
if (!free)
token = spin_adjust(other->prev, lock, token);
other->ticket = token >> TICKET_SHIFT;
@@ -220,7 +227,7 @@ bool xen_spin_wait(raw_spinlock_t *lock,
}
rm_idx &= 1;
- while (__get_cpu_var(rm_seq.ctr[rm_idx].counter))
+ while (x86_read_percpu(rm_seq.ctr[rm_idx].counter))
cpu_relax();
raw_local_irq_restore(upcall_mask);
*ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
@@ -283,3 +290,5 @@ void xen_spin_kick(raw_spinlock_t *lock,
}
}
EXPORT_SYMBOL(xen_spin_kick);
+
+#endif /* TICKET_SHIFT */
--- head-2011-03-11.orig/drivers/xen/fbfront/xenfb.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/fbfront/xenfb.c 2011-02-17 10:11:23.000000000 +0100
@@ -18,6 +18,7 @@
* frame buffer.
*/
+#include <linux/console.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fb.h>
@@ -545,6 +546,28 @@ static unsigned long vmalloc_to_mfn(void
return pfn_to_mfn(vmalloc_to_pfn(address));
}
+static __devinit void
+xenfb_make_preferred_console(void)
+{
+ struct console *c;
+
+ if (console_set_on_cmdline)
+ return;
+
+ acquire_console_sem();
+ for (c = console_drivers; c; c = c->next) {
+ if (!strcmp(c->name, "tty") && c->index == 0)
+ break;
+ }
+ release_console_sem();
+ if (c) {
+ unregister_console(c);
+ c->flags |= CON_CONSDEV;
+ c->flags &= ~CON_PRINTBUFFER; /* don't print again */
+ register_console(c);
+ }
+}
+
static int __devinit xenfb_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
@@ -665,6 +688,7 @@ static int __devinit xenfb_probe(struct
if (ret < 0)
goto error;
+ xenfb_make_preferred_console();
return 0;
error_nomem:
@@ -884,4 +908,5 @@ static void __exit xenfb_cleanup(void)
module_init(xenfb_init);
module_exit(xenfb_cleanup);
+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
MODULE_LICENSE("GPL");
--- head-2011-03-11.orig/drivers/xen/fbfront/xenkbd.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/fbfront/xenkbd.c 2011-02-01 14:38:38.000000000 +0100
@@ -350,4 +350,5 @@ static void __exit xenkbd_cleanup(void)
module_init(xenkbd_init);
module_exit(xenkbd_cleanup);
+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
MODULE_LICENSE("GPL");
--- head-2011-03-11.orig/drivers/xen/gntdev/gntdev.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/gntdev/gntdev.c 2011-02-01 14:38:38.000000000 +0100
@@ -400,7 +400,7 @@ static int __init gntdev_init(void)
}
device = device_create(class, NULL, MKDEV(gntdev_major, 0),
- GNTDEV_NAME);
+ NULL, GNTDEV_NAME);
if (IS_ERR(device)) {
pr_err("Error creating gntdev device in xen_class\n");
pr_err("gntdev created, major number = %d\n", gntdev_major);
--- head-2011-03-11.orig/drivers/xen/netback/netback.c 2011-02-09 15:55:20.000000000 +0100
+++ head-2011-03-11/drivers/xen/netback/netback.c 2011-02-01 14:38:38.000000000 +0100
@@ -36,7 +36,7 @@
#include "common.h"
#include <linux/if_vlan.h>
-#include <linux/tcp.h>
+#include <net/tcp.h>
#include <xen/balloon.h>
#include <xen/interface/memory.h>
#include <xen/net-util.h>
@@ -115,7 +115,7 @@ static inline int netif_page_index(struc
*/
#define PKT_PROT_LEN (ETH_HLEN + VLAN_HLEN + \
sizeof(struct iphdr) + MAX_IPOPTLEN + \
- sizeof(struct tcphdr) + 40 /* MAX_TCP_OPTION_SPACE */)
+ sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
static struct pending_tx_info {
netif_tx_request_t req;
--- head-2011-03-11.orig/drivers/xen/netfront/accel.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/netfront/accel.c 2011-02-01 14:38:38.000000000 +0100
@@ -28,6 +28,7 @@
* IN THE SOFTWARE.
*/
+#include <linux/version.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/list.h>
--- head-2011-03-11.orig/drivers/xen/netfront/netfront.c 2011-02-09 16:04:02.000000000 +0100
+++ head-2011-03-11/drivers/xen/netfront/netfront.c 2011-02-01 14:38:38.000000000 +0100
@@ -637,7 +637,7 @@ static int network_open(struct net_devic
}
spin_unlock_bh(&np->rx_lock);
- network_maybe_wake_tx(dev);
+ netif_start_queue(dev);
return 0;
}
--- head-2011-03-11.orig/drivers/xen/sfc_netback/accel.h 2010-01-18 15:23:12.000000000 +0100
+++ head-2011-03-11/drivers/xen/sfc_netback/accel.h 2011-02-01 14:38:38.000000000 +0100
@@ -25,6 +25,7 @@
#ifndef NETBACK_ACCEL_H
#define NETBACK_ACCEL_H
+#include <linux/version.h>
#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/tcp.h>
--- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel.h 2011-01-31 17:29:16.000000000 +0100
+++ head-2011-03-11/drivers/xen/sfc_netfront/accel.h 2011-02-01 14:38:38.000000000 +0100
@@ -35,6 +35,7 @@
#include <xen/evtchn.h>
#include <linux/kernel.h>
+#include <linux/version.h>
#include <linux/list.h>
enum netfront_accel_post_status {
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_client.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_client.c 2011-02-01 14:38:38.000000000 +0100
@@ -149,7 +149,7 @@ int xenbus_watch_pathfmt(struct xenbus_d
char *path;
va_start(ap, pathfmt);
- path = kvasprintf(GFP_KERNEL, pathfmt, ap);
+ path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap);
va_end(ap);
if (!path) {
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_comms.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_comms.c 2011-02-01 14:38:38.000000000 +0100
@@ -248,14 +248,11 @@ int xb_init_comms(void)
intf->rsp_cons = intf->rsp_prod;
}
+#if defined(CONFIG_XEN) || defined(MODULE)
if (xenbus_irq)
unbind_from_irqhandler(xenbus_irq, &xb_waitq);
-#if defined(CONFIG_XEN) || defined(MODULE)
err = bind_caller_port_to_irqhandler(
-#else
- err = bind_evtchn_to_irqhandler(
-#endif
xen_store_evtchn, wake_waiting,
0, "xenbus", &xb_waitq);
if (err <= 0) {
@@ -264,6 +261,20 @@ int xb_init_comms(void)
}
xenbus_irq = err;
+#else
+ if (xenbus_irq) {
+ /* Already have an irq; assume we're resuming */
+ rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
+ } else {
+ err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
+ 0, "xenbus", &xb_waitq);
+ if (err <= 0) {
+ pr_err("XENBUS request irq failed %i\n", err);
+ return err;
+ }
+ xenbus_irq = err;
+ }
+#endif
return 0;
}
--- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_probe.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/drivers/xen/xenbus/xenbus_probe.c 2011-02-01 14:38:38.000000000 +0100
@@ -36,6 +36,7 @@
__FUNCTION__, __LINE__, ##args)
#include <linux/kernel.h>
+#include <linux/version.h>
#include <linux/err.h>
#include <linux/string.h>
#include <linux/ctype.h>
--- head-2011-03-11.orig/fs/aio.c 2011-03-11 10:58:46.000000000 +0100
+++ head-2011-03-11/fs/aio.c 2011-03-11 10:59:16.000000000 +0100
@@ -1307,7 +1307,7 @@ static int make_aio_fd(struct kioctx *io
int fd;
struct file *file;
- fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx);
+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
if (fd < 0)
return fd;
--- head-2011-03-11.orig/include/Kbuild 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/include/Kbuild 2011-02-01 14:38:38.000000000 +0100
@@ -8,5 +8,6 @@ header-y += mtd/
header-y += rdma/
header-y += video/
header-y += drm/
+header-y += xen/public/
header-y += xen/
header-y += scsi/
--- head-2011-03-11.orig/include/asm-generic/pgtable.h 2011-03-11 10:54:24.000000000 +0100
+++ head-2011-03-11/include/asm-generic/pgtable.h 2011-03-11 10:59:22.000000000 +0100
@@ -156,10 +156,6 @@ static inline void pmdp_set_wrprotect(st
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
-#ifndef arch_change_pte_range
-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
-#endif
-
#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
extern pmd_t pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address,
--- head-2011-03-11.orig/arch/x86/include/asm/kexec.h 2011-01-31 14:53:50.000000000 +0100
+++ head-2011-03-11/arch/x86/include/asm/kexec.h 2011-02-01 14:38:38.000000000 +0100
@@ -5,8 +5,21 @@
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
# define PA_PGD 2
+# ifndef CONFIG_XEN
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+ */
+# define PA_PMD_0 8
+# define VA_PMD_0 9
+# define PA_PMD_1 10
+# define VA_PMD_1 11
+# define PA_SWAP_PAGE 12
+# define PAGES_NR 13
+# endif /* CONFIG_XEN */
#else
# define PA_CONTROL_PAGE 0
# define VA_CONTROL_PAGE 1
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/desc.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/desc.h 2011-02-01 14:38:38.000000000 +0100
@@ -31,11 +31,17 @@ extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
#endif
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
#ifdef CONFIG_X86_64
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-extern struct desc_ptr cpu_gdt_descr[];
-/* the cpu gdt accessor */
-#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
@@ -53,16 +59,6 @@ static inline void pack_gate(gate_desc *
}
#else
-struct gdt_page {
- struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
- return per_cpu(gdt_page, cpu).gdt;
-}
-
static inline void pack_gate(gate_desc *gate, unsigned char type,
unsigned long base, unsigned dpl, unsigned flags,
unsigned short seg)
@@ -333,6 +329,28 @@ static inline void set_intr_gate(unsigne
_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}
+#define SYS_VECTOR_FREE 0
+#define SYS_VECTOR_ALLOCED 1
+
+extern int first_system_vector;
+extern char system_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+ if (system_vectors[vector] == SYS_VECTOR_FREE) {
+ system_vectors[vector] = SYS_VECTOR_ALLOCED;
+ if (first_system_vector > vector)
+ first_system_vector = vector;
+ } else
+ BUG();
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+ alloc_system_vector(n);
+ set_intr_gate(n, addr);
+}
+
/*
* This routine sets up an interrupt gate at directory privilege level 3.
*/
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap.h 2011-02-01 14:38:38.000000000 +0100
@@ -7,7 +7,58 @@
# include "fixmap_64.h"
#endif
+extern int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses, maddr_t, pgprot_t);
+
+static inline void __set_fixmap(enum fixed_addresses idx,
+ maddr_t phys, pgprot_t flags)
+{
+ xen_set_fixmap(idx, phys, flags);
+}
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
#define clear_fixmap(idx) \
__set_fixmap(idx, 0, __pgprot(0))
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-02-01 14:38:38.000000000 +0100
@@ -58,10 +58,17 @@ enum fixed_addresses {
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_IO_APIC
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
#endif
+#else
+ FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+ FIX_ISAMAP_END,
+ FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
#ifdef CONFIG_X86_VISWS_APIC
FIX_CO_CPU, /* Cobalt timer */
FIX_CO_APIC, /* Cobalt APIC Redirection Table */
@@ -78,51 +85,38 @@ enum fixed_addresses {
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
#endif
-#ifdef CONFIG_ACPI
- FIX_ACPI_BEGIN,
- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
-#endif
#ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
#endif
#ifdef CONFIG_PARAVIRT
FIX_PARAVIRT_BOOTMAP,
#endif
- FIX_SHARED_INFO,
-#define NR_FIX_ISAMAPS 256
- FIX_ISAMAP_END,
- FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
__end_of_permanent_fixed_addresses,
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * We round it up to the next 512 pages boundary so that we
+ * We round it up to the next 256 pages boundary so that we
* can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
#define FIX_BTMAPS_NESTING 4
- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
- (__end_of_permanent_fixed_addresses & 511),
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
FIX_WP_TEST,
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
__end_of_fixed_addresses
};
-extern void __set_fixmap(enum fixed_addresses idx,
- maddr_t phys, pgprot_t flags);
extern void reserve_top_address(unsigned long reserve);
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
@@ -131,38 +125,5 @@ extern void reserve_top_address(unsigned
#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
-
#endif /* !__ASSEMBLY__ */
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-02-01 14:38:38.000000000 +0100
@@ -12,6 +12,7 @@
#define _ASM_FIXMAP_64_H
#include <linux/kernel.h>
+#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
#include <asm/vsyscall.h>
@@ -40,7 +41,6 @@ enum fixed_addresses {
VSYSCALL_HPET,
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
- FIX_HPET_BASE,
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
@@ -53,14 +53,21 @@ enum fixed_addresses {
FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+ MAX_EFI_IO_PAGES - 1,
#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#else
+ FIX_SHARED_INFO,
+#endif
#ifdef CONFIG_ACPI
FIX_ACPI_BEGIN,
FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
#endif
- FIX_SHARED_INFO,
#define NR_FIX_ISAMAPS 256
FIX_ISAMAP_END,
FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
__end_of_permanent_fixed_addresses,
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
@@ -71,27 +78,12 @@ enum fixed_addresses {
*/
#define NR_FIX_BTMAPS 64
#define FIX_BTMAPS_NESTING 4
- FIX_BTMAP_END =
- __end_of_permanent_fixed_addresses + 512 -
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 -
(__end_of_permanent_fixed_addresses & 511),
FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- FIX_OHCI1394_BASE,
-#endif
__end_of_fixed_addresses
};
-extern void __set_fixmap(enum fixed_addresses idx,
- unsigned long phys, pgprot_t flags);
-
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
@@ -100,30 +92,4 @@ extern void __set_fixmap(enum fixed_addr
#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without translation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
-
- return __fix_to_virt(idx);
-}
-
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/highmem.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/highmem.h 2011-02-01 14:38:38.000000000 +0100
@@ -73,6 +73,9 @@ struct page *kmap_atomic_to_page(void *p
#define flush_cache_kmaps() do { } while (0)
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
void clear_highpage(struct page *);
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypercall.h 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypercall.h 2011-02-01 14:38:38.000000000 +0100
@@ -332,9 +332,19 @@ static inline int __must_check
HYPERVISOR_grant_table_op(
unsigned int cmd, void *uop, unsigned int count)
{
+ bool fixup = false;
+ int rc;
+
if (arch_use_lazy_mmu_mode())
xen_multicall_flush(false);
- return _hypercall3(int, grant_table_op, cmd, uop, count);
+#ifdef GNTTABOP_map_grant_ref
+ if (cmd == GNTTABOP_map_grant_ref)
+#endif
+ fixup = gnttab_pre_map_adjust(cmd, uop, count);
+ rc = _hypercall3(int, grant_table_op, cmd, uop, count);
+ if (rc == 0 && fixup)
+ rc = gnttab_post_map_adjust(uop, count);
+ return rc;
}
static inline int __must_check
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-01 14:38:38.000000000 +0100
@@ -35,7 +35,6 @@
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/version.h>
#include <linux/errno.h>
#include <xen/interface/xen.h>
#include <xen/interface/platform.h>
@@ -119,6 +118,8 @@ int xen_create_contiguous_region(
unsigned long vstart, unsigned int order, unsigned int address_bits);
void xen_destroy_contiguous_region(
unsigned long vstart, unsigned int order);
+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
+ unsigned int address_bits);
struct page;
@@ -188,6 +189,29 @@ static inline void xen_multicall_flush(b
#endif /* CONFIG_XEN && !MODULE */
+#ifdef CONFIG_XEN
+
+struct gnttab_map_grant_ref;
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
+ unsigned int count);
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
+#else
+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
+ unsigned int count)
+{
+ BUG();
+ return -ENOSYS;
+}
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define gnttab_pre_map_adjust(...) false
+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN */
+
#if defined(CONFIG_X86_64)
#define MULTI_UVMFLAGS_INDEX 2
#define MULTI_UVMDOMID_INDEX 3
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/io.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/io.h 2011-02-01 14:38:38.000000000 +0100
@@ -3,20 +3,140 @@
#define ARCH_HAS_IOREMAP_WC
+#include <linux/compiler.h>
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+#ifndef __ASSEMBLY__
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+#endif
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
+
+#define readq_relaxed(a) __readq(a)
+#define __raw_readq __readq
+#define __raw_writeq writeq
+
+/* Let people know we have them */
+#define readq readq
+#define writeq writeq
+#endif
+
+#define native_io_delay xen_io_delay
+
#ifdef CONFIG_X86_32
-# include "io_32.h"
+# include "../../io_32.h"
#else
-# include "io_64.h"
+# include "../../io_64.h"
+#endif
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page)))
+
+#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \
+ (unsigned long)(bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \
+ && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
+ == bvec_to_pseudophys(vec2))
+
+#undef virt_to_bus
+#undef bus_to_virt
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+#include <asm/fixmap.h>
+
+#undef __ISA_IO_base
+#undef isa_virt_to_bus
+#undef isa_page_to_bus
+#undef isa_bus_to_virt
+#define isa_virt_to_bus(_x) ({ \
+ unsigned long _va_ = (unsigned long)(_x); \
+ _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+ ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+ : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
#endif
extern void *xlate_dev_mem_ptr(unsigned long phys);
extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
-extern void map_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-extern void unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t);
-
extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
unsigned long prot_val);
extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+
#endif /* _ASM_X86_IO_H */
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/irq_vectors.h 2011-02-15 17:27:18.000000000 +0100
@@ -0,0 +1,52 @@
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#else
+# define IA32_SYSCALL_VECTOR 0x80
+#endif
+
+#define RESCHEDULE_VECTOR 0
+#define CALL_FUNCTION_VECTOR 1
+#define NMI_VECTOR 0x02
+#define CALL_FUNC_SINGLE_VECTOR 3
+#define NR_IPIS 4
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+
+/*
+ * The flat IRQ space is divided into two regions:
+ * 1. A one-to-one mapping of real physical IRQs. This space is only used
+ * if we have physical device-access privilege. This region is at the
+ * start of the IRQ space so that existing device drivers do not need
+ * to be modified to translate physical IRQ numbers into our IRQ space.
+ * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ * are bound using the provided bind/unbind functions.
+ */
+
+#define PIRQ_BASE 0
+#if defined(NR_CPUS) && defined(MAX_IO_APICS)
+# if NR_CPUS < MAX_IO_APICS
+# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
+# else
+# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
+# endif
+#endif
+
+#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
+#define NR_DYNIRQS 256
+
+#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQ_VECTORS NR_IRQS
+
+#endif /* _ASM_IRQ_VECTORS_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/irqflags.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/irqflags.h 2011-02-01 14:38:38.000000000 +0100
@@ -118,7 +118,7 @@ static inline void halt(void)
#ifndef CONFIG_X86_64
#define INTERRUPT_RETURN iret
-#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \
+#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
__TEST_PENDING ; \
jnz 14f /* process more events if necessary... */ ; \
@@ -177,18 +177,6 @@ static inline void trace_hardirqs_fixup_
#else
#ifdef CONFIG_X86_64
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk
#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
TRACE_IRQS_ON; \
@@ -200,24 +188,6 @@ static inline void trace_hardirqs_fixup_
TRACE_IRQS_OFF;
#else
-#define ARCH_TRACE_IRQS_ON \
- pushl %eax; \
- pushl %ecx; \
- pushl %edx; \
- call trace_hardirqs_on; \
- popl %edx; \
- popl %ecx; \
- popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF \
- pushl %eax; \
- pushl %ecx; \
- pushl %edx; \
- call trace_hardirqs_off; \
- popl %edx; \
- popl %ecx; \
- popl %eax;
-
#define ARCH_LOCKDEP_SYS_EXIT \
pushl %eax; \
pushl %ecx; \
@@ -231,8 +201,8 @@ static inline void trace_hardirqs_fixup_
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
-# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON
-# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
#else
# define TRACE_IRQS_ON
# define TRACE_IRQS_OFF
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-01 14:38:38.000000000 +0100
@@ -1,5 +1,42 @@
+#ifndef __ASM_X86_MMU_CONTEXT_H
+#define __ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ if (!PagePinned(virt_to_page(next->pgd)))
+ mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
#ifdef CONFIG_X86_32
# include "mmu_context_32.h"
#else
# include "mmu_context_64.h"
#endif
+
+#define activate_mm(prev, next) \
+do { \
+ xen_activate_mm(prev, next); \
+ switch_mm((prev), (next), NULL); \
+} while (0);
+
+
+#endif /* __ASM_X86_MMU_CONTEXT_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_32.h 2011-02-01 14:38:38.000000000 +0100
@@ -1,32 +1,6 @@
#ifndef __I386_SCHED_H
#define __I386_SCHED_H
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-void arch_exit_mmap(struct mm_struct *mm);
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-
-void mm_pin(struct mm_struct *mm);
-void mm_unpin(struct mm_struct *mm);
-void mm_pin_all(void);
-
-static inline void xen_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
-{
- if (!PagePinned(virt_to_page(next->pgd)))
- mm_pin(next);
-}
-
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#if 0 /* XEN: no lazy tlb */
@@ -107,10 +81,4 @@ static inline void switch_mm(struct mm_s
#define deactivate_mm(tsk, mm) \
asm("movl %0,%%gs": :"r" (0));
-#define activate_mm(prev, next) \
-do { \
- xen_activate_mm(prev, next); \
- switch_mm((prev), (next), NULL); \
-} while (0)
-
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/mmu_context_64.h 2011-02-01 14:38:38.000000000 +0100
@@ -1,23 +1,6 @@
#ifndef __X86_64_MMU_CONTEXT_H
#define __X86_64_MMU_CONTEXT_H
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/pgalloc.h>
-#include <asm/page.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-void arch_exit_mmap(struct mm_struct *mm);
-void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-
-/*
- * possibly do the LDT unload here?
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
@@ -58,10 +41,6 @@ static inline void __prepare_arch_switch
}
}
-extern void mm_pin(struct mm_struct *mm);
-extern void mm_unpin(struct mm_struct *mm);
-void mm_pin_all(void);
-
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -124,11 +103,4 @@ do { \
asm volatile("movl %0,%%fs"::"r"(0)); \
} while (0)
-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- if (!PagePinned(virt_to_page(next->pgd)))
- mm_pin(next);
- switch_mm(prev, next, NULL);
-}
-
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pci.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pci.h 2011-02-01 14:38:38.000000000 +0100
@@ -21,6 +21,8 @@ struct pci_sysdata {
#endif
};
+extern int pci_routeirq;
+
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
int node);
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-01 14:38:38.000000000 +0100
@@ -7,6 +7,9 @@
#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable.h 2011-02-07 15:40:30.000000000 +0100
@@ -13,11 +13,12 @@
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and
+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
+#define _PAGE_BIT_UNUSED2 10
+#define _PAGE_BIT_IO 11 /* Mapped page is I/O or foreign and
* has no associated page struct. */
-#define _PAGE_BIT_UNUSED2 10 /* available for programmer */
-#define _PAGE_BIT_UNUSED3 11
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -28,34 +29,31 @@
/* if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
-/*
- * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
- * sign-extended value on 32-bit with all 1's in the upper word,
- * which preserves the upper pte values on 64-bit ptes:
- */
-#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT)
-#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW)
-#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER)
-#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT)
-#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD)
-#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
-#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY)
-#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */
-#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */
-#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO)
-#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2)
-#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3)
-#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT)
-#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE)
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2)
+#define _PAGE_IO (_AT(pteval_t, 1) << _PAGE_BIT_IO)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define __HAVE_ARCH_PTE_SPECIAL
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#else
-#define _PAGE_NX 0
+#define _PAGE_NX (_AT(pteval_t, 0))
#endif
-#define _PAGE_FILE (_AC(1, L)<<_PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AC(1, L)<<_PAGE_BIT_PROTNONE)
+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#ifndef __ASSEMBLY__
#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
@@ -71,8 +69,8 @@ extern unsigned int __kernel_page_user;
_PAGE_DIRTY | __kernel_page_user)
/* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IO | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
* PAT settings are part of the hypervisor interface, which sets the
@@ -102,19 +100,9 @@ extern unsigned int __kernel_page_user;
#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
_PAGE_ACCESSED)
-#ifdef CONFIG_X86_32
-#define _PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#ifndef __ASSEMBLY__
-extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
-#endif /* __ASSEMBLY__ */
-#else
#define __PAGE_KERNEL_EXEC \
(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
-#endif
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
@@ -125,25 +113,22 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KE
#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-/*
- * We don't support GLOBAL page in xenolinux64
- */
-#define MAKE_GLOBAL(x) __pgprot((x))
-
-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
/* xwr */
#define __P000 PAGE_NONE
@@ -182,27 +167,27 @@ extern struct list_head pgd_list;
*/
static inline int pte_dirty(pte_t pte)
{
- return __pte_val(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
{
- return __pte_val(pte) & _PAGE_ACCESSED;
+ return pte_flags(pte) & _PAGE_ACCESSED;
}
static inline int pte_write(pte_t pte)
{
- return __pte_val(pte) & _PAGE_RW;
+ return pte_flags(pte) & _PAGE_RW;
}
static inline int pte_file(pte_t pte)
{
- return __pte_val(pte) & _PAGE_FILE;
+ return pte_flags(pte) & _PAGE_FILE;
}
static inline int pte_huge(pte_t pte)
{
- return __pte_val(pte) & _PAGE_PSE;
+ return pte_flags(pte) & _PAGE_PSE;
}
static inline int pte_global(pte_t pte)
@@ -212,12 +197,12 @@ static inline int pte_global(pte_t pte)
static inline int pte_exec(pte_t pte)
{
- return !(__pte_val(pte) & _PAGE_NX);
+ return !(pte_flags(pte) & _PAGE_NX);
}
static inline int pte_special(pte_t pte)
{
- return 0;
+ return pte_flags(pte) & _PAGE_SPECIAL;
}
static inline int pmd_large(pmd_t pte)
@@ -228,22 +213,22 @@ static inline int pmd_large(pmd_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY);
+ return __pte_ma(__pte_val(pte) & ~_PAGE_DIRTY);
}
static inline pte_t pte_mkold(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED);
+ return __pte_ma(__pte_val(pte) & ~_PAGE_ACCESSED);
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW);
+ return __pte_ma(__pte_val(pte) & ~_PAGE_RW);
}
static inline pte_t pte_mkexec(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX);
+ return __pte_ma(__pte_val(pte) & ~_PAGE_NX);
}
static inline pte_t pte_mkdirty(pte_t pte)
@@ -268,7 +253,7 @@ static inline pte_t pte_mkhuge(pte_t pte
static inline pte_t pte_clrhuge(pte_t pte)
{
- return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE);
+ return __pte_ma(__pte_val(pte) & ~_PAGE_PSE);
}
static inline pte_t pte_mkglobal(pte_t pte)
@@ -283,35 +268,46 @@ static inline pte_t pte_clrglobal(pte_t
static inline pte_t pte_mkspecial(pte_t pte)
{
- return pte;
+ return __pte_ma(__pte_val(pte) | _PAGE_SPECIAL);
}
extern pteval_t __supported_pte_mask;
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
- pgprot_val(pgprot)) & __supported_pte_mask);
+ pgprotval_t prot = pgprot_val(pgprot);
+
+ if (prot & _PAGE_PRESENT)
+ prot &= __supported_pte_mask;
+ return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
}
static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
{
- return __pte_ma(((page_nr << PAGE_SHIFT) |
- pgprot_val(pgprot)) & __supported_pte_mask);
+ pgprotval_t prot = pgprot_val(pgprot);
+
+ if (prot & _PAGE_PRESENT)
+ prot &= __supported_pte_mask;
+ return __pte_ma((page_nr << PAGE_SHIFT) | prot);
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
- pgprot_val(pgprot)) & __supported_pte_mask);
+ pgprotval_t prot = pgprot_val(pgprot);
+
+ if (prot & _PAGE_PRESENT)
+ prot &= __supported_pte_mask;
+ return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | prot);
}
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pteval_t val = pte_val(pte);
+ pgprotval_t prot = pgprot_val(newprot);
+ pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
- val &= _PAGE_CHG_MASK;
- val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+ if (prot & _PAGE_PRESENT)
+ prot &= __supported_pte_mask;
+ val |= prot & ~_PAGE_CHG_MASK;
return __pte(val);
}
@@ -325,9 +321,11 @@ static inline pgprot_t pgprot_modify(pgp
return __pgprot(preservebits | addbits);
}
-#define pte_pgprot(x) __pgprot(__pte_val(x) & ~PTE_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
-#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & _PAGE_PRESENT \
+ ? pgprot_val(p) & __supported_pte_mask \
+ : pgprot_val(p))
#ifndef __ASSEMBLY__
#define __HAVE_PHYS_MEM_ACCESS_PROT
@@ -338,6 +336,17 @@ int phys_mem_access_prot_allowed(struct
unsigned long size, pgprot_t *vma_prot);
#endif
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifndef CONFIG_XEN
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void xen_pagetable_setup_start(pgd_t *base) {}
+static inline void xen_pagetable_setup_done(pgd_t *base) {}
+#endif
+
#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
@@ -373,6 +382,26 @@ int phys_mem_access_prot_allowed(struct
# include "pgtable_64.h"
#endif
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
@@ -383,8 +412,15 @@ enum {
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
+ PG_LEVEL_NUM
};
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
/*
* Helper function that returns the kernel pagetable entry controlling
* the virtual address 'address'. NULL means no pagetable entry present.
@@ -441,6 +477,8 @@ static inline void xen_pte_clear(struct
* race with other CPU's that might be updating the dirty
* bit at the same time.
*/
+struct vm_area_struct;
+
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
@@ -523,9 +561,6 @@ static inline void clone_pgd_range(pgd_t
memcpy(dst, src, count * sizeof(pgd_t));
}
-#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
- xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
-
#define arbitrary_virt_to_machine(va) \
({ \
unsigned int __lvl; \
@@ -548,6 +583,34 @@ struct page *kmap_atomic_to_page(void *)
#define ptep_to_machine(ptep) virt_to_machine(ptep)
#endif
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+#if CONFIG_XEN_COMPAT < 0x030300
+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
+ return ptep_get_and_clear(mm, addr, ptep);
+#endif
+ return *ptep;
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ mmu_update_t u;
+
+#if CONFIG_XEN_COMPAT < 0x030300
+ if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
+ set_pte_at(mm, addr, ptep, pte);
+ return;
+ }
+#endif
+ u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
+ u.val = __pte_val(pte);
+ if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
#include <asm-generic/pgtable.h>
#include <xen/features.h>
@@ -573,10 +636,6 @@ int create_lookup_pte_addr(struct mm_str
unsigned long address,
uint64_t *ptep);
-int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable);
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-02-01 14:38:38.000000000 +0100
@@ -14,11 +14,11 @@
#define pmd_ERROR(e) \
printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \
__FILE__, __LINE__, &(e), __pmd_val(e), \
- (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+ (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
#define pgd_ERROR(e) \
printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \
__FILE__, __LINE__, &(e), __pgd_val(e), \
- (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT)
+ (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
static inline int pud_none(pud_t pud)
{
@@ -27,7 +27,7 @@ static inline int pud_none(pud_t pud)
}
static inline int pud_bad(pud_t pud)
{
- return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
}
static inline int pud_present(pud_t pud)
@@ -102,9 +102,9 @@ static inline void pud_clear(pud_t *pudp
xen_tlb_flush();
}
-#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_MASK))
+#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_MASK))
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
/* Find an entry in the second-level page table.. */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-01 14:38:38.000000000 +0100
@@ -89,10 +89,10 @@ extern unsigned long pg0[];
/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
can temporarily clear it. */
#define pmd_present(x) (__pmd_val(x))
-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
#else
#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
-#define pmd_bad(x) ((__pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define pmd_bad(x) ((__pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
#endif
@@ -119,26 +119,6 @@ extern unsigned long pg0[];
*/
#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-/*
- * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
- *
- * this macro returns the index of the entry in the pgd page which would
- * control the given virtual address
- */
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_index_k(addr) pgd_index((addr))
-
-/*
- * pgd_offset() returns a (pgd_t *)
- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
- */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-
-/*
- * a shortcut which implies the use of the kernel's pgd, instead
- * of a process's
- */
-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
static inline int pud_large(pud_t pud) { return 0; }
@@ -165,7 +145,7 @@ static inline int pud_large(pud_t pud) {
#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
#define pmd_page_vaddr(pmd) \
- ((unsigned long)__va(pmd_val((pmd)) & PTE_MASK))
+ ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address) \
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-01 14:38:38.000000000 +0100
@@ -23,6 +23,8 @@ extern void xen_init_pt(void);
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
@@ -79,6 +81,9 @@ extern void paging_init(void);
struct mm_struct;
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
static inline void xen_set_pte(pte_t *ptep, pte_t pte)
@@ -145,7 +150,7 @@ static inline void xen_pgd_clear(pgd_t *
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-#define MAXMEM _AC(0x0000006fffffffff, UL)
+#define MAXMEM _AC(0x000004ffffffffff, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
#define VMEMMAP_START _AC(0xffffe20000000000, UL)
@@ -157,17 +162,17 @@ static inline void xen_pgd_clear(pgd_t *
static inline int pgd_bad(pgd_t pgd)
{
- return (__pgd_val(pgd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+ return (__pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
static inline int pud_bad(pud_t pud)
{
- return (__pud_val(pud) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+ return (__pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
static inline int pmd_bad(pmd_t pmd)
{
- return (__pmd_val(pmd) & ~(PTE_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+ return (__pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
}
#define pte_none(x) (!(x).pte)
@@ -175,7 +180,7 @@ static inline int pmd_bad(pmd_t pmd)
#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
-#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \
__pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \
@@ -200,11 +205,8 @@ static inline int pmd_bad(pmd_t pmd)
* Level 4 access.
*/
#define pgd_page_vaddr(pgd) \
- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK))
+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address)))
#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT)
static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
@@ -226,7 +228,7 @@ static inline int pud_large(pud_t pte)
}
/* PMD - Level 2 access */
-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:42:13.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/processor.h 2011-03-03 16:44:23.000000000 +0100
@@ -144,7 +144,7 @@ extern __u32 cleared_cpu_caps[NCAPINTS
#ifdef CONFIG_SMP
DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
-#define current_cpu_data cpu_data(smp_processor_id())
+#define current_cpu_data __get_cpu_var(cpu_info)
#else
#define cpu_data(cpu) boot_cpu_data
#define current_cpu_data boot_cpu_data
@@ -163,7 +163,7 @@ static inline int hlt_works(int cpu)
extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void identify_cpu(struct cpuinfo_x86 *);
+extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
@@ -277,15 +277,11 @@ struct tss_struct {
struct thread_struct *io_bitmap_owner;
/*
- * Pad the TSS to be cacheline-aligned (size is 0x100):
- */
- unsigned long __cacheline_filler[35];
- /*
* .. and then another 0x100 bytes for the emergency kernel stack:
*/
unsigned long stack[64];
-} __attribute__((packed));
+} ____cacheline_aligned;
DECLARE_PER_CPU(struct tss_struct, init_tss);
@@ -677,11 +673,36 @@ static inline void __sti_mwait(unsigned
extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-extern int force_mwait;
-
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern unsigned long boot_option_idle_override;
+extern unsigned long idle_halt;
+extern unsigned long idle_nomwait;
+
+#ifndef CONFIG_XEN
+/*
+ * on systems with caches, caches must be flashed as the absolute
+ * last instruction before going into a suspended halt. Otherwise,
+ * dirty data can linger in the cache and become stale on resume,
+ * leading to strange errors.
+ *
+ * perform a variety of operations to guarantee that the compiler
+ * will not reorder instructions. wbinvd itself is serializing
+ * so the processor will not reorder.
+ *
+ * Systems without cache can just go into halt.
+ */
+static inline void wbinvd_halt(void)
+{
+ mb();
+ /* check for clflush to determine if wbinvd is legal */
+ if (cpu_has_clflush)
+ asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
+ else
+ while (1)
+ halt();
+}
+#endif
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/smp.h 2011-02-01 14:38:38.000000000 +0100
@@ -25,25 +25,18 @@ extern cpumask_t cpu_initialized;
extern void (*mtrr_hook)(void);
extern void zap_low_mappings(void);
+extern int __cpuinit get_local_pda(int cpu);
+
extern unsigned int num_processors;
extern cpumask_t cpu_initialized;
#ifndef CONFIG_XEN
-#ifdef CONFIG_SMP
-extern u16 x86_cpu_to_apicid_init[];
-extern u16 x86_bios_cpu_apicid_init[];
-extern void *x86_cpu_to_apicid_early_ptr;
-extern void *x86_bios_cpu_apicid_early_ptr;
-#else
-#define x86_cpu_to_apicid_early_ptr NULL
-#define x86_bios_cpu_apicid_early_ptr NULL
-#endif
-
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
#endif
#ifdef CONFIG_SMP
@@ -64,9 +57,9 @@ struct smp_ops {
void (*smp_send_stop)(void);
void (*smp_send_reschedule)(int cpu);
- int (*smp_call_function_mask)(cpumask_t mask,
- void (*func)(void *info), void *info,
- int wait);
+
+ void (*send_call_func_ipi)(cpumask_t mask);
+ void (*send_call_func_single_ipi)(int cpu);
};
/* Globals due to paravirt */
@@ -104,11 +97,14 @@ static inline void smp_send_reschedule(i
smp_ops.smp_send_reschedule(cpu);
}
-static inline int smp_call_function_mask(cpumask_t mask,
- void (*func) (void *info), void *info,
- int wait)
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi(cpumask_t mask)
{
- return smp_ops.smp_call_function_mask(mask, func, info, wait);
+ smp_ops.send_call_func_ipi(mask);
}
void native_smp_prepare_boot_cpu(void);
@@ -120,23 +116,19 @@ int native_cpu_up(unsigned int cpunum);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function_mask(cpumask_t mask,
- void (*func) (void *info), void *info,
- int wait);
+void xen_send_call_func_ipi(cpumask_t mask);
+void xen_send_call_func_single_ipi(int cpu);
#define smp_send_stop xen_smp_send_stop
#define smp_send_reschedule xen_smp_send_reschedule
-#define smp_call_function_mask xen_smp_call_function_mask
-
-extern void prefill_possible_map(void);
+#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi
+#define arch_send_call_function_ipi xen_send_call_func_ipi
#endif /* CONFIG_XEN */
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
-extern void prefill_possible_map(void);
-
void smp_store_cpu_info(int id);
#define cpu_physical_id(cpu) (cpu)
@@ -147,6 +139,14 @@ static inline int num_booting_cpus(void)
}
#endif /* CONFIG_SMP */
+#if defined(CONFIG_SMP) && (defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_XEN))
+extern void prefill_possible_map(void);
+#else
+static inline void prefill_possible_map(void)
+{
+}
+#endif
+
extern unsigned disabled_cpus __cpuinitdata;
#ifdef CONFIG_X86_32_SMP
@@ -214,12 +214,8 @@ static inline int hard_smp_processor_id(
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_HOTPLUG_CPU
-extern void cpu_exit_clear(void);
extern void cpu_uninit(void);
#endif
-extern void smp_alloc_memory(void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
#endif /* __ASSEMBLY__ */
#endif
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/spinlock.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/spinlock.h 2011-02-01 14:38:38.000000000 +0100
@@ -38,6 +38,8 @@
# define UNLOCK_LOCK_PREFIX
#endif
+#ifdef TICKET_SHIFT
+
#include <asm/irqflags.h>
int xen_spinlock_init(unsigned int cpu);
@@ -65,14 +67,14 @@ void xen_spin_kick(raw_spinlock_t *, uns
* much between them in performance though, especially as locks are out of line.
*/
#if TICKET_SHIFT == 8
-#define __raw_spin_lock_preamble \
+#define __ticket_spin_lock_preamble \
asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
"cmpb %h0, %b0\n\t" \
"sete %1" \
: "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
: "0" (0x0100) \
: "memory", "cc")
-#define __raw_spin_lock_body \
+#define __ticket_spin_lock_body \
asm("1:\t" \
"cmpb %h0, %b0\n\t" \
"je 2f\n\t" \
@@ -86,7 +88,7 @@ void xen_spin_kick(raw_spinlock_t *, uns
: "+Q" (token), "+g" (count) \
: "m" (lock->slock) \
: "memory", "cc")
-#define __raw_spin_unlock_body \
+#define __ticket_spin_unlock_body \
asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \
"movzwl %2, %0\n\t" \
"cmpb %h0, %b0\n\t" \
@@ -95,7 +97,7 @@ void xen_spin_kick(raw_spinlock_t *, uns
: \
: "memory", "cc")
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
int tmp, new;
@@ -114,7 +116,7 @@ static __always_inline int __raw_spin_tr
return tmp;
}
#elif TICKET_SHIFT == 16
-#define __raw_spin_lock_preamble \
+#define __ticket_spin_lock_preamble \
do { \
unsigned int tmp; \
asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
@@ -126,7 +128,7 @@ static __always_inline int __raw_spin_tr
: "0" (0x00010000) \
: "memory", "cc"); \
} while (0)
-#define __raw_spin_lock_body \
+#define __ticket_spin_lock_body \
do { \
unsigned int tmp; \
asm("shldl $16, %0, %2\n" \
@@ -144,7 +146,7 @@ static __always_inline int __raw_spin_tr
: "m" (lock->slock) \
: "memory", "cc"); \
} while (0)
-#define __raw_spin_unlock_body \
+#define __ticket_spin_unlock_body \
do { \
unsigned int tmp; \
asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \
@@ -158,7 +160,7 @@ static __always_inline int __raw_spin_tr
: "memory", "cc"); \
} while (0)
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
int tmp;
int new;
@@ -181,27 +183,27 @@ static __always_inline int __raw_spin_tr
}
#endif
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
}
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
}
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
{
unsigned int token, count;
unsigned int flags = __raw_local_irq_save();
bool free;
- __raw_spin_lock_preamble;
+ __ticket_spin_lock_preamble;
if (likely(free)) {
raw_local_irq_restore(flags);
return;
@@ -210,41 +212,154 @@ static __always_inline void __raw_spin_l
raw_local_irq_restore(flags);
do {
count = 1 << 10;
- __raw_spin_lock_body;
+ __ticket_spin_lock_body;
} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
}
-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
- unsigned long flags)
+static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock,
+ unsigned long flags)
{
unsigned int token, count;
bool free;
- __raw_spin_lock_preamble;
+ __ticket_spin_lock_preamble;
if (likely(free))
return;
token = xen_spin_adjust(lock, token);
do {
count = 1 << 10;
- __raw_spin_lock_body;
+ __ticket_spin_lock_body;
} while (unlikely(!count) && !xen_spin_wait(lock, &token, flags));
}
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
{
unsigned int token;
bool kick;
- __raw_spin_unlock_body;
+ __ticket_spin_unlock_body;
if (kick)
xen_spin_kick(lock, token);
}
#ifndef XEN_SPINLOCK_SOURCE
-#undef __raw_spin_lock_preamble
-#undef __raw_spin_lock_body
-#undef __raw_spin_unlock_body
+#undef __ticket_spin_lock_preamble
+#undef __ticket_spin_lock_body
+#undef __ticket_spin_unlock_body
+#endif
+
+#define __raw_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+ s8 lock;
+#if NR_CPUS < 256
+ s8 spinners;
+#else
+#error NR_CPUS >= 256 support not implemented
#endif
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ s8 val = 1;
+
+ asm("1: xchgb %1, %0\n"
+ " test %1,%1\n"
+ " jz 3f\n"
+ " " LOCK_PREFIX "incb %2\n"
+ "2: rep;nop\n"
+ " cmpb $1, %0\n"
+ " je 2b\n"
+ " " LOCK_PREFIX "decb %2\n"
+ " jmp 1b\n"
+ "3:"
+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %1,%0"
+ : "+m" (bl->lock), "+q" (old) : : "memory");
+
+ return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ smp_wmb();
+ bl->lock = 0;
+}
+
+#define __raw_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
+
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+ return __raw_spin(is_locked)(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+ return __raw_spin(is_contended)(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+ __raw_spin(lock)(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __raw_spin(trylock)(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __raw_spin(unlock)(lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+ unsigned long flags)
+{
+ __raw_spin(lock_flags)(lock, flags);
+}
+
+#undef __raw_spin
static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
{
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-01-31 18:01:51.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/spinlock_types.h 2011-02-01 14:38:38.000000000 +0100
@@ -11,6 +11,10 @@ typedef union {
unsigned int slock;
struct {
/*
+ * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
+ */
+#if CONFIG_XEN_COMPAT >= 0x030200
+/*
* On Xen we support a single level of interrupt re-enabling per lock. Hence
* we can have twice as many outstanding tickets. Thus the cut-off for using
* byte register pairs must be at half the number of CPUs.
@@ -22,6 +26,7 @@ typedef union {
# define TICKET_SHIFT 16
u16 cur, seq;
#endif
+#endif
};
} raw_spinlock_t;
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/system.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/system.h 2011-03-03 15:58:55.000000000 +0100
@@ -68,10 +68,12 @@ do { \
[next] "d" (next)); \
} while (0)
+#ifndef CONFIG_XEN
/*
* disable hlt during certain critical i/o operations
*/
#define HAVE_DISABLE_HLT
+#endif
#else
#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
@@ -137,7 +139,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
-extern void load_gs_index(unsigned);
+extern void xen_load_gs_index(unsigned);
/*
* Load a segment. Fall back on loading the zero
@@ -154,14 +156,14 @@ extern void load_gs_index(unsigned);
"jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(1b,3b) \
- : :"r" (value), "r" (0))
+ : :"r" (value), "r" (0) : "memory")
/*
* Save a segment register away
*/
#define savesegment(seg, value) \
- asm volatile("mov %%" #seg ",%0":"=rm" (value))
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
static inline unsigned long get_limit(unsigned long segment)
{
@@ -269,6 +271,7 @@ static inline void xen_wbinvd(void)
#ifdef CONFIG_X86_64
#define read_cr8() (xen_read_cr8())
#define write_cr8(x) (xen_write_cr8(x))
+#define load_gs_index xen_load_gs_index
#endif
/* Clear the 'TS' bit */
@@ -287,13 +290,12 @@ static inline void clflush(volatile void
void disable_hlt(void);
void enable_hlt(void);
-extern int es7000_plat;
void cpu_idle_wait(void);
extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-void default_idle(void);
+void xen_idle(void);
/*
* Force strict CPU ordering.
--- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/xor_64.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/arch/x86/include/mach-xen/asm/xor_64.h 2011-02-01 14:38:38.000000000 +0100
@@ -1,3 +1,6 @@
+#ifndef ASM_X86__XOR_64_H
+#define ASM_X86__XOR_64_H
+
/*
* x86-64 changes / gcc fixes from Andi Kleen.
* Copyright 2002 Andi Kleen, SuSE Labs.
@@ -330,3 +333,5 @@ do { \
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* ASM_X86__XOR_64_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,125 +0,0 @@
-/*
- * This file should contain #defines for all of the interrupt vector
- * numbers used by this architecture.
- *
- * In addition, there are some standard defines:
- *
- * FIRST_EXTERNAL_VECTOR:
- * The first free place for external interrupts
- *
- * SYSCALL_VECTOR:
- * The IRQ vector a syscall makes the user to kernel transition
- * under.
- *
- * TIMER_IRQ:
- * The IRQ number the timer interrupt comes in at.
- *
- * NR_IRQS:
- * The total number of interrupt vectors (including all the
- * architecture specific interrupts) needed.
- *
- */
-#ifndef _ASM_IRQ_VECTORS_H
-#define _ASM_IRQ_VECTORS_H
-
-/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
- */
-#define FIRST_EXTERNAL_VECTOR 0x20
-
-#define SYSCALL_VECTOR 0x80
-
-/*
- * Vectors 0x20-0x2f are used for ISA interrupts.
- */
-
-#if 0
-/*
- * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
- *
- * some of the following vectors are 'rare', they are merged
- * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
- * TLB, reschedule and local APIC vectors are performance-critical.
- *
- * Vectors 0xf0-0xfa are free (reserved for future Linux use).
- */
-#define SPURIOUS_APIC_VECTOR 0xff
-#define ERROR_APIC_VECTOR 0xfe
-#define INVALIDATE_TLB_VECTOR 0xfd
-#define RESCHEDULE_VECTOR 0xfc
-#define CALL_FUNCTION_VECTOR 0xfb
-
-#define THERMAL_APIC_VECTOR 0xf0
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR 0xef
-#endif
-
-#define SPURIOUS_APIC_VECTOR 0xff
-#define ERROR_APIC_VECTOR 0xfe
-
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee)
- * we start at 0x31 to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR 0x31
-#define FIRST_SYSTEM_VECTOR 0xef
-
-/*
- * 16 8259A IRQ's, 208 potential APIC interrupt sources.
- * Right now the APIC is mostly only used for SMP.
- * 256 vectors is an architectural limit. (we can have
- * more than 256 devices theoretically, but they will
- * have to use shared interrupts)
- * Since vectors 0x00-0x1f are used/reserved for the CPU,
- * the usable vector space is 0x20-0xff (224 vectors)
- */
-
-#define RESCHEDULE_VECTOR 0
-#define CALL_FUNCTION_VECTOR 1
-#define NR_IPIS 2
-
-/*
- * The maximum number of vectors supported by i386 processors
- * is limited to 256. For processors other than i386, NR_VECTORS
- * should be changed accordingly.
- */
-#define NR_VECTORS 256
-
-#define FPU_IRQ 13
-
-#define FIRST_VM86_IRQ 3
-#define LAST_VM86_IRQ 15
-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
-
-/*
- * The flat IRQ space is divided into two regions:
- * 1. A one-to-one mapping of real physical IRQs. This space is only used
- * if we have physical device-access privilege. This region is at the
- * start of the IRQ space so that existing device drivers do not need
- * to be modified to translate physical IRQ numbers into our IRQ space.
- * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
- * are bound using the provided bind/unbind functions.
- */
-
-#define PIRQ_BASE 0
-#if !defined(MAX_IO_APICS)
-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
-#elif NR_CPUS < MAX_IO_APICS
-# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS)
-#else
-# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS)
-#endif
-
-#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS)
-#define NR_DYNIRQS 256
-
-#define NR_IRQS (NR_PIRQS + NR_DYNIRQS)
-#define NR_IRQ_VECTORS NR_IRQS
-
-#endif /* _ASM_IRQ_VECTORS_H */
--- head-2011-03-11.orig/arch/x86/include/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-/**
- * machine_specific_* - Hooks for machine specific setup.
- *
- * Description:
- * This is included late in kernel/setup.c so that it can make
- * use of all of the static functions.
- **/
-
-#include <xen/interface/callback.h>
-
-extern void hypervisor_callback(void);
-extern void failsafe_callback(void);
-extern void nmi(void);
-
-static void __init machine_specific_arch_setup(void)
-{
- int ret;
- static struct callback_register __initdata event = {
- .type = CALLBACKTYPE_event,
- .address = (unsigned long) hypervisor_callback,
- };
- static struct callback_register __initdata failsafe = {
- .type = CALLBACKTYPE_failsafe,
- .address = (unsigned long)failsafe_callback,
- };
- static struct callback_register __initdata syscall = {
- .type = CALLBACKTYPE_syscall,
- .address = (unsigned long)system_call,
- };
-#ifdef CONFIG_X86_LOCAL_APIC
- static struct callback_register __initdata nmi_cb = {
- .type = CALLBACKTYPE_nmi,
- .address = (unsigned long)nmi,
- };
-#endif
-
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
- if (ret == 0)
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
- if (ret == 0)
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
-#if CONFIG_XEN_COMPAT <= 0x030002
- if (ret == -ENOSYS)
- ret = HYPERVISOR_set_callbacks(
- event.address,
- failsafe.address,
- syscall.address);
-#endif
- BUG_ON(ret);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
-#if CONFIG_XEN_COMPAT <= 0x030002
- if (ret == -ENOSYS) {
- static struct xennmi_callback __initdata cb = {
- .handler_address = (unsigned long)nmi
- };
-
- HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
- }
-#endif
-#endif
-}
--- head-2011-03-11.orig/arch/x86/include/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200
+++ /dev/null 1970-01-01 00:00:00.000000000 +0000
@@ -1,5 +0,0 @@
-/* Hook to call BIOS initialisation function */
-
-#define ARCH_SETUP machine_specific_arch_setup();
-
-static void __init machine_specific_arch_setup(void);
--- head-2011-03-11.orig/arch/x86/include/asm/traps.h 2011-03-15 16:45:55.000000000 +0100
+++ head-2011-03-11/arch/x86/include/asm/traps.h 2011-02-01 14:38:38.000000000 +0100
@@ -38,6 +38,9 @@ asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_X86_XEN
+asmlinkage void fixup_4gb_segment(void);
+#endif
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
@@ -66,6 +69,9 @@ dotraplinkage void do_machine_check(stru
dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *, long);
+#ifdef CONFIG_XEN
+void do_fixup_4gb_segment(struct pt_regs *, long);
+#endif
#endif
static inline int get_si_code(unsigned long condition)
--- head-2011-03-11.orig/include/linux/page-flags.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/include/linux/page-flags.h 2011-02-01 14:38:38.000000000 +0100
@@ -125,12 +125,12 @@ enum pageflags {
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN)
PG_pinned = PG_locked, /* Cannot alias with PG_owner_priv_1 since
* bad_page() checks should include this bit.
* Should not use PG_arch_1 as that may have
* a different purpose elsewhere. */
-#else
+#elif defined(CONFIG_PARAVIRT_XEN)
PG_pinned = PG_owner_priv_1,
PG_savepinned = PG_dirty,
#endif
@@ -225,8 +225,12 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG
TESTCLEARFLAG(Active, active)
__PAGEFLAG(Slab, slab)
PAGEFLAG(Checked, checked) /* Used by some filesystems */
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
+#endif
+#ifdef CONFIG_PARAVIRT_XEN
PAGEFLAG(SavePinned, savepinned); /* Xen */
+#endif
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
--- head-2011-03-11.orig/include/xen/interface/memory.h 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/include/xen/interface/memory.h 2011-02-01 14:38:38.000000000 +0100
@@ -88,6 +88,7 @@ struct xen_memory_reservation {
*/
domid_t domid;
};
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
typedef struct xen_memory_reservation xen_memory_reservation_t;
DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
@@ -173,11 +174,7 @@ struct xen_machphys_mfn_list {
* any large discontiguities in the machine address space, 2MB gaps in
* the machphys table will be represented by an MFN base of zero.
*/
-#ifndef CONFIG_PARAVIRT_XEN
XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
-#else
- ulong extent_start;
-#endif
/*
* Number of extents written to the above array. This will be smaller
@@ -185,6 +182,7 @@ struct xen_machphys_mfn_list {
*/
unsigned int nr_extents;
};
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
@@ -226,6 +224,7 @@ struct xen_add_to_physmap {
/* GPFN where the source mapping page should appear. */
xen_pfn_t gpfn;
};
+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
typedef struct xen_add_to_physmap xen_add_to_physmap_t;
DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
--- head-2011-03-11.orig/include/xen/public/Kbuild 2011-01-31 14:31:28.000000000 +0100
+++ head-2011-03-11/include/xen/public/Kbuild 2011-02-01 14:38:38.000000000 +0100
@@ -1 +1,5 @@
+header-y += evtchn.h
+header-y += gntdev.h
header-y += iomulti.h
+header-y += privcmd.h
+header-y += xenbus.h
--- head-2011-03-11.orig/include/xen/public/privcmd.h 2010-01-18 15:23:12.000000000 +0100
+++ head-2011-03-11/include/xen/public/privcmd.h 2011-02-01 14:38:38.000000000 +0100
@@ -35,10 +35,6 @@
#include <linux/types.h>
-#ifndef __user
-#define __user
-#endif
-
typedef struct privcmd_hypercall
{
__u64 op;
--- head-2011-03-11.orig/include/xen/public/xenbus.h 2009-05-29 10:25:53.000000000 +0200
+++ head-2011-03-11/include/xen/public/xenbus.h 2011-02-01 14:38:38.000000000 +0100
@@ -35,10 +35,6 @@
#include <linux/types.h>
-#ifndef __user
-#define __user
-#endif
-
typedef struct xenbus_alloc {
domid_t dom;
__u32 port;
--- head-2011-03-11.orig/kernel/kexec.c 2011-01-31 17:56:27.000000000 +0100
+++ head-2011-03-11/kernel/kexec.c 2011-02-01 14:38:38.000000000 +0100
@@ -49,7 +49,7 @@ note_buf_t __percpu *crash_notes;
static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
u32
#if defined(CONFIG_XEN) && defined(CONFIG_X86)
-__attribute__((__section__(".bss.page_aligned"), __aligned__(PAGE_SIZE)))
+__page_aligned_bss
#endif
vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
--- head-2011-03-11.orig/lib/swiotlb-xen.c 2011-01-31 18:07:35.000000000 +0100
+++ head-2011-03-11/lib/swiotlb-xen.c 2011-02-01 14:38:38.000000000 +0100
@@ -788,7 +788,7 @@ swiotlb_sync_sg_for_device(struct device
}
int
-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
{
return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
}
--- head-2011-03-11.orig/mm/mprotect.c 2011-01-31 17:29:16.000000000 +0100
+++ head-2011-03-11/mm/mprotect.c 2011-02-01 14:38:38.000000000 +0100
@@ -97,8 +97,6 @@ static inline void change_pmd_range(stru
}
if (pmd_none_or_clear_bad(pmd))
continue;
- if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
- continue;
change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
dirty_accountable);
} while (pmd++, addr = next, addr != end);