From: www.kernel.org Subject: Linux 2.6.19 Patch-mainline: 2.6.19 Automatically created from "patches.kernel.org/patch-2.6.19" by xen-port-patches.py Acked-by: jbeulich@novell.com --- head-2011-03-11.orig/arch/x86/Kconfig 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/arch/x86/Kconfig 2011-01-31 17:29:16.000000000 +0100 @@ -537,6 +537,7 @@ config SCHED_OMIT_FRAME_POINTER menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !XEN ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. --- head-2011-03-11.orig/arch/x86/kernel/acpi/boot.c 2011-03-11 10:41:54.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/acpi/boot.c 2011-03-11 10:54:41.000000000 +0100 @@ -70,8 +70,12 @@ int acpi_strict; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; +#ifndef CONFIG_XEN int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; +#else +#define acpi_skip_timer_override 0 +#endif int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC @@ -587,6 +591,7 @@ void __init acpi_set_irq_model_ioapic(vo #ifdef CONFIG_ACPI_HOTPLUG_CPU #include +#ifndef CONFIG_XEN static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA @@ -678,6 +683,9 @@ free_tmp_map: out: return retval; } +#else +#define _acpi_map_lsapic(h, p) (-EINVAL) +#endif /* wrapper to silence section mismatch warning */ int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu) @@ -688,9 +696,11 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifndef CONFIG_XEN per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; +#endif return (0); } @@ -1688,7 +1698,7 @@ int __init acpi_mps_check(void) return 0; } -#ifdef CONFIG_X86_IO_APIC +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN) static int __init parse_acpi_skip_timer_override(char *arg) { acpi_skip_timer_override = 1; --- head-2011-03-11.orig/arch/x86/kernel/apic/apic-xen.c 2007-06-12 13:12:48.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/apic/apic-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -54,7 +54,6 @@ static cpumask_t timer_bcast_ipi; /* * Knob to control our willingness to enable the local APIC. */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ /* * Debug level @@ -102,7 +101,7 @@ int get_physical_broadcast(void) #ifndef CONFIG_XEN #ifndef CONFIG_SMP -static void up_apic_timer_interrupt_call(struct pt_regs *regs) +static void up_apic_timer_interrupt_call(void) { int cpu = smp_processor_id(); @@ -111,11 +110,11 @@ static void up_apic_timer_interrupt_call */ per_cpu(irq_stat, cpu).apic_timer_irqs++; - smp_local_timer_interrupt(regs); + smp_local_timer_interrupt(); } #endif -void smp_send_timer_broadcast_ipi(struct pt_regs *regs) +void smp_send_timer_broadcast_ipi(void) { cpumask_t mask; @@ -128,7 +127,7 @@ void smp_send_timer_broadcast_ipi(struct * We can directly call the apic timer interrupt handler * in UP case. Minus all irq related functions */ - up_apic_timer_interrupt_call(regs); + up_apic_timer_interrupt_call(); #endif } } --- head-2011-03-11.orig/arch/x86/kernel/cpu/common-xen.c 2009-05-19 09:16:41.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/cpu/common-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -43,7 +43,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM extern int disable_pse; -static void default_init(struct cpuinfo_x86 * c) +static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -56,7 +56,7 @@ static void default_init(struct cpuinfo_ } } -static struct cpu_dev default_cpu = { +static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", }; @@ -191,7 +191,16 @@ static void __cpuinit get_cpu_vendor(str static int __init x86_fxsr_setup(char * s) { + /* Tell all the other CPU's to not use it... */ disable_x86_fxsr = 1; + + /* + * ... and clear the bits early in the boot_cpu_data + * so that the bootup process doesn't try to do this + * either. + */ + clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); + clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -272,7 +281,7 @@ static void __init early_cpu_detect(void } } -void __cpuinit generic_identify(struct cpuinfo_x86 * c) +static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int ebx; @@ -700,8 +709,7 @@ old_gdt: */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - if (current->mm) - BUG(); + BUG_ON(current->mm); enter_lazy_tlb(&init_mm, current); load_esp0(t, thread); @@ -714,7 +722,7 @@ old_gdt: #endif /* Clear %fs and %gs. */ - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); --- head-2011-03-11.orig/arch/x86/kernel/cpu/mcheck/Makefile 2011-01-31 14:53:50.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/cpu/mcheck/Makefile 2011-01-31 17:29:16.000000000 +0100 @@ -11,3 +11,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inje obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o obj-$(CONFIG_ACPI_APEI) += mce-apei.o + +disabled-obj-$(CONFIG_XEN) := therm_throt.o --- head-2011-03-11.orig/arch/x86/kernel/cpu/mcheck/mce_dom0.c 2009-10-01 11:00:47.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/cpu/mcheck/mce_dom0.c 2011-01-31 17:29:16.000000000 +0100 @@ -53,8 +53,7 @@ static struct mc_info *g_mi; /*dom0 mce virq handler, logging physical mce error info*/ -static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id) { xen_mc_t mc_op; int result = 0; @@ -129,6 +128,6 @@ void bind_virq_for_mce(void) printk(KERN_ERR "MCE_DOM0_LOG: bind_virq for DOM0 failed\n"); /* Log the machine checks left over from the previous reset. */ - mce_dom0_interrupt(VIRQ_MCA, NULL, NULL); + mce_dom0_interrupt(VIRQ_MCA, NULL); } --- head-2011-03-11.orig/arch/x86/kernel/entry_32-xen.S 2009-05-19 09:16:41.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/entry_32-xen.S 2011-01-31 17:29:16.000000000 +0100 @@ -80,8 +80,12 @@ VM_MASK = 0x00020000 NMI_MASK = 0x80000000 #ifndef CONFIG_XEN -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax #else /* Offsets into shared_info_t. */ #define evtchn_upcall_pending /* 0 */ @@ -99,15 +103,29 @@ NMI_MASK = 0x80000000 #define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) #define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) #define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ __DISABLE_INTERRUPTS #define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ __ENABLE_INTERRUPTS -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + __TEST_PENDING ; \ + jnz 14f # process more events if necessary... ; \ + movl ESI(%esp), %esi ; \ + sysexit ; \ +14: __DISABLE_INTERRUPTS ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + push %esp ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#define INTERRUPT_RETURN iret #endif #ifdef CONFIG_PREEMPT -#define preempt_stop cli; TRACE_IRQS_OFF +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck @@ -206,18 +224,21 @@ NMI_MASK = 0x80000000 #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 3*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_EC_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 4*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, OLDESP-EBX;\ /*CFI_OFFSET cs, CS-OLDESP;*/\ CFI_OFFSET eip, EIP-OLDESP;\ @@ -263,8 +284,9 @@ ret_from_intr: check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 2), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -277,7 +299,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -297,6 +319,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp movl SYSENTER_stack_esp0(%esp),%esp @@ -305,7 +328,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -359,26 +382,8 @@ sysenter_past_esp: movl EIP(%esp), %edx movl OLDESP(%esp), %ecx xorl %ebp,%ebp -#ifdef CONFIG_XEN TRACE_IRQS_ON - __ENABLE_INTERRUPTS -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ - __TEST_PENDING - jnz 14f # process more events if necessary... - movl ESI(%esp), %esi - sysexit -14: __DISABLE_INTERRUPTS - TRACE_IRQS_OFF -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ - push %esp - call evtchn_do_upcall - add $4,%esp - jmp ret_from_intr -#else - TRACE_IRQS_ON - sti - sysexit -#endif /* !CONFIG_XEN */ + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC # pv sysenter call handler stub @@ -444,8 +449,8 @@ restore_all: # See comments in process.c:copy_thread() for details. movb OLDSS(%esp), %ah movb CS(%esp), %al - andl $(VM_MASK | (4 << 8) | 3), %eax - cmpl $((4 << 8) | 3), %eax + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: @@ -467,12 +472,11 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: #ifndef CONFIG_XEN - TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS #endif pushl $0 # no error code pushl $do_iret_error @@ -498,7 +502,7 @@ ldt_ss: * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer CFI_ADJUST_CFA_OFFSET 8 - cli + DISABLE_INTERRUPTS TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, @@ -508,7 +512,7 @@ ldt_ss: TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -524,7 +528,7 @@ scrit: /**** START OF CRITICAL REGION ** RESTORE_REGS addl $4, %esp CFI_ADJUST_CFA_OFFSET -4 -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc @@ -713,11 +717,9 @@ ENTRY(name) \ #define UNWIND_ESPFIX_STACK #endif -ENTRY(divide_error) - RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -767,6 +769,7 @@ error_code: call *%edi jmp ret_from_exception CFI_ENDPROC +KPROBE_END(page_fault) #ifdef CONFIG_XEN # A note on the "critical region" in our callback handler. @@ -790,9 +793,11 @@ ENTRY(hypervisor_callback) pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - testb $2,CS(%esp) + movl CS(%esp),%ecx movl EIP(%esp),%eax - jnz .Ldo_upcall + andl $SEGMENT_RPL_MASK,%ecx + cmpl $USER_RPL,%ecx + jae .Ldo_upcall cmpl $scrit,%eax jb 0f cmpl $ecrit,%eax @@ -928,7 +933,7 @@ ENTRY(device_not_available) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL #ifndef CONFIG_XEN - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) je device_available_emulate pushl $0 # temporary storage for ORIG_EIP @@ -963,9 +968,15 @@ device_available_emulate: jne ok; \ label: \ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 #endif /* CONFIG_XEN */ KPROBE_ENTRY(debug) @@ -984,7 +995,8 @@ debug_stack_correct: call do_debug jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(debug) + #ifndef CONFIG_XEN /* * NMI is doubly nasty. It can happen _while_ we're handling @@ -994,7 +1006,7 @@ debug_stack_correct: * check whether we got an NMI on the debug path where the debug * fault happened on the sysenter path. */ -ENTRY(nmi) +KPROBE_ENTRY(nmi) RING0_INT_FRAME pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -1019,6 +1031,7 @@ ENTRY(nmi) cmpl $sysenter_entry,12(%esp) je nmi_debug_stack_check nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL @@ -1029,9 +1042,12 @@ nmi_stack_correct: CFI_ENDPROC nmi_stack_fixup: + RING0_INT_FRAME FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct + nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -1042,8 +1058,10 @@ nmi_debug_stack_check: jmp nmi_stack_correct nmi_16bit_stack: - RING0_INT_FRAME - /* create the pointer to lss back */ + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ pushl %ss CFI_ADJUST_CFA_OFFSET 4 pushl %esp @@ -1064,14 +1082,14 @@ nmi_16bit_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" .align 4 .long 1b,iret_exc .previous #else -ENTRY(nmi) +KPROBE_ENTRY(nmi) RING0_INT_FRAME pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -1083,6 +1101,7 @@ ENTRY(nmi) jmp restore_all CFI_ENDPROC #endif +KPROBE_END(nmi) KPROBE_ENTRY(int3) RING0_INT_FRAME @@ -1094,7 +1113,7 @@ KPROBE_ENTRY(int3) call do_int3 jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) RING0_INT_FRAME @@ -1159,7 +1178,7 @@ KPROBE_ENTRY(general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) RING0_EC_FRAME @@ -1168,13 +1187,14 @@ ENTRY(alignment_check) jmp error_code CFI_ENDPROC -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) @@ -1236,6 +1256,19 @@ ENTRY(fixup_4gb_segment) jmp error_code CFI_ENDPROC +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + .section .rodata,"a" #include "syscall_table.S" --- head-2011-03-11.orig/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/head_32-xen.S 2011-01-31 17:29:16.000000000 +0100 @@ -62,7 +62,7 @@ ENTRY(startup_32) movl %eax,%gs cld # gcc2 wants the direction flag cleared at all times - pushl %eax # fake return address + pushl $0 # fake return address for unwinder jmp start_kernel #define HYPERCALL_PAGE_OFFSET 0x1000 --- head-2011-03-11.orig/arch/x86/kernel/io_apic_32-xen.c 2009-03-18 10:39:31.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/io_apic_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -31,6 +31,9 @@ #include #include #include +#include +#include +#include #include #include @@ -38,13 +41,15 @@ #include #include #include +#include +#include #include +#include #include "io_ports.h" #ifdef CONFIG_XEN - #include #include #include @@ -56,32 +61,7 @@ unsigned long io_apic_irqs; -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) -{ - struct physdev_apic apic_op; - int ret; - - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; - apic_op.reg = reg; - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); - if (ret) - return ret; - return apic_op.value; -} - -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct physdev_apic apic_op; - - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; - apic_op.reg = reg; - apic_op.value = value; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); -} - -#define io_apic_read(a,r) xen_io_apic_read(a,r) -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) - +#define clear_IO_APIC() ((void)0) #endif /* CONFIG_XEN */ int (*ioapic_renumber_irq)(int ioapic, int irq); @@ -108,7 +88,7 @@ int sis_apic_bug = -1; */ int nr_ioapic_registers[MAX_IO_APICS]; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; /* * Rough estimation of how many shared IRQs there are, can @@ -128,12 +108,124 @@ static struct irq_pin_list { int apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#ifndef CONFIG_XEN +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} +#endif + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +#else + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +#endif +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +#else + struct physdev_apic apic_op; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + apic_op.value = value; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +#endif +} + +#ifndef CONFIG_XEN +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + volatile struct io_apic *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} #else -#define vector_to_irq(vector) (vector) +#define io_apic_modify io_apic_write +#endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +#ifndef CONFIG_XEN +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} +#endif + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifndef CONFIG_XEN +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} #endif /* @@ -159,9 +251,7 @@ static void add_pin_to_irq(unsigned int entry->pin = pin; } -#ifdef CONFIG_XEN -#define clear_IO_APIC() ((void)0) -#else +#ifndef CONFIG_XEN /* * Reroute an IRQ to a different pin. */ @@ -246,25 +336,16 @@ static void unmask_IO_APIC_irq (unsigned static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* * Disable it in the IO-APIC irq-routing table: */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_mask_entry(apic, pin); } static void clear_IO_APIC (void) @@ -304,7 +385,7 @@ static void set_ioapic_affinity_irq(unsi break; entry = irq_2_pin + entry->next; } - set_irq_info(irq, cpumask); + set_native_irq_info(irq, cpumask); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1212,43 +1293,43 @@ static inline int IO_APIC_irq_trigger(in /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq) { - unsigned long flags; int vector; struct physdev_irq irq_op; - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) return -EINVAL; - spin_lock_irqsave(&vector_lock, flags); - - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); - return IO_APIC_VECTOR(irq); - } + if (irq_vector[irq] > 0) + return irq_vector[irq]; irq_op.irq = irq; - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - spin_unlock_irqrestore(&vector_lock, flags); + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) return -ENOSPC; - } vector = irq_op.vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; + irq_vector[irq] = vector; + + return vector; +} + +static int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); return vector; } #ifndef CONFIG_XEN -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1256,16 +1337,16 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, "fasteoi"); + else { + irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); + } + set_intr_gate(vector, interrupt[irq]); } #else #define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) @@ -1336,9 +1417,8 @@ static void __init setup_IO_APIC_irqs(vo if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1355,7 +1435,6 @@ static void __init setup_IO_APIC_irqs(vo static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry,0,sizeof(entry)); @@ -1380,15 +1459,13 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + irq_desc[0].chip = &ioapic_chip; + set_irq_handler(0, handle_edge_irq); /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); enable_8259A_irq(0); } @@ -1498,10 +1575,7 @@ void __init print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1521,17 +1595,12 @@ void __init print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1720,10 +1789,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode @@ -1782,7 +1848,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1799,12 +1864,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); #endif @@ -1971,6 +2031,8 @@ static int __init timer_irq_works(void) */ /* + * Startup quirk: + * * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need @@ -1978,8 +2040,10 @@ static int __init timer_irq_works(void) * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... + * + * (We do this for level-triggered IRQs too - it cannot hurt.) */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1996,47 +2060,18 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_irq(unsigned int irq) { - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ + move_native_irq(irq); + ack_APIC_irq(); } -static void end_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_quirk_irq(unsigned int irq) { unsigned long v; int i; - move_irq(irq); + move_native_irq(irq); /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2056,7 +2091,7 @@ static void end_level_ioapic_irq (unsign * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = IO_APIC_VECTOR(irq); + i = irq_vector[irq]; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2071,104 +2106,24 @@ static void end_level_ioapic_irq (unsign } } -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) +static int ioapic_retrigger_irq(unsigned int irq) { - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif -#endif - -static int ioapic_retrigger(unsigned int irq) -{ - send_IPI_self(IO_APIC_VECTOR(irq)); + send_IPI_self(irq_vector[irq]); return 1; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, -#endif - .retrigger = ioapic_retrigger, -}; - -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_quirk_irq, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; #endif /* !CONFIG_XEN */ @@ -2189,12 +2144,7 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2205,22 +2155,23 @@ static inline void init_IO_APIC_traps(vo #ifndef CONFIG_XEN else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; #endif } } } #ifndef CONFIG_XEN -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; +/* + * The local APIC irq-chip implementation: + */ - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +static void ack_apic(unsigned int irq) +{ + ack_APIC_irq(); } -static void disable_lapic_irq (unsigned int irq) +static void mask_lapic_irq (unsigned int irq) { unsigned long v; @@ -2228,21 +2179,19 @@ static void disable_lapic_irq (unsigned apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void unmask_lapic_irq (unsigned int irq) { - ack_APIC_irq(); -} + unsigned long v; -static void end_lapic_irq (unsigned int i) { /* nothing */ } + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC-edge", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .eoi = ack_apic, }; static void setup_nmi (void) @@ -2275,17 +2224,13 @@ static inline void unlock_ExtINT_logic(v int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; - unsigned long flags; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2298,10 +2243,7 @@ static inline void unlock_ExtINT_logic(v entry1.trigger = 0; entry1.vector = 0; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); @@ -2320,10 +2262,7 @@ static inline void unlock_ExtINT_logic(v CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry0); } int timer_uses_ioapic_pin_0; @@ -2423,7 +2362,8 @@ static inline void check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(0); - irq_desc[0].chip = &lapic_irq_type; + set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, + "fasteio"); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2537,17 +2477,12 @@ static int ioapic_suspend(struct sys_dev { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + entry[i] = ioapic_read_entry(dev->id, i); return 0; } @@ -2569,11 +2504,9 @@ static int ioapic_resume(struct sys_devi reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2619,8 +2552,240 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq, new, vector; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + set_intr_gate(vector, interrupt[irq]); + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} + #endif /* CONFIG_XEN */ +/* + * MSI mesage composition + */ +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + dest = cpu_mask_to_apicid(TARGET_CPUS); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + int vector; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + vector = assign_irq_vector(irq); + if (vector < 0) + return; + + dest = cpu_mask_to_apicid(mask); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + set_native_irq_info(irq, mask); +} +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev) +{ + struct msi_msg msg; + int ret; + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, + "edge"); + + return 0; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + return; +} + +#endif /* CONFIG_PCI_MSI */ + +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(mask, tmp, CPU_MASK_ALL); + + dest = cpu_mask_to_apicid(mask); + + target_ht_irq(irq, dest); + set_native_irq_info(irq, mask); +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + int vector; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + struct ht_irq_msg msg; + unsigned dest; + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return vector; +} +#endif /* CONFIG_HT_IRQ */ + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -2774,13 +2939,34 @@ int io_apic_set_pci_routing (int ioapic, if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } #endif /* CONFIG_ACPI */ + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +static int __init parse_enable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = -1; + return 0; +} +early_param("enable_timer_pin_1", parse_enable_timer_pin_1); + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); --- head-2011-03-11.orig/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/ldt_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -1,5 +1,5 @@ /* - * linux/kernel/ldt.c + * linux/arch/i386/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar --- head-2011-03-11.orig/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/microcode-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -2,6 +2,7 @@ * Intel CPU Microcode Update Driver for Linux * * Copyright (C) 2000-2004 Tigran Aivazian + * 2006 Shaohua Li * * This driver allows to upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, @@ -33,7 +34,9 @@ #include #include #include -#include +#include +#include +#include #include #include @@ -55,12 +58,7 @@ module_param(verbose, int, 0644); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - - +#ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update (const void __user *ubuf, size_t len) { int err; @@ -85,6 +83,11 @@ static int do_microcode_update (const vo return err; } +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret; @@ -117,7 +120,7 @@ static struct miscdevice microcode_dev = .fops = µcode_fops, }; -static int __init microcode_init (void) +static int __init microcode_dev_init (void) { int error; @@ -129,6 +132,68 @@ static int __init microcode_init (void) return error; } + return 0; +} + +static void __exit microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int request_microcode(void) +{ + char name[30]; + const struct cpuinfo_x86 *c = &boot_cpu_data; + const struct firmware *firmware; + int error; + struct xen_platform_op op; + + sprintf(name,"intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("ucode data file %s load failed\n", name); + return error; + } + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data); + op.u.microcode.length = firmware->size; + error = HYPERVISOR_platform_op(&op); + + release_firmware(firmware); + + if (error) + pr_debug("ucode load failed\n"); + + return error; +} + +static int __init microcode_init (void) +{ + int error; + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + request_microcode(); + printk(KERN_INFO "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); return 0; @@ -136,9 +201,9 @@ static int __init microcode_init (void) static void __exit microcode_exit (void) { - misc_deregister(µcode_dev); + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); } module_init(microcode_init) module_exit(microcode_exit) -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); --- head-2011-03-11.orig/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/mpparse_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -68,7 +69,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __devinitdata num_processors; +unsigned int __cpuinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -235,12 +236,14 @@ static void __init MP_bus_info (struct m mpc_oem_bus_info(m, str, translation_table[mpc_record]); +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; @@ -300,19 +303,6 @@ static void __init MP_lintsrc_info (stru m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } #ifdef CONFIG_X86_NUMAQ @@ -838,8 +828,7 @@ int es7000_plat; #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { #ifndef CONFIG_XEN mp_lapic_addr = (unsigned long) address; @@ -853,13 +842,10 @@ void __init mp_register_lapic_address ( #endif } - -void __devinit mp_register_lapic ( - u8 id, - u8 enabled) +void __devinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; - int boot_cpu = 0; + int boot_cpu = 0; if (MAX_APICS - id <= 0) { printk(KERN_WARNING "Processor #%d invalid (max %d)\n", @@ -898,11 +884,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic (int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -915,15 +899,11 @@ static int mp_find_ioapic ( return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; - int tmpid; + int idx = 0; + int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -971,16 +951,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -1018,15 +992,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -1095,12 +1067,12 @@ void __init mp_config_acpi_legacy_irqs ( #define MAX_GSI_NUM 4096 -int mp_register_gsi (u32 gsi, int triggering, int polarity) +int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrups, which * represent all possible interrupts, and IRQs --- head-2011-03-11.orig/arch/x86/kernel/pci-dma-xen.c 2009-11-06 10:23:23.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/pci-dma-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -110,8 +110,7 @@ dma_map_sg(struct device *hwdev, struct { int i, rc; - if (direction == DMA_NONE) - BUG(); + BUG_ON(!valid_dma_direction(direction)); WARN_ON(nents == 0 || sg[0].length == 0); if (swiotlb) { @@ -142,7 +141,7 @@ dma_unmap_sg(struct device *hwdev, struc { int i; - BUG_ON(direction == DMA_NONE); + BUG_ON(!valid_dma_direction(direction)); if (swiotlb) swiotlb_unmap_sg(hwdev, sg, nents, direction); else { @@ -159,8 +158,7 @@ dma_map_page(struct device *dev, struct { dma_addr_t dma_addr; - BUG_ON(direction == DMA_NONE); - + BUG_ON(!valid_dma_direction(direction)); if (swiotlb) { dma_addr = swiotlb_map_page( dev, page, offset, size, direction); @@ -177,7 +175,7 @@ void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, enum dma_data_direction direction) { - BUG_ON(direction == DMA_NONE); + BUG_ON(!valid_dma_direction(direction)); if (swiotlb) swiotlb_unmap_page(dev, dma_address, size, direction); else @@ -356,8 +354,7 @@ dma_map_single(struct device *dev, void { dma_addr_t dma; - if (direction == DMA_NONE) - BUG(); + BUG_ON(!valid_dma_direction(direction)); WARN_ON(size == 0); if (swiotlb) { @@ -378,8 +375,7 @@ void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction) { - if (direction == DMA_NONE) - BUG(); + BUG_ON(!valid_dma_direction(direction)); if (swiotlb) swiotlb_unmap_single(dev, dma_addr, size, direction); else --- head-2011-03-11.orig/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/process_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -186,7 +187,7 @@ void cpu_idle(void) void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; + cpumask_t map, tmp = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); put_cpu(); @@ -208,6 +209,8 @@ void cpu_idle_wait(void) } cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -240,9 +243,9 @@ void show_regs(struct pt_regs * regs) if (user_mode_vm(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s %.*s)\n", - regs->eflags, print_tainted(), system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + regs->eflags, print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", @@ -264,15 +267,6 @@ void show_regs(struct pt_regs * regs) * the "args". */ extern void kernel_thread_helper(void); -__asm__(".section .text\n" - ".align 4\n" - "kernel_thread_helper:\n\t" - "movl %edx,%eax\n\t" - "pushl %edx\n\t" - "call *%ebx\n\t" - "pushl %eax\n\t" - "call do_exit\n" - ".previous"); /* * Create a kernel thread @@ -290,7 +284,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = GET_KERNEL_CS(); + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ @@ -369,13 +363,12 @@ int copy_thread(int nr, unsigned long cl tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } @@ -871,7 +864,7 @@ asmlinkage int sys_get_thread_area(struc unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } --- head-2011-03-11.orig/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/setup_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -83,9 +84,6 @@ static struct notifier_block xen_panic_b xen_panic_event, NULL, 0 /* try to go last */ }; -extern char hypercall_page[PAGE_SIZE]; -EXPORT_SYMBOL(hypercall_page); - int disable_pse __devinitdata = 0; /* @@ -105,18 +103,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -#ifdef CONFIG_ACPI - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - -#ifdef CONFIG_ACPI -int __initdata acpi_force = 0; -extern acpi_interrupt_flags acpi_sci_flags; -#endif - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA @@ -170,7 +156,6 @@ struct e820map machine_e820; #endif extern void early_cpu_init(void); -extern void generic_apic_probe(char *); extern int root_mountflags; unsigned long saved_videomode; @@ -243,9 +228,6 @@ static struct resource adapter_rom_resou .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM } }; -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, @@ -307,9 +289,6 @@ static struct resource standard_io_resou .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) static int __init romchecksum(unsigned char *rom, unsigned long length) @@ -372,7 +351,7 @@ static void __init probe_roms(void) } /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { rom = isa_bus_to_virt(start); if (!romsignature(rom)) continue; @@ -779,246 +758,152 @@ static inline void copy_edd(void) } #endif -static void __init parse_cmdline_early (char ** cmdline_p) +static int __initdata user_defined_memmap = 0; + +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to , overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * to +, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init parse_mem(char *arg) { - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0, max_cmdline; - int userdef = 0; - - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) - max_cmdline = COMMAND_LINE_SIZE; - memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[max_cmdline-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to , overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * to +, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] - */ - if (!memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(from+4, &from); - limit_regions(mem_size); - userdef=1; - } - } + if (!arg) + return -EINVAL; - else if (!memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; + if (strcmp(arg, "nopentium") == 0) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } - - else if (!memcmp(from, "noexec=", 7)) - noexec_setup(from + 7); + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_MPPARSE - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } + find_max_pfn(); + saved_max_pfn = max_pfn; #endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { - disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } - - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_disable_pci(); - } - /* "acpi=noirq" disables ACPI interrupt routing */ - else if (!memcmp(from, "acpi=noirq", 10)) { - acpi_noirq_set(); + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; } + } + return 0; +} +early_param("memmap", parse_memmap); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ +static int __init parse_elfcorehdr(char *arg) +{ + if (!arg) + return -EINVAL; - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; + elfcorehdr_addr = memparse(arg, &arg); + return 0; +} +early_param("elfcorehdr", parse_elfcorehdr); +#endif /* CONFIG_PROC_VMCORE */ - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); - if (!memcmp(from, "disable_timer_pin_1", 19)) - disable_timer_pin_1 = 1; - if (!memcmp(from, "enable_timer_pin_1", 18)) - disable_timer_pin_1 = -1; - - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_LOCAL_APIC - /* enable local APIC */ - else if (!memcmp(from, "lapic", 5)) - lapic_enable(); - - /* disable local APIC */ - else if (!memcmp(from, "nolapic", 6)) - lapic_disable(); -#endif /* CONFIG_X86_LOCAL_APIC */ + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; +} +early_param("vmalloc", parse_vmalloc); -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { #ifndef CONFIG_XEN - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } -#else - printk("Ignoring crashkernel command line, " - "parameter will be supplied by xen\n"); -#endif - } -#endif -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ - else if (!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - else if (!memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - /* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ - else if (!memcmp(from, "vmalloc=", 8)) - __VMALLOC_RESERVE = memparse(from+8, &from); + if (!arg) + return -EINVAL; - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + address = memparse(arg, &arg); + reserve_top_address(address); + return 0; } +early_param("reservetop", parse_reservetop); +#endif /* * Callback for efi_memory_walk. @@ -1039,7 +924,7 @@ efi_find_max_pfn(unsigned long start, un static int __init efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) { - memory_present(0, start, end); + memory_present(0, PFN_UP(start), PFN_DOWN(end)); return 0; } @@ -1306,6 +1191,14 @@ static unsigned long __init setup_memory } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); @@ -1317,22 +1210,21 @@ static unsigned long __init setup_memory void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, low; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; - - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highend_pfn - low; + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + add_active_range(0, 0, min(xen_start_info->nr_pages, highend_pfn)); + add_active_range(0, highend_pfn, highend_pfn); +#else + add_active_range(0, 0, min(xen_start_info->nr_pages, max_low_pfn)); + add_active_range(0, max_low_pfn, max_low_pfn); #endif - } - free_area_init(zones_size); + + free_area_init_nodes(max_zone_pfns); } #else extern unsigned long __init setup_memory(void); @@ -1389,6 +1281,7 @@ void __init setup_bootmem_allocator(void */ acpi_reserve_bootmem(); #endif + numa_kva_reserve(); #endif /* !CONFIG_XEN */ #ifdef CONFIG_BLK_DEV_INITRD @@ -1574,7 +1467,7 @@ static int __init request_standard_resou request_resource(&iomem_resource, &video_ram_resource); /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); return 0; } @@ -1705,17 +1598,19 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; - parse_cmdline_early(cmdline_p); + if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + i = COMMAND_LINE_SIZE; + memcpy(saved_command_line, xen_start_info->cmd_line, i); + saved_command_line[i - 1] = '\0'; + parse_early_param(); -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - setup_early_printk(strchr(s, '=') + 1); - printk("early console enabled\n"); - } + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); } -#endif + + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; max_low_pfn = setup_memory(); @@ -1822,7 +1717,7 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); + generic_apic_probe(); #endif if (efi_enabled) efi_map_memmap(); @@ -1843,9 +1738,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif +#ifdef CONFIG_PCI #ifdef CONFIG_X86_IO_APIC check_acpi_pci(); /* Checks more than just ACPI actually */ #endif +#endif #ifdef CONFIG_ACPI acpi_boot_init(); --- head-2011-03-11.orig/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/smp_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -279,8 +279,7 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id) { unsigned long cpu; @@ -567,16 +566,14 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { return IRQ_HANDLED; } #include -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -603,3 +600,69 @@ irqreturn_t smp_call_function_interrupt( return IRQ_HANDLED; } +/* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + * + * cpu is a standard Linux logical CPU number. + */ +static void +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute + * or is or has executed. + */ + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); --- head-2011-03-11.orig/arch/x86/kernel/time-xen.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/time-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -88,7 +88,6 @@ int pit_latch_buggy; /* ext unsigned long vxtime_hz = PIT_TICK_RATE; struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; struct timespec __xtime __section_xtime; struct timezone __sys_tz __section_sys_tz; #endif @@ -96,8 +95,6 @@ struct timezone __sys_tz __section_sys_t unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); -extern unsigned long wall_jiffies; - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); @@ -243,11 +240,10 @@ static void __update_wallclock(time_t se time_t wtm_sec, xtime_sec; u64 tmp, wc_nsec; - /* Adjust wall-clock time base based on wall_jiffies ticks. */ + /* Adjust wall-clock time base. */ wc_nsec = processed_system_time; wc_nsec += sec * (u64)NSEC_PER_SEC; wc_nsec += nsec; - wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK; /* Split wallclock base into seconds and nanoseconds. */ tmp = wc_nsec; @@ -376,16 +372,10 @@ void do_gettimeofday(struct timeval *tv) shadow = &per_cpu(shadow_time, cpu); do { - unsigned long lost; - local_time_version = shadow->version; seq = read_seqbegin(&xtime_lock); usec = get_usec_offset(shadow); - lost = jiffies - wall_jiffies; - - if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); @@ -524,7 +514,7 @@ static void sync_xen_wallclock(unsigned write_seqlock_irq(&xtime_lock); sec = xtime.tv_sec; - nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK); + nsec = xtime.tv_nsec; __normalize_time(&sec, &nsec); op.cmd = XENPF_settime; @@ -598,42 +588,49 @@ unsigned long long sched_clock(void) } #endif -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); -#ifdef __x86_64__ - /* Assume the lock function has either no stack frame or only a single word. - This checks if the address on the stack looks like a kernel text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ +#if defined(CONFIG_SMP) || defined(__x86_64__) if (!user_mode_vm(regs) && in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; +# ifdef CONFIG_FRAME_POINTER +# ifdef __i386__ + return ((unsigned long *)regs->ebp)[1]; +# else + return ((unsigned long *)regs->rbp)[1]; +# endif +# else +# ifdef __i386__ + unsigned long *sp; + if ((regs->xcs & 2) == 0) + sp = (unsigned long *)®s->esp; + else + sp = (unsigned long *)regs->esp; +# else + unsigned long *sp = (unsigned long *)regs->rsp; +# endif + /* Return address is either directly at stack pointer + or above a saved eflags. Eflags has bits 22-31 zero, + kernel addresses don't. */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +# endif } -#else - if (!user_mode_vm(regs) && in_lock_functions(pc)) - return *(unsigned long *)(regs->ebp + 4); #endif return pc; } EXPORT_SYMBOL(profile_pc); -#endif /* * This is the same as the above, except we _also_ save the current * Time Stamp Counter value at the time of the timer interrupt, so that * we later on can estimate the time of day more exactly. */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id) { s64 delta, delta_cpu, stolen, blocked; u64 sched_time; @@ -692,10 +689,15 @@ irqreturn_t timer_interrupt(int irq, voi } /* System-wide jiffy work. */ - while (delta >= NS_PER_TICK) { - delta -= NS_PER_TICK; - processed_system_time += NS_PER_TICK; - do_timer(regs); + if (delta >= NS_PER_TICK) { + do_div(delta, NS_PER_TICK); + processed_system_time += delta * NS_PER_TICK; + while (delta > HZ) { + clobber_induction_variable(delta); + do_timer(HZ); + delta -= HZ; + } + do_timer(delta); } if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { @@ -742,7 +744,7 @@ irqreturn_t timer_interrupt(int irq, voi if (delta_cpu > 0) { do_div(delta_cpu, NS_PER_TICK); per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; - if (user_mode_vm(regs)) + if (user_mode_vm(get_irq_regs())) account_user_time(current, (cputime_t)delta_cpu); else account_system_time(current, HARDIRQ_OFFSET, @@ -756,10 +758,10 @@ irqreturn_t timer_interrupt(int irq, voi /* Local timer processing (see update_process_times()). */ run_local_timers(); if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_mode_vm(regs)); + rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs())); scheduler_tick(); run_posix_cpu_timers(current); - profile_tick(CPU_PROFILING, regs); + profile_tick(CPU_PROFILING); return IRQ_HANDLED; } @@ -969,10 +971,11 @@ extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + struct timespec ts; + ts.tv_sec = get_cmos_time(); + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + + do_settimeofday(&ts); if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); --- head-2011-03-11.orig/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/traps_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -40,7 +41,6 @@ #include #include -#include #include #include #include @@ -51,11 +51,14 @@ #include #include #include +#include #include #include "mach_traps.h" +int panic_on_unrecovered_nmi; + asmlinkage int system_call(void); struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, @@ -124,62 +127,63 @@ static inline int valid_stack_ptr(struct p < (void *)tinfo + THREAD_SIZE - 3; } -/* - * Print one address/symbol entries per line. - */ -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl) -{ - printk(" [<%08lx>] ", addr); - - print_symbol("%s\n", addr); -} - static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - char *log_lvl) + struct stacktrace_ops *ops, void *data) { unsigned long addr; #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { + unsigned long new_ebp; addr = *(unsigned long *)(ebp + 4); - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); /* * break out of recursive entries (such as - * end_of_stack_stop_unwind_function): + * end_of_stack_stop_unwind_function). Also, + * we can never allow a frame pointer to + * move downwards! */ - if (ebp == *(unsigned long *)ebp) + new_ebp = *(unsigned long *)ebp; + if (new_ebp <= ebp) break; - ebp = *(unsigned long *)ebp; + ebp = new_ebp; } #else while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) - print_addr_and_symbol(addr, log_lvl); + ops->address(data, addr); } #endif return ebp; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + static asmlinkage int -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +dump_trace_unwind(struct unwind_frame_info *info, void *data) { + struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - print_addr_and_symbol(UNW_PC(info), log_lvl); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } return n; } -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, + struct stacktrace_ops *ops, void *data) { - unsigned long ebp; + unsigned long ebp = 0; if (!task) task = current; @@ -187,54 +191,116 @@ static void show_trace_log_lvl(struct ta if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, task, regs) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } else if (task == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, task) == 0) - unw_ret = show_trace_unwind(&info, log_lvl); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); + if (!stack) + return; + ebp = UNW_FP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); } - - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.esp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!ebp) { + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } } +#endif while (1) { struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, log_lvl); + ebp = print_context_stack(context, stack, ebp, ops, data); + /* Should be after the line below, but somewhere + in early boot context comes out corrupted and we + can't reference it -AK */ + if (ops->stack(data, "IRQ") < 0) + break; stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk("%s =======================\n", log_lvl); } } +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + print_symbol("%s\n", addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack, char *log_lvl) +{ + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack) { show_trace_log_lvl(task, regs, stack, ""); } @@ -297,12 +363,13 @@ void show_registers(struct pt_regs *regs ss = regs->xss & 0xffff; } print_modules(); - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" - "EFLAGS: %08lx (%s %.*s) \n", + printk(KERN_EMERG "CPU: %d\n" + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + print_tainted(), regs->eflags, init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->eax, regs->ebx, regs->ecx, regs->edx); @@ -319,6 +386,8 @@ void show_registers(struct pt_regs *regs */ if (in_kernel) { u8 __user *eip; + int code_bytes = 64; + unsigned char c; printk("\n" KERN_EMERG "Stack: "); show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); @@ -326,9 +395,12 @@ void show_registers(struct pt_regs *regs printk(KERN_EMERG "Code: "); eip = (u8 __user *)regs->eip - 43; - for (i = 0; i < 64; i++, eip++) { - unsigned char c; - + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + /* try starting at EIP */ + eip = (u8 __user *)regs->eip; + code_bytes = 32; + } + for (i = 0; i < code_bytes; i++, eip++) { if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { printk(" Bad EIP value."); break; @@ -349,7 +421,7 @@ static void handle_BUG(struct pt_regs *r if (eip < PAGE_OFFSET) return; - if (__get_user(ud2, (unsigned short __user *)eip)) + if (probe_kernel_address((unsigned short __user *)eip, ud2)) return; if (ud2 != 0x0b0f) return; @@ -362,7 +434,8 @@ static void handle_BUG(struct pt_regs *r char *file; char c; - if (__get_user(line, (unsigned short __user *)(eip + 2))) + if (probe_kernel_address((unsigned short __user *)(eip + 2), + line)) break; if (__get_user(file, (char * __user *)(eip + 4)) || (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) @@ -604,18 +677,24 @@ gp_in_kernel: } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " - "to continue\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -624,7 +703,8 @@ static void io_check_error(unsigned char clear_io_check_error(reason); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA /* Might actually be able to figure out what the guilty party @@ -634,15 +714,18 @@ static void unknown_nmi_error(unsigned c return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); -void die_nmi (struct pt_regs *regs, const char *msg) +void __kprobes die_nmi(struct pt_regs *regs, const char *msg) { if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) @@ -674,7 +757,7 @@ void die_nmi (struct pt_regs *regs, cons do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -691,12 +774,12 @@ static void default_do_nmi(struct pt_reg * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs); + if (nmi_watchdog_tick(regs, reason)) return; - } + if (!do_nmi_callback(regs, smp_processor_id())) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -712,14 +795,7 @@ static void default_do_nmi(struct pt_reg reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -729,25 +805,11 @@ fastcall void do_nmi(struct pt_regs * re ++nmi_count(cpu); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} -EXPORT_SYMBOL_GPL(unset_nmi_callback); - #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { --- head-2011-03-11.orig/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/mach-xen/setup.c 2011-01-31 17:29:16.000000000 +0100 @@ -103,8 +103,10 @@ void __init pre_setup_arch_hook(void) setup_xen_features(); - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - set_fixaddr_top(pp.virt_start); + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { + hypervisor_virt_start = pp.virt_start; + reserve_top_address(0UL - pp.virt_start); + } if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { machine_to_phys_mapping = (unsigned long *)mapping.v_start; --- head-2011-03-11.orig/arch/x86/mm/fault_32-xen.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/arch/x86/mm/fault_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -27,21 +27,24 @@ #include #include #include +#include extern void die(const char *,struct pt_regs *,long); -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + int register_page_fault_notifier(struct notifier_block *nb) { vmalloc_sync_all(); return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(register_page_fault_notifier); int unregister_page_fault_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); static inline int notify_page_fault(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -55,14 +58,6 @@ static inline int notify_page_fault(enum }; return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); } -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif - /* * Unlock any spinlocks which will prevent us from getting the @@ -119,10 +114,10 @@ static inline unsigned long get_segment_ } /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == GET_KERNEL_CS())) + if (likely(SEGMENT_IS_FLAT_CODE(seg))) return eip; /* Check the segment exists, is within the current LDT/GDT size, @@ -559,11 +554,7 @@ good_area: write = 0; switch (error_code & 3) { default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == GET_KERNEL_CS()) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ + /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; @@ -572,7 +563,7 @@ good_area: case 1: /* read, present */ goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } @@ -704,7 +695,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (tsk->pid == 1) { + if (is_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; --- head-2011-03-11.orig/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/highmem_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -38,11 +38,9 @@ static void *__kmap_atomic(struct page * idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM if (!pte_none(*(kmap_pte-idx))) BUG(); -#endif - set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); + set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); return (void*) vaddr; } @@ -62,36 +60,26 @@ void *kmap_atomic_pte(struct page *page, void kunmap_atomic(void *kvaddr, enum km_type type) { -#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) { // FIXME +#ifdef CONFIG_DEBUG_HIGHMEM + if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) { dec_preempt_count(); preempt_check_resched(); return; } -#endif -#if defined(CONFIG_DEBUG_HIGHMEM) if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) BUG(); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - __flush_tlb_one(vaddr); -#elif defined(CONFIG_XEN) +#endif /* - * We must ensure there are no dangling pagetable references when - * returning memory to Xen (decrease_reservation). - * XXX TODO: We could make this faster by only zapping when - * kmap_flush_unused is called but that is trickier and more invasive. + * Force other mappings to Oops if they'll try to access this pte + * without first remap it. Keeping stale mappings around is a bad idea + * also, in case the page changes cacheability attributes or becomes + * a protected page in a hypervisor. */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); -#endif + kpte_clear_flush(kmap_pte-idx, vaddr); dec_preempt_count(); preempt_check_resched(); @@ -110,7 +98,6 @@ void *kmap_atomic_pfn(unsigned long pfn, idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); - __flush_tlb_one(vaddr); return (void*) vaddr; } --- head-2011-03-11.orig/arch/x86/mm/hypervisor.c 2009-06-09 15:01:37.000000000 +0200 +++ head-2011-03-11/arch/x86/mm/hypervisor.c 2011-01-31 17:29:16.000000000 +0100 @@ -31,6 +31,7 @@ */ #include +#include #include #include #include @@ -44,6 +45,300 @@ #include #include +EXPORT_SYMBOL(hypercall_page); + +#define NR_MC BITS_PER_LONG +#define NR_MMU BITS_PER_LONG +#define NR_MMUEXT (BITS_PER_LONG / 4) + +DEFINE_PER_CPU(bool, xen_lazy_mmu); +struct lazy_mmu { + unsigned int nr_mc, nr_mmu, nr_mmuext; + multicall_entry_t mc[NR_MC]; + mmu_update_t mmu[NR_MMU]; + struct mmuext_op mmuext[NR_MMUEXT]; +}; +static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu); + +static inline bool use_lazy_mmu_mode(void) +{ +#ifdef CONFIG_PREEMPT + if (!preempt_count()) + return false; +#endif + return !irq_count(); +} + +static void multicall_failed(const multicall_entry_t *mc, int rc) +{ + printk(KERN_EMERG "hypercall#%lu(%lx, %lx, %lx, %lx)" + " failed: %d (caller %lx)\n", + mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3], + rc, mc->args[5]); + BUG(); +} + +int xen_multicall_flush(bool ret_last) { + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc = lazy->mc; + unsigned int count = lazy->nr_mc; + + if (!count || !use_lazy_mmu_mode()) + return 0; + + lazy->nr_mc = 0; + lazy->nr_mmu = 0; + lazy->nr_mmuext = 0; + + if (count == 1) { + int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1], + mc->args[2], mc->args[3], mc->args[4]); + + if (unlikely(rc)) { + if (ret_last) + return rc; + multicall_failed(mc, rc); + } + } else { + if (HYPERVISOR_multicall(mc, count)) + BUG(); + while (count-- > ret_last) + if (unlikely(mc++->result)) + multicall_failed(mc - 1, mc[-1].result); + if (ret_last) + return mc->result; + } + + return 0; +} + +int xen_multi_update_va_mapping(unsigned long va, pte_t pte, + unsigned long uvmf) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc; + + if (unlikely(!use_lazy_mmu_mode())) +#ifdef CONFIG_X86_PAE + return _hypercall4(int, update_va_mapping, va, + pte.pte_low, pte.pte_high, uvmf); +#else + return _hypercall3(int, update_va_mapping, va, + pte.pte, uvmf); +#endif + + if (unlikely(lazy->nr_mc == NR_MC)) + xen_multicall_flush(false); + + mc = lazy->mc + lazy->nr_mc++; + mc->op = __HYPERVISOR_update_va_mapping; + mc->args[0] = va; +#ifndef CONFIG_X86_PAE + mc->args[1] = pte.pte; +#else + mc->args[1] = pte.pte_low; + mc->args[2] = pte.pte_high; +#endif + mc->args[MULTI_UVMFLAGS_INDEX] = uvmf; + mc->args[5] = (long)__builtin_return_address(0); + + return 0; +} + +static inline bool mmu_may_merge(const multicall_entry_t *mc, + unsigned int op, domid_t domid) +{ + return mc->op == op && !mc->args[2] && mc->args[3] == domid; +} + +int xen_multi_mmu_update(mmu_update_t *src, unsigned int count, + unsigned int *success_count, domid_t domid) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc = lazy->mc + lazy->nr_mc; + mmu_update_t *dst; + bool commit, merge; + + if (unlikely(!use_lazy_mmu_mode())) + return _hypercall4(int, mmu_update, src, count, + success_count, domid); + + commit = (lazy->nr_mmu + count) > NR_MMU || success_count; + merge = lazy->nr_mc && !commit + && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid); + if (unlikely(lazy->nr_mc == NR_MC) && !merge) { + xen_multicall_flush(false); + mc = lazy->mc; + commit = count > NR_MMU || success_count; + } + + if (!lazy->nr_mc && unlikely(commit)) + return _hypercall4(int, mmu_update, src, count, + success_count, domid); + + dst = lazy->mmu + lazy->nr_mmu; + lazy->nr_mmu += count; + if (merge) { + mc[-1].args[1] += count; + memcpy(dst, src, count * sizeof(*src)); + } else { + ++lazy->nr_mc; + mc->op = __HYPERVISOR_mmu_update; + if (!commit) { + mc->args[0] = (unsigned long)dst; + memcpy(dst, src, count * sizeof(*src)); + } else + mc->args[0] = (unsigned long)src; + mc->args[1] = count; + mc->args[2] = (unsigned long)success_count; + mc->args[3] = domid; + mc->args[5] = (long)__builtin_return_address(0); + } + + while (!commit && count--) + switch (src++->ptr & (sizeof(pteval_t) - 1)) { + case MMU_NORMAL_PT_UPDATE: + case MMU_PT_UPDATE_PRESERVE_AD: + break; + default: + commit = true; + break; + } + + return commit ? xen_multicall_flush(true) : 0; +} + +int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count, + unsigned int *success_count, domid_t domid) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc; + struct mmuext_op *dst; + bool commit, merge; + + if (unlikely(!use_lazy_mmu_mode())) + return _hypercall4(int, mmuext_op, src, count, + success_count, domid); + + /* + * While it could be useful in theory, I've never seen the body of + * this conditional to be reached, hence it seems more reasonable + * to disable it for the time being. + */ + if (0 && likely(count) + && likely(!success_count) + && likely(domid == DOMID_SELF) + && likely(lazy->nr_mc) + && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) { + unsigned long oldf, newf = UVMF_NONE; + + switch (src->cmd) { + case MMUEXT_TLB_FLUSH_ALL: + newf = UVMF_TLB_FLUSH | UVMF_ALL; + break; + case MMUEXT_INVLPG_ALL: + newf = UVMF_INVLPG | UVMF_ALL; + break; + case MMUEXT_TLB_FLUSH_MULTI: + newf = UVMF_TLB_FLUSH | UVMF_MULTI + | (unsigned long)src->arg2.vcpumask.p; + break; + case MMUEXT_INVLPG_MULTI: + newf = UVMF_INVLPG | UVMF_MULTI + | (unsigned long)src->arg2.vcpumask.p; + break; + case MMUEXT_TLB_FLUSH_LOCAL: + newf = UVMF_TLB_FLUSH | UVMF_LOCAL; + break; + case MMUEXT_INVLPG_LOCAL: + newf = UVMF_INVLPG | UVMF_LOCAL; + break; + } + mc = lazy->mc + lazy->nr_mc - 1; + oldf = mc->args[MULTI_UVMFLAGS_INDEX]; + if (newf == UVMF_NONE || oldf == UVMF_NONE + || newf == (UVMF_TLB_FLUSH | UVMF_ALL)) + ; + else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL)) + newf = UVMF_TLB_FLUSH | UVMF_ALL; + else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG + && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG + && ((src->arg1.linear_addr ^ mc->args[0]) + >> PAGE_SHIFT)) + newf = UVMF_NONE; + else if (((oldf | newf) & UVMF_ALL) + && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK)) + newf |= UVMF_ALL; + else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK) + newf = UVMF_NONE; + else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH) + newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH; + else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH + && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK)) + newf = UVMF_NONE; + if (newf != UVMF_NONE) { + mc->args[MULTI_UVMFLAGS_INDEX] = newf; + ++src; + if (!--count) + return 0; + } + } + + mc = lazy->mc + lazy->nr_mc; + commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count; + merge = lazy->nr_mc && !commit + && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid); + if (unlikely(lazy->nr_mc == NR_MC) && !merge) { + xen_multicall_flush(false); + mc = lazy->mc; + commit = count > NR_MMUEXT || success_count; + } + + if (!lazy->nr_mc && unlikely(commit)) + return _hypercall4(int, mmuext_op, src, count, + success_count, domid); + + dst = lazy->mmuext + lazy->nr_mmuext; + lazy->nr_mmuext += count; + if (merge) { + mc[-1].args[1] += count; + memcpy(dst, src, count * sizeof(*src)); + } else { + ++lazy->nr_mc; + mc->op = __HYPERVISOR_mmuext_op; + if (!commit) { + mc->args[0] = (unsigned long)dst; + memcpy(dst, src, count * sizeof(*src)); + } else + mc->args[0] = (unsigned long)src; + mc->args[1] = count; + mc->args[2] = (unsigned long)success_count; + mc->args[3] = domid; + mc->args[5] = (long)__builtin_return_address(0); + } + + while (!commit && count--) + switch (src++->cmd) { + case MMUEXT_PIN_L1_TABLE: + case MMUEXT_PIN_L2_TABLE: + case MMUEXT_PIN_L3_TABLE: + case MMUEXT_PIN_L4_TABLE: + case MMUEXT_UNPIN_TABLE: + case MMUEXT_TLB_FLUSH_LOCAL: + case MMUEXT_INVLPG_LOCAL: + case MMUEXT_TLB_FLUSH_MULTI: + case MMUEXT_INVLPG_MULTI: + case MMUEXT_TLB_FLUSH_ALL: + case MMUEXT_INVLPG_ALL: + break; + default: + commit = true; + break; + } + + return commit ? xen_multicall_flush(true) : 0; +} + void xen_l1_entry_update(pte_t *ptr, pte_t val) { mmu_update_t u; @@ -546,7 +841,8 @@ int write_ldt_entry(void *ldt, int entry #define MAX_BATCHED_FULL_PTES 32 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot) + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) { int rc = 0, i = 0; mmu_update_t u[MAX_BATCHED_FULL_PTES]; @@ -559,10 +855,14 @@ int xen_change_pte_range(struct mm_struc pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { if (pte_present(*pte)) { + pte_t ptent = pte_modify(*pte, newprot); + + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK) | ((unsigned long)pte & ~PAGE_MASK) | MMU_PT_UPDATE_PRESERVE_AD; - u[i].val = __pte_val(pte_modify(*pte, newprot)); + u[i].val = __pte_val(ptent); if (++i == MAX_BATCHED_FULL_PTES) { if ((rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF)) != 0) --- head-2011-03-11.orig/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/init_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -313,8 +313,7 @@ static void __init permanent_kmaps_init( static void __meminit free_new_highpage(struct page *page, int pfn) { init_page_count(page); - if (pfn < xen_start_info->nr_pages) - __free_page(page); + __free_page(page); totalhigh_pages++; } @@ -357,8 +356,16 @@ extern void set_highmem_pages_init(int); static void __init set_highmem_pages_init(int bad_ppro) { int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + for (pfn = highstart_pfn; pfn < highend_pfn + && pfn < xen_start_info->nr_pages; pfn++) add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + + /* XEN: init high-mem pages outside initial allocation. */ + for (; pfn < highend_pfn; pfn++) { + ClearPageReserved(pfn_to_page(pfn)); + init_page_count(pfn_to_page(pfn)); + } + totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -462,16 +469,22 @@ EXPORT_SYMBOL(__supported_pte_mask); * on Enable * off Disable */ -void __init noexec_setup(const char *str) +static int __init noexec_setup(char *str) { - if (!strncmp(str, "on",2) && cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str,"off",3)) { + if (!str || !strcmp(str, "on")) { + if (cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } + } else if (!strcmp(str,"off")) { disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; - } + } else + return -EINVAL; + + return 0; } +early_param("noexec", noexec_setup); int nx_enabled = 0; #ifdef CONFIG_X86_PAE @@ -514,6 +527,7 @@ int __init set_kernel_exec(unsigned long pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); else pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); + pte_update_defer(&init_mm, vaddr, pte); __flush_tlb_all(); out: return ret; @@ -596,18 +610,6 @@ static void __init test_wp_bit(void) } } -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - num_physpages = highend_pfn; -#else - num_physpages = max_low_pfn; -#endif -#ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; -#endif -} - static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) @@ -623,8 +625,7 @@ void __init mem_init(void) #endif #ifdef CONFIG_FLATMEM - if (!mem_map) - BUG(); + BUG_ON(!mem_map); #endif bad_ppro = ppro_with_ram_bug(); @@ -639,24 +640,12 @@ void __init mem_init(void) } #endif - set_max_mapnr_init(); - -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk("vmalloc area: %lx-%lx, maxmem %lx\n", - VMALLOC_START,VMALLOC_END,MAXMEM); - BUG_ON(VMALLOC_START > VMALLOC_END); - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); - /* XEN: init and count low-mem pages outside initial allocation. */ + /* XEN: init low-mem pages outside initial allocation. */ for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { ClearPageReserved(pfn_to_page(pfn)); init_page_count(pfn_to_page(pfn)); - totalram_pages++; } reservedpages = 0; @@ -687,6 +676,48 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); +#if 1 /* double-sanity-check paranoia */ + printk("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); +#endif /* double-sanity-check paranoia */ + #ifdef CONFIG_X86_PAE if (!cpu_has_pae) panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); @@ -717,7 +748,7 @@ void __init mem_init(void) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = &contig_page_data; - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; --- head-2011-03-11.orig/arch/x86/mm/ioremap-xen.c 2011-02-07 15:31:26.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/ioremap-xen.c 2011-02-07 15:37:37.000000000 +0100 @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -114,7 +114,7 @@ int direct_remap_pfn_range(struct vm_are if (domid == DOMID_SELF) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED; + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; vma->vm_mm->context.has_foreign_mappings = 1; @@ -184,6 +184,7 @@ void __iomem * __ioremap(unsigned long p void __iomem * addr; struct vm_struct * area; unsigned long offset, last_addr; + pgprot_t prot; domid_t domid = DOMID_IO; /* Don't allow wraparound or zero size */ @@ -215,6 +216,8 @@ void __iomem * __ioremap(unsigned long p domid = DOMID_SELF; } + prot = __pgprot(_KERNPG_TABLE | flags); + /* * Mappings have to be page-aligned */ @@ -230,10 +233,9 @@ void __iomem * __ioremap(unsigned long p return NULL; area->phys_addr = phys_addr; addr = (void __iomem *) area->addr; - flags |= _KERNPG_TABLE; if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, phys_addr>>PAGE_SHIFT, - size, __pgprot(flags), domid)) { + size, prot, domid)) { vunmap((void __force *) addr); return NULL; } --- head-2011-03-11.orig/arch/x86/mm/pgtable_32-xen.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/arch/x86/mm/pgtable_32-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -68,7 +68,9 @@ void show_mem(void) printk(KERN_INFO "%lu pages writeback\n", global_page_state(NR_WRITEBACK)); printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); - printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB)); + printk(KERN_INFO "%lu pages slab\n", + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)); printk(KERN_INFO "%lu pages pagetables\n", global_page_state(NR_PAGETABLE)); } @@ -108,18 +110,11 @@ void set_pmd_pfn(unsigned long vaddr, un __flush_tlb_one(vaddr); } -static int nr_fixmaps = 0; +static int fixmaps; unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; -unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE); +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE); EXPORT_SYMBOL(__FIXADDR_TOP); -void __init set_fixaddr_top(unsigned long top) -{ - BUG_ON(nr_fixmaps > 0); - hypervisor_virt_start = top; - __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE; -} - void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); @@ -141,7 +136,21 @@ void __set_fixmap (enum fixed_addresses if (HYPERVISOR_update_va_mapping(address, pte, UVMF_INVLPG|UVMF_ALL)) BUG(); - nr_fixmaps++; + fixmaps++; +} + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ + BUG_ON(fixmaps > 0); + __FIXADDR_TOP = -reserve - PAGE_SIZE; + __VMALLOC_RESERVE += reserve; } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) --- head-2011-03-11.orig/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100 +++ head-2011-03-11/arch/x86/pci/irq-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -991,10 +991,6 @@ static void __init pcibios_fixup_irqs(vo pci_name(bridge), 'A' + pin, irq); } if (irq >= 0) { - if (use_pci_vector() && - !platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", pci_name(dev), 'A' + pin, irq); dev->irq = irq; @@ -1155,10 +1151,6 @@ static int pirq_enable_irq(struct pci_de } dev = temp_dev; if (irq >= 0) { -#ifdef CONFIG_PCI_MSI - if (!platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); -#endif printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", pci_name(dev), 'A' + pin, irq); dev->irq = irq; @@ -1179,33 +1171,3 @@ static int pirq_enable_irq(struct pci_de } return 0; } - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - int next = last; - int offset = (last % 8); - - while (next < FIRST_SYSTEM_VECTOR) { - next += 8; -#ifdef CONFIG_X86_64 - if (next == IA32_SYSCALL_VECTOR) - continue; -#else - if (next == SYSCALL_VECTOR) - continue; -#endif - count++; - if (next >= FIRST_SYSTEM_VECTOR) { - if (offset%8) { - next = FIRST_DEVICE_VECTOR + offset; - offset++; - continue; - } - count--; - } - } - - return count; -} --- head-2011-03-11.orig/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/ia32/ia32entry-xen.S 2011-01-31 17:29:16.000000000 +0100 @@ -83,6 +83,7 @@ */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP+16 /*CFI_REL_OFFSET ss,SS-RIP+16*/ CFI_REL_OFFSET rsp,RSP-RIP+16 @@ -164,6 +165,7 @@ ENDPROC(ia32_sysenter_target) */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP+16 /*CFI_REL_OFFSET ss,SS-RIP+16*/ CFI_REL_OFFSET rsp,RSP-RIP+16 @@ -243,6 +245,7 @@ ia32_badarg: ENTRY(ia32_syscall) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP+16 /*CFI_REL_OFFSET ss,SS-RIP+16*/ CFI_REL_OFFSET rsp,RSP-RIP+16 @@ -320,6 +323,7 @@ ENTRY(ia32_ptregs_common) popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET CFI_REL_OFFSET rax,RAX-ARGOFFSET CFI_REL_OFFSET rcx,RCX-ARGOFFSET @@ -653,8 +657,8 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad quiet_ni_syscall /* pselect6 for now */ - .quad quiet_ni_syscall /* ppoll for now */ + .quad compat_sys_pselect6 + .quad compat_sys_ppoll .quad sys_unshare /* 310 */ .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list @@ -663,4 +667,5 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_getcpu ia32_syscall_end: --- head-2011-03-11.orig/arch/x86/kernel/Makefile 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/Makefile 2011-01-31 17:29:16.000000000 +0100 @@ -115,7 +115,7 @@ obj-$(CONFIG_X86_XEN) += fixup.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_xen_64.o + obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o @@ -129,5 +129,7 @@ ifeq ($(CONFIG_X86_64),y) pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o endif -disabled-obj-$(CONFIG_XEN) := i8253.o i8259_$(BITS).o reboot.o smpboot_$(BITS).o tsc_$(BITS).o +disabled-obj-$(CONFIG_XEN) := early-quirks.o i8253.o i8259_$(BITS).o reboot.o \ + smpboot_$(BITS).o tsc_$(BITS).o +disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o %/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) := --- head-2011-03-11.orig/arch/x86/kernel/e820_64-xen.c 2009-12-04 08:45:56.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/e820_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,11 @@ #include #include +struct e820map e820 __initdata; +#ifdef CONFIG_XEN +struct e820map machine_e820 __initdata; +#endif + /* * PFN of last memory page. */ @@ -43,14 +49,10 @@ unsigned long end_pfn_map; /* * Last pfn which the user wants to use. */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; extern struct resource code_resource, data_resource; -#ifdef CONFIG_XEN -extern struct e820map machine_e820; -#endif - /* Check for some hardcoded bad areas that early boot is not allowed to touch */ static inline int bad_addr(unsigned long *addrp, unsigned long size) { @@ -59,13 +61,13 @@ static inline int bad_addr(unsigned long #ifndef CONFIG_XEN /* various gunk below that needed for SMP startup */ if (addr < 0x8000) { - *addrp = 0x8000; + *addrp = PAGE_ALIGN(0x8000); return 1; } /* direct mapping tables of the kernel */ if (last >= table_start<= INITRD_START && addr < INITRD_START+INITRD_SIZE) { - *addrp = INITRD_START + INITRD_SIZE; + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); return 1; } #endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < 1024*1024) { - *addrp = 1024*1024; - return 1; - } - if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { - *addrp = __pa_symbol(&_end); + /* kernel code */ + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { + *addrp = PAGE_ALIGN(__pa_symbol(&_end)); return 1; } if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = ebda_addr + ebda_size; + *addrp = PAGE_ALIGN(ebda_addr + ebda_size); return 1; } @@ -186,7 +183,7 @@ unsigned long __init find_e820_area(unsi continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; - last = addr + size; + last = PAGE_ALIGN(addr) + size; if (last > ei->addr + ei->size) continue; if (last > end) @@ -196,59 +193,14 @@ unsigned long __init find_e820_area(unsi return -1UL; } -/* - * Free bootmem based on the e820 table for a node. - */ -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr && last-addr >= PAGE_SIZE) - free_bootmem_node(pgdat, addr, last-addr); - } -} - /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - int i; unsigned long end_pfn = 0; + end_pfn = find_max_pfn_with_active_regions(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_up(ei->addr, PAGE_SIZE); - end = round_down(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RAM) { - if (end > end_pfn<>PAGE_SHIFT; - } else { - if (end > end_pfn_map<>PAGE_SHIFT; - } - } - if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) @@ -258,43 +210,10 @@ unsigned long __init e820_end_of_ram(voi if (end_pfn > end_pfn_map) end_pfn = end_pfn_map; + printk("end_pfn_map = %lu\n", end_pfn_map); return end_pfn; } -/* - * Compute how much memory is missing in a range. - * Unlike the other functions in this file the arguments are in page numbers. - */ -unsigned long __init -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long ram = 0; - unsigned long start = start_pfn << PAGE_SHIFT; - unsigned long end = end_pfn << PAGE_SHIFT; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram) >> PAGE_SHIFT; -} - /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -335,6 +254,109 @@ void __init e820_reserve_resources(struc } } +#ifndef CONFIG_XEN +/* Mark pages corresponding to given address range as nosave */ +static void __init +e820_mark_nosave_range(unsigned long start, unsigned long end) +{ + unsigned long pfn, max_pfn; + + if (start >= end) + return; + + printk("Nosave address range: %016lx - %016lx\n", start, end); + max_pfn = end >> PAGE_SHIFT; + for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++) + if (pfn_valid(pfn)) + SetPageNosave(pfn_to_page(pfn)); +} + +/* + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for software + * suspend and suspend to RAM. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long paddr; + + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (paddr < ei->addr) + e820_mark_nosave_range(paddr, + round_up(ei->addr, PAGE_SIZE)); + + paddr = round_down(ei->addr + ei->size, PAGE_SIZE); + if (ei->type != E820_RAM) + e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE), + paddr); + + if (paddr >= (end_pfn << PAGE_SHIFT)) + break; + } +} +#endif + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + unsigned long ei_startpfn, ei_endpfn; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) + >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (ei_startpfn >= ei_endpfn) + continue; + + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) + end_pfn_map = ei_endpfn; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || + ei_endpfn <= start_pfn || + ei_startpfn >= end_pfn) + continue; + + /* Check for overlaps */ + if (ei_startpfn < start_pfn) + ei_startpfn = start_pfn; + if (ei_endpfn > end_pfn) + ei_endpfn = end_pfn; + + /* Obey end_user_pfn to save on memmap */ + if (ei_startpfn >= end_user_pfn) + continue; + if (ei_endpfn > end_user_pfn) + ei_endpfn = end_user_pfn; + +#ifdef CONFIG_XEN + if (ei_startpfn >= xen_start_info->nr_pages) + continue; + if (ei_endpfn > xen_start_info->nr_pages) + ei_endpfn = xen_start_info->nr_pages; +#endif + + add_active_range(nid, ei_startpfn, ei_endpfn); + } +#ifdef CONFIG_XEN + BUG_ON(nid); + add_active_range(nid, end_pfn, end_pfn); +#endif +} + /* * Add a memory region to the kernel e820 map. */ @@ -555,13 +577,6 @@ static int __init sanitize_e820_map(stru * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { @@ -583,27 +598,6 @@ static int __init copy_e820_map(struct e if (start > end) return -1; -#ifndef CONFIG_XEN - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } -#endif - add_memory_region(start, size, type); } while (biosmap++,--nr_map); @@ -624,11 +618,15 @@ static int __init copy_e820_map(struct e return 0; } +void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + #ifndef CONFIG_XEN void __init setup_memory_region(void) { - char *who = "BIOS-e820"; - /* * Try to copy the BIOS-supplied E820-map. * @@ -636,24 +634,10 @@ void __init setup_memory_region(void) * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); + e820_print_map("BIOS-e820"); } #else /* CONFIG_XEN */ @@ -685,20 +669,23 @@ void __init setup_memory_region(void) sanitize_e820_map(map, (char *)&memmap.nr_entries); - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0); + if (copy_e820_map(map, (char)memmap.nr_entries) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map("Xen"); } #endif -void __init parse_memopt(char *p, char **from) -{ +static int __init parse_memopt(char *p) +{ int i; unsigned long current_end; unsigned long end; - end_user_pfn = memparse(p, from); + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); end_user_pfn >>= PAGE_SHIFT; end = end_user_pfn<> PAGE_SHIFT); } - p = *from; + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } } unsigned long pci_mem_start = 0xaeedbabe; --- head-2011-03-11.orig/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/early_printk-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -244,20 +244,16 @@ void early_printk(const char *fmt, ...) static int __initdata keep_early; -int __init setup_early_printk(char *opt) +static int __init setup_early_printk(char *buf) { - char *space; - char buf[256]; + if (!buf) + return 0; if (early_console_initialized) - return 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; + return 0; + early_console_initialized = 1; - if (strstr(buf,"keep")) + if (strstr(buf, "keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { @@ -281,11 +277,12 @@ int __init setup_early_printk(char *opt) early_console = &simnow_console; keep_early = 1; } - early_console_initialized = 1; register_console(early_console); return 0; } +early_param("earlyprintk", setup_early_printk); + void __init disable_early_printk(void) { if (!early_console_initialized || !early_console) @@ -299,4 +296,3 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); --- head-2011-03-11.orig/arch/x86/kernel/entry_64-xen.S 2009-06-23 09:28:21.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/entry_64-xen.S 2011-01-31 17:29:16.000000000 +0100 @@ -4,9 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - * - * $Id$ - * * Jun Nakajima * Asit Mallick * Modified for Xen @@ -26,15 +23,25 @@ * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. */ -#define ASSEMBLY 1 #include #include -#include #include #include #include @@ -117,6 +124,7 @@ NMI_MASK = 0x80000000 .macro CFI_DEFAULT_STACK start=1,adj=0 .if \start CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET .else CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET @@ -207,6 +215,7 @@ END(ret_from_fork) */ .macro _frame ref CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-\ref /*CFI_REL_OFFSET ss,SS-\ref*/ CFI_REL_OFFSET rsp,RSP-\ref @@ -334,6 +343,8 @@ tracesys: LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST cmpq $__NR_syscall_max,%rax + movq $-ENOSYS,%rcx + cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) @@ -349,6 +360,7 @@ END(system_call) */ ENTRY(int_ret_from_sys_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET @@ -583,8 +595,7 @@ retint_signal: #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: +ENTRY(retint_kernel) cmpl $0,threadinfo_preempt_count(%rcx) jnz retint_restore_args bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) @@ -644,7 +655,6 @@ ENTRY(call_function_interrupt) END(call_function_interrupt) #endif -#ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) @@ -656,7 +666,6 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) -#endif #endif /* !CONFIG_XEN */ /* @@ -755,7 +764,9 @@ paranoid_exit\trace: testl $3,CS(%rsp) jnz paranoid_userspace\trace paranoid_swapgs\trace: + .if \trace TRACE_IRQS_IRETQ 0 + .endif swapgs paranoid_restore\trace: RESTORE_ALL 8 @@ -802,7 +813,7 @@ paranoid_schedule\trace: * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI CFI_REL_OFFSET rax,0 /* rdi slot contains rax, oldrax contains error code */ @@ -896,7 +907,7 @@ error_kernelspace: jmp error_sti #endif CFI_ENDPROC -END(error_entry) +KPROBE_END(error_entry) ENTRY(hypervisor_callback) zeroentry do_hypervisor_callback @@ -936,26 +947,6 @@ ENTRY(do_hypervisor_callback) # do_hyp CFI_ENDPROC END(do_hypervisor_callback) -#ifdef CONFIG_X86_LOCAL_APIC -KPROBE_ENTRY(nmi) - zeroentry do_nmi_callback -ENTRY(do_nmi_callback) - CFI_STARTPROC - addq $8, %rsp - CFI_ENDPROC - CFI_DEFAULT_STACK - call do_nmi - orl $NMI_MASK,EFLAGS(%rsp) - RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_restore_args - CFI_ENDPROC - .previous .text -END(nmi) -#endif - ALIGN restore_all_enable_events: CFI_DEFAULT_STACK adj=1 @@ -1121,7 +1112,7 @@ ENDPROC(child_rip) * do_sys_execve asm fallback arguments: * rdi: name, rsi: argv, rdx: envp, fake frame on the stack */ -ENTRY(execve) +ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL @@ -1135,12 +1126,11 @@ ENTRY(execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(execve) +ENDPROC(kernel_execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault -END(page_fault) - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error @@ -1162,25 +1152,25 @@ KPROBE_ENTRY(debug) zeroentry do_debug /* paranoidexit CFI_ENDPROC */ -END(debug) - .previous .text +KPROBE_END(debug) -#if 0 - /* runs on exception stack */ KPROBE_ENTRY(nmi) - INTR_FRAME - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi, 0, 0 -#ifdef CONFIG_TRACE_IRQFLAGS - paranoidexit 0 -#else - jmp paranoid_exit1 - CFI_ENDPROC -#endif -END(nmi) - .previous .text -#endif + zeroentry do_nmi_callback +KPROBE_END(nmi) +do_nmi_callback: + CFI_STARTPROC + addq $8, %rsp + CFI_ENDPROC + CFI_DEFAULT_STACK + call do_nmi + orl $NMI_MASK,EFLAGS(%rsp) + RESTORE_REST + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_restore_args + CFI_ENDPROC +END(do_nmi_callback) KPROBE_ENTRY(int3) /* INTR_FRAME @@ -1189,8 +1179,7 @@ KPROBE_ENTRY(int3) zeroentry do_int3 /* jmp paranoid_exit1 CFI_ENDPROC */ -END(int3) - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow @@ -1241,8 +1230,7 @@ END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection -END(general_protection) - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check --- head-2011-03-11.orig/arch/x86/kernel/head_64-xen.S 2010-11-08 17:27:03.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/head_64-xen.S 2011-01-31 17:29:16.000000000 +0100 @@ -5,9 +5,6 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ - * * Jun Nakajima * Modified for Xen */ @@ -146,7 +143,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ --- head-2011-03-11.orig/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/head64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -7,6 +7,9 @@ * Modified for Xen. */ +/* PDA is not ready to be used until the end of x86_64_start_kernel(). */ +#define arch_use_lazy_mmu_mode() false + #include #include #include @@ -54,11 +57,9 @@ static void __init copy_bootdata(char *r new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); return; } new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); } command_line = (char *) ((u64)(new_data)); memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); @@ -70,25 +71,6 @@ static void __init copy_bootdata(char *r memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); saved_command_line[max_cmdline-1] = '\0'; #endif - printk("Bootdata ok (command line is %s)\n", saved_command_line); -} - -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; } #include @@ -101,7 +83,6 @@ void __init x86_64_start_kernel(char * r { struct xen_machphys_mapping mapping; unsigned long machine_to_phys_nr_ents; - char *s; int i; setup_xen_features(); @@ -128,10 +109,7 @@ void __init x86_64_start_kernel(char * r asm volatile("lidt %0" :: "m" (idt_descr)); #endif - /* - * This must be called really, really early: - */ - lockdep_init(); + early_printk("Kernel alive\n"); for (i = 0; i < NR_CPUS; i++) cpu_pda(i) = &boot_cpu_pda[i]; @@ -141,22 +119,5 @@ void __init x86_64_start_kernel(char * r #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(strchr(s, '=') + 1); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif -#ifdef CONFIG_X86_IO_APIC - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; -#endif - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - - setup_boot_cpu_data(); start_kernel(); } --- head-2011-03-11.orig/arch/x86/kernel/io_apic_64-xen.c 2009-03-18 10:39:31.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/io_apic_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -26,9 +26,12 @@ #include #include #include +#include #include #include #include +#include +#include #ifdef CONFIG_ACPI #include #endif @@ -41,6 +44,10 @@ #include #include #include +#include +#include + +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result); #define __apicdebuginit __init @@ -48,17 +55,30 @@ int sis_apic_bug; /* not actually suppor static int no_timer_check; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; -#ifndef CONFIG_XEN -int timer_over_8254 __initdata = 0; +#ifdef CONFIG_XEN +#include +#include +#include + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +#define clear_IO_APIC() ((void)0) +#else +int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; #endif static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -83,29 +103,27 @@ static struct irq_pin_list { short apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - -#ifdef CONFIG_XEN - -#include -#include -#include - -/* Fake i8259 */ -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -#define disable_8259A_irq(_irq) ((void)0) -#define i8259A_irq_pending(_irq) (0) +#ifndef CONFIG_XEN +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; -unsigned long io_apic_irqs; +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} +#endif -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +#else struct physdev_apic apic_op; int ret; @@ -115,31 +133,133 @@ static inline unsigned int xen_io_apic_r if (ret) return ret; return apic_op.value; +#endif } -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +#else struct physdev_apic apic_op; apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; apic_op.reg = reg; apic_op.value = value; WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +#endif +} + +#ifndef CONFIG_XEN +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + */ +static inline void io_apic_modify(unsigned int apic, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(value, &io_apic->data); } +#else +#define io_apic_modify io_apic_write +#endif -#define io_apic_read(a,r) xen_io_apic_read(a,r) -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +#endif +} -#define clear_IO_APIC() ((void)0) +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; -#else +#ifndef CONFIG_XEN +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} +#endif + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifndef CONFIG_XEN +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} #ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + for (;;) { + unsigned int reg; + apic = entry->apic; + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~0x000000ff; + reg |= vector; + io_apic_modify(apic, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { unsigned long flags; unsigned int dest; cpumask_t tmp; + int vector; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -147,7 +267,11 @@ static void set_ioapic_affinity_irq(unsi cpus_and(mask, tmp, CPU_MASK_ALL); - dest = cpu_mask_to_apicid(mask); + vector = assign_irq_vector(irq, mask, &tmp); + if (vector < 0) + return; + + dest = cpu_mask_to_apicid(tmp); /* * Only the high 8 bits are valid. @@ -155,13 +279,12 @@ static void set_ioapic_affinity_irq(unsi dest = SET_APIC_LOGICAL_ID(dest); spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = dest, ) - set_irq_info(irq, mask); + __target_IO_APIC_irq(irq, dest, vector); + set_native_irq_info(irq, mask); spin_unlock_irqrestore(&ioapic_lock, flags); } #endif - -#endif /* !CONFIG_XEN */ +#endif /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -241,24 +364,15 @@ static void unmask_IO_APIC_irq (unsigned static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* * Disable it in the IO-APIC irq-routing table: */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_mask_entry(apic, pin); } static void clear_IO_APIC (void) @@ -272,16 +386,6 @@ static void clear_IO_APIC (void) #endif /* !CONFIG_XEN */ -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -290,18 +394,17 @@ int ioapic_force; static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; - return 1; + return 0; } +early_param("noapic", disable_ioapic_setup); -static int __init enable_ioapic_setup(char *str) +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) { - ioapic_force = 1; - skip_ioapic_setup = 0; + disable_timer_pin_1 = 1; return 1; } - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); +__setup("disable_timer_pin_1", disable_timer_pin_setup); #ifndef CONFIG_XEN static int __init setup_disable_8254_timer(char *s) @@ -319,137 +422,6 @@ __setup("disable_8254_timer", setup_disa __setup("enable_8254_timer", setup_enable_8254_timer); #endif /* !CONFIG_XEN */ -#include -#include -#include - - -#ifdef CONFIG_ACPI - -static int nvidia_hpet_detected __initdata; - -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) -{ - nvidia_hpet_detected = 1; - return 0; -} -#endif - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - ... and others. Really should move this somewhere else. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - */ - nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_HPET, - nvidia_hpet_check); - if (nvidia_hpet_detected == 0) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - return; - case PCI_VENDOR_ID_ATI: - - /* This should be actually default, but - for 2.6.16 let's do it for ATI only where - it's really needed. */ -#ifndef CONFIG_XEN - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } -#endif - return; - } - - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); /* * Find the IRQ entry number of a certain pin. @@ -479,9 +451,7 @@ static int __init find_isa_irq_pin(int i for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -497,9 +467,7 @@ static int __init find_isa_irq_apic(int for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -540,7 +508,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -563,27 +531,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, return best_guess; } -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ @@ -596,12 +543,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -613,38 +554,11 @@ static int __init MPBIOS_polarity(int id switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); break; - } case 1: /* high active */ { polarity = 0; @@ -682,38 +596,11 @@ static int MPBIOS_trigger(int idx) switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); break; - } case 1: /* edge */ { trigger = 0; @@ -750,64 +637,6 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } -static int next_irq = 16; - -/* - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number - * from ACPI, which can reach 800 in large boxen. - * - * Compact the sparse GSI space into a sequential IRQ series and reuse - * vectors if possible. - */ -int gsi_irq_sharing(int gsi) -{ - int i, tries, vector; - - BUG_ON(gsi >= NR_IRQ_VECTORS); - - if (platform_legacy_irq(gsi)) - return gsi; - - if (gsi_2_irq[gsi] != 0xFF) - return (int)gsi_2_irq[gsi]; - - tries = NR_IRQS; - try_again: - vector = assign_irq_vector(gsi); - - /* - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous - * use of vector and if found, return that IRQ. However, we never want - * to share legacy IRQs, which usually have a different trigger mode - * than PCI. - */ - for (i = 0; i < NR_IRQS; i++) - if (IO_APIC_VECTOR(i) == vector) - break; - if (platform_legacy_irq(i)) { - if (--tries >= 0) { - IO_APIC_VECTOR(i) = 0; - goto try_again; - } - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); - } - if (i < NR_IRQS) { - gsi_2_irq[gsi] = i; - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; - } - - i = next_irq++; - BUG_ON(i >= NR_IRQS); - gsi_2_irq[gsi] = i; - IO_APIC_VECTOR(i) = vector; - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; -} - static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -819,49 +648,16 @@ static int pin_2_irq(int idx, int apic, if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - irq = gsi_irq_sharing(irq); - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } - } - BUG_ON(irq >= NR_IRQS); - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; } BUG_ON(irq >= NR_IRQS); return irq; @@ -885,46 +681,71 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result) { - unsigned long flags; int vector; struct physdev_irq irq_op; - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) return -EINVAL; - spin_lock_irqsave(&vector_lock, flags); + cpus_and(*result, mask, cpu_online_map); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); - return IO_APIC_VECTOR(irq); - } + if (irq_vector[irq] > 0) + return irq_vector[irq]; irq_op.irq = irq; - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - spin_unlock_irqrestore(&vector_lock, flags); + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) return -ENOSPC; - } vector = irq_op.vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; + irq_vector[irq] = vector; - spin_unlock_irqrestore(&vector_lock, flags); + return vector; +} + +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result) +{ + int vector; + unsigned long flags; + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq, mask, result); + spin_unlock_irqrestore(&vector_lock, flags); return vector; } -extern void (*interrupt[NR_IRQS])(void); #ifndef CONFIG_XEN -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + + /* Mark the inuse vectors */ + for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) { + if (!cpu_isset(cpu, irq_domain[irq])) + continue; + vector = irq_vector[irq]; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + if (!cpu_isset(cpu, irq_domain[irq])) + per_cpu(vector_irq, cpu)[vector] = -1; + } +} + +extern void (*interrupt[NR_IRQS])(void); + +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -932,16 +753,15 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, "fasteoi"); + else { + irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); + } } #else #define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) @@ -994,16 +814,21 @@ static void __init setup_IO_APIC_irqs(vo continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); + cpumask_t mask; + vector = assign_irq_vector(irq, TARGET_CPUS, &mask); + if (vector < 0) + continue; + + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1046,7 +871,7 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1142,10 +967,7 @@ void __apicdebuginit print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1165,17 +987,12 @@ void __apicdebuginit print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1339,9 +1156,6 @@ static void __init enable_IO_APIC(void) irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; /* * The number of IO-APIC IRQ registers (== #pins): @@ -1358,11 +1172,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. @@ -1416,7 +1226,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1433,12 +1242,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); @@ -1446,76 +1250,6 @@ void disable_IO_APIC(void) } /* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -#ifndef CONFIG_XEN -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} -#else -static void __init setup_ioapic_ids_from_mpc(void) { } -#endif - -/* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: * @@ -1569,7 +1303,7 @@ static int __init timer_irq_works(void) * an edge even if it isn't on the 8259A... */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1586,107 +1320,19 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - move_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) +static int ioapic_retrigger_irq(unsigned int irq) { - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); + cpumask_t mask; + unsigned vector; + unsigned long flags; - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif // CONFIG_SMP -#endif // CONFIG_PCI_MSI + spin_lock_irqsave(&vector_lock, flags); + vector = irq_vector[irq]; + cpus_clear(mask); + cpu_set(first_cpu(irq_domain[irq]), mask); -static int ioapic_retrigger(unsigned int irq) -{ - send_IPI_self(IO_APIC_VECTOR(irq)); + send_IPI_mask(mask, vector); + spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -1700,32 +1346,47 @@ static int ioapic_retrigger(unsigned int * races. */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, +static void ack_apic_edge(unsigned int irq) +{ + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } #endif - .retrigger = ioapic_retrigger, -}; -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propogate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + move_masked_irq(irq); + if (unlikely(do_unmask_irq)) + unmask_IO_APIC_irq(irq); +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; #endif /* !CONFIG_XEN */ @@ -1746,12 +1407,7 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -1762,7 +1418,7 @@ static inline void init_IO_APIC_traps(vo #ifndef CONFIG_XEN else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; #endif } } @@ -1883,8 +1539,6 @@ static inline void unlock_ExtINT_logic(v spin_unlock_irqrestore(&ioapic_lock, flags); } -int timer_uses_ioapic_pin_0; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1897,13 +1551,13 @@ static inline void check_timer(void) { int apic1, pin1, apic2, pin2; int vector; + cpumask_t mask; /* * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); + vector = assign_irq_vector(0, TARGET_CPUS, &mask); /* * Subtle, code in do_timer_interrupt() expects an AEOI @@ -1922,9 +1576,6 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - if (pin1 == 0) - timer_uses_ioapic_pin_0 = 1; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); @@ -2039,11 +1690,6 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); #ifndef CONFIG_XEN sync_Arb_IDs(); #endif /* !CONFIG_XEN */ @@ -2066,17 +1712,12 @@ static int ioapic_suspend(struct sys_dev { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -2098,11 +1739,9 @@ static int ioapic_resume(struct sys_devi reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2148,28 +1787,254 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq; + int new; + int vector = 0; + unsigned long flags; + cpumask_t mask; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new, TARGET_CPUS, &mask); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} + #endif /* CONFIG_XEN */ -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +/* + * MSI mesage composition + */ +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + int vector; + unsigned dest; + cpumask_t tmp; -#ifdef CONFIG_ACPI + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp); + if (vector >= 0) { + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} -#define IO_APIC_MAX_ID 0xFE +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + int vector; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(mask, tmp, CPU_MASK_ALL); + + vector = assign_irq_vector(irq, mask, &tmp); + if (vector < 0) + return; + + dest = cpu_mask_to_apicid(tmp); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); -int __init io_apic_get_version (int ioapic) + write_msi_msg(irq, &msg); + set_native_irq_info(irq, mask); +} +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev) { - union IO_APIC_reg_01 reg_01; - unsigned long flags; + struct msi_msg msg; + int ret; + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + write_msi_msg(irq, &msg); - return reg_01.bits.version; + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + return; } +#endif /* CONFIG_PCI_MSI */ + +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + unsigned int dest; + cpumask_t tmp; + int vector; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(mask, tmp, CPU_MASK_ALL); + + vector = assign_irq_vector(irq, mask, &tmp); + if (vector < 0) + return; + + dest = cpu_mask_to_apicid(tmp); + + target_ht_irq(irq, dest, vector); + set_native_irq_info(irq, mask); +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + int vector; + cpumask_t tmp; + + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp); + if (vector >= 0) { + struct ht_irq_msg msg; + unsigned dest; + + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return vector; +} +#endif /* CONFIG_HT_IRQ */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#define IO_APIC_MAX_ID 0xFE int __init io_apic_get_redir_entries (int ioapic) { @@ -2188,6 +2053,8 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int vector; + cpumask_t mask; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -2196,6 +2063,17 @@ int io_apic_set_pci_routing (int ioapic, } /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + + vector = assign_irq_vector(irq, TARGET_CPUS, &mask); + if (vector < 0) + return vector; + + /* * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. * Note that we mask (disable) IRQs now -- these get enabled when the * corresponding device driver registers for this IRQ. @@ -2205,19 +2083,11 @@ int io_apic_set_pci_routing (int ioapic, entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); entry.trigger = edge_level; entry.polarity = active_high_low; entry.mask = 1; /* Disabled (masked) */ - - irq = gsi_irq_sharing(irq); - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); + entry.vector = vector & 0xff; apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", ioapic, @@ -2229,10 +2099,10 @@ int io_apic_set_pci_routing (int ioapic, if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; --- head-2011-03-11.orig/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/ioport_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -58,6 +58,7 @@ asmlinkage long sys_ioperm(unsigned long memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); set_iobitmap.nr_ports = IO_BITMAP_BITS; --- head-2011-03-11.orig/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/mpparse_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -41,8 +41,7 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -unsigned char apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; static int mp_current_pci_id = 0; @@ -56,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ int mp_irq_entries; int nr_ioapics; -int pic_mode; unsigned long mp_lapic_addr = 0; @@ -71,19 +69,6 @@ unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI -extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI*/ - u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -109,24 +94,20 @@ static int __init mpf_checksum(unsigned static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int cpu; - unsigned char ver; cpumask_t tmp_map; + char *bootup_cpu = ""; if (!(m->mpc_cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } - - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); + bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -137,24 +118,7 @@ static void __cpuinit MP_processor_info cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); -#if MAX_APICS < 255 - if ((int)m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } -#endif - ver = m->mpc_apicver; - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* * bios_cpu_apicid is required to have processors listed @@ -185,37 +149,42 @@ static void __init MP_bus_info (struct m Dprintk("Bus #%d is %s\n", m->mpc_busid, str); if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + set_bit(m->mpc_busid, mp_bus_not_pci); } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->mpc_busid, mp_bus_not_pci); mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else { printk(KERN_ERR "Unknown bustype %s\n", str); } } +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + static void __init MP_ioapic_info (struct mpc_config_ioapic *m) { if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); + + if (bad_ioapic(m->mpc_apicaddr)) return; - } + mp_ioapics[nr_ioapics] = *m; nr_ioapics++; } @@ -239,19 +208,6 @@ static void __init MP_lintsrc_info (stru m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } /* @@ -265,7 +221,7 @@ static int __init smp_read_mpc(struct mp unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", + printk("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->mpc_signature[0], mpc->mpc_signature[1], mpc->mpc_signature[2], @@ -273,31 +229,31 @@ static int __init smp_read_mpc(struct mp return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); + printk("MPTABLE: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); - printk("APIC at: 0x%X\n",mpc->mpc_lapic); + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -309,7 +265,7 @@ static int __init smp_read_mpc(struct mp struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; if (!acpi_lapic) - MP_processor_info(m); + MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; @@ -328,8 +284,8 @@ static int __init smp_read_mpc(struct mp struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_INTSRC: @@ -338,8 +294,8 @@ static int __init smp_read_mpc(struct mp (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_LINTSRC: @@ -347,15 +303,15 @@ static int __init smp_read_mpc(struct mp struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } } } clustered_apic_check(); if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); + printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; } @@ -451,13 +407,10 @@ static inline void __init construct_defa * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_apicver = 0; processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { @@ -476,14 +429,6 @@ static inline void __init construct_defa case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -494,7 +439,7 @@ static inline void __init construct_defa ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_apicver = 0; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); @@ -537,13 +482,6 @@ void __init get_smp_config (void) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } /* * Now see if we need to read further. @@ -620,7 +558,7 @@ static int __init smp_scan_config (unsig return 0; } -void __init find_intel_smp (void) +void __init find_smp_config(void) { unsigned int address; @@ -637,9 +575,7 @@ void __init find_intel_smp (void) smp_scan_config(0xF0000,0x10000)) return; /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. + * If it is an SMP machine we should know now. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. @@ -660,64 +596,38 @@ void __init find_intel_smp (void) printk(KERN_INFO "No mptable found.\n"); } -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - find_intel_smp(); -#endif -} - - /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { #ifndef CONFIG_XEN mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); #endif } - -void __cpuinit mp_register_lapic ( - u8 id, - u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) + if (id == boot_cpu_id) boot_cpu = 1; #ifndef CONFIG_XEN processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_apicver = 0; processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; #endif @@ -725,8 +635,6 @@ void __cpuinit mp_register_lapic ( MP_processor_info(&processor); } -#ifdef CONFIG_X86_IO_APIC - #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 @@ -737,11 +645,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic(int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -751,28 +657,15 @@ static int mp_find_ioapic ( } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; + int idx = 0; - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); + if (bad_ioapic(address)) return; - } idx = nr_ioapics++; @@ -784,7 +677,7 @@ void __init mp_register_ioapic ( set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); #endif mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mpc_apicver = 0; /* * Build basic IRQ lookup table to facilitate gsi->io_apic lookups @@ -795,21 +688,15 @@ void __init mp_register_ioapic ( mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -847,22 +734,18 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } - -void __init mp_config_acpi_legacy_irqs (void) +void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + set_bit(MP_ISA_BUS, mp_bus_not_pci); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). @@ -915,24 +798,13 @@ void __init mp_config_acpi_legacy_irqs ( if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); } - - return; } -#define MAX_GSI_NUM 4096 - int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, to the IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -965,47 +837,14 @@ int mp_register_gsi(u32 gsi, int trigger if ((1< 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_fadt.sci_int) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; } - -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ --- head-2011-03-11.orig/arch/x86/kernel/process_64-xen.c 2011-02-02 08:28:34.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/process_64-xen.c 2011-02-02 08:30:50.000000000 +0100 @@ -88,25 +88,24 @@ void idle_notifier_unregister(struct not } EXPORT_SYMBOL(idle_notifier_unregister); -enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; - void enter_idle(void) { - __get_cpu_var(idle_state) = CPU_IDLE; + write_pda(isidle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - __get_cpu_var(idle_state) = CPU_NOT_IDLE; + if (test_and_clear_bit_pda(0, isidle) == 0) + return; atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ void exit_idle(void) { - if (current->pid | read_pda(irqcount)) + /* idle loop has pid 0 */ + if (current->pid) return; __exit_idle(); } @@ -183,6 +182,9 @@ void cpu_idle (void) play_dead(); enter_idle(); idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ __exit_idle(); } @@ -195,7 +197,7 @@ void cpu_idle (void) void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; + cpumask_t map, tmp = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); put_cpu(); @@ -218,6 +220,8 @@ void cpu_idle_wait(void) } cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -249,9 +253,9 @@ void __show_regs(struct pt_regs * regs) print_modules(); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), - system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, @@ -309,6 +313,7 @@ void exit_thread(void) kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -339,6 +344,7 @@ void flush_thread(void) if (t->flags & _TIF_IA32) current_thread_info()->status |= TS_COMPAT; } + t->flags &= ~_TIF_DEBUG; tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; @@ -431,7 +437,7 @@ int copy_thread(int nr, unsigned long cl asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -439,6 +445,7 @@ int copy_thread(int nr, unsigned long cl } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -473,6 +480,30 @@ static inline void __save_init_fpu( stru } /* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) + +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } +} + +/* * switch_to(x,y) should switch tasks from x to y. * * This could still be optimized: @@ -500,6 +531,10 @@ __switch_to(struct task_struct *prev_p, #endif multicall_entry_t _mcl[8], *mcl = _mcl; + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * This is basically '__unlazy_fpu', except that we queue a * multicall to indicate FPU task switch, rather than @@ -512,7 +547,8 @@ __switch_to(struct task_struct *prev_p, mcl->op = __HYPERVISOR_fpu_taskswitch; mcl->args[0] = 1; mcl++; - } + } else + prev_p->fpu_counter = 0; /* * Reload esp0, LDT and the page table pointer: @@ -607,21 +643,29 @@ __switch_to(struct task_struct *prev_p, write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); write_pda(kernelstack, - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif /* * Now maybe reload the debug registers */ - if (unlikely(next->debugreg7)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + __switch_to_xtra(prev_p, next_p); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } @@ -841,7 +885,7 @@ int dump_task_regs(struct task_struct *t unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } --- head-2011-03-11.orig/arch/x86/kernel/setup_64-xen.c 2010-10-05 09:58:12.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/setup_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -88,9 +88,6 @@ extern struct edid_info edid_info; shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; EXPORT_SYMBOL(HYPERVISOR_shared_info); -extern char hypercall_page[PAGE_SIZE]; -EXPORT_SYMBOL(hypercall_page); - static int xen_panic_event(struct notifier_block *, unsigned long, void *); static struct notifier_block xen_panic_block = { xen_panic_event, NULL, 0 /* try to go last */ @@ -118,16 +115,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif - -int acpi_numa __initdata; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -151,10 +138,6 @@ struct sys_desc_table_struct { struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); -struct e820map e820; -#ifdef CONFIG_XEN -struct e820map machine_e820; -#endif extern int root_mountflags; @@ -181,9 +164,6 @@ struct resource standard_io_resources[] .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) struct resource data_resource = { @@ -230,9 +210,6 @@ static struct resource adapter_rom_resou .flags = IORESOURCE_ROM } }; -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, @@ -309,7 +286,8 @@ static void __init probe_roms(void) } /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; + start += 2048) { rom = isa_bus_to_virt(start); if (!romsignature(rom)) continue; @@ -329,187 +307,22 @@ static void __init probe_roms(void) } } -/* Check for full argument with no trailing characters */ -static int fullarg(char *p, char *arg) +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) { - int l = strlen(arg); - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; } - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - int userdef = 0; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (fullarg(from,"acpi=off")) - disable_acpi(); - - if (fullarg(from, "acpi=force")) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (fullarg(from, "acpi=ht")) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (fullarg(from, "pci=noacpi")) - acpi_disable_pci(); - else if (fullarg(from, "acpi=noirq")) - acpi_noirq_set(); - - else if (fullarg(from, "acpi_sci=edge")) - acpi_sci_flags.trigger = 1; - else if (fullarg(from, "acpi_sci=level")) - acpi_sci_flags.trigger = 3; - else if (fullarg(from, "acpi_sci=high")) - acpi_sci_flags.polarity = 1; - else if (fullarg(from, "acpi_sci=low")) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (fullarg(from, "acpi=strict")) { - acpi_strict = 1; - } -#ifdef CONFIG_X86_IO_APIC - else if (fullarg(from, "acpi_skip_timer_override")) - acpi_skip_timer_override = 1; -#endif -#endif - -#ifndef CONFIG_XEN - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - disable_apic = 1; - } - - if (fullarg(from, "noapic")) - skip_ioapic_setup = 1; - - if (fullarg(from,"apic")) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } -#endif - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - - if (!memcmp(from, "memmap=", 7)) { - /* exactmap option is for used defined memory */ - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - saved_max_pfn = e820_end_of_ram(); -#endif - from += 8+7; - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - } - else { - parse_memmapopt(from+7, &from); - userdef = 1; - } - } - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); +early_param("elfcorehdr", setup_elfcorehdr); #endif - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } - - if (fullarg(from,"oops=panic")) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { -#ifndef CONFIG_XEN - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } -#else - printk("Ignoring crashkernel command line, " - "parameter will be supplied by xen\n"); -#endif - } -#endif - -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ - else if(!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif - -#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) - else if (!memcmp(from, "additional_cpus=", 16)) - setup_additional_cpus(from+16); -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } - *to = '\0'; - *cmdline_p = command_line; -} - #ifndef CONFIG_NUMA static void __init contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) @@ -521,13 +334,13 @@ contig_initmem_init(unsigned long start_ if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n",bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_register_active_regions(0, start_pfn, end_pfn); #ifdef CONFIG_XEN if (xen_start_info->nr_pages < end_pfn) - e820_bootmem_free(NODE_DATA(0), 0, - xen_start_info->nr_pages<nr_pages); else #endif - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); } #endif @@ -589,6 +402,10 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { #ifdef CONFIG_XEN + extern struct e820map machine_e820; + + printk(KERN_INFO "Command line: %s\n", saved_command_line); + /* Register a call for panic conditions. */ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); @@ -614,6 +431,8 @@ void __init setup_arch(char **cmdline_p) ARCH_SETUP #else + printk(KERN_INFO "Command line: %s\n", saved_command_line); + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; @@ -641,16 +460,22 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); - early_identify_cpu(&boot_cpu_data); + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + + finish_e820_parsing(); + + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; /* for pfn_valid */ + num_physpages = end_pfn; check_efer(); @@ -661,6 +486,14 @@ void __init setup_arch(char **cmdline_p) if (is_initial_xendomain()) dmi_scan_machine(); + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; + + /* Remove active ranges so rediscovery with NUMA-awareness happens */ + remove_all_active_ranges(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -850,16 +683,16 @@ void __init setup_arch(char **cmdline_p) BUG(); } +#ifdef CONFIG_ACPI if (!is_initial_xendomain()) { acpi_disabled = 1; -#ifdef CONFIG_ACPI acpi_ht = 0; -#endif } #endif +#endif -#ifndef CONFIG_XEN - check_ioapic(); +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) + early_quirks(); #endif zap_low_mappings(0); @@ -909,6 +742,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(machine_e820.map, machine_e820.nr_map); #else e820_reserve_resources(e820.map, e820.nr_map); + e820_mark_nosave_regions(); #endif request_resource(&iomem_resource, &video_ram_resource); @@ -916,7 +750,7 @@ void __init setup_arch(char **cmdline_p) { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); } @@ -1100,7 +934,7 @@ static void __init amd_detect_cmp(struct #endif } -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -1156,6 +990,12 @@ static void __init init_amd(struct cpuin /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; + + /* When there is only one core no need to synchronize RDTSC */ + if (num_possible_cpus() == 1) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -1237,8 +1077,7 @@ static void srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - if (acpi_numa > 0) - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } @@ -1272,6 +1111,8 @@ static void __cpuinit init_intel(struct if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); @@ -1492,8 +1333,8 @@ static int show_cpuinfo(struct seq_file /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ --- head-2011-03-11.orig/arch/x86/kernel/setup64-xen.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/arch/x86/kernel/setup64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -31,7 +31,7 @@ #include #endif -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -55,8 +55,10 @@ Control non executable mappings for 64bi on Enable(default) off Disable */ -int __init nonx_setup(char *str) +static int __init nonx_setup(char *str) { + if (!str) + return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; do_not_nx = 0; @@ -64,9 +66,9 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 1; + return 0; } -__setup("noexec=", nonx_setup); /* parsed early actually */ +early_param("noexec", nonx_setup); int force_personality32 = 0; @@ -102,12 +104,9 @@ void __init setup_per_cpu_areas(void) #endif /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; @@ -169,7 +168,10 @@ void pda_init(int cpu) /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); #ifndef CONFIG_XEN + /* Memory clobbers used to order PDA accessed */ + mb(); wrmsrl(MSR_GS_BASE, pda); + mb(); #else if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda)) @@ -302,28 +304,17 @@ void __cpuinit cpu_init (void) * set up and load the per-CPU TSS */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; if (cpu) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - switch (v + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - cpu_pda(cpu)->debugstack = (unsigned long)estacks; - estacks += DEBUG_STKSZ; - break; -#endif - default: - estacks += EXCEPTION_STKSZ; - break; - } + estacks += PAGE_SIZE << order[v]; orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } --- head-2011-03-11.orig/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/smp_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -381,9 +381,8 @@ int smp_call_function_single (int cpu, v /* prevent preemption and reschedule on another processor */ int me = get_cpu(); if (cpu == me) { - WARN_ON(1); put_cpu(); - return -EBUSY; + return 0; } spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); @@ -501,7 +500,7 @@ void smp_send_stop(void) #ifndef CONFIG_XEN asmlinkage void smp_reschedule_interrupt(void) #else -asmlinkage irqreturn_t smp_reschedule_interrupt(void) +asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx) #endif { #ifndef CONFIG_XEN @@ -514,7 +513,7 @@ asmlinkage irqreturn_t smp_reschedule_in #ifndef CONFIG_XEN asmlinkage void smp_call_function_interrupt(void) #else -asmlinkage irqreturn_t smp_call_function_interrupt(void) +asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx) #endif { void (*func) (void *info) = call_data->func; @@ -545,31 +544,3 @@ asmlinkage irqreturn_t smp_call_function return IRQ_HANDLED; #endif } - -int safe_smp_processor_id(void) -{ -#ifdef CONFIG_XEN - return smp_processor_id(); -#else - unsigned apicid, i; - - if (disable_apic) - return 0; - - apicid = hard_smp_processor_id(); - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -#endif -} --- head-2011-03-11.orig/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/traps_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -114,7 +116,6 @@ static int call_trace = 1; #endif #ifdef CONFIG_KALLSYMS -# include void printk_address(unsigned long address) { unsigned long offset = 0, symsize; @@ -142,7 +143,7 @@ void printk_address(unsigned long addres #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) + unsigned *usedp, char **idp) { #ifndef CONFIG_X86_NO_TSS static char ids[][8] = { @@ -162,26 +163,7 @@ static unsigned long *in_exception_stack * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - /* - * set 'end' to the end of the exception stack. - */ - switch (k + 1) { - /* - * TODO: this block is not needed i think, because - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] - * properly too. - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; - break; -#endif - default: - end = per_cpu(orig_ist, cpu).ist[k]; - break; - } + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. @@ -236,13 +218,19 @@ static unsigned long *in_exception_stack return NULL; } -static int show_trace_unwind(struct unwind_frame_info *info, void *context) +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { + struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - printk_address(UNW_PC(info)); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } @@ -256,13 +244,19 @@ static int show_trace_unwind(struct unwi * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) { - const unsigned cpu = safe_smp_processor_id(); + void *t = (void *)tinfo; + return p > t && p < t + THREAD_SIZE - 3; +} + +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, + struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; - - printk("\nCall Trace:\n"); + struct thread_info *tinfo; if (!tsk) tsk = current; @@ -270,32 +264,47 @@ void show_trace(struct task_struct *tsk, if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, tsk, regs) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, tsk) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); + if (!stack) + return; } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (tsk && tsk != current) + stack = (unsigned long *)tsk->thread.rsp; + } + /* + * Align the stack pointer on word boundary, later loops + * rely on that (and corruption / debug info bugs can cause + * unaligned values here): + */ + stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1)); /* * Print function call entries within a stack. 'cond' is the @@ -305,7 +314,9 @@ void show_trace(struct task_struct *tsk, #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ + if (oops_in_progress ? \ + __kernel_text_address(addr) : \ + kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -314,7 +325,7 @@ void show_trace(struct task_struct *tsk, * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - printk_address(addr); \ + ops->address(data, addr); \ } \ } while (0) @@ -323,16 +334,17 @@ void show_trace(struct task_struct *tsk, * current stack address. If the stacks consist of nested * exceptions */ - for ( ; ; ) { - const char *id; + for (;;) { + char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - printk(" <%s>", id); + if (ops->stack(data, id) < 0) + break; HANDLE_STACK (stack < estack_end); - printk(" "); + ops->stack(data, ""); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the @@ -347,7 +359,8 @@ void show_trace(struct task_struct *tsk, (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - printk(" "); + if (ops->stack(data, "IRQ") < 0) + break; HANDLE_STACK (stack < irqstack_end); /* * We link to the next stack (which would be @@ -356,7 +369,7 @@ void show_trace(struct task_struct *tsk, */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - printk(" "); + ops->stack(data, "EOI"); continue; } } @@ -364,19 +377,58 @@ void show_trace(struct task_struct *tsk, } /* - * This prints the process stack: + * This handles the process stack: */ - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); + tinfo = current_thread_info(); + HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK +} +EXPORT_SYMBOL(dump_trace); +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); printk("\n"); } -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr) +{ + printk_address(addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +{ + printk("\nCall Trace:\n"); + dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + printk("\n"); +} + +static void +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -430,7 +482,7 @@ void show_registers(struct pt_regs *regs int i; int in_kernel = !user_mode(regs); unsigned long rsp; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; @@ -505,9 +557,11 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = safe_smp_processor_id(); + int cpu = smp_processor_id(); unsigned long flags; + oops_enter(); + /* racy, but better than risking deadlock. */ local_irq_save(flags); if (!spin_trylock(&die_lock)) { @@ -536,6 +590,7 @@ void __kprobes oops_end(unsigned long fl spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Fatal exception"); + oops_exit(); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) @@ -572,8 +627,8 @@ void die(const char * str, struct pt_reg do_exit(SIGSEGV); } -#ifdef CONFIG_X86_LOCAL_APIC -void __kprobes die_nmi(char *str, struct pt_regs *regs) +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL) +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags = oops_begin(); @@ -581,13 +636,12 @@ void __kprobes die_nmi(char *str, struct * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, safe_smp_processor_id()); + printk(str, smp_processor_id()); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); oops_end(flags); nmi_exit(); local_irq_enable(); @@ -734,8 +788,15 @@ asmlinkage void __kprobes do_general_pro static __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "You probably have a hardware problem with your " + "RAM chips\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); #if 0 /* XEN */ /* Clear and disable the memory parity error line. */ @@ -762,9 +823,15 @@ io_check_error(unsigned char reason, str static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); +{ + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } /* Runs on IST stack. This code must keep interrupts off all the time. @@ -789,12 +856,12 @@ asmlinkage __kprobes void default_do_nmi * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); + if (nmi_watchdog_tick(regs,reason)) return; - } #endif - unknown_nmi_error(reason, regs); + if (!do_nmi_callback(regs,cpu)) + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -1081,6 +1148,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } @@ -1141,24 +1209,30 @@ void __cpuinit smp_trap_init(trap_info_t } -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) +static int __init oops_setup(char *s) { - panic_on_oops = 1; - return 1; + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; } -__setup("oops=", oops_dummy); +early_param("oops", oops_setup); static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); #ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { + if (!s) + return -EINVAL; if (strcmp(s, "old") == 0) call_trace = -1; else if (strcmp(s, "both") == 0) @@ -1167,7 +1241,7 @@ static int __init call_trace_setup(char call_trace = 1; else if (strcmp(s, "new") == 0) call_trace = 2; - return 1; + return 0; } -__setup("call_trace=", call_trace_setup); +early_param("call_trace", call_trace_setup); #endif --- head-2011-03-11.orig/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200 +++ head-2011-03-11/arch/x86/kernel/vsyscall_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -26,6 +26,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -33,11 +37,15 @@ #include #include #include +#include +#include +#include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include @@ -61,8 +69,7 @@ static __always_inline void do_vgettimeo sequence = read_seqbegin(&__xtime_lock); sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); + usec = __xtime.tv_nsec / 1000; if (__vxtime.mode != VXTIME_HPET) { t = get_cycles_sync(); @@ -72,7 +79,8 @@ static __always_inline void do_vgettimeo __vxtime.tsc_quot) >> 32; /* See comment in x86_64 do_gettimeofday. */ } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; } } while (read_seqretry(&__xtime_lock, sequence)); @@ -127,9 +135,46 @@ time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) -{ - return -ENOSYS; +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = __jiffies)) { + p = tcache->blob[1]; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -149,7 +194,8 @@ static int vsyscall_sysctl_change(ctl_ta void __user *buffer, size_t *lenp, loff_t *ppos) { extern u16 vsysc1, vsysc2; - u16 *map1, *map2; + u16 __iomem *map1; + u16 __iomem *map2; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (!write) return ret; @@ -164,11 +210,11 @@ static int vsyscall_sysctl_change(ctl_ta goto out; } if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; + writew(SYSCALL, map1); + writew(SYSCALL, map2); } else { - *map1 = NOP2; - *map2 = NOP2; + writew(NOP2, map1); + writew(NOP2, map2); } iounmap(map2); out: @@ -200,6 +246,48 @@ static ctl_table kernel_root_table2[] = #endif +/* Assume __initcall executes before all user space. Hopefully kmod + doesn't violate that. We'll find out if it does. */ +static void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = 0x0f40000000000ULL; + d |= cpu; + d |= (node & 0xf) << 12; + d |= (node >> 4) << 48; + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu) + + GDT_ENTRY_PER_CPU), + d)) + BUG(); +} + +static void __cpuinit cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int __cpuinit +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + if (action == CPU_ONLINE) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + return NOTIFY_DONE; +} +#endif + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,13 +302,20 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_XEN sysctl_vsyscall = 0; /* disable vgettimeofay() */ + if (boot_cpu_has(X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; #endif #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); + hotcpu_notifier(cpu_vsyscall_notifier, 0); return 0; } --- head-2011-03-11.orig/arch/x86/mm/fault_64-xen.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/arch/x86/mm/fault_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -40,8 +40,7 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); /* Hook to register for page fault notifications */ int register_page_fault_notifier(struct notifier_block *nb) @@ -49,11 +48,13 @@ int register_page_fault_notifier(struct vmalloc_sync_all(); return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(register_page_fault_notifier); int unregister_page_fault_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); static inline int notify_page_fault(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -67,13 +68,6 @@ static inline int notify_page_fault(enum }; return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); } -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif void bust_spinlocks(int yes) { @@ -102,7 +96,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char *instr; + unsigned char __user *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -111,7 +105,7 @@ static noinline int is_prefetch(struct p if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_rip_to_linear(current, regs); + instr = (unsigned char __user *)convert_rip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) @@ -122,7 +116,7 @@ static noinline int is_prefetch(struct p unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; instr_hi = opcode & 0xf0; @@ -160,7 +154,7 @@ static noinline int is_prefetch(struct p case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -176,7 +170,7 @@ static noinline int is_prefetch(struct p static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long *)p); + return __get_user(dummy, (unsigned long __user *)p); } void dump_pagetable(unsigned long address) @@ -248,7 +242,7 @@ static int is_errata93(struct pt_regs *r int unhandled_signal(struct task_struct *tsk, int sig) { - if (tsk->pid == 1) + if (is_init(tsk)) return 1; if (tsk->ptrace & PT_PTRACED) return 0; @@ -300,7 +294,7 @@ static int vmalloc_fault(unsigned long a if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); /* Below here mismatches are bugs because these lower tables are shared */ @@ -309,7 +303,7 @@ static int vmalloc_fault(unsigned long a pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); @@ -531,7 +525,7 @@ good_area: case PF_PROT: /* read, present */ goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } @@ -647,7 +641,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { + if (is_init(current)) { yield(); goto again; } @@ -707,7 +701,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); pgd_page_table(unlock, page); } spin_unlock(&pgd_lock); --- head-2011-03-11.orig/arch/x86/mm/init_64-xen.c 2010-04-29 09:34:47.000000000 +0200 +++ head-2011-03-11/arch/x86/mm/init_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -61,8 +61,6 @@ EXPORT_SYMBOL(__kernel_page_user); int after_bootmem; -static unsigned long dma_reserve __initdata; - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); extern unsigned long start_pfn; @@ -420,7 +418,6 @@ __init void *early_ioremap(unsigned long /* actually usually some more */ if (size >= LARGE_PAGE_SIZE) { - printk("SMBIOS area too long %lu\n", size); return NULL; } set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); @@ -442,16 +439,24 @@ __init void early_iounmap(void *addr, un #endif static void __meminit -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { - int i, k; + int i = pmd_index(address); - for (i = 0; i < PTRS_PER_PMD; pmd++, i++) { + for (; i < PTRS_PER_PMD; i++) { unsigned long pte_phys; + pmd_t *pmd = pmd_page + i; pte_t *pte, *pte_save; + int k; if (address >= end) break; + + if (__pmd_val(*pmd)) { + address += PMD_SIZE; + continue; + } + pte = alloc_static_page(&pte_phys); pte_save = pte; for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) { @@ -478,40 +483,35 @@ phys_pmd_init(pmd_t *pmd, unsigned long static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); - - if (pmd_none(*pmd)) { - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); - } + pmd_t *pmd = pmd_offset(pud,0); + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { - long i = pud_index(address); - - pud = pud + i; - - if (after_bootmem && pud_val(*pud)) { - phys_pmd_update(pud, address, end); - return; - } + int i = pud_index(addr); - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr, pmd_phys; + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - paddr = (address & PGDIR_MASK) + i*PUD_SIZE; - if (paddr >= end) + if (addr >= end) break; + if (__pud_val(*pud)) { + phys_pmd_update(pud, addr, end); + continue; + } + pmd = alloc_static_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); *pud = __pud(pmd_phys | _KERNPG_TABLE); - phys_pmd_init(pmd, paddr, end); + phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); early_make_page_readonly(pmd, XENFEAT_writable_page_tables); @@ -800,69 +800,18 @@ void __cpuinit zap_low_mappings(int cpu) #endif } -/* Compute zone sizes for the DMA and DMA32 zones in a node. */ -__init void -size_zones(unsigned long *z, unsigned long *h, - unsigned long start_pfn, unsigned long end_pfn) -{ - int i; - unsigned long w; - - for (i = 0; i < MAX_NR_ZONES; i++) - z[i] = 0; - - if (start_pfn < MAX_DMA_PFN) - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; - if (start_pfn < MAX_DMA32_PFN) { - unsigned long dma32_pfn = MAX_DMA32_PFN; - if (dma32_pfn > end_pfn) - dma32_pfn = end_pfn; - z[ZONE_DMA32] = dma32_pfn - start_pfn; - } - z[ZONE_NORMAL] = end_pfn - start_pfn; - - /* Remove lower zones from higher ones. */ - w = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - if (z[i]) - z[i] -= w; - w += z[i]; - } - - /* Compute holes */ - w = start_pfn; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long s = w; - w += z[i]; - h[i] = e820_hole_size(s, w); - } - - /* Add the space pace needed for mem_map to the holes too. */ - for (i = 0; i < MAX_NR_ZONES; i++) - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; - - /* The 16MB DMA zone has the kernel and other misc mappings. - Account them too */ - if (h[ZONE_DMA]) { - h[ZONE_DMA] += dma_reserve; - if (h[ZONE_DMA] >= z[ZONE_DMA]) { - printk(KERN_WARNING - "Kernel too large and filling up ZONE_DMA?\n"); - h[ZONE_DMA] = z[ZONE_DMA]; - } - } -} - #ifndef CONFIG_NUMA void __init paging_init(void) { - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_NORMAL] = end_pfn; memory_present(0, 0, end_pfn); sparse_init(); - size_zones(zones, holes, 0, end_pfn); - free_area_init_node(0, NODE_DATA(0), zones, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); + free_area_init_nodes(max_zone_pfns); init_mm.context.pinned = 1; } @@ -916,36 +865,23 @@ void online_page(struct page *page) #ifdef CONFIG_MEMORY_HOTPLUG /* - * XXX: memory_add_physaddr_to_nid() is to find node id from physical address - * via probe interface of sysfs. If acpi notifies hot-add event, then it - * can tell node id by searching dsdt. But, probe interface doesn't have - * node id. So, return 0 as node id at this time. - */ -#ifdef CONFIG_NUMA -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -#endif - -/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; + init_memory_mapping(start, (start + size -1)); + ret = __add_pages(zone, start_pfn, nr_pages); if (ret) goto error; - init_memory_mapping(start, (start + size -1)); - return ret; error: printk("%s: Problem encountered in __add_pages!\n", __func__); @@ -959,7 +895,17 @@ int remove_memory(u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); -#else /* CONFIG_MEMORY_HOTPLUG */ +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) +int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE /* * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, * just online the pages. @@ -985,7 +931,7 @@ int __add_pages(struct zone *z, unsigned } return err; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +#endif static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -997,12 +943,6 @@ void __init mem_init(void) pci_iommu_alloc(); - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - num_physpages = end_pfn; - high_memory = (void *) __va(end_pfn * PAGE_SIZE); - /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -1014,13 +954,13 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - /* XEN: init and count pages outside initial allocation. */ + /* XEN: init pages outside initial allocation. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { ClearPageReserved(pfn_to_page(pfn)); init_page_count(pfn_to_page(pfn)); - totalram_pages++; } - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); + reservedpages = end_pfn - totalram_pages - + absent_pages_in_range(0, end_pfn); after_bootmem = 1; @@ -1127,15 +1067,32 @@ void free_initrd_mem(unsigned long start void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { - /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); +#endif + unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { + /* This can happen with kdump kernels when accessing firmware + tables. */ + if (pfn < end_pfn_map) + return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + phys, len); + return; + } + + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA reserve_bootmem_node(NODE_DATA(nid), phys, len); #else reserve_bootmem(phys, len); #endif - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) +#ifndef CONFIG_XEN + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; + set_dma_reserve(dma_reserve); + } +#endif } int kern_addr_valid(unsigned long addr) --- head-2011-03-11.orig/arch/x86/mm/pageattr_64-xen.c 2009-03-18 10:39:31.000000000 +0100 +++ head-2011-03-11/arch/x86/mm/pageattr_64-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -377,8 +377,8 @@ static void revert_page(unsigned long ad BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(__pmd_val(*pmd) & _PAGE_PSE); - pgprot_val(ref_prot) |= _PAGE_PSE; large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } @@ -388,32 +388,28 @@ __change_page_attr(unsigned long address { pte_t *kpte; struct page *kpte_page; - unsigned kpte_flags; pgprot_t ref_prot2; kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - kpte_flags = pte_val(*kpte); if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if ((kpte_flags & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); } else { /* * split_large_page will take the reference for this * change_page_attr on the split page. */ - struct page *split; - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); - + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot2)); + set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; - } + } page_private(kpte_page)++; - } else if ((kpte_flags & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, ref_prot)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; @@ -470,10 +466,12 @@ int change_page_attr_addr(unsigned long * lowmem */ if (__pa(address) < KERNEL_TEXT_SIZE) { unsigned long addr2; - pgprot_t prot2 = prot; + pgprot_t prot2; addr2 = __START_KERNEL_map + __pa(address); - pgprot_val(prot2) &= ~_PAGE_NX; - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + /* Make sure the kernel mappings stay executable */ + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); + err = __change_page_attr(addr2, pfn, prot2, + PAGE_KERNEL_EXEC); } } up_write(&init_mm.mmap_sem); --- head-2011-03-11.orig/drivers/char/tpm/tpm_xen.c 2011-01-31 14:53:38.000000000 +0100 +++ head-2011-03-11/drivers/char/tpm/tpm_xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -85,8 +85,7 @@ static struct tpm_private *my_priv; /* local function prototypes */ static irqreturn_t tpmif_int(int irq, - void *tpm_priv, - struct pt_regs *ptregs); + void *tpm_priv); static void tpmif_rx_action(unsigned long unused); static int tpmif_connect(struct xenbus_device *dev, struct tpm_private *tp, @@ -559,7 +558,7 @@ static void tpmif_rx_action(unsigned lon } -static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs) +static irqreturn_t tpmif_int(int irq, void *tpm_priv) { struct tpm_private *tp = tpm_priv; unsigned long flags; --- head-2011-03-11.orig/drivers/pci/Kconfig 2011-01-31 14:32:40.000000000 +0100 +++ head-2011-03-11/drivers/pci/Kconfig 2011-01-31 17:29:16.000000000 +0100 @@ -86,7 +86,7 @@ config XEN_PCIDEV_FE_DEBUG config HT_IRQ bool "Interrupts on hypertransport devices" default y - depends on PCI && X86_LOCAL_APIC && X86_IO_APIC + depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN help This allows native hypertransport devices to use interrupts. --- head-2011-03-11.orig/drivers/pci/msi-xen.c 2009-12-04 08:45:56.000000000 +0100 +++ head-2011-03-11/drivers/pci/msi-xen.c 2011-01-31 17:29:16.000000000 +0100 @@ -6,6 +6,7 @@ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) */ +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include @@ -26,14 +28,6 @@ static int pci_msi_enable = 1; -static struct msi_ops *msi_ops; - -int msi_register(struct msi_ops *ops) -{ - msi_ops = ops; - return 0; -} - static LIST_HEAD(msi_dev_head); DEFINE_SPINLOCK(msi_dev_lock); @@ -481,9 +475,9 @@ void pci_restore_msix_state(struct pci_d * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with a single - * MSI vector, regardless of device function is capable of handling + * MSI irq, regardless of device function is capable of handling * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI vector or non-zero for otherwise. + * of an entry zero with the new MSI irq or non-zero for otherwise. **/ static int msi_capability_init(struct pci_dev *dev) { @@ -497,11 +491,11 @@ static int msi_capability_init(struct pc if (pirq < 0) return -EBUSY; - dev->irq = pirq; /* Set MSI enabled bits */ enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); dev->msi_enabled = 1; + dev->irq = pirq; return 0; } @@ -512,8 +506,8 @@ static int msi_capability_init(struct pc * @nvec: number of @entries * * Setup the MSI-X capability structure of device function with a - * single MSI-X vector. A return of zero indicates the successful setup of - * requested MSI-X entries with allocated vectors or non-zero for otherwise. + * single MSI-X irq. A return of zero indicates the successful setup of + * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec) @@ -562,12 +556,18 @@ static int msix_capability_init(struct p } if (i != nvec) { + int avail = i - 1; for (j = --i; j >= 0; j--) { msi_unmap_pirq(dev, entries[j].vector); detach_pirq_entry(entries[j].entry, msi_dev_entry); entries[j].vector = 0; } - return -EBUSY; + /* If we had some success report the number of irqs + * we succeeded in setting up. + */ + if (avail <= 0) + avail = -EBUSY; + return avail; } enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); @@ -577,11 +577,40 @@ static int msix_capability_init(struct p } /** + * pci_msi_supported - check whether MSI may be enabled on device + * @dev: pointer to the pci_dev data structure of MSI device function + * + * Look at global flags, the device itself, and its parent busses + * to return 0 if MSI are supported for the device. + **/ +static +int pci_msi_supported(struct pci_dev * dev) +{ + struct pci_bus *bus; + + /* MSI must be globally enabled and supported by the device */ + if (!pci_msi_enable || !dev || dev->no_msi) + return -EINVAL; + + /* Any bridge which does NOT route MSI transactions from it's + * secondary bus to it's primary bus must set NO_MSI flag on + * the secondary pci_bus. + * We expect only arch-specific PCI host bus controller driver + * or quirks for specific PCI bridges to be setting NO_MSI. + */ + for (bus = dev->bus; bus; bus = bus->parent) + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) + return -EINVAL; + + return 0; +} + +/** * pci_enable_msi - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with - * a single MSI vector upon its software driver call to request for + * a single MSI irq upon its software driver call to request for * MSI mode enabled on its hardware device function. A return of zero * indicates the successful setup of an entry zero with the new MSI * vector or non-zero for otherwise. @@ -589,19 +618,11 @@ static int msix_capability_init(struct p extern int pci_frontend_enable_msi(struct pci_dev *dev); int pci_enable_msi(struct pci_dev* dev) { - struct pci_bus *bus; - int pos, temp, status = -EINVAL; + int pos, temp, status; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); - if (!pci_msi_enable || !dev) - return status; - - if (dev->no_msi) - return status; - - for (bus = dev->bus; bus; bus = bus->parent) - if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) - return -EINVAL; + if (pci_msi_supported(dev) < 0) + return -EINVAL; status = msi_init(); if (status < 0) @@ -631,10 +652,10 @@ int pci_enable_msi(struct pci_dev* dev) if (!pos) return -EINVAL; - /* Check whether driver already requested for MSI-X vectors */ + /* Check whether driver already requested for MSI-X irqs */ if (dev->msix_enabled) { printk(KERN_INFO "PCI: %s: Can't enable MSI. " - "Device already has MSI-X vectors assigned\n", + "Device already has MSI-X irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; @@ -699,37 +720,29 @@ void pci_disable_msi(struct pci_dev* dev * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of MSI-X entries - * @nvec: number of MSI-X vectors requested for allocation by device driver + * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number - * of requested vectors upon its software driver call to request for + * of requested irqs upon its software driver call to request for * MSI-X mode enabled on its hardware device function. A return of zero * indicates the successful configuration of MSI-X capability structure - * with new allocated MSI-X vectors. A return of < 0 indicates a failure. + * with new allocated MSI-X irqs. A return of < 0 indicates a failure. * Or a return of > 0 indicates that driver request is exceeding the number - * of vectors available. Driver should use the returned value to re-send + * of irqs available. Driver should use the returned value to re-send * its request. **/ extern int pci_frontend_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec); int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) { - struct pci_bus *bus; int status, pos, nr_entries; int i, j, temp; u16 control; struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); - if (!pci_msi_enable || !dev || !entries) + if (!entries || pci_msi_supported(dev) < 0) return -EINVAL; - if (dev->no_msi) - return -EINVAL; - - for (bus = dev->bus; bus; bus = bus->parent) - if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) - return -EINVAL; - #ifdef CONFIG_XEN_PCIDEV_FRONTEND if (!is_initial_xendomain()) { struct msi_pirq_entry *pirq_entry; @@ -793,7 +806,7 @@ int pci_enable_msix(struct pci_dev* dev, /* Check whether driver already requested for MSI vector */ if (dev->msi_enabled) { printk(KERN_INFO "PCI: %s: Can't enable MSI-X. " - "Device already has an MSI vector assigned\n", + "Device already has an MSI irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; @@ -861,11 +874,11 @@ void pci_disable_msix(struct pci_dev* de } /** - * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state + * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state * @dev: pointer to the pci_dev data structure of MSI(X) device function * * Being called during hotplug remove, from which the device function - * is hot-removed. All previous assigned MSI/MSI-X vectors, if + * is hot-removed. All previous assigned MSI/MSI-X irqs, if * allocated for this device function, are reclaimed to unused state, * which may be used later on. **/ --- head-2011-03-11.orig/drivers/xen/Kconfig 2011-02-24 14:05:09.000000000 +0100 +++ head-2011-03-11/drivers/xen/Kconfig 2011-01-31 17:29:16.000000000 +0100 @@ -332,6 +332,10 @@ endmenu config HAVE_IRQ_IGNORE_UNHANDLED def_bool y +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + depends on X86 + config NO_IDLE_HZ def_bool y --- head-2011-03-11.orig/drivers/xen/balloon/balloon.c 2010-03-31 09:56:02.000000000 +0200 +++ head-2011-03-11/drivers/xen/balloon/balloon.c 2011-01-31 17:29:16.000000000 +0100 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -81,11 +82,7 @@ struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; -/* VM /proc information for memory */ -extern unsigned long totalram_pages; - -#ifndef MODULE -extern unsigned long totalhigh_pages; +#if !defined(MODULE) && defined(CONFIG_HIGHMEM) #define inc_totalhigh_pages() (totalhigh_pages++) #define dec_totalhigh_pages() (totalhigh_pages--) #else @@ -133,29 +130,44 @@ static struct timer_list balloon_timer; printk(KERN_WARNING "xen_mem: " fmt, ##args) /* balloon_append: add the given page to the balloon. */ -static void balloon_append(struct page *page) +static void balloon_append(struct page *page, int account) { + unsigned long pfn; + /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); bs.balloon_high++; - dec_totalhigh_pages(); + if (account) + dec_totalhigh_pages(); } else { list_add(PAGE_TO_LIST(page), &ballooned_pages); bs.balloon_low++; } + + pfn = page_to_pfn(page); + if (account) { + SetPageReserved(page); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + page_zone(page)->present_pages--; + } else { + BUG_ON(!PageReserved(page)); + WARN_ON_ONCE(phys_to_machine_mapping_valid(pfn)); + } } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(int *was_empty) { struct page *page; + struct zone *zone; if (list_empty(&ballooned_pages)) return NULL; page = LIST_TO_PAGE(ballooned_pages.next); UNLIST_PAGE(page); + BUG_ON(!PageReserved(page)); if (PageHighMem(page)) { bs.balloon_high--; @@ -163,6 +175,9 @@ static struct page *balloon_retrieve(voi } else bs.balloon_low--; + zone = page_zone(page); + *was_empty |= !populated_zone(zone); + zone->present_pages++; return page; } @@ -248,6 +263,7 @@ static int increase_reservation(unsigned unsigned long pfn, i, flags; struct page *page; long rc; + int need_zonelists_rebuild = 0; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, @@ -273,7 +289,7 @@ static int increase_reservation(unsigned goto out; for (i = 0; i < rc; i++) { - page = balloon_retrieve(); + page = balloon_retrieve(&need_zonelists_rebuild); BUG_ON(page == NULL); pfn = page_to_pfn(page); @@ -306,6 +322,18 @@ static int increase_reservation(unsigned out: balloon_unlock(flags); +#ifndef MODULE + setup_per_zone_pages_min(); +# if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) \ + || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) + /* build_all_zonelists() is __meminit */ + if (need_zonelists_rebuild) + build_all_zonelists(); + else +# endif + vm_total_pages = nr_free_pagecache_pages(); +#endif + return rc < 0 ? rc : rc != nr_pages; } @@ -364,8 +392,7 @@ static int decrease_reservation(unsigned /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - balloon_append(pfn_to_page(pfn)); + balloon_append(pfn_to_page(pfn), 1); } set_xen_guest_handle(reservation.extent_start, frame_list); @@ -582,8 +609,11 @@ static int __init balloon_init(void) /* Initialise the balloon with excess memory space. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { page = pfn_to_page(pfn); - if (!PageReserved(page)) - balloon_append(page); + if (!PageReserved(page)) { + SetPageReserved(page); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + balloon_append(page, 0); + } } #endif @@ -618,7 +648,7 @@ void balloon_update_driver_allowance(lon static int dealloc_pte_fn( pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) { - unsigned long mfn = pte_mfn(*pte); + unsigned long pfn, mfn = pte_mfn(*pte); int ret; struct xen_memory_reservation reservation = { .nr_extents = 1, @@ -627,7 +657,9 @@ static int dealloc_pte_fn( }; set_xen_guest_handle(reservation.extent_start, &mfn); set_pte_at(&init_mm, addr, pte, __pte_ma(0)); - set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + pfn = __pa(addr) >> PAGE_SHIFT; + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + SetPageReserved(pfn_to_page(pfn)); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != 1); return 0; @@ -696,6 +728,9 @@ struct page **alloc_empty_pages_and_page } totalram_pages = --bs.current_pages - totalram_bias; + if (PageHighMem(page)) + dec_totalhigh_pages(); + page_zone(page)->present_pages--; balloon_unlock(flags); } @@ -710,7 +745,7 @@ struct page **alloc_empty_pages_and_page err: balloon_lock(flags); while (--i >= 0) - balloon_append(pagevec[i]); + balloon_append(pagevec[i], 0); balloon_unlock(flags); kfree(pagevec); pagevec = NULL; @@ -728,7 +763,7 @@ void free_empty_pages_and_pagevec(struct balloon_lock(flags); for (i = 0; i < nr_pages; i++) { BUG_ON(page_count(pagevec[i]) != 1); - balloon_append(pagevec[i]); + balloon_append(pagevec[i], 0); } balloon_unlock(flags); @@ -742,7 +777,8 @@ void balloon_release_driver_page(struct unsigned long flags; balloon_lock(flags); - balloon_append(page); + balloon_append(page, 1); + totalram_pages = --bs.current_pages - totalram_bias; bs.driver_pages--; balloon_unlock(flags); --- head-2011-03-11.orig/drivers/xen/blkback/blkback.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/blkback/blkback.c 2011-01-31 17:29:16.000000000 +0100 @@ -294,7 +294,7 @@ static void blkif_notify_work(blkif_t *b wake_up(&blkif->wq); } -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t blkif_be_int(int irq, void *dev_id) { blkif_notify_work(dev_id); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/blkback/common.h 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/blkback/common.h 2011-01-31 17:29:16.000000000 +0100 @@ -144,7 +144,7 @@ void blkif_interface_init(void); void blkif_xenbus_init(void); -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t blkif_be_int(int irq, void *dev_id); int blkif_schedule(void *arg); int blkback_barrier(struct xenbus_transaction xbt, --- head-2011-03-11.orig/drivers/xen/blkfront/blkfront.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/blkfront/blkfront.c 2011-01-31 17:29:16.000000000 +0100 @@ -70,9 +70,9 @@ static int setup_blkring(struct xenbus_d static void kick_pending_request_queues(struct blkfront_info *); -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs); +static irqreturn_t blkif_int(int irq, void *dev_id); static void blkif_restart_queue(void *arg); -static void blkif_recover(struct blkfront_info *); +static int blkif_recover(struct blkfront_info *); static void blkif_completion(struct blk_shadow *); static void blkif_free(struct blkfront_info *, int); @@ -149,7 +149,7 @@ static int blkfront_resume(struct xenbus err = talk_to_backend(dev, info); if (info->connected == BLKIF_STATE_SUSPENDED && !err) - blkif_recover(info); + err = blkif_recover(info); return err; } @@ -743,7 +743,7 @@ void do_blkif_request(request_queue_t *r } -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +static irqreturn_t blkif_int(int irq, void *dev_id) { struct request *req; blkif_response_t *bret; @@ -854,7 +854,7 @@ static void blkif_completion(struct blk_ gnttab_end_foreign_access(s->req.seg[i].gref, 0UL); } -static void blkif_recover(struct blkfront_info *info) +static int blkif_recover(struct blkfront_info *info) { int i; blkif_request_t *req; @@ -862,8 +862,10 @@ static void blkif_recover(struct blkfron int j; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmalloc(sizeof(info->shadow), GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH); - memcpy(copy, info->shadow, sizeof(info->shadow)); + copy = kmemdup(info->shadow, sizeof(info->shadow), + GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH); + if (!copy) + return -ENOMEM; /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); @@ -917,6 +919,8 @@ static void blkif_recover(struct blkfron kick_pending_request_queues(info); spin_unlock_irq(&blkif_io_lock); + + return 0; } int blkfront_is_ready(struct xenbus_device *dev) --- head-2011-03-11.orig/drivers/xen/blktap/blktap.c 2011-02-17 09:58:10.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap/blktap.c 2011-02-17 10:07:17.000000000 +0100 @@ -1272,7 +1272,7 @@ static void blkif_notify_work(blkif_t *b wake_up(&blkif->wq); } -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t tap_blkif_be_int(int irq, void *dev_id) { blkif_notify_work(dev_id); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/blktap/common.h 2008-09-15 13:40:15.000000000 +0200 +++ head-2011-03-11/drivers/xen/blktap/common.h 2011-01-31 17:29:16.000000000 +0100 @@ -113,7 +113,7 @@ void tap_blkif_interface_init(void); void tap_blkif_xenbus_init(void); -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t tap_blkif_be_int(int irq, void *dev_id); int tap_blkif_schedule(void *arg); int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif); --- head-2011-03-11.orig/drivers/xen/blktap2/sysfs.c 2011-03-02 12:00:16.000000000 +0100 +++ head-2011-03-11/drivers/xen/blktap2/sysfs.c 2011-01-31 17:29:16.000000000 +0100 @@ -150,7 +150,7 @@ blktap_sysfs_pause_device(struct class_d err = blktap_device_pause(tap); if (!err) { class_device_remove_file(dev, &class_device_attr_pause); - class_device_create_file(dev, &class_device_attr_resume); + err = class_device_create_file(dev, &class_device_attr_resume); } out: @@ -182,7 +182,7 @@ blktap_sysfs_resume_device(struct class_ err = blktap_device_resume(tap); if (!err) { class_device_remove_file(dev, &class_device_attr_resume); - class_device_create_file(dev, &class_device_attr_pause); + err = class_device_create_file(dev, &class_device_attr_pause); } out: @@ -292,6 +292,7 @@ blktap_sysfs_create(struct blktap *tap) { struct blktap_ring *ring; struct class_device *dev; + int err, state = 0; if (!class) return -ENODEV; @@ -310,12 +311,27 @@ blktap_sysfs_create(struct blktap *tap) atomic_set(&ring->sysfs_refcnt, 0); set_bit(BLKTAP_SYSFS, &tap->dev_inuse); - class_device_create_file(dev, &class_device_attr_name); - class_device_create_file(dev, &class_device_attr_remove); - class_device_create_file(dev, &class_device_attr_pause); - class_device_create_file(dev, &class_device_attr_debug); + err = class_device_create_file(dev, &class_device_attr_name); + if (!err) { + ++state; + err = class_device_create_file(dev, &class_device_attr_remove); + } + if (!err) { + ++state; + err = class_device_create_file(dev, &class_device_attr_pause); + } + if (!err) { + ++state; + err = class_device_create_file(dev, &class_device_attr_debug); + } - return 0; + switch (state * !!err) { + case 3: class_device_remove_file(dev, &class_device_attr_pause); + case 2: class_device_remove_file(dev, &class_device_attr_remove); + case 1: class_device_remove_file(dev, &class_device_attr_name); + } + + return err; } int @@ -409,6 +425,7 @@ int __init blktap_sysfs_init(void) { struct class *cls; + int err; if (class) return -EEXIST; @@ -417,9 +434,16 @@ blktap_sysfs_init(void) if (IS_ERR(cls)) return PTR_ERR(cls); - class_create_file(cls, &class_attr_verbosity); - class_create_file(cls, &class_attr_devices); + err = class_create_file(cls, &class_attr_verbosity); + if (!err) { + err = class_create_file(cls, &class_attr_devices); + if (err) + class_remove_file(cls, &class_attr_verbosity); + } + if (!err) + class = cls; + else + class_destroy(cls); - class = cls; - return 0; + return err; } --- head-2011-03-11.orig/drivers/xen/console/console.c 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/drivers/xen/console/console.c 2011-01-31 17:29:16.000000000 +0100 @@ -360,7 +360,7 @@ static struct tty_struct *xencons_tty; static int xencons_priv_irq; static char x_char; -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs) +void xencons_rx(char *buf, unsigned len) { int i; unsigned long flags; @@ -385,8 +385,7 @@ void xencons_rx(char *buf, unsigned len, if (time_before(jiffies, sysrq_timeout)) { spin_unlock_irqrestore( &xencons_lock, flags); - handle_sysrq( - buf[i], regs, xencons_tty); + handle_sysrq(buf[i], xencons_tty); spin_lock_irqsave( &xencons_lock, flags); continue; @@ -451,14 +450,13 @@ void xencons_tx(void) } /* Privileged receive callback and transmit kicker. */ -static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id) { static char rbuf[16]; int l; while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) - xencons_rx(rbuf, l, regs); + xencons_rx(rbuf, l); xencons_tx(); @@ -646,7 +644,7 @@ static void xencons_close(struct tty_str spin_unlock_irqrestore(&xencons_lock, flags); } -static struct tty_operations xencons_ops = { +static const struct tty_operations xencons_ops = { .open = xencons_open, .close = xencons_close, .write = xencons_write, --- head-2011-03-11.orig/drivers/xen/console/xencons_ring.c 2007-06-12 13:13:44.000000000 +0200 +++ head-2011-03-11/drivers/xen/console/xencons_ring.c 2011-01-31 17:29:16.000000000 +0100 @@ -83,7 +83,7 @@ int xencons_ring_send(const char *data, return sent; } -static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs) +static irqreturn_t handle_input(int irq, void *unused) { struct xencons_interface *intf = xencons_interface(); XENCONS_RING_IDX cons, prod; @@ -94,7 +94,7 @@ static irqreturn_t handle_input(int irq, BUG_ON((prod - cons) > sizeof(intf->in)); while (cons != prod) { - xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs); + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1); cons++; } --- head-2011-03-11.orig/drivers/xen/core/evtchn.c 2010-11-25 09:36:37.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/evtchn.c 2011-01-31 17:29:16.000000000 +0100 @@ -539,7 +539,7 @@ static void unbind_from_irq(unsigned int int bind_caller_port_to_irqhandler( unsigned int caller_port, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -562,7 +562,7 @@ EXPORT_SYMBOL_GPL(bind_caller_port_to_ir int bind_listening_port_to_irqhandler( unsigned int remote_domain, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -586,7 +586,7 @@ EXPORT_SYMBOL_GPL(bind_listening_port_to int bind_interdomain_evtchn_to_irqhandler( unsigned int remote_domain, unsigned int remote_port, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -610,7 +610,7 @@ EXPORT_SYMBOL_GPL(bind_interdomain_evtch int bind_virq_to_irqhandler( unsigned int virq, unsigned int cpu, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -634,7 +634,7 @@ EXPORT_SYMBOL_GPL(bind_virq_to_irqhandle int bind_ipi_to_irqhandler( unsigned int ipi, unsigned int cpu, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -710,16 +710,15 @@ int resend_irq_on_evtchn(unsigned int ir * Interface to generic handling in irq.c */ -static unsigned int startup_dynirq(unsigned int irq) +static void unmask_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); - return 0; } -static void shutdown_dynirq(unsigned int irq) +static void mask_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); @@ -727,28 +726,18 @@ static void shutdown_dynirq(unsigned int mask_evtchn(evtchn); } -static void enable_dynirq(unsigned int irq) +static unsigned int startup_dynirq(unsigned int irq) { - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + unmask_dynirq(irq); + return 0; } -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} +#define shutdown_dynirq mask_dynirq static void ack_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); - move_native_irq(irq); - if (VALID_EVTCHN(evtchn)) { mask_evtchn(evtchn); clear_evtchn(evtchn); @@ -757,20 +746,23 @@ static void ack_dynirq(unsigned int irq) static void end_dynirq(unsigned int irq) { - int evtchn = evtchn_from_irq(irq); + move_masked_irq(irq); - if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) - unmask_evtchn(evtchn); + if (!(irq_desc[irq].status & IRQ_DISABLED)) + unmask_dynirq(irq); } -static struct hw_interrupt_type dynirq_type = { - .typename = "Dynamic-irq", +static struct irq_chip dynirq_chip = { + .name = "Dynamic", .startup = startup_dynirq, .shutdown = shutdown_dynirq, - .enable = enable_dynirq, - .disable = disable_dynirq, + .enable = unmask_dynirq, + .disable = mask_dynirq, + .mask = mask_dynirq, + .unmask = unmask_dynirq, .ack = ack_dynirq, .end = end_dynirq, + .eoi = end_dynirq, #ifdef CONFIG_SMP .set_affinity = set_affinity_irq, #endif @@ -832,7 +824,7 @@ static inline void pirq_query_unmask(int */ #define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) -static unsigned int startup_pirq(unsigned int irq) +static void enable_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; int evtchn = evtchn_from_irq(irq); @@ -847,7 +839,7 @@ static unsigned int startup_pirq(unsigne if (!probing_irq(irq)) printk(KERN_INFO "Failed to obtain physical IRQ %d\n", irq); - return 0; + return; } evtchn = bind_pirq.port; @@ -859,7 +851,13 @@ static unsigned int startup_pirq(unsigne out: pirq_unmask_and_notify(evtchn, irq); +} + +#define disable_pirq mask_pirq +static unsigned int startup_pirq(unsigned int irq) +{ + enable_pirq(irq); return 0; } @@ -882,46 +880,39 @@ static void shutdown_pirq(unsigned int i irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); } -static void enable_pirq(unsigned int irq) -{ - startup_pirq(irq); -} - -static void disable_pirq(unsigned int irq) -{ -} - -static void ack_pirq(unsigned int irq) +static void unmask_pirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) { - mask_evtchn(evtchn); - clear_evtchn(evtchn); - } + if (VALID_EVTCHN(evtchn)) + pirq_unmask_and_notify(evtchn, irq); } +#define mask_pirq mask_dynirq +#define ack_pirq ack_dynirq + static void end_pirq(unsigned int irq) { - int evtchn = evtchn_from_irq(irq); + move_masked_irq(irq); if ((irq_desc[irq].status & (IRQ_DISABLED|IRQ_PENDING)) == - (IRQ_DISABLED|IRQ_PENDING)) { + (IRQ_DISABLED|IRQ_PENDING)) shutdown_pirq(irq); - } else if (VALID_EVTCHN(evtchn)) - pirq_unmask_and_notify(evtchn, irq); + else + unmask_pirq(irq); } -static struct hw_interrupt_type pirq_type = { - .typename = "Phys-irq", +static struct irq_chip pirq_chip = { + .name = "Phys", .startup = startup_pirq, .shutdown = shutdown_pirq, .enable = enable_pirq, .disable = disable_pirq, + .mask = mask_pirq, + .unmask = unmask_pirq, .ack = ack_pirq, .end = end_pirq, + .eoi = end_pirq, #ifdef CONFIG_SMP .set_affinity = set_affinity_irq, #endif @@ -1104,7 +1095,8 @@ void evtchn_register_pirq(int irq) if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) return; irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0); - irq_desc[irq].chip = &pirq_type; + set_irq_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq, + "fasteoi"); } int evtchn_map_pirq(int irq, int xen_pirq) @@ -1127,11 +1119,18 @@ int evtchn_map_pirq(int irq, int xen_pir spin_unlock(&irq_alloc_lock); if (irq < PIRQ_BASE) return -ENOSPC; - irq_desc[irq].chip = &pirq_type; + set_irq_chip_and_handler_name(irq, &pirq_chip, + handle_fasteoi_irq, "fasteoi"); } else if (!xen_pirq) { if (unlikely(type_from_irq(irq) != IRQT_PIRQ)) return -EINVAL; - irq_desc[irq].chip = &no_irq_type; + /* + * dynamic_irq_cleanup(irq) would seem to be the correct thing + * here, but cannot be used as we get here also during shutdown + * when a driver didn't free_irq() its MSI(-X) IRQ(s), which + * then causes a warning in dynamic_irq_cleanup(). + */ + set_irq_chip_and_handler(irq, NULL, NULL); irq_info[irq] = IRQ_UNBOUND; return 0; } else if (type_from_irq(irq) != IRQT_PIRQ @@ -1177,10 +1176,9 @@ void __init xen_init_IRQ(void) for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) { irq_bindcount[i] = 0; - irq_desc[i].status = IRQ_DISABLED|IRQ_NOPROBE; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - irq_desc[i].chip = &dynirq_type; + irq_desc[i].status |= IRQ_NOPROBE; + set_irq_chip_and_handler_name(i, &dynirq_chip, + handle_fasteoi_irq, "fasteoi"); } /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ @@ -1196,9 +1194,7 @@ void __init xen_init_IRQ(void) continue; #endif - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - irq_desc[i].chip = &pirq_type; + set_irq_chip_and_handler_name(i, &pirq_chip, + handle_fasteoi_irq, "fasteoi"); } } --- head-2011-03-11.orig/drivers/xen/core/gnttab.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/core/gnttab.c 2011-01-31 17:29:16.000000000 +0100 @@ -510,6 +510,7 @@ static void gnttab_page_free(struct page BUG_ON(order); ClearPageForeign(page); gnttab_reset_grant_page(page); + ClearPageReserved(page); put_page(page); } @@ -587,6 +588,8 @@ int gnttab_copy_grant_page(grant_ref_t r new_page->mapping = page->mapping; new_page->index = page->index; set_bit(PG_foreign, &new_page->flags); + if (PageReserved(page)) + SetPageReserved(new_page); *pagep = new_page; SetPageForeign(page, gnttab_page_free); --- head-2011-03-11.orig/drivers/xen/core/reboot.c 2010-11-25 09:36:37.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/reboot.c 2011-01-31 17:29:16.000000000 +0100 @@ -1,4 +1,3 @@ -#define __KERNEL_SYSCALLS__ #include #include #include @@ -14,6 +13,7 @@ #ifdef HAVE_XEN_PLATFORM_COMPAT_H #include +#undef handle_sysrq #endif MODULE_LICENSE("Dual BSD/GPL"); @@ -231,7 +231,7 @@ static void sysrq_handler(struct xenbus_ #ifdef CONFIG_MAGIC_SYSRQ if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL, NULL); + handle_sysrq(sysrq_key, NULL); #endif } @@ -245,7 +245,7 @@ static struct xenbus_watch sysrq_watch = .callback = sysrq_handler }; -static irqreturn_t suspend_int(int irq, void* dev_id, struct pt_regs *ptregs) +static irqreturn_t suspend_int(int irq, void* dev_id) { switch_shutdown_state(SHUTDOWN_SUSPEND); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/core/smpboot.c 2011-01-31 17:01:49.000000000 +0100 +++ head-2011-03-11/drivers/xen/core/smpboot.c 2011-01-31 17:29:16.000000000 +0100 @@ -25,8 +25,8 @@ #include #include -extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); -extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_reschedule_interrupt(int, void *); +extern irqreturn_t smp_call_function_interrupt(int, void *); extern int local_setup_timer(unsigned int cpu); extern void local_teardown_timer(unsigned int cpu); @@ -59,8 +59,6 @@ cpumask_t cpu_core_map[NR_CPUS] __cachel #if defined(__i386__) u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); -#elif !defined(CONFIG_X86_IO_APIC) -unsigned int maxcpus = NR_CPUS; #endif void __init prefill_possible_map(void) --- head-2011-03-11.orig/drivers/xen/fbfront/xenfb.c 2011-03-02 12:00:16.000000000 +0100 +++ head-2011-03-11/drivers/xen/fbfront/xenfb.c 2011-01-31 17:29:16.000000000 +0100 @@ -524,8 +524,7 @@ static struct fb_ops xenfb_fb_ops = { .fb_set_par = xenfb_set_par, }; -static irqreturn_t xenfb_event_handler(int rq, void *dev_id, - struct pt_regs *regs) +static irqreturn_t xenfb_event_handler(int rq, void *dev_id) { /* * No in events recognized, simply ignore them all. --- head-2011-03-11.orig/drivers/xen/fbfront/xenkbd.c 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/drivers/xen/fbfront/xenkbd.c 2011-01-31 17:29:16.000000000 +0100 @@ -46,7 +46,7 @@ static void xenkbd_disconnect_backend(st * to do that. */ -static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs) +static irqreturn_t input_handler(int rq, void *dev_id) { struct xenkbd_info *info = dev_id; struct xenkbd_page *page = info->page; --- head-2011-03-11.orig/drivers/xen/gntdev/gntdev.c 2011-01-03 12:43:21.000000000 +0100 +++ head-2011-03-11/drivers/xen/gntdev/gntdev.c 2011-01-31 17:29:16.000000000 +0100 @@ -744,9 +744,6 @@ static pte_t gntdev_clear_pte(struct vm_ BUG(); } - /* Copy the existing value of the PTE for returning. */ - copy = *ptep; - /* Calculate the grant relating to this PTE. */ slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); @@ -761,6 +758,10 @@ static pte_t gntdev_clear_pte(struct vm_ GNTDEV_INVALID_HANDLE && !xen_feature(XENFEAT_auto_translated_physmap)) { /* NOT USING SHADOW PAGE TABLES. */ + + /* Copy the existing value of the PTE for returning. */ + copy = *ptep; + gnttab_set_unmap_op(&op, ptep_to_machine(ptep), GNTMAP_contains_pte, private_data->grants[slot_index] @@ -773,7 +774,7 @@ static pte_t gntdev_clear_pte(struct vm_ op.status); } else { /* USING SHADOW PAGE TABLES. */ - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); } /* Finally, we unmap the grant from kernel space. */ @@ -801,7 +802,7 @@ static pte_t gntdev_clear_pte(struct vm_ INVALID_P2M_ENTRY); } else { - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); } return copy; --- head-2011-03-11.orig/drivers/xen/netback/accel.c 2008-01-07 13:19:18.000000000 +0100 +++ head-2011-03-11/drivers/xen/netback/accel.c 2011-01-31 17:29:16.000000000 +0100 @@ -65,7 +65,7 @@ static int match_accelerator(struct xenb if (IS_ERR(eth_name)) { /* Probably means not present */ - DPRINTK("%s: no match due to xenbus_read accel error %d\n", + DPRINTK("%s: no match due to xenbus_read accel error %ld\n", __FUNCTION__, PTR_ERR(eth_name)); return 0; } else { --- head-2011-03-11.orig/drivers/xen/netback/common.h 2011-03-01 11:33:08.000000000 +0100 +++ head-2011-03-11/drivers/xen/netback/common.h 2011-02-17 10:07:22.000000000 +0100 @@ -101,6 +101,7 @@ typedef struct netif_st { /* Statistics */ unsigned long nr_copied_skbs; + unsigned long rx_gso_csum_fixups; /* Miscellaneous private stuff. */ struct list_head list; /* scheduling list */ @@ -209,7 +210,7 @@ void netif_deschedule_work(netif_t *neti int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); struct net_device_stats *netif_be_get_stats(struct net_device *dev); -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t netif_be_int(int irq, void *dev_id); static inline int netbk_can_queue(struct net_device *dev) { --- head-2011-03-11.orig/drivers/xen/netback/interface.c 2011-02-17 09:58:10.000000000 +0100 +++ head-2011-03-11/drivers/xen/netback/interface.c 2011-02-17 10:07:27.000000000 +0100 @@ -170,6 +170,7 @@ static const struct netif_stat { u16 offset; } netbk_stats[] = { { "copied_skbs", offsetof(netif_t, nr_copied_skbs) / sizeof(long) }, + { "rx_gso_csum_fixups", offsetof(netif_t, rx_gso_csum_fixups) / sizeof(long) }, }; static int netbk_get_stats_count(struct net_device *dev) --- head-2011-03-11.orig/drivers/xen/netback/loopback.c 2011-01-03 12:43:21.000000000 +0100 +++ head-2011-03-11/drivers/xen/netback/loopback.c 2011-01-31 17:29:16.000000000 +0100 @@ -152,16 +152,6 @@ static int loopback_start_xmit(struct sk np->stats.rx_bytes += skb->len; np->stats.rx_packets++; - if (skb->ip_summed == CHECKSUM_HW) { - /* Defer checksum calculation. */ - skb->proto_csum_blank = 1; - /* Must be a local packet: assert its integrity. */ - skb->proto_data_valid = 1; - } - - skb->ip_summed = skb->proto_data_valid ? - CHECKSUM_UNNECESSARY : CHECKSUM_NONE; - skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */ skb->protocol = eth_type_trans(skb, dev); skb->dev = dev; --- head-2011-03-11.orig/drivers/xen/netback/netback.c 2011-02-17 09:58:10.000000000 +0100 +++ head-2011-03-11/drivers/xen/netback/netback.c 2011-02-09 15:35:10.000000000 +0100 @@ -39,6 +39,7 @@ #include #include #include +#include /*define NETBE_DEBUG_INTERRUPT*/ @@ -314,7 +315,6 @@ int netif_be_start_xmit(struct sk_buff * /* Copy only the header fields we use in this driver. */ nskb->dev = skb->dev; nskb->ip_summed = skb->ip_summed; - nskb->proto_data_valid = skb->proto_data_valid; dev_kfree_skb(skb); skb = nskb; } @@ -706,10 +706,14 @@ static void net_rx_action(unsigned long id = meta[npo.meta_cons].id; flags = nr_frags ? NETRXF_more_data : 0; - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ + switch (skb->ip_summed) { + case CHECKSUM_PARTIAL: /* local packet? */ flags |= NETRXF_csum_blank | NETRXF_data_validated; - else if (skb->proto_data_valid) /* remote but checksummed? */ + break; + case CHECKSUM_UNNECESSARY: /* remote but checksummed? */ flags |= NETRXF_data_validated; + break; + } if (meta[npo.meta_cons].copy) offset = 0; @@ -1451,18 +1455,12 @@ static void net_tx_action(unsigned long netif_idx_release(pending_idx); } - /* - * Old frontends do not assert data_validated but we - * can infer it from csum_blank so test both flags. - */ - if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) { + if (txp->flags & NETTXF_csum_blank) + skb->ip_summed = CHECKSUM_PARTIAL; + else if (txp->flags & NETTXF_data_validated) skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->proto_data_valid = 1; - } else { + else skb->ip_summed = CHECKSUM_NONE; - skb->proto_data_valid = 0; - } - skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank); netbk_fill_frags(skb); @@ -1479,6 +1477,12 @@ static void net_tx_action(unsigned long skb->dev = netif->dev; skb->protocol = eth_type_trans(skb, skb->dev); + if (skb_checksum_setup(skb, &netif->rx_gso_csum_fixups)) { + DPRINTK("Can't setup checksum in net_tx_action\n"); + kfree_skb(skb); + continue; + } + netif->stats.rx_bytes += skb->len; netif->stats.rx_packets++; @@ -1527,7 +1531,7 @@ static void netif_page_release(struct pa netif_idx_release(idx); } -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t netif_be_int(int irq, void *dev_id) { netif_t *netif = dev_id; @@ -1594,7 +1598,7 @@ static netif_rx_response_t *make_rx_resp } #ifdef NETBE_DEBUG_INTERRUPT -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +static irqreturn_t netif_be_dbg(int irq, void *dev_id) { struct list_head *ent; netif_t *netif; --- head-2011-03-11.orig/drivers/xen/netfront/netfront.c 2010-11-25 09:36:37.000000000 +0100 +++ head-2011-03-11/drivers/xen/netfront/netfront.c 2011-02-09 15:35:31.000000000 +0100 @@ -63,6 +63,7 @@ #include #include #include +#include struct netfront_cb { struct page *page; @@ -136,7 +137,7 @@ static inline int netif_needs_gso(struct { return skb_is_gso(skb) && (!skb_gso_ok(skb, dev->features) || - unlikely(skb->ip_summed != CHECKSUM_HW)); + unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); } #else #define HAVE_GSO 0 @@ -222,7 +223,7 @@ static void network_tx_buf_gc(struct net static void network_alloc_rx_buffers(struct net_device *); static void send_fake_arp(struct net_device *); -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs); +static irqreturn_t netif_int(int irq, void *dev_id); #ifdef CONFIG_SYSFS static int xennet_sysfs_addif(struct net_device *netdev); @@ -992,12 +993,10 @@ static int network_start_xmit(struct sk_ tx->flags = 0; extra = NULL; - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */ tx->flags |= NETTXF_csum_blank | NETTXF_data_validated; -#ifdef CONFIG_XEN - if (skb->proto_data_valid) /* remote but checksummed? */ + else if (skb->ip_summed == CHECKSUM_UNNECESSARY) tx->flags |= NETTXF_data_validated; -#endif #if HAVE_TSO if (skb_shinfo(skb)->gso_size) { @@ -1049,7 +1048,7 @@ static int network_start_xmit(struct sk_ return 0; } -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) +static irqreturn_t netif_int(int irq, void *dev_id) { struct net_device *dev = dev_id; struct netfront_info *np = netdev_priv(dev); @@ -1424,18 +1423,13 @@ err: skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len); skb->len += skb->data_len; - /* - * Old backends do not assert data_validated but we - * can infer it from csum_blank so test both flags. - */ - if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) + if (rx->flags & NETRXF_csum_blank) + skb->ip_summed = CHECKSUM_PARTIAL; + else if (rx->flags & NETRXF_data_validated) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb->ip_summed = CHECKSUM_NONE; -#ifdef CONFIG_XEN - skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE); - skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank); -#endif + np->stats.rx_packets++; np->stats.rx_bytes += skb->len; @@ -1480,6 +1474,11 @@ err: /* Ethernet work: Delayed to here as it peeks the header. */ skb->protocol = eth_type_trans(skb, dev); + if (skb_checksum_setup(skb, &np->rx_gso_csum_fixups)) { + kfree_skb(skb); + continue; + } + /* Pass it up. */ netif_receive_skb(skb); dev->last_rx = jiffies; @@ -1772,6 +1771,44 @@ static void xennet_set_features(struct n xennet_set_tso(dev, 1); } +static const struct xennet_stat { + char name[ETH_GSTRING_LEN]; + u16 offset; +} xennet_stats[] = { + { + "rx_gso_csum_fixups", + offsetof(struct netfront_info, rx_gso_csum_fixups) / sizeof(long) + }, +}; + +static int xennet_get_stats_count(struct net_device *dev) +{ + return ARRAY_SIZE(xennet_stats); +} + +static void xennet_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + unsigned long *np = netdev_priv(dev); + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xennet_stats); i++) + data[i] = np[xennet_stats[i].offset]; +} + +static void xennet_get_strings(struct net_device *dev, u32 stringset, u8 *data) +{ + unsigned int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < ARRAY_SIZE(xennet_stats); i++) + memcpy(data + i * ETH_GSTRING_LEN, + xennet_stats[i].name, ETH_GSTRING_LEN); + break; + } +} + static void netfront_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { @@ -1897,6 +1934,10 @@ static struct ethtool_ops network_ethtoo .set_tso = xennet_set_tso, #endif .get_link = ethtool_op_get_link, + + .get_stats_count = xennet_get_stats_count, + .get_ethtool_stats = xennet_get_ethtool_stats, + .get_strings = xennet_get_strings, }; #ifdef CONFIG_SYSFS --- head-2011-03-11.orig/drivers/xen/netfront/netfront.h 2010-02-24 13:13:46.000000000 +0100 +++ head-2011-03-11/drivers/xen/netfront/netfront.h 2011-02-09 15:35:17.000000000 +0100 @@ -150,6 +150,7 @@ struct netfront_info { struct net_device *netdev; struct net_device_stats stats; + unsigned long rx_gso_csum_fixups; struct netif_tx_front_ring tx; struct netif_rx_front_ring rx; --- head-2011-03-11.orig/drivers/xen/pciback/pciback.h 2009-03-18 10:39:32.000000000 +0100 +++ head-2011-03-11/drivers/xen/pciback/pciback.h 2011-01-31 17:29:16.000000000 +0100 @@ -99,7 +99,7 @@ int pciback_publish_pci_roots(struct pci void pciback_release_devices(struct pciback_device *pdev); /* Handles events from front-end */ -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t pciback_handle_event(int irq, void *dev_id); void pciback_do_op(void *data); int pciback_xenbus_register(void); --- head-2011-03-11.orig/drivers/xen/pciback/pciback_ops.c 2011-02-17 09:58:10.000000000 +0100 +++ head-2011-03-11/drivers/xen/pciback/pciback_ops.c 2011-02-17 10:07:33.000000000 +0100 @@ -132,7 +132,7 @@ void pciback_do_op(void *data) test_and_schedule_op(pdev); } -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t pciback_handle_event(int irq, void *dev_id) { struct pciback_device *pdev = dev_id; --- head-2011-03-11.orig/drivers/xen/pcifront/pcifront.h 2010-10-05 09:58:12.000000000 +0200 +++ head-2011-03-11/drivers/xen/pcifront/pcifront.h 2011-01-31 17:29:16.000000000 +0100 @@ -51,6 +51,6 @@ void pcifront_free_roots(struct pcifront void pcifront_do_aer( void *data); -irqreturn_t pcifront_handler_aer(int irq, void *dev, struct pt_regs *regs); +irqreturn_t pcifront_handler_aer(int irq, void *dev); #endif /* __XEN_PCIFRONT_H__ */ --- head-2011-03-11.orig/drivers/xen/pcifront/pci_op.c 2010-11-25 09:36:37.000000000 +0100 +++ head-2011-03-11/drivers/xen/pcifront/pci_op.c 2011-01-31 17:29:16.000000000 +0100 @@ -662,7 +662,7 @@ void pcifront_do_aer(void *data) } -irqreturn_t pcifront_handler_aer(int irq, void *dev, struct pt_regs *regs) +irqreturn_t pcifront_handler_aer(int irq, void *dev) { struct pcifront_device *pdev = dev; schedule_pcifront_aer_op(pdev); --- head-2011-03-11.orig/drivers/xen/privcmd/compat_privcmd.c 2010-01-27 14:01:48.000000000 +0100 +++ head-2011-03-11/drivers/xen/privcmd/compat_privcmd.c 2011-01-31 17:29:16.000000000 +0100 @@ -18,7 +18,6 @@ * Authors: Jimi Xenidis */ -#include #include #include #include --- head-2011-03-11.orig/drivers/xen/privcmd/privcmd.c 2010-01-27 14:01:48.000000000 +0100 +++ head-2011-03-11/drivers/xen/privcmd/privcmd.c 2011-01-31 17:29:16.000000000 +0100 @@ -71,43 +71,16 @@ static long privcmd_ioctl(struct file *f if (copy_from_user(&hypercall, udata, sizeof(hypercall))) return -EFAULT; +#ifdef CONFIG_X86 ret = -ENOSYS; -#if defined(__i386__) if (hypercall.op >= (PAGE_SIZE >> 5)) break; - __asm__ __volatile__ ( - "pushl %%ebx; pushl %%ecx; pushl %%edx; " - "pushl %%esi; pushl %%edi; " - "movl 8(%%eax),%%ebx ;" - "movl 16(%%eax),%%ecx ;" - "movl 24(%%eax),%%edx ;" - "movl 32(%%eax),%%esi ;" - "movl 40(%%eax),%%edi ;" - "movl (%%eax),%%eax ;" - "shll $5,%%eax ;" - "addl $hypercall_page,%%eax ;" - "call *%%eax ;" - "popl %%edi; popl %%esi; popl %%edx; " - "popl %%ecx; popl %%ebx" - : "=a" (ret) : "0" (&hypercall) : "memory" ); -#elif defined (__x86_64__) - if (hypercall.op < (PAGE_SIZE >> 5)) { - long ign1, ign2, ign3; - __asm__ __volatile__ ( - "movq %8,%%r10; movq %9,%%r8;" - "shll $5,%%eax ;" - "addq $hypercall_page,%%rax ;" - "call *%%rax" - : "=a" (ret), "=D" (ign1), - "=S" (ign2), "=d" (ign3) - : "0" ((unsigned int)hypercall.op), - "1" (hypercall.arg[0]), - "2" (hypercall.arg[1]), - "3" (hypercall.arg[2]), - "g" (hypercall.arg[3]), - "g" (hypercall.arg[4]) - : "r8", "r10", "memory" ); - } + ret = _hypercall(long, (unsigned int)hypercall.op, + (unsigned long)hypercall.arg[0], + (unsigned long)hypercall.arg[1], + (unsigned long)hypercall.arg[2], + (unsigned long)hypercall.arg[3], + (unsigned long)hypercall.arg[4]); #else ret = privcmd_hypercall(&hypercall); #endif @@ -446,7 +419,7 @@ static int privcmd_mmap(struct file * fi return -ENOSYS; /* DONTCOPY is essential for Xen as copy_page_range is broken. */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY; vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; --- head-2011-03-11.orig/drivers/xen/scsiback/common.h 2009-03-18 10:39:32.000000000 +0100 +++ head-2011-03-11/drivers/xen/scsiback/common.h 2011-01-31 17:29:16.000000000 +0100 @@ -147,7 +147,7 @@ typedef struct { #define VSCSI_TYPE_HOST 1 -irqreturn_t scsiback_intr(int, void *, struct pt_regs *); +irqreturn_t scsiback_intr(int, void *); int scsiback_init_sring(struct vscsibk_info *info, unsigned long ring_ref, unsigned int evtchn); int scsiback_schedule(void *data); --- head-2011-03-11.orig/drivers/xen/scsiback/scsiback.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/scsiback/scsiback.c 2011-01-31 17:29:16.000000000 +0100 @@ -459,7 +459,7 @@ void scsiback_cmd_exec(pending_req_t *pe write = (data_dir == DMA_TO_DEVICE); rq = blk_get_request(pending_req->sdev->request_queue, write, GFP_KERNEL); - rq->flags |= REQ_BLOCK_PC; + rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->cmd_len = cmd_len; memcpy(rq->cmd, pending_req->cmnd, cmd_len); @@ -503,7 +503,7 @@ static void scsiback_device_reset_exec(p } -irqreturn_t scsiback_intr(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t scsiback_intr(int irq, void *dev_id) { scsiback_notify_work((struct vscsibk_info *)dev_id); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/scsifront/common.h 2010-02-24 13:13:46.000000000 +0100 +++ head-2011-03-11/drivers/xen/scsifront/common.h 2011-01-31 17:29:16.000000000 +0100 @@ -128,7 +128,7 @@ struct vscsifrnt_info { int scsifront_xenbus_init(void); void scsifront_xenbus_unregister(void); int scsifront_schedule(void *data); -irqreturn_t scsifront_intr(int irq, void *dev_id, struct pt_regs *ptregs); +irqreturn_t scsifront_intr(int irq, void *dev_id); int scsifront_cmd_done(struct vscsifrnt_info *info); --- head-2011-03-11.orig/drivers/xen/scsifront/scsifront.c 2011-02-02 12:19:11.000000000 +0100 +++ head-2011-03-11/drivers/xen/scsifront/scsifront.c 2011-01-31 17:29:16.000000000 +0100 @@ -100,7 +100,7 @@ static void scsifront_do_request(struct notify_remote_via_irq(irq); } -irqreturn_t scsifront_intr(int irq, void *dev_id, struct pt_regs *ptregs) +irqreturn_t scsifront_intr(int irq, void *dev_id) { scsifront_notify_work((struct vscsifrnt_info *)dev_id); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/sfc_netback/accel_xenbus.c 2010-01-04 11:56:34.000000000 +0100 +++ head-2011-03-11/drivers/xen/sfc_netback/accel_xenbus.c 2011-01-31 17:29:16.000000000 +0100 @@ -69,8 +69,7 @@ static void unlink_bend(struct netback_a /* Demultiplex a message IRQ from the frontend driver. */ -static irqreturn_t msgirq_from_frontend(int irq, void *context, - struct pt_regs *unused) +static irqreturn_t msgirq_from_frontend(int irq, void *context) { struct xenbus_device *dev = context; struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev); @@ -85,8 +84,7 @@ static irqreturn_t msgirq_from_frontend( * functionally, but we need it to pass to the bind function, and may * get called spuriously */ -static irqreturn_t netirq_from_frontend(int irq, void *context, - struct pt_regs *unused) +static irqreturn_t netirq_from_frontend(int irq, void *context) { VPRINTK("netirq %d from device %s\n", irq, ((struct xenbus_device *)context)->nodename); --- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel.h 2009-04-07 13:58:48.000000000 +0200 +++ head-2011-03-11/drivers/xen/sfc_netfront/accel.h 2011-01-31 17:29:16.000000000 +0100 @@ -467,10 +467,8 @@ void netfront_accel_msg_tx_fastpath(netf u32 ip, u16 port, u8 protocol); /* Process an IRQ received from back end driver */ -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, - struct pt_regs *unused); -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, - struct pt_regs *unused); +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context); +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) extern void netfront_accel_msg_from_bend(struct work_struct *context); --- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel_msg.c 2009-04-07 13:58:48.000000000 +0200 +++ head-2011-03-11/drivers/xen/sfc_netfront/accel_msg.c 2011-01-31 17:29:16.000000000 +0100 @@ -488,8 +488,7 @@ void netfront_accel_msg_from_bend(void * } -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, - struct pt_regs *unused) +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context) { netfront_accel_vnic *vnic = (netfront_accel_vnic *)context; VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename); @@ -500,8 +499,7 @@ irqreturn_t netfront_accel_msg_channel_i } /* Process an interrupt received from the NIC via backend */ -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, - struct pt_regs *unused) +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context) { netfront_accel_vnic *vnic = (netfront_accel_vnic *)context; struct net_device *net_dev = vnic->net_dev; --- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel_tso.c 2008-02-26 10:54:12.000000000 +0100 +++ head-2011-03-11/drivers/xen/sfc_netfront/accel_tso.c 2011-01-31 17:29:16.000000000 +0100 @@ -363,7 +363,7 @@ int netfront_accel_enqueue_skb_tso(netfr tso_check_safe(skb); - if (skb->ip_summed != CHECKSUM_HW) + if (skb->ip_summed != CHECKSUM_PARTIAL) EPRINTK("Trying to TSO send a packet without HW checksum\n"); tso_start(&state, skb); --- head-2011-03-11.orig/drivers/xen/sfc_netfront/accel_vi.c 2010-01-18 15:23:12.000000000 +0100 +++ head-2011-03-11/drivers/xen/sfc_netfront/accel_vi.c 2011-01-31 17:29:16.000000000 +0100 @@ -463,7 +463,7 @@ netfront_accel_enqueue_skb_multi(netfron frag_i = -1; - if (skb->ip_summed == CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ *(u16*)(skb->h.raw + skb->csum) = 0; } @@ -582,7 +582,7 @@ netfront_accel_enqueue_skb_single(netfro kva = buf->pkt_kva; - if (skb->ip_summed == CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { /* Set to zero to encourage falcon to work it out for us */ *(u16*)(skb->h.raw + skb->csum) = 0; } --- head-2011-03-11.orig/drivers/xen/tpmback/common.h 2007-06-12 13:13:45.000000000 +0200 +++ head-2011-03-11/drivers/xen/tpmback/common.h 2011-01-31 17:29:16.000000000 +0100 @@ -61,7 +61,7 @@ void tpmif_deschedule_work(tpmif_t * tpm void tpmif_xenbus_init(void); void tpmif_xenbus_exit(void); int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn); -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t tpmif_be_int(int irq, void *dev_id); long int tpmback_get_instance(struct backend_info *bi); --- head-2011-03-11.orig/drivers/xen/tpmback/tpmback.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/tpmback/tpmback.c 2011-01-31 17:29:16.000000000 +0100 @@ -497,7 +497,7 @@ static ssize_t vtpm_op_read(struct file list_del(&pak->next); write_unlock_irqrestore(&dataex.pak_lock, flags); - DPRINTK("size given by app: %d, available: %d\n", size, left); + DPRINTK("size given by app: %zu, available: %u\n", size, left); ret_size = min_t(size_t, size, left); @@ -894,7 +894,7 @@ static void tpm_tx_action(unsigned long } } -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t tpmif_be_int(int irq, void *dev_id) { tpmif_t *tpmif = (tpmif_t *) dev_id; --- head-2011-03-11.orig/drivers/xen/usbback/usbback.c 2010-09-23 15:39:04.000000000 +0200 +++ head-2011-03-11/drivers/xen/usbback/usbback.c 2011-01-31 17:29:16.000000000 +0100 @@ -288,7 +288,7 @@ static void usbbk_notify_work(usbif_t *u wake_up(&usbif->wq); } -irqreturn_t usbbk_be_int(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t usbbk_be_int(int irq, void *dev_id) { usbbk_notify_work(dev_id); return IRQ_HANDLED; @@ -318,7 +318,7 @@ static void usbbk_do_response(pending_re notify_remote_via_irq(usbif->irq); } -static void usbbk_urb_complete(struct urb *urb, struct pt_regs *regs) +static void usbbk_urb_complete(struct urb *urb) { pending_req_t *pending_req = (pending_req_t *)urb->context; --- head-2011-03-11.orig/drivers/xen/usbback/usbback.h 2009-11-06 10:23:23.000000000 +0100 +++ head-2011-03-11/drivers/xen/usbback/usbback.h 2011-01-31 17:29:16.000000000 +0100 @@ -158,7 +158,7 @@ int portid_add(const char *busid, int portid_remove(const domid_t domid, const unsigned int handle, const int portnum); -irqreturn_t usbbk_be_int(int irq, void *dev_id, struct pt_regs *regs); +irqreturn_t usbbk_be_int(int irq, void *dev_id); int usbbk_schedule(void *arg); struct usbstub *find_attached_device(usbif_t *usbif, int port); void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub); --- head-2011-03-11.orig/drivers/xen/usbback/usbstub.c 2011-03-02 12:00:16.000000000 +0100 +++ head-2011-03-11/drivers/xen/usbback/usbstub.c 2011-03-11 10:54:35.000000000 +0100 @@ -283,7 +283,7 @@ static ssize_t usbstub_show_portids(stru static DRIVER_ATTR(port_ids, S_IRUSR, usbstub_show_portids, NULL); /* table of devices that matches any usbdevice */ -static struct usb_device_id usbstub_table[] = { +static const struct usb_device_id usbstub_table[] = { { .driver_info = 1 }, /* wildcard, see usb_match_id() */ { } /* Terminating entry */ }; @@ -307,7 +307,7 @@ int __init usbstub_init(void) goto out; } - err = driver_create_file(&usbback_usb_driver.driver, + err = driver_create_file(&usbback_usb_driver.drvwrap.driver, &driver_attr_port_ids); if (err) usb_deregister(&usbback_usb_driver); @@ -318,7 +318,7 @@ out: void usbstub_exit(void) { - driver_remove_file(&usbback_usb_driver.driver, + driver_remove_file(&usbback_usb_driver.drvwrap.driver, &driver_attr_port_ids); usb_deregister(&usbback_usb_driver); } --- head-2011-03-11.orig/drivers/xen/usbfront/usbfront.h 2009-10-15 11:45:41.000000000 +0200 +++ head-2011-03-11/drivers/xen/usbfront/usbfront.h 2011-01-31 17:29:16.000000000 +0100 @@ -195,7 +195,7 @@ timer_action(struct usbfront_info *info, extern struct kmem_cache *xenhcd_urbp_cachep; extern struct hc_driver xen_usb20_hc_driver; extern struct hc_driver xen_usb11_hc_driver; -irqreturn_t xenhcd_int(int irq, void *dev_id, struct pt_regs *ptregs); +irqreturn_t xenhcd_int(int irq, void *dev_id); void xenhcd_rhport_state_change(struct usbfront_info *info, int port, enum usb_device_speed speed); int xenhcd_schedule(void *arg); --- head-2011-03-11.orig/drivers/xen/usbfront/usbfront-dbg.c 2009-10-15 11:45:41.000000000 +0200 +++ head-2011-03-11/drivers/xen/usbfront/usbfront-dbg.c 2011-01-31 17:29:16.000000000 +0100 @@ -90,7 +90,9 @@ static CLASS_DEVICE_ATTR(statistics, S_I static inline void create_debug_file(struct usbfront_info *info) { struct class_device *cldev = info_to_hcd(info)->self.class_dev; - class_device_create_file(cldev, &class_device_attr_statistics); + if (class_device_create_file(cldev, &class_device_attr_statistics)) + printk(KERN_WARNING "statistics file not created for %s\n", + info_to_hcd(info)->self.bus_name); } static inline void remove_debug_file(struct usbfront_info *info) --- head-2011-03-11.orig/drivers/xen/usbfront/usbfront-q.c 2009-10-15 11:45:41.000000000 +0200 +++ head-2011-03-11/drivers/xen/usbfront/usbfront-q.c 2011-01-31 17:29:16.000000000 +0100 @@ -236,7 +236,7 @@ __acquires(info->lock) COUNT(info->stats.complete); } spin_unlock(&info->lock); - usb_hcd_giveback_urb(info_to_hcd(info), urb, NULL); + usb_hcd_giveback_urb(info_to_hcd(info), urb); spin_lock(&info->lock); } @@ -534,7 +534,7 @@ static void xenhcd_notify_work(struct us wake_up(&info->wq); } -irqreturn_t xenhcd_int(int irq, void *dev_id, struct pt_regs *ptregs) +irqreturn_t xenhcd_int(int irq, void *dev_id) { xenhcd_notify_work((struct usbfront_info *) dev_id); return IRQ_HANDLED; --- head-2011-03-11.orig/drivers/xen/xenbus/xenbus_comms.c 2011-01-31 15:14:12.000000000 +0100 +++ head-2011-03-11/drivers/xen/xenbus/xenbus_comms.c 2011-01-31 17:29:16.000000000 +0100 @@ -54,7 +54,7 @@ static DECLARE_WORK(probe_work, xenbus_p static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); -static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs) +static irqreturn_t wake_waiting(int irq, void *unused) { int old, new; --- head-2011-03-11.orig/drivers/xen/xenoprof/xenoprofile.c 2010-01-07 09:38:29.000000000 +0100 +++ head-2011-03-11/drivers/xen/xenoprof/xenoprofile.c 2011-01-31 17:29:16.000000000 +0100 @@ -194,8 +194,7 @@ done: oprofile_add_domain_switch(COORDINATOR_DOMAIN); } -static irqreturn_t -xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) +static irqreturn_t xenoprof_ovf_interrupt(int irq, void *dev_id) { struct xenoprof_buf * buf; static unsigned long flag; --- head-2011-03-11.orig/include/asm-generic/pgtable.h 2011-03-11 10:52:21.000000000 +0100 +++ head-2011-03-11/include/asm-generic/pgtable.h 2011-03-11 10:54:24.000000000 +0100 @@ -157,7 +157,7 @@ static inline void pmdp_set_wrprotect(st #endif #ifndef arch_change_pte_range -#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0 #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/desc_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -32,52 +32,110 @@ static inline struct desc_struct *get_cp return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } +/* + * This is the ldt that every process will get unless we need + * something other than this. + */ +extern struct desc_struct default_ldt[]; +extern struct desc_struct idt_table[]; +extern void set_intr_gate(unsigned int irq, void * addr); + +static inline void pack_descriptor(__u32 *a, __u32 *b, + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) +{ + *a = ((base & 0xffff) << 16) | (limit & 0xffff); + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20); +} + +static inline void pack_gate(__u32 *a, __u32 *b, + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags) +{ + *a = (seg << 16) | (base & 0xffff); + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff); +} + +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */ +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */ +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */ +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */ +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */ +#define DESCTYPE_DPL3 0x60 /* DPL-3 */ +#define DESCTYPE_S 0x10 /* !system */ + #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr)) -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt)) +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) +#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); +#if TLS_SIZE != 24 +# error update this code. +#endif + +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \ + *(u64 *)&t->tls_array[i]) \ + BUG() + C(0); C(1); C(2); +#undef C +} -#define _set_tssldt_desc(n,addr,limit,type) \ -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ - "movw %w1,2(%2)\n\t" \ - "rorl $16,%1\n\t" \ - "movb %b1,4(%2)\n\t" \ - "movb %4,5(%2)\n\t" \ - "movb $0,6(%2)\n\t" \ - "movb %h1,7(%2)\n\t" \ - "rorl $16,%1" \ - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type)) +#ifndef CONFIG_XEN +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; +} -#ifndef CONFIG_X86_NO_TSS -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#else +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); +#endif +#ifndef CONFIG_X86_NO_IDT +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); + __u32 a, b; + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); + write_idt_entry(idt_table, gate, a, b); } +#endif -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) +#ifndef CONFIG_X86_NO_TSS +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + DESCTYPE_TSS, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); +} #endif -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); } +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) @@ -103,21 +161,6 @@ static inline void set_ldt_desc(unsigned (info)->seg_not_present == 1 && \ (info)->useable == 0 ) -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); - -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \ - *(u64 *)&t->tls_array[i])) \ - BUG(); - C(0); C(1); C(2); -#undef C -} - static inline void clear_LDT(void) { int cpu = get_cpu(); --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -55,7 +55,7 @@ enum fixed_addresses { #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif -#ifdef CONFIG_X86_IO_APIC +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN) FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, #endif @@ -95,10 +95,9 @@ enum fixed_addresses { __end_of_fixed_addresses }; -extern void set_fixaddr_top(unsigned long top); - extern void __set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags); +extern void reserve_top_address(unsigned long reserve); #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypercall_32.h 2009-06-23 09:28:21.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypercall_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -128,6 +128,23 @@ __res; \ }) +#define _hypercall(type, op, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + register typeof((a1)+0) __arg1 asm("ebx") = (a1); \ + register typeof((a2)+0) __arg2 asm("ecx") = (a2); \ + register typeof((a3)+0) __arg3 asm("edx") = (a3); \ + register typeof((a4)+0) __arg4 asm("esi") = (a4); \ + register typeof((a5)+0) __arg5 asm("edi") = (a5); \ + asm volatile ( \ + "call *%6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : "0" (hypercall_page + (op) * 32) \ + : "memory" ); \ + __res; \ +}) + static inline int __must_check HYPERVISOR_set_trap_table( const trap_info_t *table) @@ -140,6 +157,8 @@ HYPERVISOR_mmu_update( mmu_update_t *req, unsigned int count, unsigned int *success_count, domid_t domid) { + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmu_update(req, count, success_count, domid); return _hypercall4(int, mmu_update, req, count, success_count, domid); } @@ -148,6 +167,8 @@ HYPERVISOR_mmuext_op( struct mmuext_op *op, unsigned int count, unsigned int *success_count, domid_t domid) { + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmuext_op(op, count, success_count, domid); return _hypercall4(int, mmuext_op, op, count, success_count, domid); } @@ -238,6 +259,8 @@ static inline int __must_check HYPERVISOR_memory_op( unsigned int cmd, void *arg) { + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); return _hypercall2(int, memory_op, cmd, arg); } @@ -253,6 +276,9 @@ HYPERVISOR_update_va_mapping( unsigned long va, pte_t new_val, unsigned long flags) { unsigned long pte_hi = 0; + + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); #ifdef CONFIG_X86_PAE pte_hi = new_val.pte_high; #endif @@ -316,6 +342,8 @@ static inline int __must_check HYPERVISOR_grant_table_op( unsigned int cmd, void *uop, unsigned int count) { + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); return _hypercall3(int, grant_table_op, cmd, uop, count); } --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypercall_64.h 2009-06-23 09:28:21.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypercall_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -135,6 +135,23 @@ __res; \ }) +#define _hypercall(type, op, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + register typeof((a1)+0) __arg1 asm("rdi") = (a1); \ + register typeof((a2)+0) __arg2 asm("rsi") = (a2); \ + register typeof((a3)+0) __arg3 asm("rdx") = (a3); \ + register typeof((a4)+0) __arg4 asm("r10") = (a4); \ + register typeof((a5)+0) __arg5 asm("r8") = (a5); \ + asm volatile ( \ + "call *%6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : "0" (hypercall_page + (op) * 32) \ + : "memory" ); \ + __res; \ +}) + static inline int __must_check HYPERVISOR_set_trap_table( const trap_info_t *table) @@ -147,6 +164,8 @@ HYPERVISOR_mmu_update( mmu_update_t *req, unsigned int count, unsigned int *success_count, domid_t domid) { + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmu_update(req, count, success_count, domid); return _hypercall4(int, mmu_update, req, count, success_count, domid); } @@ -155,6 +174,8 @@ HYPERVISOR_mmuext_op( struct mmuext_op *op, unsigned int count, unsigned int *success_count, domid_t domid) { + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmuext_op(op, count, success_count, domid); return _hypercall4(int, mmuext_op, op, count, success_count, domid); } @@ -248,6 +269,8 @@ static inline int __must_check HYPERVISOR_memory_op( unsigned int cmd, void *arg) { + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); return _hypercall2(int, memory_op, cmd, arg); } @@ -262,6 +285,8 @@ static inline int __must_check HYPERVISOR_update_va_mapping( unsigned long va, pte_t new_val, unsigned long flags) { + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); } @@ -321,6 +346,8 @@ static inline int __must_check HYPERVISOR_grant_table_op( unsigned int cmd, void *uop, unsigned int count) { + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); return _hypercall3(int, grant_table_op, cmd, uop, count); } --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2009-07-13 14:25:35.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/hypervisor.h 2011-01-31 17:29:16.000000000 +0100 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #if defined(__i386__) @@ -139,7 +140,44 @@ void scrub_pages(void *, unsigned int); #define scrub_pages(_p,_n) ((void)0) #endif -#include +#if defined(CONFIG_XEN) && !defined(MODULE) + +DECLARE_PER_CPU(bool, xen_lazy_mmu); + +int xen_multicall_flush(bool); + +int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, + unsigned long flags); +int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count, + unsigned int *success_count, domid_t); +int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count, + unsigned int *success_count, domid_t); + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + __get_cpu_var(xen_lazy_mmu) = true; +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + __get_cpu_var(xen_lazy_mmu) = false; + xen_multicall_flush(false); +} + +#ifndef arch_use_lazy_mmu_mode +#define arch_use_lazy_mmu_mode() unlikely(__get_cpu_var(xen_lazy_mmu)) +#endif + +#else /* !CONFIG_XEN || MODULE */ + +static inline void xen_multicall_flush(bool ignore) {} +#define arch_use_lazy_mmu_mode() false +#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN && !MODULE */ #if defined(CONFIG_X86_64) #define MULTI_UVMFLAGS_INDEX 2 @@ -151,11 +189,15 @@ void scrub_pages(void *, unsigned int); #ifdef CONFIG_XEN #define is_running_on_xen() 1 +extern char hypercall_page[PAGE_SIZE]; #else extern char *hypercall_stubs; +#define hypercall_page hypercall_stubs #define is_running_on_xen() (!!hypercall_stubs) #endif +#include + static inline int HYPERVISOR_yield( void) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable-3level.h 2011-01-31 17:29:16.000000000 +0100 @@ -53,7 +53,6 @@ static inline int pte_exec_kernel(pte_t * not possible, use pte_get_and_clear to obtain the old pte * value and then use set_pte to update it. -ben */ -#define __HAVE_ARCH_SET_PTE_ATOMIC static inline void set_pte(pte_t *ptep, pte_t pte) { @@ -70,14 +69,6 @@ static inline void set_pte(pte_t *ptep, set_pte((ptep), (pteval)); \ } while (0) -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \ - if (((_mm) != current->mm && (_mm) != &init_mm) || \ - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \ - set_pte((ptep), (pteval)); \ - xen_invlpg((addr)); \ - } \ -} while (0) - #define set_pmd(pmdptr,pmdval) \ xen_l2_entry_update((pmdptr), (pmdval)) #define set_pud(pudptr,pudval) \ @@ -94,7 +85,7 @@ static inline void pud_clear (pud_t * pu #define pud_page(pud) \ ((struct page *) __va(pud_val(pud) & PAGE_MASK)) -#define pud_page_kernel(pud) \ +#define pud_page_vaddr(pud) \ ((unsigned long) __va(pud_val(pud) & PAGE_MASK)) @@ -124,6 +115,7 @@ static inline void pte_clear(struct mm_s #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; @@ -142,6 +134,7 @@ static inline pte_t ptep_get_and_clear(s return pte; } +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH #define ptep_clear_flush(vma, addr, ptep) \ ({ \ pte_t *__ptep = (ptep); \ @@ -159,6 +152,7 @@ static inline pte_t ptep_get_and_clear(s __res; \ }) +#define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t a, pte_t b) { return a.pte_low == b.pte_low && a.pte_high == b.pte_high; --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-07 15:33:33.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_32.h 2011-02-07 15:37:16.000000000 +0100 @@ -260,31 +260,89 @@ static inline pte_t pte_mkhuge(pte_t pte # include #endif -#define ptep_test_and_clear_dirty(vma, addr, ptep) \ +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ +do { \ + if (dirty) \ + ptep_establish(vma, address, ptep, entry); \ +} while (0) + +/* + * We don't actually have these, but we want to advertise them so that + * we can encompass the flush here. + */ +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG + +/* + * Rules for using ptep_establish: the pte MUST be a user pte, and + * must be a present->present transition. + */ +#define __HAVE_ARCH_PTEP_ESTABLISH +#define ptep_establish(vma, address, ptep, pteval) \ +do { \ + if ( likely((vma)->vm_mm == current->mm) ) { \ + BUG_ON(HYPERVISOR_update_va_mapping(address, \ + pteval, \ + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ + UVMF_INVLPG|UVMF_MULTI)); \ + } else { \ + xen_l1_entry_update(ptep, pteval); \ + flush_tlb_page(vma, address); \ + } \ +} while (0) + +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH +#define ptep_clear_flush_dirty(vma, address, ptep) \ ({ \ pte_t __pte = *(ptep); \ - int __ret = pte_dirty(__pte); \ - if (__ret) { \ - __pte = pte_mkclean(__pte); \ - if ((vma)->vm_mm != current->mm || \ - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ - (ptep)->pte_low = __pte.pte_low; \ - } \ - __ret; \ + int __dirty = pte_dirty(__pte); \ + __pte = pte_mkclean(__pte); \ + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + else if (__dirty) \ + (ptep)->pte_low = __pte.pte_low; \ + __dirty; \ }) -#define ptep_test_and_clear_young(vma, addr, ptep) \ +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young(vma, address, ptep) \ ({ \ pte_t __pte = *(ptep); \ - int __ret = pte_young(__pte); \ - if (__ret) \ - __pte = pte_mkold(__pte); \ - if ((vma)->vm_mm != current->mm || \ - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ - (ptep)->pte_low = __pte.pte_low; \ - __ret; \ + int __young = pte_young(__pte); \ + __pte = pte_mkold(__pte); \ + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ + ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + else if (__young) \ + (ptep)->pte_low = __pte.pte_low; \ + __young; \ }) +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL #define ptep_get_and_clear_full(mm, addr, ptep, full) \ ((full) ? ({ \ pte_t __res = *(ptep); \ @@ -296,6 +354,7 @@ static inline pte_t pte_mkhuge(pte_t pte }) : \ ptep_get_and_clear(mm, addr, ptep)) +#define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; @@ -391,11 +450,11 @@ static inline pte_t pte_modify(pte_t pte #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) + ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address)) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) -#define pmd_page_kernel(pmd) \ +#define pmd_page_vaddr(pmd) \ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) /* @@ -418,8 +477,6 @@ extern pte_t *lookup_address(unsigned lo static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} #endif -extern void noexec_setup(const char *str); - #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \ @@ -437,37 +494,17 @@ extern void noexec_setup(const char *str #define pte_unmap_nested(pte) do { } while (0) #endif -#define __HAVE_ARCH_PTEP_ESTABLISH -#define ptep_establish(vma, address, ptep, pteval) \ - do { \ - if ( likely((vma)->vm_mm == current->mm) ) { \ - BUG_ON(HYPERVISOR_update_va_mapping(address, \ - pteval, \ - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ - UVMF_INVLPG|UVMF_MULTI)); \ - } else { \ - xen_l1_entry_update(ptep, pteval); \ - flush_tlb_page(vma, address); \ - } \ - } while (0) +/* Clear a kernel PTE and flush it from the TLB */ +#define kpte_clear_flush(ptep, vaddr) do { \ + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \ + BUG(); \ +} while (0) /* * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. - * - * Also, we only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware - * will do the accessed bit for us, and we don't want to - * race with other CPU's that might be updating the dirty - * bit at the same time. */ #define update_mmu_cache(vma,address,pte) do { } while (0) -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ - do { \ - if (dirty) \ - ptep_establish(vma, address, ptep, entry); \ - } while (0) #include void make_lowmem_page_readonly(void *va, unsigned int feature); @@ -523,10 +560,11 @@ int create_lookup_pte_addr(struct mm_str uint64_t *ptep); int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot); + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable); -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ - xen_change_pte_range(mm, pmd, addr, end, newprot) +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) #define io_remap_pfn_range(vma,from,pfn,size,prot) \ direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) @@ -535,13 +573,6 @@ direct_remap_pfn_range(vma,from,pfn,size #define GET_IOSPACE(pfn) 0 #define GET_PFN(pfn) (pfn) -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTE_SAME #include #endif /* _I386_PGTABLE_H */ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/processor_32.h 2011-01-31 17:02:29.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/processor_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -146,6 +146,18 @@ static inline void detect_ht(struct cpui #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + __asm__(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx @@ -153,24 +165,18 @@ static inline void detect_ht(struct cpui */ static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - __asm__(XEN_CPUID - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c"(0)); + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); } /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) + int *edx) { - __asm__(XEN_CPUID - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c" (count)); + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); } /* @@ -178,42 +184,30 @@ static inline void cpuid_count(int op, i */ static inline unsigned int cpuid_eax(unsigned int op) { - unsigned int eax; + unsigned int eax, ebx, ecx, edx; - __asm__(XEN_CPUID - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); + cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { - unsigned int eax, ebx; + unsigned int eax, ebx, ecx, edx; - __asm__(XEN_CPUID - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); + cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { - unsigned int eax, ecx; + unsigned int eax, ebx, ecx, edx; - __asm__(XEN_CPUID - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); + cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { - unsigned int eax, edx; + unsigned int eax, ebx, ecx, edx; - __asm__(XEN_CPUID - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); + cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } @@ -315,6 +309,8 @@ static inline void __mwait(unsigned long : :"a" (eax), "c" (ecx)); } +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + /* from system description table in BIOS. Mostly for MCA use, but others may find it useful. */ extern unsigned int machine_id; --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/smp_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -79,25 +79,36 @@ static inline int hard_smp_processor_id( return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); } #endif - -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); -} - #endif +#define safe_smp_processor_id() smp_processor_id() extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); +extern unsigned int num_processors; + #endif /* !__ASSEMBLY__ */ #else /* CONFIG_SMP */ +#define safe_smp_processor_id() 0 #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif + +#ifndef __ASSEMBLY__ + +extern u8 apicid_2_node[]; + +#ifdef CONFIG_X86_LOCAL_APIC +static __inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); +} +#endif +#endif + #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/system_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -267,6 +267,9 @@ static inline unsigned long __xchg(unsig #define cmpxchg(ptr,o,n)\ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ (unsigned long)(n),sizeof(*(ptr)))) +#define sync_cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) #endif static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, @@ -296,6 +299,39 @@ static inline unsigned long __cmpxchg(vo return old; } +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + #ifndef CONFIG_X86_CMPXCHG /* * Building a kernel capable running on 80386. It may be necessary to --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/tlbflush_32.h 2011-01-31 17:29:16.000000000 +0100 @@ -8,8 +8,6 @@ #define __flush_tlb_global() xen_tlb_flush() #define __flush_tlb_all() xen_tlb_flush() -extern unsigned long pgkern_mask; - #define cpu_has_invlpg (boot_cpu_data.x86 > 3) #define __flush_tlb_single(addr) xen_invlpg(addr) --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/fixmap_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -41,7 +41,7 @@ enum fixed_addresses { #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif -#ifdef CONFIG_X86_IO_APIC +#ifndef CONFIG_XEN FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-07 15:33:42.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-07 15:37:14.000000000 +0100 @@ -44,12 +44,9 @@ extern unsigned long __supported_pte_mas #define swapper_pg_dir init_level4_pgt -extern int nonx_setup(char *str); extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); -extern unsigned long pgkern_mask; - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -119,9 +116,6 @@ static inline void pgd_clear (pgd_t * pg set_pgd(__user_pgd(pgd), __pgd(0)); } -#define pud_page(pud) \ - ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) - #define pte_same(a, b) ((a).pte == (b).pte) #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) @@ -333,7 +327,7 @@ static inline pte_t ptep_get_and_clear_f #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } -static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); } static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } @@ -346,29 +340,12 @@ static inline pte_t pte_mkclean(pte_t pt static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; } static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } - -#define ptep_test_and_clear_dirty(vma, addr, ptep) \ -({ \ - pte_t __pte = *(ptep); \ - int __ret = pte_dirty(__pte); \ - if (__ret) \ - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \ - __ret; \ -}) - -#define ptep_test_and_clear_young(vma, addr, ptep) \ -({ \ - pte_t __pte = *(ptep); \ - int __ret = pte_young(__pte); \ - if (__ret) \ - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \ - __ret; \ -}) +static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; } static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -395,7 +372,8 @@ static inline int pmd_large(pmd_t pte) { /* * Level 4 access. */ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) +#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) +#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)) #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) @@ -404,16 +382,18 @@ static inline int pmd_large(pmd_t pte) { /* PUD - Level3 access */ /* to find an entry in a page-table-directory. */ +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) +#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) +#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address)) #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) /* PMD - Level 2 access */ -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \ pmd_index(address)) #define pmd_none(x) (!__pmd_val(x)) #if CONFIG_XEN_COMPAT <= 0x030002 @@ -444,6 +424,7 @@ static inline pte_t mk_pte_phys(unsigned { unsigned long pteval; pteval = physpage | pgprot_val(pgprot); + pteval &= __supported_pte_mask; return __pte(pteval); } @@ -465,7 +446,7 @@ static inline pte_t pte_modify(pte_t pte #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \ +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ pte_index(address)) /* x86-64 always has all page tables mapped. */ @@ -506,6 +487,40 @@ static inline pte_t pte_modify(pte_t pte ptep_establish(vma, address, ptep, entry); \ } while (0) + +/* + * i386 says: We don't actually have these, but we want to advertise + * them so that we can encompass the flush here. + */ +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG + +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH +#define ptep_clear_flush_dirty(vma, address, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __dirty = pte_dirty(__pte); \ + __pte = pte_mkclean(__pte); \ + if ((vma)->vm_mm->context.pinned) \ + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + else if (__dirty) \ + set_pte(ptep, __pte); \ + __dirty; \ +}) + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young(vma, address, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __young = pte_young(__pte); \ + __pte = pte_mkold(__pte); \ + if ((vma)->vm_mm->context.pinned) \ + ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + else if (__young) \ + set_pte(ptep, __pte); \ + __young; \ +}) + /* Encode and de-code a swap entry */ #define __swp_type(x) (((x).val >> 1) & 0x3f) #define __swp_offset(x) ((x).val >> 8) @@ -543,10 +558,11 @@ int create_lookup_pte_addr(struct mm_str uint64_t *ptep); int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot); + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable); -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ - xen_change_pte_range(mm, pmd, addr, end, newprot) +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) @@ -568,8 +584,6 @@ int xen_change_pte_range(struct mm_struc #define kc_offset_to_vaddr(o) \ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL #define __HAVE_ARCH_PTEP_CLEAR_FLUSH --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/processor_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -484,6 +484,8 @@ static inline void __mwait(unsigned long : :"a" (eax), "c" (ecx)); } +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + #define stack_current() \ ({ \ struct thread_info *ti; \ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/smp_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -4,15 +4,12 @@ /* * We need the APIC definitions automatically as part of 'smp.h' */ -#ifndef __ASSEMBLY__ #include #include #include extern int disable_apic; -#endif #ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ #include #include #ifdef CONFIG_X86_IO_APIC @@ -21,10 +18,8 @@ extern int disable_apic; #include #include #endif -#endif #ifdef CONFIG_SMP -#ifndef ASSEMBLY #include @@ -41,14 +36,11 @@ extern cpumask_t cpu_initialized; extern void smp_alloc_memory(void); extern volatile unsigned long smp_invalidate_needed; -extern int pic_mode; extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); void smp_stop_cpu(void); -extern int smp_call_function_single(int cpuid, void (*func) (void *info), - void *info, int retry, int wait); extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; @@ -77,20 +69,16 @@ static inline int hard_smp_processor_id( } #endif -extern int safe_smp_processor_id(void); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); extern unsigned num_processors; extern unsigned disabled_cpus; -#endif /* !ASSEMBLY */ - #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif -#ifndef ASSEMBLY /* * Some lowlevel functions might want to know about * the real APIC ID <-> CPU # mapping. @@ -114,11 +102,8 @@ static inline int cpu_present_to_apicid( } #endif -#endif /* !ASSEMBLY */ - #ifndef CONFIG_SMP #define stack_smp_processor_id() 0 -#define safe_smp_processor_id() 0 #define cpu_logical_map(x) (x) #else #include @@ -130,7 +115,6 @@ static inline int cpu_present_to_apicid( }) #endif -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_LOCAL_APIC static __inline int logical_smp_processor_id(void) { @@ -138,13 +122,18 @@ static __inline int logical_smp_processo return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } #endif -#endif #ifdef CONFIG_SMP #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #else #define cpu_physical_id(cpu) boot_cpu_id -#endif - +static inline int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait) +{ + /* Disable interrupts here? */ + func(info); + return 0; +} +#endif /* !CONFIG_SMP */ #endif --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/system_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -24,6 +24,7 @@ #define __EXTRA_CLOBBER \ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" +/* Save restore flags to clear handle leaking NT */ #define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ --- head-2011-03-11.orig/arch/x86/include/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100 +++ head-2011-03-11/arch/x86/include/mach-xen/asm/tlbflush_64.h 2011-01-31 17:29:16.000000000 +0100 @@ -12,9 +12,6 @@ */ #define __flush_tlb_global() xen_tlb_flush() - -extern unsigned long pgkern_mask; - #define __flush_tlb_all() __flush_tlb_global() #define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) --- head-2011-03-11.orig/include/xen/evtchn.h 2011-01-31 15:14:12.000000000 +0100 +++ head-2011-03-11/include/xen/evtchn.h 2011-01-31 17:29:16.000000000 +0100 @@ -57,34 +57,34 @@ */ int bind_caller_port_to_irqhandler( unsigned int caller_port, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); int bind_listening_port_to_irqhandler( unsigned int remote_domain, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); int bind_interdomain_evtchn_to_irqhandler( unsigned int remote_domain, unsigned int remote_port, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); int bind_virq_to_irqhandler( unsigned int virq, unsigned int cpu, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); int bind_ipi_to_irqhandler( unsigned int ipi, unsigned int cpu, - irqreturn_t (*handler)(int, void *, struct pt_regs *), + irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ head-2011-03-11/include/xen/net-util.h 2011-02-09 15:49:42.000000000 +0100 @@ -0,0 +1,74 @@ +#ifndef __XEN_NETUTIL_H__ +#define __XEN_NETUTIL_H__ + +#include +#include +#include +#include +#include + +static inline int skb_checksum_setup(struct sk_buff *skb, + unsigned long *fixup_counter) +{ + struct iphdr *iph = (void *)skb->data; + __be16 *csum = NULL; + int err = -EPROTO; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + /* A non-CHECKSUM_PARTIAL SKB does not require setup. */ + if (!skb_is_gso(skb)) + return 0; + + /* + * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy + * peers can fail to set NETRXF_csum_blank when sending a GSO + * frame. In this case force the SKB to CHECKSUM_PARTIAL and + * recalculate the partial checksum. + */ + ++*fixup_counter; + --csum; + } + + if (skb->protocol != htons(ETH_P_IP)) + goto out; + + skb->nh.iph = iph; + skb->h.raw = skb->nh.raw + 4 * iph->ihl; + if (skb->h.raw >= skb->tail) + goto out; + + switch (iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + if (csum) + csum = &skb->h.th->check; + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + if (csum) + csum = &skb->h.uh->check; + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet\n", skb->nh.iph->protocol); + goto out; + } + + if ((skb->h.raw + skb->csum + sizeof(*csum)) > skb->tail) + goto out; + + if (csum) { + *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - iph->ihl*4, + IPPROTO_TCP, 0); + skb->ip_summed = CHECKSUM_PARTIAL; + } + + err = 0; +out: + return err; +} + +#endif /* __XEN_NETUTIL_H__ */ --- head-2011-03-11.orig/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200 +++ head-2011-03-11/include/xen/xencons.h 2011-01-31 17:29:16.000000000 +0100 @@ -8,7 +8,7 @@ void xencons_force_flush(void); void xencons_resume(void); /* Interrupt work hooks. Receive data, or kick data out. */ -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs); +void xencons_rx(char *buf, unsigned len); void xencons_tx(void); int xencons_ring_init(void); --- head-2011-03-11.orig/mm/mprotect.c 2011-01-31 14:53:38.000000000 +0100 +++ head-2011-03-11/mm/mprotect.c 2011-01-31 17:29:16.000000000 +0100 @@ -97,7 +97,7 @@ static inline void change_pmd_range(stru } if (pmd_none_or_clear_bad(pmd)) continue; - if (arch_change_pte_range(mm, pmd, addr, next, newprot)) + if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable)) continue; change_pte_range(vma->vm_mm, pmd, addr, next, newprot, dirty_accountable); --- head-2011-03-11.orig/mm/page_alloc.c 2011-01-31 14:53:38.000000000 +0100 +++ head-2011-03-11/mm/page_alloc.c 2011-02-08 10:03:14.000000000 +0100 @@ -5004,6 +5004,23 @@ static void __setup_per_zone_wmarks(void spin_unlock_irqrestore(&zone->lock, flags); } +#ifdef CONFIG_XEN + for_each_zone(zone) { + unsigned int cpu; + + if (!populated_zone(zone)) + continue; + for_each_online_cpu(cpu) { + unsigned long high; + + high = percpu_pagelist_fraction + ? zone->present_pages / percpu_pagelist_fraction + : 5 * zone_batchsize(zone); + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } +#endif + /* update totalreserve_pages */ calculate_totalreserve_pages(); }