From: Linux Kernel Mailing List Subject: Linux: 2.6.33 Patch-mainline: 2.6.33 This patch contains the differences between 2.6.33 and 2.6.33. Acked-by: Jeff Mahoney Automatically created from "patches.kernel.org/patch-2.6.33" by xen-port-patches.py --- head-2010-05-12.orig/arch/ia64/include/asm/xen/hypervisor.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/arch/ia64/include/asm/xen/hypervisor.h 2010-03-24 16:00:05.000000000 +0100 @@ -34,11 +34,11 @@ #define _ASM_IA64_XEN_HYPERVISOR_H #include +#include #ifdef CONFIG_PARAVIRT_XEN #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ -#include #include extern struct shared_info *HYPERVISOR_shared_info; --- head-2010-05-12.orig/arch/x86/Kconfig 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/Kconfig 2010-03-24 16:00:05.000000000 +0100 @@ -51,7 +51,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 if !XEN select HAVE_KERNEL_LZMA if !XEN - select HAVE_KERNEL_LZO + select HAVE_KERNEL_LZO if !XEN select HAVE_HW_BREAKPOINT select PERF_EVENTS select ANON_INODES --- head-2010-05-12.orig/arch/x86/ia32/ia32entry-xen.S 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/ia32/ia32entry-xen.S 2010-03-24 16:00:05.000000000 +0100 @@ -546,7 +546,7 @@ ia32_sys_call_table: .quad compat_sys_writev .quad sys_getsid .quad sys_fdatasync - .quad sys32_sysctl /* sysctl */ + .quad compat_sys_sysctl /* sysctl */ .quad sys_mlock /* 150 */ .quad sys_munlock .quad sys_mlockall @@ -589,7 +589,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ @@ -734,4 +734,5 @@ ia32_sys_call_table: .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open + .quad compat_sys_recvmmsg ia32_syscall_end: --- head-2010-05-12.orig/arch/x86/include/asm/hw_irq.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/include/asm/hw_irq.h 2010-03-24 16:00:05.000000000 +0100 @@ -78,6 +78,7 @@ static inline void set_io_apic_irq_attr( irq_attr->polarity = polarity; } +#ifndef CONFIG_XEN /* * This is performance-critical, we want to do it O(1) * @@ -92,6 +93,9 @@ struct irq_cfg { }; extern struct irq_cfg *irq_cfg(unsigned int); +#else +struct irq_cfg; +#endif extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/pgtable.h 2010-03-24 16:00:05.000000000 +0100 @@ -16,6 +16,8 @@ #ifndef __ASSEMBLY__ +#include + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -268,9 +270,9 @@ static inline int is_new_memtype_allowed unsigned long new_flags) { /* - * PAT type is always WB for ISA. So no need to check. + * PAT type is always WB for untracked ranges, so no need to check. */ - if (is_ISA_range(paddr, paddr + size - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/processor.h 2010-03-24 16:00:05.000000000 +0100 @@ -31,6 +31,7 @@ struct mm_struct; #include #include +#define HBP_NUM 4 /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -181,7 +182,7 @@ static inline void xen_cpuid(unsigned in unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm(XEN_CPUID + asm volatile(XEN_CPUID : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), @@ -430,6 +431,8 @@ extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; +struct perf_event; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -450,13 +453,12 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; - /* Hardware debugging registers: */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_no; --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/spinlock.h 2010-03-24 16:00:05.000000000 +0100 @@ -44,10 +44,10 @@ int xen_spinlock_init(unsigned int cpu); void xen_spinlock_cleanup(unsigned int cpu); -bool xen_spin_wait(raw_spinlock_t *, unsigned int *token, +bool xen_spin_wait(arch_spinlock_t *, unsigned int *token, unsigned int flags); -unsigned int xen_spin_adjust(const raw_spinlock_t *, unsigned int token); -void xen_spin_kick(raw_spinlock_t *, unsigned int token); +unsigned int xen_spin_adjust(const arch_spinlock_t *, unsigned int token); +void xen_spin_kick(arch_spinlock_t *, unsigned int token); /* * Ticket locks are conceptually two parts, one indicating the current head of @@ -97,7 +97,7 @@ void xen_spin_kick(raw_spinlock_t *, uns : \ : "memory", "cc") -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp, new; @@ -160,7 +160,7 @@ static __always_inline int __ticket_spin : "memory", "cc"); \ } while (0) -static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { int tmp; int new; @@ -183,21 +183,21 @@ static __always_inline int __ticket_spin } #endif -static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); } -static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { int tmp = ACCESS_ONCE(lock->slock); return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; } -static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { unsigned int token, count; unsigned int flags = __raw_local_irq_save(); @@ -216,7 +216,7 @@ static __always_inline void __ticket_spi } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); } -static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock, +static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { unsigned int token, count; @@ -232,7 +232,7 @@ static __always_inline void __ticket_spi } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); } -static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { unsigned int token; bool kick; @@ -248,24 +248,24 @@ static __always_inline void __ticket_spi #undef __ticket_spin_unlock_body #endif -#define __raw_spin(n) __ticket_spin_##n +#define __arch_spin(n) __ticket_spin_##n #else /* TICKET_SHIFT */ static inline int xen_spinlock_init(unsigned int cpu) { return 0; } static inline void xen_spinlock_cleanup(unsigned int cpu) {} -static inline int __byte_spin_is_locked(raw_spinlock_t *lock) +static inline int __byte_spin_is_locked(arch_spinlock_t *lock) { return lock->lock != 0; } -static inline int __byte_spin_is_contended(raw_spinlock_t *lock) +static inline int __byte_spin_is_contended(arch_spinlock_t *lock) { return lock->spinners != 0; } -static inline void __byte_spin_lock(raw_spinlock_t *lock) +static inline void __byte_spin_lock(arch_spinlock_t *lock) { s8 val = 1; @@ -284,7 +284,7 @@ static inline void __byte_spin_lock(raw_ #define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock) -static inline int __byte_spin_trylock(raw_spinlock_t *lock) +static inline int __byte_spin_trylock(arch_spinlock_t *lock) { u8 old = 1; @@ -294,53 +294,53 @@ static inline int __byte_spin_trylock(ra return old == 0; } -static inline void __byte_spin_unlock(raw_spinlock_t *lock) +static inline void __byte_spin_unlock(arch_spinlock_t *lock) { smp_wmb(); lock->lock = 0; } -#define __raw_spin(n) __byte_spin_##n +#define __arch_spin(n) __byte_spin_##n #endif /* TICKET_SHIFT */ -static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __raw_spin(is_locked)(lock); + return __arch_spin(is_locked)(lock); } -static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __raw_spin(is_contended)(lock); + return __arch_spin(is_contended)(lock); } -#define __raw_spin_is_contended __raw_spin_is_contended +#define arch_spin_is_contended arch_spin_is_contended -static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - __raw_spin(lock)(lock); + __arch_spin(lock)(lock); } -static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { - return __raw_spin(trylock)(lock); + return __arch_spin(trylock)(lock); } -static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - __raw_spin(unlock)(lock); + __arch_spin(unlock)(lock); } -static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, +static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin(lock_flags)(lock, flags); + __arch_spin(lock_flags)(lock, flags); } -#undef __raw_spin +#undef __arch_spin -static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (__raw_spin_is_locked(lock)) + while (arch_spin_is_locked(lock)) cpu_relax(); } @@ -362,7 +362,7 @@ static inline void __raw_spin_unlock_wai * read_can_lock - would read_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_read_can_lock(raw_rwlock_t *lock) +static inline int arch_read_can_lock(arch_rwlock_t *lock) { return (int)(lock)->lock > 0; } @@ -371,12 +371,12 @@ static inline int __raw_read_can_lock(ra * write_can_lock - would write_trylock() succeed? * @lock: the rwlock in question. */ -static inline int __raw_write_can_lock(raw_rwlock_t *lock) +static inline int arch_write_can_lock(arch_rwlock_t *lock) { return (lock)->lock == RW_LOCK_BIAS; } -static inline void __raw_read_lock(raw_rwlock_t *rw) +static inline void arch_read_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" "jns 1f\n" @@ -385,7 +385,7 @@ static inline void __raw_read_lock(raw_r ::LOCK_PTR_REG (rw) : "memory"); } -static inline void __raw_write_lock(raw_rwlock_t *rw) +static inline void arch_write_lock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" "jz 1f\n" @@ -394,7 +394,7 @@ static inline void __raw_write_lock(raw_ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); } -static inline int __raw_read_trylock(raw_rwlock_t *lock) +static inline int arch_read_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -404,7 +404,7 @@ static inline int __raw_read_trylock(raw return 0; } -static inline int __raw_write_trylock(raw_rwlock_t *lock) +static inline int arch_write_trylock(arch_rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; @@ -414,23 +414,23 @@ static inline int __raw_write_trylock(ra return 0; } -static inline void __raw_read_unlock(raw_rwlock_t *rw) +static inline void arch_read_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); } -static inline void __raw_write_unlock(raw_rwlock_t *rw) +static inline void arch_write_unlock(arch_rwlock_t *rw) { asm volatile(LOCK_PREFIX "addl %1, %0" : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); } -#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) -#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) +#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#define _raw_spin_relax(lock) cpu_relax() -#define _raw_read_relax(lock) cpu_relax() -#define _raw_write_relax(lock) cpu_relax() +#define arch_spin_relax(lock) cpu_relax() +#define arch_read_relax(lock) cpu_relax() +#define arch_write_relax(lock) cpu_relax() /* The {read|write|spin}_lock() on x86 are full memory barriers. */ static inline void smp_mb__after_lock(void) { } --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/spinlock_types.h 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/spinlock_types.h 2010-03-24 16:00:05.000000000 +0100 @@ -42,14 +42,14 @@ typedef union { #endif #endif }; -} raw_spinlock_t; +} arch_spinlock_t; -#define __RAW_SPIN_LOCK_UNLOCKED { 0 } +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } typedef struct { unsigned int lock; -} raw_rwlock_t; +} arch_rwlock_t; -#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } +#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } #endif /* _ASM_X86_SPINLOCK_TYPES_H */ --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/swiotlb.h 2010-03-24 15:12:36.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/swiotlb.h 2010-03-24 16:00:05.000000000 +0100 @@ -1,4 +1,6 @@ #include_next +#define pci_swiotlb_detect() 1 + dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, int dir); --- head-2010-05-12.orig/arch/x86/include/mach-xen/asm/system.h 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/include/mach-xen/asm/system.h 2010-03-24 16:00:05.000000000 +0100 @@ -12,9 +12,9 @@ #include /* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) # define AT_VECTOR_SIZE_ARCH 2 -#else +#else /* else it's non-compat x86-64 */ # define AT_VECTOR_SIZE_ARCH 1 #endif @@ -22,6 +22,7 @@ struct task_struct; /* one of the strang struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); +extern void show_regs_common(void); #ifdef CONFIG_X86_32 @@ -127,8 +128,6 @@ do { \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ @@ -156,19 +155,22 @@ extern void xen_load_gs_index(unsigned); * Load a segment. Fall back on loading the zero * segment if something goes wrong.. */ -#define loadsegment(seg, value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %k1, %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b,3b) \ - : :"r" (value), "r" (0) : "memory") - +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) /* * Save a segment register away --- head-2010-05-12.orig/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/acpi/sleep-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -81,12 +81,9 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - header->pmode_efer_low = nx_enabled; - if (header->pmode_efer_low & 1) { - /* This is strange, why not save efer, always? */ - rdmsr(MSR_EFER, header->pmode_efer_low, - header->pmode_efer_high); - } + if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_efer_low = header->pmode_efer_high = 0; #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); @@ -123,30 +120,33 @@ void acpi_restore_state_mem(void) /** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * * We allocate a page from the first 1MB of memory for the wakeup * routine for when we come back from a sleep state. The * runtime allocator allows specification of <16MB pages, but not * <1MB pages. */ -void __init acpi_reserve_bootmem(void) +void __init acpi_reserve_wakeup_memory(void) { #ifndef CONFIG_ACPI_PV_SLEEP + unsigned long mem; + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); return; } - acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); + mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (!acpi_realmode) { + if (mem == -1L) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } - - acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); + acpi_realmode = (unsigned long) phys_to_virt(mem); + acpi_wakeup_address = mem; + reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); #endif } @@ -169,6 +169,8 @@ static int __init acpi_sleep_setup(char #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); --- head-2010-05-12.orig/arch/x86/kernel/apic/Makefile 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/apic/Makefile 2010-03-24 16:00:05.000000000 +0100 @@ -22,5 +22,5 @@ obj-$(CONFIG_XEN) += nmi.o probe_64-$(CONFIG_XEN) := probe_32.o -disabled-obj-$(CONFIG_XEN) := apic_flat_$(BITS).o +disabled-obj-$(CONFIG_XEN) := apic_flat_$(BITS).o apic_noop.o disabled-obj-$(filter-out $(CONFIG_SMP),$(CONFIG_XEN)) += ipi.o --- head-2010-05-12.orig/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/apic/io_apic-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -60,8 +60,6 @@ #include #include #include -#include -#include #include @@ -160,20 +158,6 @@ static struct irq_pin_list *get_one_free return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -229,7 +213,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -381,7 +365,7 @@ void arch_free_chip_data(struct irq_desc /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -604,23 +588,41 @@ static void __init replace_pin_at_irq_no add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -644,18 +646,6 @@ static void __mask_IO_APIC_irq(struct ir io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -1235,7 +1225,7 @@ __assign_irq_vector(int irq, struct irq_ int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -1299,8 +1289,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -1677,9 +1666,6 @@ __apicdebuginit(void) print_IO_APIC(void struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1786,9 +1772,6 @@ __apicdebuginit(void) print_APIC_field(i { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1802,9 +1785,6 @@ __apicdebuginit(void) print_local_APIC(v unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1902,13 +1882,19 @@ __apicdebuginit(void) print_local_APIC(v printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1917,7 +1903,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1944,21 +1930,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) { + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ @@ -2117,7 +2123,7 @@ void __init setup_ioapic_ids_from_mpc(vo * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2144,7 +2150,7 @@ void __init setup_ioapic_ids_from_mpc(vo * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2159,7 +2165,7 @@ void __init setup_ioapic_ids_from_mpc(vo mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -2314,20 +2320,16 @@ static int ioapic_retrigger_irq(unsigned */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2358,31 +2360,30 @@ static void __target_IO_APIC_irq(unsigne } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and * leaves desc->affinity untouched. */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, + unsigned int *dest_id) { struct irq_cfg *cfg; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; + return -1; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; + return -1; cpumask_copy(desc->affinity, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + return 0; } static int @@ -2398,12 +2399,11 @@ set_ioapic_affinity_irq_desc(struct irq_ cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { + ret = set_desc_affinity(desc, mask, &dest); + if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); - ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2518,8 +2518,13 @@ asmlinkage void smp_irq_move_cleanup_int continue; cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) + raw_spin_lock(&desc->lock); + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) @@ -2538,29 +2543,40 @@ asmlinkage void smp_irq_move_cleanup_int goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif @@ -2576,6 +2592,59 @@ static void ack_apic_edge(unsigned int i atomic_t irq_mis_count; +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2611,6 +2680,19 @@ static void ack_apic_level(unsigned int * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2622,6 +2704,19 @@ static void ack_apic_level(unsigned int */ ack_APIC_irq(); + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(desc); + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2655,41 +2750,9 @@ static void ack_apic_level(unsigned int move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); @@ -3267,6 +3330,7 @@ unsigned int create_irq_nr(unsigned int continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3322,7 +3386,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3356,7 +3421,10 @@ static int msi_compose_msg(struct pci_de irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3402,8 +3470,7 @@ static int set_msi_irq_affinity(unsigned struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3435,8 +3502,7 @@ ir_set_msi_irq_affinity(unsigned int irq if (get_irte(irq, &irte)) return -1; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3521,7 +3587,7 @@ static int setup_msi_irq(struct pci_dev int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3618,8 +3684,7 @@ static int dmar_msi_set_affinity(unsigne struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3654,7 +3719,7 @@ int arch_setup_dmar_msi(unsigned int irq int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3674,8 +3739,7 @@ static int hpet_msi_set_affinity(unsigne struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3694,6 +3758,19 @@ static int hpet_msi_set_affinity(unsigne #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3705,20 +3782,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } @@ -3752,8 +3845,7 @@ static int set_ht_irq_affinity(unsigned struct irq_cfg *cfg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3819,75 +3911,6 @@ int arch_setup_ht_irq(unsigned int irq, } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -4065,7 +4088,7 @@ int __init io_apic_get_unique_id(int ioa */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -4081,10 +4104,10 @@ int __init io_apic_get_unique_id(int ioa * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -4097,7 +4120,7 @@ int __init io_apic_get_unique_id(int ioa apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -4229,7 +4252,7 @@ static struct resource * __init ioapic_s for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } @@ -4263,18 +4286,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } --- head-2010-05-12.orig/arch/x86/kernel/cpu/Makefile 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/cpu/Makefile 2010-03-24 16:00:05.000000000 +0100 @@ -34,7 +34,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o -disabled-obj-$(CONFIG_XEN) := hypervisor.o sched.o vmware.o +disabled-obj-$(CONFIG_XEN) := hypervisor.o perf_event.o sched.o vmware.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ --- head-2010-05-12.orig/arch/x86/kernel/cpu/common-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/cpu/common-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -69,7 +69,7 @@ void __init setup_cpu_local_masks(void) static void __cpuinit default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 - display_cacheinfo(c); + cpu_detect_cache_sizes(c); #else /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -411,7 +411,7 @@ static void __cpuinit get_model_name(str } } -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -419,8 +419,6 @@ void __cpuinit display_cacheinfo(struct if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); #ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ @@ -450,9 +448,6 @@ void __cpuinit display_cacheinfo(struct #endif c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -460,6 +455,7 @@ void __cpuinit detect_ht(struct cpuinfo_ #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; + static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) return; @@ -475,7 +471,7 @@ void __cpuinit detect_ht(struct cpuinfo_ smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -502,11 +498,12 @@ void __cpuinit detect_ht(struct cpuinfo_ ((1 << core_bits) - 1); out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printed = 1; } #endif } @@ -687,24 +684,31 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; +#ifdef PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; - unsigned int j; if (count >= X86_VENDOR_NUM) break; cpu_devs[count] = cpudev; count++; - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); +#ifdef PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } } +#endif } - early_identify_cpu(&boot_cpu_data); } @@ -867,10 +871,8 @@ static void __cpuinit identify_cpu(struc boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); -#endif + mcheck_cpu_init(c); select_idle_routine(c); @@ -899,9 +901,15 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif +#ifndef CONFIG_XEN init_hw_perf_events(); +#endif } +#ifdef CONFIG_XEN +void set_perf_event_pending(void) {} +#endif + void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); @@ -1149,7 +1157,7 @@ static void clear_all_debug_regs(void) void __cpuinit cpu_init(void) { #ifndef CONFIG_X86_NO_TSS - struct orig_ist *orig_ist; + struct orig_ist *oist; struct tss_struct *t; unsigned long v; int i; @@ -1163,7 +1171,7 @@ void __cpuinit cpu_init(void) xen_switch_pt(); #ifndef CONFIG_X86_NO_TSS t = &per_cpu(init_tss, cpu); - orig_ist = &per_cpu(orig_ist, cpu); + oist = &per_cpu(orig_ist, cpu); #endif #ifdef CONFIG_NUMA @@ -1177,7 +1185,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); @@ -1200,7 +1208,7 @@ void __cpuinit cpu_init(void) wrmsrl(MSR_KERNEL_GS_BASE, 0); barrier(); - check_efer(); + x86_configure_nx(); #ifdef CONFIG_X86_LOCAL_APIC if (cpu != 0) enable_x2apic(); @@ -1210,12 +1218,12 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!orig_ist->ist[0]) { + if (!oist->ist[0]) { char *estacks = per_cpu(exception_stacks, cpu); for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = + oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } --- head-2010-05-12.orig/arch/x86/kernel/e820-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/e820-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -771,7 +771,7 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 20 +#define MAX_EARLY_RES 32 struct early_res { u64 start, end; @@ -780,7 +780,15 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { #ifndef CONFIG_XEN - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ + { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, +#endif #endif {} }; --- head-2010-05-12.orig/arch/x86/kernel/entry_32-xen.S 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/entry_32-xen.S 2010-03-24 16:00:05.000000000 +0100 @@ -338,6 +338,10 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +/* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to * go as quickly as possible which is why some of this is @@ -387,6 +391,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -539,10 +547,14 @@ ENTRY(ia32pv_sysenter_target) .align 4 .long 1b,syscall_fault .previous - /* fall through */ + jmp system_call CFI_ENDPROC ENDPROC(ia32pv_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -774,26 +786,69 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. */ -#define PTREGSCALL(name) \ +#define PTREGSCALL0(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%eax; \ jmp sys_##name; -PTREGSCALL(iopl) -PTREGSCALL(fork) -PTREGSCALL(clone) -PTREGSCALL(vfork) -PTREGSCALL(execve) -PTREGSCALL(sigaltstack) -PTREGSCALL(sigreturn) -PTREGSCALL(rt_sigreturn) -PTREGSCALL(vm86) -PTREGSCALL(vm86old) +#define PTREGSCALL1(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; + +#define PTREGSCALL2(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%ecx; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; + +#define PTREGSCALL3(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + pushl %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + ret + +PTREGSCALL1(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(vfork) +PTREGSCALL3(execve) +PTREGSCALL2(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) + +/* Clone is an oddball. The 4th arg is in %edi */ + ALIGN; +ptregs_clone: + leal 4(%esp),%eax + pushl %eax + pushl PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + ret #ifndef CONFIG_XEN .macro FIXUP_ESPFIX_STACK @@ -884,6 +939,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -905,6 +964,8 @@ ENDPROC(name) #else #define UNWIND_ESPFIX_STACK + .pushsection .kprobes.text, "ax" + # A note on the "critical region" in our callback handler. # We want to avoid stacking callback handlers due to events occurring # during handling of the last event. To do this, we keep events disabled @@ -1205,16 +1266,16 @@ ENTRY(fixup_4gb_segment) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC - movl %edx,%eax - push %edx - CFI_ADJUST_CFA_OFFSET 4 - call *%ebx - push %eax - CFI_ADJUST_CFA_OFFSET 4 + movl %edi,%eax + call *%esi call do_exit ud2 # padding for call trace CFI_ENDPROC @@ -1315,17 +1376,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif #include --- head-2010-05-12.orig/arch/x86/kernel/entry_64-xen.S 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/entry_64-xen.S 2010-03-24 16:00:05.000000000 +0100 @@ -160,11 +160,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 16(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif @@ -863,8 +863,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ @@ -1093,63 +1093,20 @@ zeroentry coprocessor_error do_coprocess errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -END(kernel_thread) - -ENTRY(child_rip) +ENTRY(kernel_thread_helper) pushq $0 # fake return address CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax + call *%rsi # exit mov %eax, %edi call do_exit ud2 # padding for call trace CFI_ENDPROC -END(child_rip) +END(kernel_thread_helper) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -1329,12 +1286,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs #endif END(error_entry) --- head-2010-05-12.orig/arch/x86/kernel/head-xen.c 2010-04-15 10:10:51.000000000 +0200 +++ head-2010-05-12/arch/x86/kernel/head-xen.c 2010-04-15 10:13:18.000000000 +0200 @@ -1,5 +1,6 @@ #include #include +#include #include #ifndef CONFIG_XEN @@ -133,7 +134,7 @@ void __init xen_start_kernel(void) addr), __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); #else - check_efer(); + x86_configure_nx(); xen_init_pt(); #endif @@ -161,6 +162,8 @@ void __init xen_start_kernel(void) virt_to_machine(empty_zero_page), PAGE_KERNEL_RO); + if (is_initial_xendomain()) + pci_request_acs(); } void __init xen_arch_setup(void) --- head-2010-05-12.orig/arch/x86/kernel/head32-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/head32-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -32,8 +32,6 @@ static void __init i386_default_early_se void __init i386_start_kernel(void) { - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifndef CONFIG_XEN --- head-2010-05-12.orig/arch/x86/kernel/head64-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/head64-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -119,8 +119,6 @@ void __init x86_64_start_reservations(ch { copy_bootdata(__va(real_mode_data)); - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); /* --- head-2010-05-12.orig/arch/x86/kernel/head_64-xen.S 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/head_64-xen.S 2010-03-24 16:00:05.000000000 +0100 @@ -51,9 +51,9 @@ startup_64: #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ - phys_##name = . - .head.text; \ ENTRY(name) + __PAGE_ALIGNED_BSS NEXT_PAGE(init_level4_pgt) .fill 512,8,0 /* @@ -81,7 +81,9 @@ NEXT_PAGE(level2_fixmap_pgt) NEXT_PAGE(level1_fixmap_pgt) .fill 512,8,0 + .previous NEXT_PAGE(hypercall_page) + phys_hypercall_page = . - .head.text CFI_STARTPROC .rept 0x1000 / 0x20 .skip 1 /* push %rcx */ --- head-2010-05-12.orig/arch/x86/kernel/ioport-xen.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/ioport-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -75,8 +75,9 @@ asmlinkage long sys_ioperm(unsigned long * beyond the 0x3ff range: to get the full 65536 ports bitmapped * you'd need 8kB of bitmaps/process, which is a bit excessive. */ -static int do_iopl(unsigned int level, struct thread_struct *t) +long sys_iopl(unsigned int level, struct pt_regs *regs) { + struct thread_struct *t = ¤t->thread; unsigned int old = t->iopl >> 12; if (level > 3) @@ -86,27 +87,8 @@ static int do_iopl(unsigned int level, s if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - - return 0; -} - -#ifdef CONFIG_X86_32 -long sys_iopl(struct pt_regs *regs) -{ - unsigned int level = regs->bx; -#else -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ -#endif - struct thread_struct *t = ¤t->thread; - int rc; - - rc = do_iopl(level, t); - if (rc < 0) - goto out; - t->iopl = level << 12; set_iopl_mask(t->iopl); -out: - return rc; + + return 0; } --- head-2010-05-12.orig/arch/x86/kernel/irq-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/irq-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -19,7 +19,7 @@ atomic_t irq_err_count; #ifndef CONFIG_XEN /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; #endif /* @@ -77,10 +77,10 @@ static int show_other_interrupts(struct seq_printf(p, " Performance pending work\n"); #endif #ifndef CONFIG_XEN - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #endif @@ -157,7 +157,7 @@ int show_interrupts(struct seq_file *p, if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -178,7 +178,7 @@ int show_interrupts(struct seq_file *p, seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -196,8 +196,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_pending_irqs; #endif #ifndef CONFIG_XEN - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; @@ -264,9 +264,9 @@ unsigned int __irq_entry do_IRQ(struct p } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -276,13 +276,95 @@ void smp_generic_interrupt(struct pt_reg irq_enter(); - inc_irq_stat(generic_irqs); + inc_irq_stat(x86_platform_ipis); - if (generic_interrupt_extension) - generic_interrupt_extension(); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); irq_exit(); set_irq_regs(old_regs); } #endif + +#ifdef CONFIG_HOTPLUG_CPU +#include +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + static int warned; + struct irq_desc *desc; + static DECLARE_BITMAP(irqs_used, NR_IRQS); + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + raw_spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + raw_spin_unlock(&desc->lock); + continue; + } + + if (cpumask_test_cpu(smp_processor_id(), affinity)) + __set_bit(irq, irqs_used); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) + desc->chip->unmask(irq); + + raw_spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + /*printk("Broke affinity for irq %i\n", irq)*/; + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for_each_irq_desc(irq, desc) { + if (!__test_and_clear_bit(irq, irqs_used)) + continue; + + if (xen_test_irq_pending(irq)) { + raw_spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + raw_spin_unlock(&desc->lock); + } + } +} +#endif --- head-2010-05-12.orig/arch/x86/kernel/microcode_core-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/microcode_core-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -21,10 +21,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -#include #include #include #include @@ -88,7 +90,6 @@ static int do_microcode_update(const voi static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -98,7 +99,7 @@ static ssize_t microcode_write(struct fi ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -131,7 +132,7 @@ static int __init microcode_dev_init(voi error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -190,7 +191,7 @@ static int __init microcode_init(void) else if (c->x86_vendor == X86_VENDOR_AMD) fw_name = "amd-ucode/microcode_amd.bin"; else { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -207,8 +208,7 @@ static int __init microcode_init(void) request_microcode(fw_name); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " ," - " Peter Oruba\n"); + " , Peter Oruba\n"); return 0; } --- head-2010-05-12.orig/arch/x86/kernel/mpparse-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/mpparse-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -371,13 +371,6 @@ static int __init smp_read_mpc(struct mp x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; @@ -680,37 +673,21 @@ void __init default_get_smp_config(unsig } #ifndef CONFIG_XEN -static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute table size yet, - * as only few megabytes from the bottom is mapped now. - * PC-9800's MPC table places on the very last of physical - * memory; so that simply reserving PAGE_SIZE from mpf->physptr - * yields BUG() in reserve_bootmem. - * also need to make sure physptr is below than max_low_pfn - * we don't need reserve the area above max_low_pfn - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - - if (mpf->physptr < end) { - if (mpf->physptr + size > end) - size = end - mpf->physptr; - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); - } -#else - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); -#endif + + reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } #endif -static int __init smp_scan_config(unsigned long base, unsigned long length, - unsigned reserve) +static int __init smp_scan_config(unsigned long base, unsigned long length) { unsigned int *bp = _bus_to_virt(base); struct mpf_intel *mpf; +#ifndef CONFIG_XEN + unsigned long mem; +#endif apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", bp, length); @@ -732,12 +709,10 @@ static int __init smp_scan_config(unsign printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", mpf, (u64)virt_to_phys(mpf)); - if (!reserve) - return 1; - reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), - BOOTMEM_DEFAULT); + mem = virt_to_phys(mpf); + reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) - smp_reserve_bootmem(mpf); + smp_reserve_memory(mpf); #else printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", mpf, ((void *)bp - _bus_to_virt(base)) + base); @@ -750,7 +725,7 @@ static int __init smp_scan_config(unsign return 0; } -void __init default_find_smp_config(unsigned int reserve) +void __init default_find_smp_config(void) { #ifndef CONFIG_XEN unsigned int address; @@ -764,9 +739,9 @@ void __init default_find_smp_config(unsi * 2) Scan the top 1K of base RAM * 3) Scan the 64K of bios */ - if (smp_scan_config(0x0, 0x400, reserve) || - smp_scan_config(639 * 0x400, 0x400, reserve) || - smp_scan_config(0xF0000, 0x10000, reserve)) + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) return; /* * If it is an SMP machine we should know now, unless the @@ -788,7 +763,7 @@ void __init default_find_smp_config(unsi #ifndef CONFIG_XEN address = get_bios_ebda(); if (address) - smp_scan_config(address, 0x400, reserve); + smp_scan_config(address, 0x400); #endif } @@ -987,9 +962,6 @@ void __init early_reserve_e820_mpc_new(v { if (enable_update_mptable && alloc_mptable) { u64 startt = 0; -#ifdef CONFIG_X86_TRAMPOLINE - startt = TRAMPOLINE_BASE; -#endif mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); } } --- head-2010-05-12.orig/arch/x86/kernel/pci-dma-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/pci-dma-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -11,10 +11,11 @@ #include #include #include +#include static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -42,9 +43,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", @@ -143,20 +141,19 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + if (pci_swiotlb_detect()) + goto out; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - swiotlb_init(); +out: + swiotlb_init(1); if (swiotlb) { printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); dma_ops = &swiotlb_dma_ops; @@ -268,7 +265,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -370,25 +367,19 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); - calgary_iommu_init(); - - intel_iommu_init(); - - amd_iommu_init(); - - gart_iommu_init(); +#ifndef CONFIG_XEN + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); +#endif - no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ rootfs_initcall(pci_iommu_init); --- head-2010-05-12.orig/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/pci-nommu-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -112,12 +112,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .dma_supported = nommu_dma_supported, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} --- head-2010-05-12.orig/arch/x86/kernel/process-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/process-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -9,7 +9,11 @@ #include #include #include +#include +#include +#include #include +#include #include #include #include @@ -17,6 +21,7 @@ #include #include #include +#include #include unsigned long idle_halt; @@ -89,30 +94,30 @@ void exit_thread(void) } } -void flush_thread(void) +void show_regs_common(void) { - struct task_struct *tsk = current; + const char *board, *product; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif + board = dmi_get_system_info(DMI_BOARD_NAME); + if (!board) + board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board, product); +} - clear_tsk_thread_flag(tsk, TIF_DEBUG); +void flush_thread(void) +{ + struct task_struct *tsk = current; - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -193,16 +198,6 @@ void __switch_to_xtra(struct task_struct else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -211,6 +206,7 @@ void __switch_to_xtra(struct task_struct else hard_enable_TSC(); } + propagate_user_return_notify(prev_p, next_p); } int sys_fork(struct pt_regs *regs) @@ -234,6 +230,78 @@ int sys_vfork(struct pt_regs *regs) NULL, NULL); } +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* + * sys_execve() executes a new program. + */ +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} /* * Idle related variables and functions --- head-2010-05-12.orig/arch/x86/kernel/process_32-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/process_32-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -62,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); @@ -132,39 +131,29 @@ void __show_regs(struct pt_regs *regs, i unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } - printk("\n"); + show_regs_common(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -174,61 +163,28 @@ void __show_regs(struct pt_regs *regs, i cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } void show_regs(struct pt_regs *regs) { - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, ®s->sp, regs->bp); } -/* - * This gets run with %bx containing the - * function to call, and %dx containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; - - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); @@ -264,7 +220,12 @@ int copy_thread(unsigned long clone_flag task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) p->thread.ip = (unsigned long) cstar_ret_from_fork; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -499,46 +460,6 @@ __switch_to(struct task_struct *prev_p, return prev_p; } -int sys_clone(struct pt_regs *regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs->bx; - newsp = regs->cx; - parent_tidptr = (int __user *)regs->dx; - child_tidptr = (int __user *)regs->di; - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); -} - -/* - * sys_execve() executes a new program. - */ -int sys_execve(struct pt_regs *regs) -{ - int error; - char *filename; - - filename = getname((char __user *) regs->bx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs->cx, - (char __user * __user *) regs->dx, - regs); - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) --- head-2010-05-12.orig/arch/x86/kernel/process_64-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/process_64-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -41,7 +40,6 @@ #include #include #include -#include #include #include @@ -59,6 +57,7 @@ #include #include #include +#include #include @@ -66,8 +65,6 @@ asmlinkage extern void ret_from_fork(voi static DEFINE_PER_CPU(unsigned char, is_idle); -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -170,31 +167,21 @@ void __show_regs(struct pt_regs *regs, i unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + show_regs_common(); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -215,27 +202,26 @@ void __show_regs(struct pt_regs *regs, i cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } @@ -243,6 +229,7 @@ void xen_load_gs_index(unsigned gs) { WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); } +EXPORT_SYMBOL(xen_load_gs_index); void release_thread(struct task_struct *dead_task) { @@ -298,8 +285,9 @@ int copy_thread(unsigned long clone_flag *childregs = *regs; childregs->ax = 0; - childregs->sp = sp; - if (sp == ~0UL) + if (user_mode(regs)) + childregs->sp = sp; + else childregs->sp = (unsigned long)childregs; p->thread.sp = (unsigned long) childregs; @@ -309,12 +297,16 @@ int copy_thread(unsigned long clone_flag p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -354,28 +346,45 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; - regs->cs = __USER_CS; - regs->ss = __USER_DS; - regs->flags = 0x200; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} + +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); +} +#endif /* * switch_to(x,y) should switch tasks from x to y. @@ -565,26 +574,8 @@ __switch_to(struct task_struct *prev_p, */ if (preload_fpu) __math_state_restore(); - return prev_p; -} -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - putname(filename); - return error; + return prev_p; } void set_personality_64bit(void) @@ -601,13 +592,16 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; } unsigned long get_wchan(struct task_struct *p) --- head-2010-05-12.orig/arch/x86/kernel/quirks-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/quirks-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -492,6 +492,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) @@ -500,6 +513,7 @@ static void __init quirk_amd_nb_node(str { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -508,7 +522,13 @@ static void __init quirk_amd_nb_node(str return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } --- head-2010-05-12.orig/arch/x86/kernel/setup-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/setup-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -73,6 +73,7 @@ #include #include +#include #include #include #include @@ -106,9 +107,11 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include #endif +#include #ifdef CONFIG_XEN #include @@ -281,7 +284,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -291,7 +294,7 @@ static inline void copy_edd(void) } #endif #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif @@ -541,49 +544,18 @@ static void __init reserve_early_setup_d #endif } +#ifndef CONFIG_XEN /* * --------- Crashkernel reservation ------------------------------ */ #ifdef CONFIG_KEXEC -#ifndef CONFIG_XEN -/** - * Reserve @size bytes of crashkernel memory at any suitable offset. - * - * @size: Size of the crashkernel memory to reserve. - * Returns the base address on success, and -1ULL on failure. - */ -static -unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) -{ - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long start = 0LL; - - while (1) { - int ret; - - start = find_e820_area(start, ULONG_MAX, size, alignment); - if (start == -1ULL) - return start; - - /* try to reserve it */ - ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); - if (ret >= 0) - return start; - - start += alignment; - } -} - static inline unsigned long long get_total_mem(void) { unsigned long long total; - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif + total = max_pfn - min_low_pfn; return total << PAGE_SHIFT; } @@ -603,21 +575,25 @@ static void __init reserve_crashkernel(v /* 0 means: find the address automatically */ if (crash_base <= 0) { - crash_base = find_and_reserve_crashkernel(crash_size); + const unsigned long long alignment = 16<<20; /* 16M */ + + crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, + alignment); if (crash_base == -1ULL) { - pr_info("crashkernel reservation failed. " - "No suitable area found.\n"); + pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { - ret = reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE); - if (ret < 0) { - pr_info("crashkernel reservation failed - " - "memory is in use\n"); + unsigned long long start; + + start = find_e820_area(crash_base, ULONG_MAX, crash_size, + 1<<20); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } + reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -630,13 +606,11 @@ static void __init reserve_crashkernel(v insert_resource(&iomem_resource, &crashk_res); } #else -#define reserve_crashkernel xen_machine_kexec_setup_resources -#endif -#else static void __init reserve_crashkernel(void) { } #endif +#endif /* CONFIG_XEN */ static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, @@ -735,19 +709,27 @@ static struct dmi_system_id __initdata b DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; @@ -767,6 +749,8 @@ static struct dmi_system_id __initdata b void __init setup_arch(char **cmdline_p) { + int acpi = 0; + int k8 = 0; #ifdef CONFIG_XEN unsigned int i; unsigned long p2m_pages; @@ -900,21 +884,18 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -#ifdef CONFIG_X86_64 /* - * Must call this twice: Once just to detect whether hardware doesn't - * support NX (so that the early EHCI debug console setup can safely - * call set_fixmap(), and then again after parsing early parameters to - * honor the respective command line option. + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. */ - check_efer(); -#endif + x86_configure_nx(); parse_early_param(); -#ifdef CONFIG_X86_64 - check_efer(); -#endif + x86_report_nx(); /* Must be before kernel pagetables are setup */ vmi_activate(); @@ -1021,6 +1002,20 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_trampoline_memory(); + +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + * even before init_memory_mapping + */ + acpi_reserve_wakeup_memory(); +#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -1048,6 +1043,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); #ifndef CONFIG_XEN + reserve_crashkernel(); + vsmp_init(); #endif @@ -1071,23 +1068,15 @@ void __init setup_arch(char **cmdline_p) /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + acpi = acpi_numa_init(); #endif - initmem_init(0, max_pfn); - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); +#ifdef CONFIG_K8_NUMA + if (!acpi) + k8 = !k8_numa_init(0, max_pfn); #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - reserve_crashkernel(); + initmem_init(0, max_pfn, acpi, k8); #if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) /* @@ -1115,6 +1104,9 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_XEN +#ifdef CONFIG_KEXEC + xen_machine_kexec_setup_resources(); +#endif p2m_pages = max_pfn; if (xen_start_info->nr_pages > max_pfn) { /* @@ -1260,6 +1252,8 @@ void __init setup_arch(char **cmdline_p) #endif #endif /* CONFIG_XEN */ x86_init.oem.banner(); + + mcheck_init(); } #ifdef CONFIG_X86_32 --- head-2010-05-12.orig/arch/x86/kernel/time-xen.c 2010-05-12 09:02:39.000000000 +0200 +++ head-2010-05-12/arch/x86/kernel/time-xen.c 2010-05-12 09:02:50.000000000 +0200 @@ -953,28 +953,23 @@ core_initcall(cpufreq_time_setup); */ static ctl_table xen_subtable[] = { { - .ctl_name = CTL_XEN_INDEPENDENT_WALLCLOCK, .procname = "independent_wallclock", .data = &independent_wallclock, .maxlen = sizeof(independent_wallclock), .mode = 0644, - .strategy = sysctl_data, .proc_handler = proc_dointvec }, { - .ctl_name = CTL_XEN_PERMITTED_CLOCK_JITTER, .procname = "permitted_clock_jitter", .data = &permitted_clock_jitter, .maxlen = sizeof(permitted_clock_jitter), .mode = 0644, - .strategy = sysctl_data, .proc_handler = proc_doulongvec_minmax }, { } }; static ctl_table xen_table[] = { { - .ctl_name = CTL_XEN, .procname = "xen", .mode = 0555, .child = xen_subtable --- head-2010-05-12.orig/arch/x86/kernel/traps-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/traps-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -522,77 +522,56 @@ asmlinkage __kprobes struct pt_regs *syn dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); /* Catch kmemcheck conditions first of all! */ - if (condition & DR_STEP && kmemcheck_trap(regs)) + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ -clear_dr7: - set_debugreg(0, 7); + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; + } + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } --- head-2010-05-12.orig/arch/x86/kernel/vmlinux.lds.S 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/vmlinux.lds.S 2010-03-24 16:00:05.000000000 +0100 @@ -43,7 +43,7 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN) /* * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA * we retain large page mappings for boundaries spanning kernel text, rodata --- head-2010-05-12.orig/arch/x86/kernel/vsyscall_64-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/vsyscall_64-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -73,7 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wa vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; @@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = { }; static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + { .procname = "kernel", .mode = 0555, .child = kernel_table2 }, {} }; --- head-2010-05-12.orig/arch/x86/kernel/x8664_ksyms_64.c 2010-05-12 08:55:23.000000000 +0200 +++ head-2010-05-12/arch/x86/kernel/x8664_ksyms_64.c 2010-03-24 16:00:05.000000000 +0100 @@ -55,6 +55,6 @@ EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); -#ifndef CONFIG_PARAVIRT_CPU +#if !defined(CONFIG_PARAVIRT_CPU) && !defined(CONFIG_XEN) EXPORT_SYMBOL(native_load_gs_index); #endif --- head-2010-05-12.orig/arch/x86/kernel/x86_init-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/kernel/x86_init-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -13,10 +13,13 @@ #include #include #include +#include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -61,10 +64,15 @@ struct x86_init_ops x86_init __initdata .tsc_pre_init = x86_init_noop, .timer_init = x86_init_noop, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_platform_ops x86_platform = { .calibrate_tsc = NULL, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .is_untracked_pat_range = is_ISA_range, }; --- head-2010-05-12.orig/arch/x86/mm/fault-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/fault-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -38,7 +38,8 @@ enum x86_pf_error_code { * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ -static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) +static inline int __kprobes +kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) @@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_ return 0; } -static inline int notify_page_fault(struct pt_regs *regs) +static inline int __kprobes notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -248,7 +249,7 @@ void vmalloc_sync_all(void) * * Handle a fault on the vmalloc or module mapping area */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; @@ -365,7 +366,7 @@ void vmalloc_sync_all(void) * * This assumes no large pages in there. */ -static noinline int vmalloc_fault(unsigned long address) +static noinline __kprobes int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; pud_t *pud, *pud_ref; @@ -666,7 +667,7 @@ no_context(struct pt_regs *regs, unsigne show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (tsk != &init_task && *stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; @@ -868,7 +869,7 @@ static int spurious_fault_check(unsigned * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int +static noinline __kprobes int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; --- head-2010-05-12.orig/arch/x86/mm/init-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/init-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -163,10 +163,6 @@ unsigned long __init_refok init_memory_m use_gbpages = direct_gbpages; #endif - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); - /* Enable PSE if available */ if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); --- head-2010-05-12.orig/arch/x86/mm/init_32-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/init_32-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -448,7 +448,7 @@ static void __init permanent_kmaps_init( pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page, int pfn) +static void __init add_one_highpage_init(struct page *page) { ClearPageReserved(page); init_page_count(page); @@ -481,7 +481,7 @@ static int __init add_highpages_work_fn( if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn); + add_one_highpage_init(page); } return 0; @@ -705,8 +705,8 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; @@ -955,8 +955,7 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); + totalhigh_pages << (PAGE_SHIFT-10)); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" @@ -1062,7 +1061,7 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) { --- head-2010-05-12.orig/arch/x86/mm/init_64-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/init_64-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -53,6 +53,7 @@ #include #include #include +#include #include @@ -809,7 +810,8 @@ kernel_physical_mapping_init(unsigned lo } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, + int acpi, int k8) { unsigned long bootmap_size, bootmap; @@ -862,6 +864,21 @@ void __init paging_init(void) */ #ifdef CONFIG_MEMORY_HOTPLUG /* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ @@ -880,6 +897,9 @@ int arch_add_memory(int nid, u64 start, ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -948,12 +968,12 @@ void __init mem_init(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly; void set_kernel_text_rw(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -961,13 +981,18 @@ void set_kernel_text_rw(void) pr_debug("Set kernel text: %lx - %lx for read write\n", start, end); + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ set_memory_rw(start, (end - start) >> PAGE_SHIFT); } void set_kernel_text_ro(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -975,14 +1000,21 @@ void set_kernel_text_ro(void) pr_debug("Set kernel text: %lx - %lx for read only\n", start, end); + /* + * Set the kernel identity mapping for text RO. + */ set_memory_ro(start, (end - start) >> PAGE_SHIFT); } void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1005,6 +1037,14 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); } #endif --- head-2010-05-12.orig/arch/x86/mm/ioremap-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/ioremap-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -457,32 +457,6 @@ void __iomem *ioremap_cache(resource_siz } EXPORT_SYMBOL(ioremap_cache); -#ifndef CONFIG_XEN -static void __iomem *ioremap_default(resource_size_t phys_addr, - unsigned long size) -{ - unsigned long flags; - void __iomem *ret; - int err; - - /* - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - err = reserve_memtype(phys_addr, phys_addr + size, - _PAGE_CACHE_WB, &flags); - if (err < 0) - return NULL; - - ret = __ioremap_caller(phys_addr, size, flags, - __builtin_return_address(0)); - - free_memtype(phys_addr, phys_addr + size); - return ret; -} -#endif - void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { @@ -558,7 +532,7 @@ void *xlate_dev_mem_ptr(unsigned long ph if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_default(start, PAGE_SIZE); + addr = (void __force *)ioremap_cache(start, PAGE_SIZE); if (addr) addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); --- head-2010-05-12.orig/arch/x86/mm/pageattr-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/pageattr-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -281,6 +281,22 @@ static inline pgprot_t static_protection __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN) + /* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; @@ -1135,12 +1151,18 @@ EXPORT_SYMBOL(set_memory_array_wb); int set_memory_x(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); --- head-2010-05-12.orig/arch/x86/mm/pat-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/arch/x86/mm/pat-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -381,9 +382,6 @@ static int free_ram_pages_type(u64 start * - _PAGE_CACHE_UC_MINUS * - _PAGE_CACHE_UC * - * req_type will have a special case value '-1', when requester want to inherit - * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. - * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error @@ -403,9 +401,7 @@ int reserve_memtype(u64 start, u64 end, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == -1) - *new_type = _PAGE_CACHE_WB; - else if (req_type == _PAGE_CACHE_WC) + if (req_type == _PAGE_CACHE_WC) *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; @@ -414,7 +410,7 @@ int reserve_memtype(u64 start, u64 end, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -525,7 +521,7 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); @@ -609,7 +605,7 @@ static unsigned long lookup_memtype(u64 int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { @@ -736,9 +732,8 @@ int phys_mem_access_prot_allowed(struct if (!range_is_allowed(mfn, size)) return 0; - if (file->f_flags & O_SYNC) { + if (file->f_flags & O_DSYNC) flags = _PAGE_CACHE_UC_MINUS; - } #ifndef CONFIG_X86_32 #ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */ @@ -1032,8 +1027,10 @@ static const struct file_operations memt static int __init pat_memtype_list_init(void) { - debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, - NULL, &memtype_fops); + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } return 0; } --- head-2010-05-12.orig/arch/x86/vdso/vdso32-setup-xen.c 2010-03-24 15:25:21.000000000 +0100 +++ head-2010-05-12/arch/x86/vdso/vdso32-setup-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -436,7 +436,6 @@ static ctl_table abi_table2[] = { static ctl_table abi_root_table2[] = { { - .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555, .child = abi_table2 --- head-2010-05-12.orig/drivers/gpu/drm/nouveau/nouveau_sgdma.c 2010-04-15 09:29:04.000000000 +0200 +++ head-2010-05-12/drivers/gpu/drm/nouveau/nouveau_sgdma.c 2010-05-05 15:19:54.000000000 +0200 @@ -267,6 +267,15 @@ nouveau_sgdma_init(struct drm_device *de dev_priv->gart_info.sg_dummy_page = alloc_page(GFP_KERNEL|__GFP_DMA32); +#ifdef CONFIG_XEN + if (!dev_priv->gart_info.sg_dummy_page) + ret = ENOMEM; + else + ret = xen_limit_pages_to_max_mfn( + dev_priv->gart_info.sg_dummy_page, 0, 32); + if (ret) + NV_WARN(dev, "Error restricting SG dummy page: %d\n", ret); +#endif set_bit(PG_locked, &dev_priv->gart_info.sg_dummy_page->flags); dev_priv->gart_info.sg_dummy_bus = pci_map_page(dev->pdev, dev_priv->gart_info.sg_dummy_page, 0, --- head-2010-05-12.orig/drivers/gpu/drm/vmwgfx/Kconfig 2010-05-12 08:55:23.000000000 +0200 +++ head-2010-05-12/drivers/gpu/drm/vmwgfx/Kconfig 2010-04-15 10:13:09.000000000 +0200 @@ -1,6 +1,6 @@ config DRM_VMWGFX tristate "DRM driver for VMware Virtual GPU" - depends on DRM && PCI && FB + depends on DRM && PCI && FB && !XEN select FB_DEFERRED_IO select FB_CFB_FILLRECT select FB_CFB_COPYAREA --- head-2010-05-12.orig/drivers/oprofile/cpu_buffer.c 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-12/drivers/oprofile/cpu_buffer.c 2010-03-24 16:00:05.000000000 +0100 @@ -449,7 +449,7 @@ void oprofile_add_pc(unsigned long pc, i */ void oprofile_add_mode(int cpu_mode) { - struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); + struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); if (op_add_code(cpu_buf, 1, cpu_mode, current)) cpu_buf->sample_lost_overflow++; --- head-2010-05-12.orig/drivers/pci/Kconfig 2010-03-24 15:06:12.000000000 +0100 +++ head-2010-05-12/drivers/pci/Kconfig 2010-03-24 16:00:05.000000000 +0100 @@ -82,7 +82,7 @@ config PCI_IOV config PCI_IOAPIC bool - depends on PCI + depends on PCI && !XEN depends on ACPI depends on HOTPLUG default y --- head-2010-05-12.orig/drivers/scsi/Kconfig 2010-05-12 08:55:23.000000000 +0200 +++ head-2010-05-12/drivers/scsi/Kconfig 2010-03-24 16:00:05.000000000 +0100 @@ -650,7 +650,7 @@ config SCSI_FLASHPOINT config VMWARE_PVSCSI tristate "VMware PVSCSI driver support" - depends on PCI && SCSI && X86 + depends on PCI && SCSI && !XEN && X86 help This driver supports VMware's para virtualized SCSI HBA. To compile this driver as a module, choose M here: the --- head-2010-05-12.orig/drivers/xen/blktap2/sysfs.c 2010-03-24 15:12:46.000000000 +0100 +++ head-2010-05-12/drivers/xen/blktap2/sysfs.c 2010-03-24 16:00:05.000000000 +0100 @@ -39,11 +39,11 @@ blktap_sysfs_exit(struct blktap *tap) static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t); -DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); +static DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t); -DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); +static DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); static ssize_t blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, @@ -103,8 +103,8 @@ blktap_sysfs_get_name(struct device *dev return size; } -DEVICE_ATTR(name, S_IRUSR | S_IWUSR, - blktap_sysfs_get_name, blktap_sysfs_set_name); +static DEVICE_ATTR(name, S_IRUSR | S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); static ssize_t blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr, @@ -123,7 +123,7 @@ blktap_sysfs_remove_device(struct device return (err ? : size); } -DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); +static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); static ssize_t blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr, @@ -293,7 +293,7 @@ out: return ret; } -DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); +static DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); int blktap_sysfs_create(struct blktap *tap) --- head-2010-05-12.orig/drivers/xen/char/mem.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/drivers/xen/char/mem.c 2010-03-24 16:00:05.000000000 +0100 @@ -5,7 +5,7 @@ * * Added devfs support. * Jan-11-1998, C. Scott Ananian - * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar + * Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar */ #include @@ -25,9 +25,19 @@ #include #include +static inline unsigned long size_inside_page(unsigned long start, + unsigned long size) +{ + unsigned long sz; + + sz = PAGE_SIZE - (start & (PAGE_SIZE - 1)); + + return min(sz, size); +} + static inline int uncached_access(struct file *file) { - if (file->f_flags & O_SYNC) + if (file->f_flags & O_DSYNC) return 1; /* Xen sets correct MTRR type on non-RAM for us. */ return 0; @@ -61,20 +71,14 @@ static inline int range_is_allowed(unsig static ssize_t read_mem(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - unsigned long p = *ppos, ignored; + unsigned long p = *ppos; ssize_t read = 0, sz; void __iomem *v; while (count > 0) { - /* - * Handle first page in case it's not aligned - */ - if (-p & (PAGE_SIZE - 1)) - sz = -p & (PAGE_SIZE - 1); - else - sz = PAGE_SIZE; + unsigned long remaining; - sz = min_t(unsigned long, sz, count); + sz = size_inside_page(p, count); if (!range_is_allowed(p >> PAGE_SHIFT, count)) return -EPERM; @@ -95,10 +99,11 @@ static ssize_t read_mem(struct file * fi break; } - ignored = copy_to_user(buf, v, sz); + remaining = copy_to_user(buf, v, sz); iounmap(v); - if (ignored) + if (remaining) return -EFAULT; + buf += sz; p += sz; count -= sz; @@ -117,15 +122,7 @@ static ssize_t write_mem(struct file * f void __iomem *v; while (count > 0) { - /* - * Handle first page in case it's not aligned - */ - if (-p & (PAGE_SIZE - 1)) - sz = -p & (PAGE_SIZE - 1); - else - sz = PAGE_SIZE; - - sz = min_t(unsigned long, sz, count); + sz = size_inside_page(p, count); if (!range_is_allowed(p >> PAGE_SHIFT, sz)) return -EPERM; --- head-2010-05-12.orig/drivers/xen/core/spinlock.c 2010-03-24 15:25:06.000000000 +0100 +++ head-2010-05-12/drivers/xen/core/spinlock.c 2010-03-24 16:00:05.000000000 +0100 @@ -18,7 +18,7 @@ static DEFINE_PER_CPU(int, spinlock_irq) static char spinlock_name[NR_CPUS][15]; struct spinning { - raw_spinlock_t *lock; + arch_spinlock_t *lock; unsigned int ticket; struct spinning *prev; }; @@ -28,7 +28,7 @@ static DEFINE_PER_CPU(struct spinning *, * removal itself doesn't need protection - what needs to be prevented is * removed objects going out of scope (as they're allocated on the stack. */ -static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED; +static DEFINE_PER_CPU(arch_rwlock_t, spinning_rm_lock) = __ARCH_RW_LOCK_UNLOCKED; int __cpuinit xen_spinlock_init(unsigned int cpu) { @@ -58,7 +58,7 @@ void __cpuinit xen_spinlock_cleanup(unsi } static unsigned int spin_adjust(struct spinning *spinning, - const raw_spinlock_t *lock, + const arch_spinlock_t *lock, unsigned int token) { for (; spinning; spinning = spinning->prev) @@ -76,18 +76,18 @@ static unsigned int spin_adjust(struct s return token; } -unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token) +unsigned int xen_spin_adjust(const arch_spinlock_t *lock, unsigned int token) { return spin_adjust(percpu_read(spinning), lock, token); } -bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok, +bool xen_spin_wait(arch_spinlock_t *lock, unsigned int *ptok, unsigned int flags) { int irq = percpu_read(spinlock_irq); bool rc; typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask; - raw_rwlock_t *rm_lock; + arch_rwlock_t *rm_lock; struct spinning spinning, *other; /* If kicker interrupt not initialized yet, just spin. */ @@ -137,7 +137,7 @@ bool xen_spin_wait(raw_spinlock_t *lock, * reduce latency after the current lock was * released), but don't acquire the lock. */ - raw_spinlock_t *lock = other->lock; + arch_spinlock_t *lock = other->lock; raw_local_irq_disable(); while (lock->cur == other->ticket) { @@ -187,8 +187,8 @@ bool xen_spin_wait(raw_spinlock_t *lock, percpu_write(spinning, other); rm_lock = &__get_cpu_var(spinning_rm_lock); raw_local_irq_disable(); - __raw_write_lock(rm_lock); - __raw_write_unlock(rm_lock); + arch_write_lock(rm_lock); + arch_write_unlock(rm_lock); *ptok = lock->cur | (spinning.ticket << TICKET_SHIFT); /* @@ -211,13 +211,13 @@ bool xen_spin_wait(raw_spinlock_t *lock, return rc; } -void xen_spin_kick(raw_spinlock_t *lock, unsigned int token) +void xen_spin_kick(arch_spinlock_t *lock, unsigned int token) { unsigned int cpu; token &= (1U << TICKET_SHIFT) - 1; for_each_online_cpu(cpu) { - raw_rwlock_t *rm_lock; + arch_rwlock_t *rm_lock; unsigned long flags; struct spinning *spinning; @@ -226,7 +226,7 @@ void xen_spin_kick(raw_spinlock_t *lock, rm_lock = &per_cpu(spinning_rm_lock, cpu); raw_local_irq_save(flags); - __raw_read_lock(rm_lock); + arch_read_lock(rm_lock); spinning = per_cpu(spinning, cpu); smp_rmb(); @@ -236,7 +236,7 @@ void xen_spin_kick(raw_spinlock_t *lock, spinning = spinning->prev; } - __raw_read_unlock(rm_lock); + arch_read_unlock(rm_lock); raw_local_irq_restore(flags); if (unlikely(spinning)) { --- head-2010-05-12.orig/drivers/xen/evtchn.c 2010-04-15 10:08:13.000000000 +0200 +++ head-2010-05-12/drivers/xen/evtchn.c 2010-04-15 10:13:26.000000000 +0200 @@ -48,15 +48,14 @@ #include #include -#ifdef CONFIG_PARAVIRT_XEN #include +#ifdef CONFIG_PARAVIRT_XEN #include #include #include #else #include #include -#define xen_domain() is_running_on_xen() #define bind_evtchn_to_irqhandler bind_caller_port_to_irqhandler #endif --- head-2010-05-12.orig/drivers/xen/netback/interface.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/drivers/xen/netback/interface.c 2010-03-24 16:00:05.000000000 +0100 @@ -131,9 +131,13 @@ static const struct netif_stat { { "copied_skbs", offsetof(netif_t, nr_copied_skbs) }, }; -static int netbk_get_stats_count(struct net_device *dev) +static int netbk_get_sset_count(struct net_device *dev, int sset) { - return ARRAY_SIZE(netbk_stats); + switch (sset) { + case ETH_SS_STATS: + return ARRAY_SIZE(netbk_stats); + } + return -EINVAL; } static void netbk_get_ethtool_stats(struct net_device *dev, @@ -171,7 +175,7 @@ static const struct ethtool_ops network_ .set_tso = netbk_set_tso, .get_link = ethtool_op_get_link, - .get_stats_count = netbk_get_stats_count, + .get_sset_count = netbk_get_sset_count, .get_ethtool_stats = netbk_get_ethtool_stats, .get_strings = netbk_get_strings, }; --- head-2010-05-12.orig/drivers/xen/privcmd/compat_privcmd.c 2010-03-24 15:06:12.000000000 +0100 +++ head-2010-05-12/drivers/xen/privcmd/compat_privcmd.c 2010-03-24 16:00:05.000000000 +0100 @@ -26,17 +26,16 @@ #include #include -int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg) +int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg) { int ret; switch (cmd) { case IOCTL_PRIVCMD_MMAP_32: { - struct privcmd_mmap *p; - struct privcmd_mmap_32 *p32; + struct privcmd_mmap __user *p; + struct privcmd_mmap_32 __user *p32 = arg; struct privcmd_mmap_32 n32; - p32 = compat_ptr(arg); p = compat_alloc_user_space(sizeof(*p)); if (copy_from_user(&n32, p32, sizeof(n32)) || put_user(n32.num, &p->num) || @@ -48,8 +47,8 @@ int privcmd_ioctl_32(int fd, unsigned in } break; case IOCTL_PRIVCMD_MMAPBATCH_32: { - struct privcmd_mmapbatch *p; - struct privcmd_mmapbatch_32 *p32; + struct privcmd_mmapbatch __user *p; + struct privcmd_mmapbatch_32 __user *p32 = arg; struct privcmd_mmapbatch_32 n32; #ifdef xen_pfn32_t xen_pfn_t *__user arr; @@ -57,7 +56,6 @@ int privcmd_ioctl_32(int fd, unsigned in unsigned int i; #endif - p32 = compat_ptr(arg); p = compat_alloc_user_space(sizeof(*p)); if (copy_from_user(&n32, p32, sizeof(n32)) || put_user(n32.num, &p->num) || @@ -97,8 +95,8 @@ int privcmd_ioctl_32(int fd, unsigned in } break; case IOCTL_PRIVCMD_MMAPBATCH_V2_32: { - struct privcmd_mmapbatch_v2 *p; - struct privcmd_mmapbatch_v2_32 *p32; + struct privcmd_mmapbatch_v2 __user *p; + struct privcmd_mmapbatch_v2_32 __user *p32 = arg; struct privcmd_mmapbatch_v2_32 n32; #ifdef xen_pfn32_t xen_pfn_t *__user arr; @@ -106,7 +104,6 @@ int privcmd_ioctl_32(int fd, unsigned in unsigned int i; #endif - p32 = compat_ptr(arg); p = compat_alloc_user_space(sizeof(*p)); if (copy_from_user(&n32, p32, sizeof(n32)) || put_user(n32.num, &p->num) || --- head-2010-05-12.orig/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/drivers/xen/xenbus/xenbus_probe.c 2010-03-24 16:00:05.000000000 +0100 @@ -62,6 +62,8 @@ #endif #else #include + +#include #include #include #include @@ -562,7 +564,7 @@ static ssize_t xendev_show_modalias(stru { return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); } -DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); +static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, --- head-2010-05-12.orig/fs/compat_ioctl.c 2010-05-12 08:57:55.000000000 +0200 +++ head-2010-05-12/fs/compat_ioctl.c 2010-05-12 09:02:56.000000000 +0200 @@ -1527,9 +1527,6 @@ IGNORE_IOCTL(FBIOGCURSOR32) #endif #ifdef CONFIG_XEN -HANDLE_IOCTL(IOCTL_PRIVCMD_MMAP_32, privcmd_ioctl_32) -HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_32, privcmd_ioctl_32) -HANDLE_IOCTL(IOCTL_PRIVCMD_MMAPBATCH_V2_32, privcmd_ioctl_32) COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL) COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ) COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN) @@ -1605,6 +1602,12 @@ static long do_ioctl_trans(int fd, unsig return do_video_stillpicture(fd, cmd, argp); case VIDEO_SET_SPU_PALETTE: return do_video_set_spu_palette(fd, cmd, argp); +#ifdef CONFIG_XEN + case IOCTL_PRIVCMD_MMAP_32: + case IOCTL_PRIVCMD_MMAPBATCH_32: + case IOCTL_PRIVCMD_MMAPBATCH_V2_32: + return privcmd_ioctl_32(fd, cmd, argp); +#endif } /* --- head-2010-05-12.orig/include/acpi/processor.h 2010-03-24 15:17:58.000000000 +0100 +++ head-2010-05-12/include/acpi/processor.h 2010-03-24 16:00:05.000000000 +0100 @@ -324,7 +324,7 @@ static inline void acpi_processor_ppc_ex return; } #ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL -int acpi_processor_ppc_has_changed(struct acpi_processor *pr); +int acpi_processor_ppc_has_changed(struct acpi_processor *, int event_flag); #else static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) @@ -339,11 +339,11 @@ static inline int acpi_processor_ppc_has } return 0; } -#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) { return -ENODEV; } +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ #endif /* CONFIG_CPU_FREQ */ --- head-2010-05-12.orig/include/xen/compat_ioctl.h 2010-01-18 15:23:12.000000000 +0100 +++ head-2010-05-12/include/xen/compat_ioctl.h 2010-03-24 16:00:05.000000000 +0100 @@ -29,7 +29,7 @@ #define xen_pfn32_t __u32 #endif -extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg); +extern int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg); struct privcmd_mmap_32 { int num; domid_t dom; --- head-2010-05-12.orig/include/xen/evtchn.h 2010-03-31 14:37:09.000000000 +0200 +++ head-2010-05-12/include/xen/evtchn.h 2010-03-31 14:02:34.000000000 +0200 @@ -48,6 +48,7 @@ * LOW-LEVEL DEFINITIONS */ +#ifdef CONFIG_XEN struct irq_cfg { u32 info; union { @@ -57,8 +58,7 @@ struct irq_cfg { #endif }; }; - -int assign_irq_vector(int irq, struct irq_cfg *, const struct cpumask *); +#endif /* * Dynamically bind an event source to an IRQ-like callback handler. --- head-2010-05-12.orig/include/xen/xen.h 2010-05-12 08:55:23.000000000 +0200 +++ head-2010-05-12/include/xen/xen.h 2010-03-31 14:03:59.000000000 +0200 @@ -7,8 +7,10 @@ enum xen_domain_type { XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ }; -#ifdef CONFIG_XEN +#if defined(CONFIG_PARAVIRT_XEN) extern enum xen_domain_type xen_domain_type; +#elif defined(CONFIG_XEN) +#define xen_domain_type XEN_PV_DOMAIN #else #define xen_domain_type XEN_NATIVE #endif @@ -25,6 +27,8 @@ extern enum xen_domain_type xen_domain_t #define xen_initial_domain() (xen_pv_domain() && \ xen_start_info->flags & SIF_INITDOMAIN) +#elif defined(CONFIG_XEN) +#define xen_initial_domain() is_initial_xendomain() #else /* !CONFIG_XEN_DOM0 */ #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ --- head-2010-05-12.orig/kernel/sysctl_binary.c 2010-04-15 09:55:52.000000000 +0200 +++ head-2010-05-12/kernel/sysctl_binary.c 2010-04-15 10:13:33.000000000 +0200 @@ -875,9 +875,10 @@ static const struct bin_table bin_bus_ta #ifdef CONFIG_XEN -static const struct trans_ctl_table trans_xen_table[] = { - { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" }, - { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" }, +#include +static const struct bin_table bin_xen_table[] = { + { CTL_INT, CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" }, + { CTL_ULONG, CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" }, {} }; #endif @@ -922,7 +923,7 @@ static const struct bin_table bin_root_t { CTL_DIR, CTL_ABI, "abi" }, /* CTL_CPU not used */ #ifdef CONFIG_XEN - { CTL_XEN, "xen", trans_xen_table }, + { CTL_DIR, CTL_XEN, "xen", bin_xen_table }, #endif /* CTL_ARLAN "arlan" no longer used */ { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, --- head-2010-05-12.orig/kernel/sysctl_check.c 2010-03-24 15:10:29.000000000 +0100 +++ head-2010-05-12/kernel/sysctl_check.c 2010-03-24 16:00:05.000000000 +0100 @@ -4,7 +4,6 @@ #include #include #include -#include static int sysctl_depth(struct ctl_table *table) --- head-2010-05-12.orig/lib/swiotlb-xen.c 2010-03-24 15:32:27.000000000 +0100 +++ head-2010-05-12/lib/swiotlb-xen.c 2010-03-24 16:00:05.000000000 +0100 @@ -106,6 +106,7 @@ setup_io_tlb_npages(char *str) swiotlb_force = 1; else if (!strcmp(str, "off")) swiotlb_force = -1; + return 1; } __setup("swiotlb=", setup_io_tlb_npages); @@ -118,8 +119,10 @@ static dma_addr_t swiotlb_virt_to_bus(st return phys_to_dma(hwdev, virt_to_phys(address)); } -static void swiotlb_print_info(unsigned long bytes) +void swiotlb_print_info(void) { + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + printk(KERN_INFO "Software IO TLB enabled: \n" " Aperture: %lu megabytes\n" " Address size: %u bits\n" @@ -133,7 +136,7 @@ static void swiotlb_print_info(unsigned * structures for the software IO TLB used to implement the PCI DMA API. */ void __init -swiotlb_init_with_default_size(size_t default_size) +swiotlb_init_with_default_size(size_t default_size, int verbose) { unsigned long i, bytes; int rc; @@ -204,12 +207,12 @@ swiotlb_init_with_default_size(size_t de } while (rc && dma_bits++ < max_dma_bits); if (rc) panic("No suitable physical memory available for SWIOTLB overflow buffer!\n"); - - swiotlb_print_info(bytes); + if (verbose) + swiotlb_print_info(); } void __init -swiotlb_init(void) +swiotlb_init(int verbose) { long ram_end; size_t defsz = 64 * (1 << 20); /* 64MB default size */ @@ -227,7 +230,7 @@ swiotlb_init(void) } if (swiotlb) - swiotlb_init_with_default_size(defsz); + swiotlb_init_with_default_size(defsz, verbose); else printk(KERN_INFO "Software IO TLB disabled\n"); } @@ -416,7 +419,7 @@ do_unmap_single(struct device *hwdev, ch /* * Return the buffer to the free list by setting the corresponding - * entries to indicate the number of contigous entries available. + * entries to indicate the number of contiguous entries available. * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */