From: jbeulich@novell.com Subject: don't require order-1 allocations for pgd-s Patch-mainline: n/a At the same time remove the useless user mode pair of init_level4_pgt. --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2010-11-23 16:31:40.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/hypervisor.h 2011-02-03 14:42:36.000000000 +0100 @@ -106,8 +106,8 @@ void do_hypervisor_callback(struct pt_re * be MACHINE addresses. */ -void xen_pt_switch(unsigned long ptr); -void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ +void xen_pt_switch(pgd_t *); +void xen_new_user_pt(pgd_t *); /* x86_64 only */ void xen_load_gs(unsigned int selector); /* x86_64 only */ void xen_tlb_flush(void); void xen_invlpg(unsigned long ptr); @@ -115,7 +115,7 @@ void xen_invlpg(unsigned long ptr); void xen_l1_entry_update(pte_t *ptr, pte_t val); void xen_l2_entry_update(pmd_t *ptr, pmd_t val); void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ -void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val); /* x86_64 only */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ void xen_pgd_pin(pgd_t *); void xen_pgd_unpin(pgd_t *); --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-08 10:25:49.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/mmu_context.h 2011-02-08 10:46:27.000000000 +0100 @@ -82,6 +82,9 @@ static inline void switch_mm(struct mm_s { unsigned cpu = smp_processor_id(); struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op; +#ifdef CONFIG_X86_64 + pgd_t *upgd; +#endif if (likely(prev != next)) { BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && @@ -98,10 +101,11 @@ static inline void switch_mm(struct mm_s op->arg1.mfn = virt_to_mfn(next->pgd); op++; - /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ + /* xen_new_user_pt(next->pgd) */ #ifdef CONFIG_X86_64 op->cmd = MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = virt_to_mfn(__user_pgd(next->pgd)); + upgd = __user_pgd(next->pgd); + op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0; op++; #endif @@ -132,7 +136,7 @@ static inline void switch_mm(struct mm_s * to make sure to use no freed page tables. */ load_cr3(next->pgd); - xen_new_user_pt(__pa(__user_pgd(next->pgd))); + xen_new_user_pt(next->pgd); load_LDT_nolock(&next->context); } } --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-03 14:41:13.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgalloc.h 2011-02-03 14:42:36.000000000 +0100 @@ -123,15 +123,13 @@ static inline void pud_populate(struct m #endif /* CONFIG_X86_PAE */ #if PAGETABLE_LEVELS > 3 -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) - static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud)); paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); if (unlikely(PagePinned(virt_to_page(pgd)))) - xen_l4_entry_update(pgd, 1, ent); + xen_l4_entry_update(pgd, ent); else *__user_pgd(pgd) = *pgd = ent; } --- head-2011-03-17.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-03 14:42:15.000000000 +0100 +++ head-2011-03-17/arch/x86/include/mach-xen/asm/pgtable_64.h 2011-02-03 14:42:36.000000000 +0100 @@ -111,18 +111,25 @@ static inline void xen_set_pud(pud_t *pu : (void)(*__pudp = xen_make_pud(0)); \ }) -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) +static inline pgd_t *__user_pgd(pgd_t *pgd) +{ + if (unlikely(((unsigned long)pgd & PAGE_MASK) + == (unsigned long)init_level4_pgt)) + return NULL; + return (pgd_t *)(virt_to_page(pgd)->private + + ((unsigned long)pgd & ~PAGE_MASK)); +} static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) { - xen_l4_entry_update(pgdp, 0, pgd); + xen_l4_entry_update(pgdp, pgd); } #define xen_pgd_clear(pgd) \ ({ \ pgd_t *__pgdp = (pgd); \ PagePinned(virt_to_page(__pgdp)) \ - ? xen_l4_entry_update(__pgdp, 1, xen_make_pgd(0)) \ + ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \ : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \ }) --- head-2011-03-17.orig/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:44:07.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/cpu/common-xen.c 2011-03-17 14:44:15.000000000 +0100 @@ -1064,8 +1064,7 @@ DEFINE_PER_CPU_FIRST(union irq_stack_uni void xen_switch_pt(void) { #ifdef CONFIG_XEN - xen_pt_switch(__pa_symbol(init_level4_pgt)); - xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); + xen_pt_switch(init_level4_pgt); #endif } --- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-01 14:55:46.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:36.000000000 +0100 @@ -56,14 +56,6 @@ ENTRY(name) __PAGE_ALIGNED_BSS NEXT_PAGE(init_level4_pgt) .fill 512,8,0 - /* - * We update two pgd entries to make kernel and user pgd consistent - * at pgd_populate(). It can be used for kernel modules. So we place - * this page here for those cases to avoid memory corruption. - * We also use this page to establish the initial mapping for the - * vsyscall area. - */ - .fill 512,8,0 NEXT_PAGE(level3_kernel_pgt) .fill 512,8,0 --- head-2011-03-17.orig/arch/x86/mm/hypervisor.c 2010-12-08 10:45:40.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/hypervisor.c 2011-02-03 14:42:36.000000000 +0100 @@ -521,7 +521,7 @@ void xen_l3_entry_update(pud_t *ptr, pud #endif #ifdef CONFIG_X86_64 -void xen_l4_entry_update(pgd_t *ptr, int user, pgd_t val) +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) { mmu_update_t u[2]; struct page *page = NULL; @@ -534,8 +534,11 @@ void xen_l4_entry_update(pgd_t *ptr, int } u[0].ptr = virt_to_machine(ptr); u[0].val = __pgd_val(val); - if (user) { - u[1].ptr = virt_to_machine(__user_pgd(ptr)); + if (((unsigned long)ptr & ~PAGE_MASK) + <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) { + ptr = __user_pgd(ptr); + BUG_ON(!ptr); + u[1].ptr = virt_to_machine(ptr); u[1].val = __pgd_val(val); do_lN_entry_update(u, 2, page); } else @@ -543,21 +546,25 @@ void xen_l4_entry_update(pgd_t *ptr, int } #endif /* CONFIG_X86_64 */ -void xen_pt_switch(unsigned long ptr) +#ifdef CONFIG_X86_64 +void xen_pt_switch(pgd_t *pgd) { struct mmuext_op op; op.cmd = MMUEXT_NEW_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + op.arg1.mfn = virt_to_mfn(pgd); BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } -void xen_new_user_pt(unsigned long ptr) +void xen_new_user_pt(pgd_t *pgd) { struct mmuext_op op; + + pgd = __user_pgd(pgd); op.cmd = MMUEXT_NEW_USER_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0; BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } +#endif void xen_tlb_flush(void) { @@ -634,7 +641,14 @@ void xen_pgd_pin(pgd_t *pgd) op[0].arg1.mfn = virt_to_mfn(pgd); #ifdef CONFIG_X86_64 op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE; - op[1].arg1.mfn = virt_to_mfn(__user_pgd(pgd)); + pgd = __user_pgd(pgd); + if (pgd) + op[1].arg1.mfn = virt_to_mfn(pgd); + else { + op[1].cmd = MMUEXT_PIN_L3_TABLE; + op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt) + >> PAGE_SHIFT); + } #endif if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0) BUG(); @@ -647,8 +661,10 @@ void xen_pgd_unpin(pgd_t *pgd) op[0].cmd = MMUEXT_UNPIN_TABLE; op[0].arg1.mfn = virt_to_mfn(pgd); #ifdef CONFIG_X86_64 + pgd = __user_pgd(pgd); + BUG_ON(!pgd); op[1].cmd = MMUEXT_UNPIN_TABLE; - op[1].arg1.mfn = virt_to_mfn(__user_pgd(pgd)); + op[1].arg1.mfn = virt_to_mfn(pgd); #endif if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0) BUG(); --- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2010-11-23 16:31:40.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:36.000000000 +0100 @@ -761,9 +761,6 @@ void __init xen_init_pt(void) (PTRS_PER_PUD - pud_index(__START_KERNEL_map)) * sizeof(*level3_kernel_pgt)); - __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] = - __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); - /* Do an early initialization of the fixmap area. */ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); if (pud_present(level3_kernel_pgt[pud_index(addr)])) { @@ -779,8 +776,6 @@ void __init xen_init_pt(void) early_make_page_readonly(init_level4_pgt, XENFEAT_writable_page_tables); - early_make_page_readonly(__user_pgd(init_level4_pgt), - XENFEAT_writable_page_tables); early_make_page_readonly(level3_kernel_pgt, XENFEAT_writable_page_tables); early_make_page_readonly(level3_user_pgt, --- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2010-11-23 16:31:40.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-03-17 14:35:10.000000000 +0100 @@ -291,9 +291,11 @@ static void pgd_walk(pgd_t *pgd_base, pg BUG(); seq = 0; } + pgd = __user_pgd(pgd_base); + BUG_ON(!pgd); MULTI_update_va_mapping(mcl + seq, - (unsigned long)__user_pgd(pgd_base), - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags), 0); MULTI_update_va_mapping(mcl + seq + 1, (unsigned long)pgd_base, @@ -689,19 +691,37 @@ static void pgd_prepopulate_pmd(struct m } } +static inline pgd_t *user_pgd_alloc(pgd_t *pgd) +{ #ifdef CONFIG_X86_64 -/* We allocate two contiguous pages for kernel and user. */ -#define PGD_ORDER 1 -#else -#define PGD_ORDER 0 + if (pgd) { + pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP); + + if (upgd) + set_page_private(virt_to_page(pgd), + (unsigned long)upgd); + else { + free_page((unsigned long)pgd); + pgd = NULL; + } + } +#endif + return pgd; +} + +static inline void user_pgd_free(pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + free_page(page_private(virt_to_page(pgd))); #endif +} pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ORDER); + pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP)); if (pgd == NULL) goto out; @@ -740,7 +760,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb)); out_free_pgd: - free_pages((unsigned long)pgd, PGD_ORDER); + user_pgd_free(pgd); + free_page((unsigned long)pgd); out: return NULL; } @@ -759,7 +780,8 @@ void pgd_free(struct mm_struct *mm, pgd_ pgd_mop_up_pmds(mm, pgd); paravirt_pgd_free(mm, pgd); - free_pages((unsigned long)pgd, PGD_ORDER); + user_pgd_free(pgd); + free_page((unsigned long)pgd); } /* blktap and gntdev need this, as otherwise they would implicitly (and --- head-2011-03-17.orig/drivers/xen/core/machine_reboot.c 2011-02-03 14:42:15.000000000 +0100 +++ head-2011-03-17/drivers/xen/core/machine_reboot.c 2011-02-03 14:42:36.000000000 +0100 @@ -186,8 +186,7 @@ static int take_machine_down(void *_susp * in fast-suspend mode as that implies a new enough Xen. */ if (!suspend->fast_suspend) - xen_new_user_pt(__pa(__user_pgd( - current->active_mm->pgd))); + xen_new_user_pt(current->active_mm->pgd); #endif }