From: jbeulich@novell.com Subject: eliminate scalability issues from initial mapping setup Patch-mainline: obsolete References: bnc#417417 Direct Xen to place the initial P->M table outside of the initial mapping, as otherwise the 1G (implementation) / 2G (theoretical) restriction on the size of the initial mapping limits the amount of memory a domain can be handed initially. Note that the flags passed to HYPERVISOR_update_va_mapping() from __make_page_writable() and make_lowmem_page_writable() are intentionally not including UVMF_ALL. This is intended to be on optimal choice between the overhead of a potential spurious page fault (as remote CPUs may still have read-only translations in their TLBs) and the overhead of cross processor flushes. Flushing on the local CPU shouldn't be as expensive (and hence can be viewed as an optimization avoiding the spurious page fault on the local CPU), but is required when the functions are used before the page fault handler gets set up. --- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 15:09:47.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -124,6 +124,14 @@ void __init x86_64_start_reservations(ch memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + if (xen_feature(XENFEAT_auto_translated_physmap)) + xen_start_info->mfn_list = ~0UL; + else if (xen_start_info->mfn_list < __START_KERNEL_map) + memblock_x86_reserve_range(xen_start_info->first_p2m_pfn << PAGE_SHIFT, + (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, + "INITP2M"); + /* * At this point everything still needed from the boot loader * or BIOS or kernel text should be early reserved or marked not --- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:36.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:41.000000000 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -146,6 +147,7 @@ ENTRY(empty_zero_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:11.000000000 +0100 +++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -1173,7 +1173,7 @@ void __init setup_arch(char **cmdline_p) difference = xen_start_info->nr_pages - max_pfn; set_xen_guest_handle(reservation.extent_start, - ((unsigned long *)xen_start_info->mfn_list) + max_pfn); + phys_to_machine_mapping + max_pfn); reservation.nr_extents = difference; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -1190,14 +1190,86 @@ void __init setup_arch(char **cmdline_p) phys_to_machine_mapping = alloc_bootmem_pages( max_pfn * sizeof(unsigned long)); memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info->mfn_list, + __va(__pa(xen_start_info->mfn_list)), p2m_pages * sizeof(unsigned long)); memset(phys_to_machine_mapping + p2m_pages, ~0, (max_pfn - p2m_pages) * sizeof(unsigned long)); - free_bootmem( - __pa(xen_start_info->mfn_list), - PFN_PHYS(PFN_UP(xen_start_info->nr_pages * - sizeof(unsigned long)))); + +#ifdef CONFIG_X86_64 + if (xen_start_info->mfn_list == VMEMMAP_START) { + /* + * Since it is well isolated we can (and since it is + * perhaps large we should) also free the page tables + * mapping the initial P->M table. + */ + unsigned long va = VMEMMAP_START, pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + + BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK); + xen_l4_entry_update(pgd, __pgd(0)); + for(;;) { + pud_t *pud = pud_page + pud_index(va); + + if (pud_none(*pud)) + va += PUD_SIZE; + else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PUD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd_t *pmd = pmd_offset(pud, va); + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PMD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte_t *pte = pte_offset_kernel(pmd, va); + + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + make_page_writable(__va(pa), + XENFEAT_writable_page_tables); + free_bootmem(pa, PAGE_SIZE); + } + ClearPagePinned(virt_to_page(pte)); + make_page_writable(pte, + XENFEAT_writable_page_tables); + free_bootmem(__pa(pte), PAGE_SIZE); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + ClearPagePinned(virt_to_page(pmd)); + make_page_writable(pmd, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pmd + & PAGE_MASK), + PAGE_SIZE); + } + if (!pud_index(va)) + break; + } + ClearPagePinned(virt_to_page(pud_page)); + make_page_writable(pud_page, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK), + PAGE_SIZE); + } else if (!WARN_ON(xen_start_info->mfn_list + < __START_KERNEL_map)) +#endif + free_bootmem(__pa(xen_start_info->mfn_list), + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * + sizeof(unsigned long)))); + /* * Initialise the list of the frames that specify the list of --- head-2011-03-17.orig/arch/x86/mm/init-xen.c 2011-02-01 15:41:35.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -340,9 +340,22 @@ unsigned long __init_refok init_memory_m __flush_tlb_all(); - if (!after_bootmem && e820_table_top > e820_table_start) + if (!after_bootmem && e820_table_top > e820_table_start) { +#ifdef CONFIG_X86_64 + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_start <= xen_start_info->first_p2m_pfn + && e820_table_top > xen_start_info->first_p2m_pfn) { + memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, + xen_start_info->first_p2m_pfn + << PAGE_SHIFT, + "PGTABLE"); + e820_table_start = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + } +#endif memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT, "PGTABLE"); + } if (!after_bootmem) early_memtest(start, end); --- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:36.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -220,6 +220,17 @@ void sync_global_pgds(unsigned long star } } +static __init unsigned long get_table_end(void) +{ + BUG_ON(!e820_table_end); + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_end == xen_start_info->first_p2m_pfn) { + e820_table_end += xen_start_info->nr_p2m_frames; + e820_table_top += xen_start_info->nr_p2m_frames; + } + return e820_table_end++; +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -231,8 +242,7 @@ static __ref void *spp_getpage(void) if (after_bootmem) ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else if (e820_table_end < e820_table_top) { - ptr = __va(e820_table_end << PAGE_SHIFT); - e820_table_end++; + ptr = __va(get_table_end() << PAGE_SHIFT); clear_page(ptr); } else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -427,8 +437,7 @@ static __ref void *alloc_low_page(unsign return adr; } - BUG_ON(!e820_table_end); - pfn = e820_table_end++; + pfn = get_table_end(); if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); @@ -454,14 +463,29 @@ static inline int __meminit make_readonl /* Make new page tables read-only on the first pass. */ if (!xen_feature(XENFEAT_writable_page_tables) && !max_pfn_mapped - && (paddr >= (e820_table_start << PAGE_SHIFT)) - && (paddr < (e820_table_top << PAGE_SHIFT))) - readonly = 1; + && (paddr >= (e820_table_start << PAGE_SHIFT))) { + unsigned long top = e820_table_top; + + /* Account for the range get_table_end() skips. */ + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_end <= xen_start_info->first_p2m_pfn + && top > xen_start_info->first_p2m_pfn) + top += xen_start_info->nr_p2m_frames; + if (paddr < (top << PAGE_SHIFT)) + readonly = 1; + } /* Make old page tables read-only. */ if (!xen_feature(XENFEAT_writable_page_tables) && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) && (paddr < (e820_table_end << PAGE_SHIFT))) readonly = 1; + /* Make P->M table (and its page tables) read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && xen_start_info->mfn_list < __START_KERNEL_map + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT) + && paddr < (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT) + readonly = 1; /* * No need for writable mapping of kernel image. This also ensures that @@ -761,6 +785,12 @@ void __init xen_init_pt(void) (PTRS_PER_PUD - pud_index(__START_KERNEL_map)) * sizeof(*level3_kernel_pgt)); + /* Copy the initial P->M table mappings if necessary. */ + addr = pgd_index(xen_start_info->mfn_list); + if (addr < pgd_index(__START_KERNEL_map)) + init_level4_pgt[addr] = + ((pgd_t *)xen_start_info->pt_base)[addr]; + /* Do an early initialization of the fixmap area. */ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); if (pud_present(level3_kernel_pgt[pud_index(addr)])) { @@ -792,22 +822,27 @@ void __init xen_init_pt(void) void __init xen_finish_init_mapping(void) { unsigned long start, end; + struct mmuext_op mmuext; /* Re-vector virtual addresses pointing into the initial mapping to the just-established permanent ones. */ xen_start_info = __va(__pa(xen_start_info)); xen_start_info->pt_base = (unsigned long) __va(__pa(xen_start_info->pt_base)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_feature(XENFEAT_auto_translated_physmap) + && xen_start_info->mfn_list >= __START_KERNEL_map) phys_to_machine_mapping = __va(__pa(xen_start_info->mfn_list)); - xen_start_info->mfn_list = (unsigned long) - phys_to_machine_mapping; - } if (xen_start_info->mod_start) xen_start_info->mod_start = (unsigned long) __va(__pa(xen_start_info->mod_start)); + /* Unpin the no longer used Xen provided page tables. */ + mmuext.cmd = MMUEXT_UNPIN_TABLE; + mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base); + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF)) + BUG(); + /* Destroy the Xen-created mappings beyond the kernel image. */ start = PAGE_ALIGN(_brk_end); end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT); --- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-03-17 14:33:38.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-03-17 14:35:24.000000000 +0100 @@ -1500,7 +1500,7 @@ static void __make_page_writable(unsigne pte = lookup_address(va, &level); BUG_ON(!pte || level != PG_LEVEL_4K); - if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0)) + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG)) BUG(); if (in_secondary_range(va)) { unsigned long pfn = pte_pfn(*pte); --- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-03-17 14:35:10.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -344,7 +344,7 @@ void __init xen_init_pgd_pin(void) if (PTRS_PER_PUD > 1) /* not folded */ SetPagePinned(virt_to_page(pud)); for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (!pud_present(*pud)) + if (!pud_present(*pud) || pud_large(*pud)) continue; pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ @@ -355,7 +355,7 @@ void __init xen_init_pgd_pin(void) && m >= pmd_index(HYPERVISOR_VIRT_START)) continue; #endif - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || pmd_large(*pmd)) continue; SetPagePinned(pmd_page(*pmd)); } --- head-2011-03-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-02-01 15:03:10.000000000 +0100 +++ head-2011-03-17/arch/x86/mm/pgtable_32-xen.c 2011-02-03 14:42:41.000000000 +0100 @@ -174,6 +174,6 @@ void make_lowmem_page_writable(void *va, pte = lookup_address((unsigned long)va, &level); BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_mkwrite(*pte), 0); + (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG); BUG_ON(rc); }