344 lines
13 KiB
Plaintext
344 lines
13 KiB
Plaintext
From: jbeulich@novell.com
|
|
Subject: eliminate scalability issues from initial mapping setup
|
|
Patch-mainline: obsolete
|
|
References: bnc#417417
|
|
|
|
Direct Xen to place the initial P->M table outside of the initial
|
|
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
|
|
restriction on the size of the initial mapping limits the amount
|
|
of memory a domain can be handed initially.
|
|
|
|
Note that the flags passed to HYPERVISOR_update_va_mapping() from
|
|
__make_page_writable() and make_lowmem_page_writable() are
|
|
intentionally not including UVMF_ALL. This is intended to be on optimal
|
|
choice between the overhead of a potential spurious page fault (as
|
|
remote CPUs may still have read-only translations in their TLBs) and
|
|
the overhead of cross processor flushes. Flushing on the local CPU
|
|
shouldn't be as expensive (and hence can be viewed as an optimization
|
|
avoiding the spurious page fault on the local CPU), but is required
|
|
when the functions are used before the page fault handler gets set up.
|
|
|
|
--- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 15:09:47.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -124,6 +124,14 @@ void __init x86_64_start_reservations(ch
|
|
|
|
memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
|
|
|
|
+ if (xen_feature(XENFEAT_auto_translated_physmap))
|
|
+ xen_start_info->mfn_list = ~0UL;
|
|
+ else if (xen_start_info->mfn_list < __START_KERNEL_map)
|
|
+ memblock_x86_reserve_range(xen_start_info->first_p2m_pfn << PAGE_SHIFT,
|
|
+ (xen_start_info->first_p2m_pfn
|
|
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT,
|
|
+ "INITP2M");
|
|
+
|
|
/*
|
|
* At this point everything still needed from the boot loader
|
|
* or BIOS or kernel text should be early reserved or marked not
|
|
--- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:36.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -17,6 +17,7 @@
|
|
#include <linux/elfnote.h>
|
|
#include <asm/segment.h>
|
|
#include <asm/page.h>
|
|
+#include <asm/pgtable.h>
|
|
#include <asm/msr.h>
|
|
#include <asm/cache.h>
|
|
#include <asm/dwarf2.h>
|
|
@@ -146,6 +147,7 @@ ENTRY(empty_zero_page)
|
|
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
|
|
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
|
|
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
|
|
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START)
|
|
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel")
|
|
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
|
|
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
|
|
--- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:11.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -1173,7 +1173,7 @@ void __init setup_arch(char **cmdline_p)
|
|
difference = xen_start_info->nr_pages - max_pfn;
|
|
|
|
set_xen_guest_handle(reservation.extent_start,
|
|
- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
|
|
+ phys_to_machine_mapping + max_pfn);
|
|
reservation.nr_extents = difference;
|
|
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
|
|
&reservation);
|
|
@@ -1190,14 +1190,86 @@ void __init setup_arch(char **cmdline_p)
|
|
phys_to_machine_mapping = alloc_bootmem_pages(
|
|
max_pfn * sizeof(unsigned long));
|
|
memcpy(phys_to_machine_mapping,
|
|
- (unsigned long *)xen_start_info->mfn_list,
|
|
+ __va(__pa(xen_start_info->mfn_list)),
|
|
p2m_pages * sizeof(unsigned long));
|
|
memset(phys_to_machine_mapping + p2m_pages, ~0,
|
|
(max_pfn - p2m_pages) * sizeof(unsigned long));
|
|
- free_bootmem(
|
|
- __pa(xen_start_info->mfn_list),
|
|
- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
|
|
- sizeof(unsigned long))));
|
|
+
|
|
+#ifdef CONFIG_X86_64
|
|
+ if (xen_start_info->mfn_list == VMEMMAP_START) {
|
|
+ /*
|
|
+ * Since it is well isolated we can (and since it is
|
|
+ * perhaps large we should) also free the page tables
|
|
+ * mapping the initial P->M table.
|
|
+ */
|
|
+ unsigned long va = VMEMMAP_START, pa;
|
|
+ pgd_t *pgd = pgd_offset_k(va);
|
|
+ pud_t *pud_page = pud_offset(pgd, 0);
|
|
+
|
|
+ BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
|
|
+ xen_l4_entry_update(pgd, __pgd(0));
|
|
+ for(;;) {
|
|
+ pud_t *pud = pud_page + pud_index(va);
|
|
+
|
|
+ if (pud_none(*pud))
|
|
+ va += PUD_SIZE;
|
|
+ else if (pud_large(*pud)) {
|
|
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
|
|
+ make_pages_writable(__va(pa),
|
|
+ PUD_SIZE >> PAGE_SHIFT,
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(pa, PUD_SIZE);
|
|
+ va += PUD_SIZE;
|
|
+ } else {
|
|
+ pmd_t *pmd = pmd_offset(pud, va);
|
|
+
|
|
+ if (pmd_large(*pmd)) {
|
|
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
|
|
+ make_pages_writable(__va(pa),
|
|
+ PMD_SIZE >> PAGE_SHIFT,
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(pa, PMD_SIZE);
|
|
+ } else if (!pmd_none(*pmd)) {
|
|
+ pte_t *pte = pte_offset_kernel(pmd, va);
|
|
+
|
|
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
|
|
+ if (pte_none(pte[i]))
|
|
+ break;
|
|
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
|
|
+ make_page_writable(__va(pa),
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(pa, PAGE_SIZE);
|
|
+ }
|
|
+ ClearPagePinned(virt_to_page(pte));
|
|
+ make_page_writable(pte,
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(__pa(pte), PAGE_SIZE);
|
|
+ }
|
|
+ va += PMD_SIZE;
|
|
+ if (pmd_index(va))
|
|
+ continue;
|
|
+ ClearPagePinned(virt_to_page(pmd));
|
|
+ make_page_writable(pmd,
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(__pa((unsigned long)pmd
|
|
+ & PAGE_MASK),
|
|
+ PAGE_SIZE);
|
|
+ }
|
|
+ if (!pud_index(va))
|
|
+ break;
|
|
+ }
|
|
+ ClearPagePinned(virt_to_page(pud_page));
|
|
+ make_page_writable(pud_page,
|
|
+ XENFEAT_writable_page_tables);
|
|
+ free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
|
|
+ PAGE_SIZE);
|
|
+ } else if (!WARN_ON(xen_start_info->mfn_list
|
|
+ < __START_KERNEL_map))
|
|
+#endif
|
|
+ free_bootmem(__pa(xen_start_info->mfn_list),
|
|
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
|
|
+ sizeof(unsigned long))));
|
|
+
|
|
|
|
/*
|
|
* Initialise the list of the frames that specify the list of
|
|
--- head-2011-03-17.orig/arch/x86/mm/init-xen.c 2011-02-01 15:41:35.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -340,9 +340,22 @@ unsigned long __init_refok init_memory_m
|
|
|
|
__flush_tlb_all();
|
|
|
|
- if (!after_bootmem && e820_table_top > e820_table_start)
|
|
+ if (!after_bootmem && e820_table_top > e820_table_start) {
|
|
+#ifdef CONFIG_X86_64
|
|
+ if (xen_start_info->mfn_list < __START_KERNEL_map
|
|
+ && e820_table_start <= xen_start_info->first_p2m_pfn
|
|
+ && e820_table_top > xen_start_info->first_p2m_pfn) {
|
|
+ memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
|
|
+ xen_start_info->first_p2m_pfn
|
|
+ << PAGE_SHIFT,
|
|
+ "PGTABLE");
|
|
+ e820_table_start = xen_start_info->first_p2m_pfn
|
|
+ + xen_start_info->nr_p2m_frames;
|
|
+ }
|
|
+#endif
|
|
memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
|
|
e820_table_top << PAGE_SHIFT, "PGTABLE");
|
|
+ }
|
|
|
|
if (!after_bootmem)
|
|
early_memtest(start, end);
|
|
--- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:36.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -220,6 +220,17 @@ void sync_global_pgds(unsigned long star
|
|
}
|
|
}
|
|
|
|
+static __init unsigned long get_table_end(void)
|
|
+{
|
|
+ BUG_ON(!e820_table_end);
|
|
+ if (xen_start_info->mfn_list < __START_KERNEL_map
|
|
+ && e820_table_end == xen_start_info->first_p2m_pfn) {
|
|
+ e820_table_end += xen_start_info->nr_p2m_frames;
|
|
+ e820_table_top += xen_start_info->nr_p2m_frames;
|
|
+ }
|
|
+ return e820_table_end++;
|
|
+}
|
|
+
|
|
/*
|
|
* NOTE: This function is marked __ref because it calls __init function
|
|
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
|
|
@@ -231,8 +242,7 @@ static __ref void *spp_getpage(void)
|
|
if (after_bootmem)
|
|
ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
|
|
else if (e820_table_end < e820_table_top) {
|
|
- ptr = __va(e820_table_end << PAGE_SHIFT);
|
|
- e820_table_end++;
|
|
+ ptr = __va(get_table_end() << PAGE_SHIFT);
|
|
clear_page(ptr);
|
|
} else
|
|
ptr = alloc_bootmem_pages(PAGE_SIZE);
|
|
@@ -427,8 +437,7 @@ static __ref void *alloc_low_page(unsign
|
|
return adr;
|
|
}
|
|
|
|
- BUG_ON(!e820_table_end);
|
|
- pfn = e820_table_end++;
|
|
+ pfn = get_table_end();
|
|
if (pfn >= e820_table_top)
|
|
panic("alloc_low_page: ran out of memory");
|
|
|
|
@@ -454,14 +463,29 @@ static inline int __meminit make_readonl
|
|
/* Make new page tables read-only on the first pass. */
|
|
if (!xen_feature(XENFEAT_writable_page_tables)
|
|
&& !max_pfn_mapped
|
|
- && (paddr >= (e820_table_start << PAGE_SHIFT))
|
|
- && (paddr < (e820_table_top << PAGE_SHIFT)))
|
|
- readonly = 1;
|
|
+ && (paddr >= (e820_table_start << PAGE_SHIFT))) {
|
|
+ unsigned long top = e820_table_top;
|
|
+
|
|
+ /* Account for the range get_table_end() skips. */
|
|
+ if (xen_start_info->mfn_list < __START_KERNEL_map
|
|
+ && e820_table_end <= xen_start_info->first_p2m_pfn
|
|
+ && top > xen_start_info->first_p2m_pfn)
|
|
+ top += xen_start_info->nr_p2m_frames;
|
|
+ if (paddr < (top << PAGE_SHIFT))
|
|
+ readonly = 1;
|
|
+ }
|
|
/* Make old page tables read-only. */
|
|
if (!xen_feature(XENFEAT_writable_page_tables)
|
|
&& (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
|
|
&& (paddr < (e820_table_end << PAGE_SHIFT)))
|
|
readonly = 1;
|
|
+ /* Make P->M table (and its page tables) read-only. */
|
|
+ if (!xen_feature(XENFEAT_writable_page_tables)
|
|
+ && xen_start_info->mfn_list < __START_KERNEL_map
|
|
+ && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
|
|
+ && paddr < (xen_start_info->first_p2m_pfn
|
|
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
|
|
+ readonly = 1;
|
|
|
|
/*
|
|
* No need for writable mapping of kernel image. This also ensures that
|
|
@@ -761,6 +785,12 @@ void __init xen_init_pt(void)
|
|
(PTRS_PER_PUD - pud_index(__START_KERNEL_map))
|
|
* sizeof(*level3_kernel_pgt));
|
|
|
|
+ /* Copy the initial P->M table mappings if necessary. */
|
|
+ addr = pgd_index(xen_start_info->mfn_list);
|
|
+ if (addr < pgd_index(__START_KERNEL_map))
|
|
+ init_level4_pgt[addr] =
|
|
+ ((pgd_t *)xen_start_info->pt_base)[addr];
|
|
+
|
|
/* Do an early initialization of the fixmap area. */
|
|
addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
|
|
if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
|
|
@@ -792,22 +822,27 @@ void __init xen_init_pt(void)
|
|
void __init xen_finish_init_mapping(void)
|
|
{
|
|
unsigned long start, end;
|
|
+ struct mmuext_op mmuext;
|
|
|
|
/* Re-vector virtual addresses pointing into the initial
|
|
mapping to the just-established permanent ones. */
|
|
xen_start_info = __va(__pa(xen_start_info));
|
|
xen_start_info->pt_base = (unsigned long)
|
|
__va(__pa(xen_start_info->pt_base));
|
|
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
|
|
+ if (!xen_feature(XENFEAT_auto_translated_physmap)
|
|
+ && xen_start_info->mfn_list >= __START_KERNEL_map)
|
|
phys_to_machine_mapping =
|
|
__va(__pa(xen_start_info->mfn_list));
|
|
- xen_start_info->mfn_list = (unsigned long)
|
|
- phys_to_machine_mapping;
|
|
- }
|
|
if (xen_start_info->mod_start)
|
|
xen_start_info->mod_start = (unsigned long)
|
|
__va(__pa(xen_start_info->mod_start));
|
|
|
|
+ /* Unpin the no longer used Xen provided page tables. */
|
|
+ mmuext.cmd = MMUEXT_UNPIN_TABLE;
|
|
+ mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
|
|
+ if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
|
|
+ BUG();
|
|
+
|
|
/* Destroy the Xen-created mappings beyond the kernel image. */
|
|
start = PAGE_ALIGN(_brk_end);
|
|
end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
|
|
--- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-03-17 14:33:38.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-03-17 14:35:24.000000000 +0100
|
|
@@ -1500,7 +1500,7 @@ static void __make_page_writable(unsigne
|
|
|
|
pte = lookup_address(va, &level);
|
|
BUG_ON(!pte || level != PG_LEVEL_4K);
|
|
- if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
|
|
+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
|
|
BUG();
|
|
if (in_secondary_range(va)) {
|
|
unsigned long pfn = pte_pfn(*pte);
|
|
--- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-03-17 14:35:10.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -344,7 +344,7 @@ void __init xen_init_pgd_pin(void)
|
|
if (PTRS_PER_PUD > 1) /* not folded */
|
|
SetPagePinned(virt_to_page(pud));
|
|
for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
|
|
- if (!pud_present(*pud))
|
|
+ if (!pud_present(*pud) || pud_large(*pud))
|
|
continue;
|
|
pmd = pmd_offset(pud, 0);
|
|
if (PTRS_PER_PMD > 1) /* not folded */
|
|
@@ -355,7 +355,7 @@ void __init xen_init_pgd_pin(void)
|
|
&& m >= pmd_index(HYPERVISOR_VIRT_START))
|
|
continue;
|
|
#endif
|
|
- if (!pmd_present(*pmd))
|
|
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
|
|
continue;
|
|
SetPagePinned(pmd_page(*pmd));
|
|
}
|
|
--- head-2011-03-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-02-01 15:03:10.000000000 +0100
|
|
+++ head-2011-03-17/arch/x86/mm/pgtable_32-xen.c 2011-02-03 14:42:41.000000000 +0100
|
|
@@ -174,6 +174,6 @@ void make_lowmem_page_writable(void *va,
|
|
pte = lookup_address((unsigned long)va, &level);
|
|
BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
|
|
rc = HYPERVISOR_update_va_mapping(
|
|
- (unsigned long)va, pte_mkwrite(*pte), 0);
|
|
+ (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
|
|
BUG_ON(rc);
|
|
}
|