You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
qubes-linux-kernel/patches.xen/xen-x86_64-note-init-p2m

344 lines
13 KiB

From: jbeulich@novell.com
Subject: eliminate scalability issues from initial mapping setup
Patch-mainline: obsolete
References: bnc#417417
Direct Xen to place the initial P->M table outside of the initial
mapping, as otherwise the 1G (implementation) / 2G (theoretical)
restriction on the size of the initial mapping limits the amount
of memory a domain can be handed initially.
Note that the flags passed to HYPERVISOR_update_va_mapping() from
__make_page_writable() and make_lowmem_page_writable() are
intentionally not including UVMF_ALL. This is intended to be on optimal
choice between the overhead of a potential spurious page fault (as
remote CPUs may still have read-only translations in their TLBs) and
the overhead of cross processor flushes. Flushing on the local CPU
shouldn't be as expensive (and hence can be viewed as an optimization
avoiding the spurious page fault on the local CPU), but is required
when the functions are used before the page fault handler gets set up.
--- head-2011-03-17.orig/arch/x86/kernel/head64-xen.c 2011-02-01 15:09:47.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head64-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -124,6 +124,14 @@ void __init x86_64_start_reservations(ch
memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ xen_start_info->mfn_list = ~0UL;
+ else if (xen_start_info->mfn_list < __START_KERNEL_map)
+ memblock_x86_reserve_range(xen_start_info->first_p2m_pfn << PAGE_SHIFT,
+ (xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT,
+ "INITP2M");
+
/*
* At this point everything still needed from the boot loader
* or BIOS or kernel text should be early reserved or marked not
--- head-2011-03-17.orig/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:36.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/head_64-xen.S 2011-02-03 14:42:41.000000000 +0100
@@ -17,6 +17,7 @@
#include <linux/elfnote.h>
#include <asm/segment.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/dwarf2.h>
@@ -146,6 +147,7 @@ ENTRY(empty_zero_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT)
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
--- head-2011-03-17.orig/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:11.000000000 +0100
+++ head-2011-03-17/arch/x86/kernel/setup-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -1173,7 +1173,7 @@ void __init setup_arch(char **cmdline_p)
difference = xen_start_info->nr_pages - max_pfn;
set_xen_guest_handle(reservation.extent_start,
- ((unsigned long *)xen_start_info->mfn_list) + max_pfn);
+ phys_to_machine_mapping + max_pfn);
reservation.nr_extents = difference;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
&reservation);
@@ -1190,14 +1190,86 @@ void __init setup_arch(char **cmdline_p)
phys_to_machine_mapping = alloc_bootmem_pages(
max_pfn * sizeof(unsigned long));
memcpy(phys_to_machine_mapping,
- (unsigned long *)xen_start_info->mfn_list,
+ __va(__pa(xen_start_info->mfn_list)),
p2m_pages * sizeof(unsigned long));
memset(phys_to_machine_mapping + p2m_pages, ~0,
(max_pfn - p2m_pages) * sizeof(unsigned long));
- free_bootmem(
- __pa(xen_start_info->mfn_list),
- PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
- sizeof(unsigned long))));
+
+#ifdef CONFIG_X86_64
+ if (xen_start_info->mfn_list == VMEMMAP_START) {
+ /*
+ * Since it is well isolated we can (and since it is
+ * perhaps large we should) also free the page tables
+ * mapping the initial P->M table.
+ */
+ unsigned long va = VMEMMAP_START, pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+
+ BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+ xen_l4_entry_update(pgd, __pgd(0));
+ for(;;) {
+ pud_t *pud = pud_page + pud_index(va);
+
+ if (pud_none(*pud))
+ va += PUD_SIZE;
+ else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ make_pages_writable(__va(pa),
+ PUD_SIZE >> PAGE_SHIFT,
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd_t *pmd = pmd_offset(pud, va);
+
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ make_pages_writable(__va(pa),
+ PMD_SIZE >> PAGE_SHIFT,
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte_t *pte = pte_offset_kernel(pmd, va);
+
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ make_page_writable(__va(pa),
+ XENFEAT_writable_page_tables);
+ free_bootmem(pa, PAGE_SIZE);
+ }
+ ClearPagePinned(virt_to_page(pte));
+ make_page_writable(pte,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa(pte), PAGE_SIZE);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ ClearPagePinned(virt_to_page(pmd));
+ make_page_writable(pmd,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa((unsigned long)pmd
+ & PAGE_MASK),
+ PAGE_SIZE);
+ }
+ if (!pud_index(va))
+ break;
+ }
+ ClearPagePinned(virt_to_page(pud_page));
+ make_page_writable(pud_page,
+ XENFEAT_writable_page_tables);
+ free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+ PAGE_SIZE);
+ } else if (!WARN_ON(xen_start_info->mfn_list
+ < __START_KERNEL_map))
+#endif
+ free_bootmem(__pa(xen_start_info->mfn_list),
+ PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+ sizeof(unsigned long))));
+
/*
* Initialise the list of the frames that specify the list of
--- head-2011-03-17.orig/arch/x86/mm/init-xen.c 2011-02-01 15:41:35.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -340,9 +340,22 @@ unsigned long __init_refok init_memory_m
__flush_tlb_all();
- if (!after_bootmem && e820_table_top > e820_table_start)
+ if (!after_bootmem && e820_table_top > e820_table_start) {
+#ifdef CONFIG_X86_64
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && e820_table_start <= xen_start_info->first_p2m_pfn
+ && e820_table_top > xen_start_info->first_p2m_pfn) {
+ memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
+ xen_start_info->first_p2m_pfn
+ << PAGE_SHIFT,
+ "PGTABLE");
+ e820_table_start = xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames;
+ }
+#endif
memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
e820_table_top << PAGE_SHIFT, "PGTABLE");
+ }
if (!after_bootmem)
early_memtest(start, end);
--- head-2011-03-17.orig/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:36.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/init_64-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -220,6 +220,17 @@ void sync_global_pgds(unsigned long star
}
}
+static __init unsigned long get_table_end(void)
+{
+ BUG_ON(!e820_table_end);
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && e820_table_end == xen_start_info->first_p2m_pfn) {
+ e820_table_end += xen_start_info->nr_p2m_frames;
+ e820_table_top += xen_start_info->nr_p2m_frames;
+ }
+ return e820_table_end++;
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -231,8 +242,7 @@ static __ref void *spp_getpage(void)
if (after_bootmem)
ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
else if (e820_table_end < e820_table_top) {
- ptr = __va(e820_table_end << PAGE_SHIFT);
- e820_table_end++;
+ ptr = __va(get_table_end() << PAGE_SHIFT);
clear_page(ptr);
} else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -427,8 +437,7 @@ static __ref void *alloc_low_page(unsign
return adr;
}
- BUG_ON(!e820_table_end);
- pfn = e820_table_end++;
+ pfn = get_table_end();
if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
@@ -454,14 +463,29 @@ static inline int __meminit make_readonl
/* Make new page tables read-only on the first pass. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& !max_pfn_mapped
- && (paddr >= (e820_table_start << PAGE_SHIFT))
- && (paddr < (e820_table_top << PAGE_SHIFT)))
- readonly = 1;
+ && (paddr >= (e820_table_start << PAGE_SHIFT))) {
+ unsigned long top = e820_table_top;
+
+ /* Account for the range get_table_end() skips. */
+ if (xen_start_info->mfn_list < __START_KERNEL_map
+ && e820_table_end <= xen_start_info->first_p2m_pfn
+ && top > xen_start_info->first_p2m_pfn)
+ top += xen_start_info->nr_p2m_frames;
+ if (paddr < (top << PAGE_SHIFT))
+ readonly = 1;
+ }
/* Make old page tables read-only. */
if (!xen_feature(XENFEAT_writable_page_tables)
&& (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
&& (paddr < (e820_table_end << PAGE_SHIFT)))
readonly = 1;
+ /* Make P->M table (and its page tables) read-only. */
+ if (!xen_feature(XENFEAT_writable_page_tables)
+ && xen_start_info->mfn_list < __START_KERNEL_map
+ && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+ && paddr < (xen_start_info->first_p2m_pfn
+ + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+ readonly = 1;
/*
* No need for writable mapping of kernel image. This also ensures that
@@ -761,6 +785,12 @@ void __init xen_init_pt(void)
(PTRS_PER_PUD - pud_index(__START_KERNEL_map))
* sizeof(*level3_kernel_pgt));
+ /* Copy the initial P->M table mappings if necessary. */
+ addr = pgd_index(xen_start_info->mfn_list);
+ if (addr < pgd_index(__START_KERNEL_map))
+ init_level4_pgt[addr] =
+ ((pgd_t *)xen_start_info->pt_base)[addr];
+
/* Do an early initialization of the fixmap area. */
addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
@@ -792,22 +822,27 @@ void __init xen_init_pt(void)
void __init xen_finish_init_mapping(void)
{
unsigned long start, end;
+ struct mmuext_op mmuext;
/* Re-vector virtual addresses pointing into the initial
mapping to the just-established permanent ones. */
xen_start_info = __va(__pa(xen_start_info));
xen_start_info->pt_base = (unsigned long)
__va(__pa(xen_start_info->pt_base));
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ if (!xen_feature(XENFEAT_auto_translated_physmap)
+ && xen_start_info->mfn_list >= __START_KERNEL_map)
phys_to_machine_mapping =
__va(__pa(xen_start_info->mfn_list));
- xen_start_info->mfn_list = (unsigned long)
- phys_to_machine_mapping;
- }
if (xen_start_info->mod_start)
xen_start_info->mod_start = (unsigned long)
__va(__pa(xen_start_info->mod_start));
+ /* Unpin the no longer used Xen provided page tables. */
+ mmuext.cmd = MMUEXT_UNPIN_TABLE;
+ mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
+ if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+ BUG();
+
/* Destroy the Xen-created mappings beyond the kernel image. */
start = PAGE_ALIGN(_brk_end);
end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT);
--- head-2011-03-17.orig/arch/x86/mm/pageattr-xen.c 2011-03-17 14:33:38.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pageattr-xen.c 2011-03-17 14:35:24.000000000 +0100
@@ -1500,7 +1500,7 @@ static void __make_page_writable(unsigne
pte = lookup_address(va, &level);
BUG_ON(!pte || level != PG_LEVEL_4K);
- if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0))
+ if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
BUG();
if (in_secondary_range(va)) {
unsigned long pfn = pte_pfn(*pte);
--- head-2011-03-17.orig/arch/x86/mm/pgtable-xen.c 2011-03-17 14:35:10.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pgtable-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -344,7 +344,7 @@ void __init xen_init_pgd_pin(void)
if (PTRS_PER_PUD > 1) /* not folded */
SetPagePinned(virt_to_page(pud));
for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
- if (!pud_present(*pud))
+ if (!pud_present(*pud) || pud_large(*pud))
continue;
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
@@ -355,7 +355,7 @@ void __init xen_init_pgd_pin(void)
&& m >= pmd_index(HYPERVISOR_VIRT_START))
continue;
#endif
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
continue;
SetPagePinned(pmd_page(*pmd));
}
--- head-2011-03-17.orig/arch/x86/mm/pgtable_32-xen.c 2011-02-01 15:03:10.000000000 +0100
+++ head-2011-03-17/arch/x86/mm/pgtable_32-xen.c 2011-02-03 14:42:41.000000000 +0100
@@ -174,6 +174,6 @@ void make_lowmem_page_writable(void *va,
pte = lookup_address((unsigned long)va, &level);
BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
rc = HYPERVISOR_update_va_mapping(
- (unsigned long)va, pte_mkwrite(*pte), 0);
+ (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
BUG_ON(rc);
}