2010-07-07 11:12:45 +00:00
|
|
|
From: Nick Piggin <npiggin@suse.de>
|
|
|
|
Subject: mm: /dev/zero optimisation
|
|
|
|
References: bnc#430738
|
|
|
|
Patch-mainline: no (could be submit)
|
|
|
|
|
|
|
|
Patch for removal of ZERO_PAGE from main VM paths also removed the
|
|
|
|
/dev/zero optimisation to map directly from ZERO_PAGE when doing
|
|
|
|
mmap() and also the interesting read(2) "hack" where the MMU was
|
|
|
|
used to make zero-filling the target buffer zero-copy.
|
|
|
|
|
|
|
|
Some benchmarks have run into issues with this. Customers sometimes
|
|
|
|
use these benchmarks to qualify and test systems, so even if the
|
|
|
|
benchmarks themselves are "stupid", it saves some trouble to retain
|
|
|
|
this optimisation for them. Also, while I don't think it was established
|
|
|
|
that there is a "real" workload where this helps, but it can't be proven
|
|
|
|
that one does not exist.
|
|
|
|
|
|
|
|
Signed-off-by: Nick Piggin <npiggin@suse.de>
|
|
|
|
---
|
|
|
|
drivers/char/mem.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
|
|
|
include/linux/mm.h | 2 +
|
|
|
|
mm/memory.c | 87 +++++++++++++++++++++++++++++++++++++++++++
|
|
|
|
3 files changed, 193 insertions(+), 1 deletion(-)
|
|
|
|
|
|
|
|
--- a/drivers/char/mem.c
|
|
|
|
+++ b/drivers/char/mem.c
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -639,6 +639,100 @@ static ssize_t splice_write_null(struct
|
2010-07-07 11:12:45 +00:00
|
|
|
return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
|
|
|
|
}
|
|
|
|
|
|
|
|
+#if 1 //ndef CONFIG_XEN
|
|
|
|
+/*
|
|
|
|
+ * For fun, we are using the MMU for this.
|
|
|
|
+ */
|
|
|
|
+static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
|
|
|
|
+{
|
|
|
|
+ struct mm_struct *mm;
|
|
|
|
+ struct vm_area_struct * vma;
|
|
|
|
+ unsigned long addr=(unsigned long)buf;
|
|
|
|
+
|
|
|
|
+ mm = current->mm;
|
|
|
|
+ /* Oops, this was forgotten before. -ben */
|
|
|
|
+ down_read(&mm->mmap_sem);
|
|
|
|
+
|
|
|
|
+ /* For private mappings, just map in zero pages. */
|
|
|
|
+ for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
|
|
|
|
+ unsigned long count;
|
|
|
|
+
|
|
|
|
+ if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
|
|
|
|
+ goto out_up;
|
|
|
|
+ if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
|
|
|
|
+ break;
|
|
|
|
+ count = vma->vm_end - addr;
|
|
|
|
+ if (count > size)
|
|
|
|
+ count = size;
|
|
|
|
+
|
|
|
|
+ zap_page_range(vma, addr, count, NULL);
|
|
|
|
+ if (zeromap_page_range(vma, addr, count, PAGE_COPY))
|
|
|
|
+ break;
|
|
|
|
+
|
|
|
|
+ size -= count;
|
|
|
|
+ buf += count;
|
|
|
|
+ addr += count;
|
|
|
|
+ if (size == 0)
|
|
|
|
+ goto out_up;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ up_read(&mm->mmap_sem);
|
|
|
|
+
|
|
|
|
+ /* The shared case is hard. Let's do the conventional zeroing. */
|
|
|
|
+ do {
|
|
|
|
+ unsigned long unwritten = clear_user(buf, PAGE_SIZE);
|
|
|
|
+ if (unwritten)
|
|
|
|
+ return size + unwritten - PAGE_SIZE;
|
|
|
|
+ cond_resched();
|
|
|
|
+ buf += PAGE_SIZE;
|
|
|
|
+ size -= PAGE_SIZE;
|
|
|
|
+ } while (size);
|
|
|
|
+
|
|
|
|
+ return size;
|
|
|
|
+out_up:
|
|
|
|
+ up_read(&mm->mmap_sem);
|
|
|
|
+ return size;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static ssize_t read_zero(struct file *file, char __user *buf,
|
|
|
|
+ size_t count, loff_t *ppos)
|
|
|
|
+{
|
|
|
|
+ unsigned long left, unwritten, written = 0;
|
|
|
|
+
|
|
|
|
+ if (!count)
|
|
|
|
+ return 0;
|
|
|
|
+
|
|
|
|
+ if (!access_ok(VERIFY_WRITE, buf, count))
|
|
|
|
+ return -EFAULT;
|
|
|
|
+
|
|
|
|
+ left = count;
|
|
|
|
+
|
|
|
|
+ /* do we want to be clever? Arbitrary cut-off */
|
|
|
|
+ if (count >= PAGE_SIZE*4) {
|
|
|
|
+ unsigned long partial;
|
|
|
|
+
|
|
|
|
+ /* How much left of the page? */
|
|
|
|
+ partial = (PAGE_SIZE-1) & -(unsigned long) buf;
|
|
|
|
+ unwritten = clear_user(buf, partial);
|
|
|
|
+ written = partial - unwritten;
|
|
|
|
+ if (unwritten)
|
|
|
|
+ goto out;
|
|
|
|
+ left -= partial;
|
|
|
|
+ buf += partial;
|
|
|
|
+ unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
|
|
|
|
+ written += (left & PAGE_MASK) - unwritten;
|
|
|
|
+ if (unwritten)
|
|
|
|
+ goto out;
|
|
|
|
+ buf += left & PAGE_MASK;
|
|
|
|
+ left &= ~PAGE_MASK;
|
|
|
|
+ }
|
|
|
|
+ unwritten = clear_user(buf, left);
|
|
|
|
+ written += left - unwritten;
|
|
|
|
+out:
|
|
|
|
+ return written ? written : -EFAULT;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#else /* CONFIG_XEN */
|
|
|
|
static ssize_t read_zero(struct file *file, char __user *buf,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -669,15 +763,24 @@ static ssize_t read_zero(struct file *fi
|
2010-07-07 11:12:45 +00:00
|
|
|
}
|
|
|
|
return written ? written : -EFAULT;
|
|
|
|
}
|
|
|
|
+#endif /* CONFIG_XEN */
|
|
|
|
|
|
|
|
static int mmap_zero(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
+ int err = 0;
|
|
|
|
+
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
return -ENOSYS;
|
|
|
|
#endif
|
|
|
|
+
|
|
|
|
if (vma->vm_flags & VM_SHARED)
|
|
|
|
return shmem_zero_setup(vma);
|
|
|
|
- return 0;
|
|
|
|
+#if 1 //ndef CONFIG_XEN
|
|
|
|
+ err = zeromap_page_range(vma, vma->vm_start,
|
|
|
|
+ vma->vm_end - vma->vm_start, vma->vm_page_prot);
|
|
|
|
+ BUG_ON(err == -EEXIST);
|
|
|
|
+#endif
|
|
|
|
+ return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t write_full(struct file *file, const char __user *buf,
|
|
|
|
--- a/include/linux/mm.h
|
|
|
|
+++ b/include/linux/mm.h
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -799,6 +799,8 @@ void free_pgd_range(struct mmu_gather *t
|
2010-07-07 11:12:45 +00:00
|
|
|
unsigned long end, unsigned long floor, unsigned long ceiling);
|
|
|
|
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
|
|
struct vm_area_struct *vma);
|
|
|
|
+int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
|
|
|
|
+ unsigned long size, pgprot_t prot);
|
|
|
|
void unmap_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen, int even_cows);
|
|
|
|
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
--- a/mm/memory.c
|
|
|
|
+++ b/mm/memory.c
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -1590,6 +1590,93 @@ struct page *get_dump_page(unsigned long
|
2010-07-07 11:12:45 +00:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_ELF_CORE */
|
|
|
|
|
|
|
|
+static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
|
|
|
+ unsigned long addr, unsigned long end, pgprot_t prot)
|
|
|
|
+{
|
|
|
|
+ pte_t *pte;
|
|
|
|
+ spinlock_t *ptl;
|
|
|
|
+ int err = 0;
|
|
|
|
+
|
|
|
|
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
|
|
|
|
+ if (!pte)
|
|
|
|
+ return -EAGAIN;
|
|
|
|
+ arch_enter_lazy_mmu_mode();
|
|
|
|
+ do {
|
|
|
|
+ pte_t zero_pte;
|
|
|
|
+
|
|
|
|
+ if (unlikely(!pte_none(*pte))) {
|
|
|
|
+ err = -EEXIST;
|
|
|
|
+ pte++;
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(addr), prot));
|
|
|
|
+ zero_pte = pte_wrprotect(zero_pte);
|
|
|
|
+ set_pte_at(mm, addr, pte, zero_pte);
|
|
|
|
+ } while (pte++, addr += PAGE_SIZE, addr != end);
|
|
|
|
+ arch_leave_lazy_mmu_mode();
|
|
|
|
+ pte_unmap_unlock(pte - 1, ptl);
|
|
|
|
+ return err;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
|
|
|
|
+ unsigned long addr, unsigned long end, pgprot_t prot)
|
|
|
|
+{
|
|
|
|
+ pmd_t *pmd;
|
|
|
|
+ unsigned long next;
|
|
|
|
+ int err;
|
|
|
|
+
|
|
|
|
+ pmd = pmd_alloc(mm, pud, addr);
|
|
|
|
+ if (!pmd)
|
|
|
|
+ return -EAGAIN;
|
|
|
|
+ do {
|
|
|
|
+ next = pmd_addr_end(addr, end);
|
|
|
|
+ err = zeromap_pte_range(mm, pmd, addr, next, prot);
|
|
|
|
+ if (err)
|
|
|
|
+ break;
|
|
|
|
+ } while (pmd++, addr = next, addr != end);
|
|
|
|
+ return err;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
|
|
|
+ unsigned long addr, unsigned long end, pgprot_t prot)
|
|
|
|
+{
|
|
|
|
+ pud_t *pud;
|
|
|
|
+ unsigned long next;
|
|
|
|
+ int err;
|
|
|
|
+
|
|
|
|
+ pud = pud_alloc(mm, pgd, addr);
|
|
|
|
+ if (!pud)
|
|
|
|
+ return -EAGAIN;
|
|
|
|
+ do {
|
|
|
|
+ next = pud_addr_end(addr, end);
|
|
|
|
+ err = zeromap_pmd_range(mm, pud, addr, next, prot);
|
|
|
|
+ if (err)
|
|
|
|
+ break;
|
|
|
|
+ } while (pud++, addr = next, addr != end);
|
|
|
|
+ return err;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+int zeromap_page_range(struct vm_area_struct *vma,
|
|
|
|
+ unsigned long addr, unsigned long size, pgprot_t prot)
|
|
|
|
+{
|
|
|
|
+ pgd_t *pgd;
|
|
|
|
+ unsigned long next;
|
|
|
|
+ unsigned long end = addr + size;
|
|
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
|
|
+ int err;
|
|
|
|
+
|
|
|
|
+ BUG_ON(addr >= end);
|
|
|
|
+ pgd = pgd_offset(mm, addr);
|
|
|
|
+ flush_cache_range(vma, addr, end);
|
|
|
|
+ do {
|
|
|
|
+ next = pgd_addr_end(addr, end);
|
|
|
|
+ err = zeromap_pud_range(mm, pgd, addr, next, prot);
|
|
|
|
+ if (err)
|
|
|
|
+ break;
|
|
|
|
+ } while (pgd++, addr = next, addr != end);
|
|
|
|
+ return err;
|
|
|
|
+}
|
|
|
|
+
|
2011-04-19 20:09:59 +00:00
|
|
|
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
|
2010-07-07 11:12:45 +00:00
|
|
|
spinlock_t **ptl)
|
|
|
|
{
|