Merge branch 'devel-3.7'

Conflicts:
	config-pvops
	patches.xen/pvops-0005-xen-acpi-sleep-Enable-ACPI-sleep-via-the-__acpi_os_p.patch
	patches.xen/pvops-0009-xen-enlighten-Expose-MWAIT-and-MWAIT_LEAF-if-hypervi.patch
	patches.xen/pvops-blkfront-eject-support.patch
	patches.xen/pvops-netback-calculate-correctly-the-SKB-slots.patch
	rel-pvops
	series-pvops.conf
	version-pvops
devel-3.9
Marek Marczykowski 11 years ago
commit a2acb741b6

File diff suppressed because it is too large Load Diff

@ -133,6 +133,7 @@ fi
make prepare $MAKE_ARGS
make scripts $MAKE_ARGS
make scripts_basic $MAKE_ARGS
krel=$(make -s kernelrelease $MAKE_ARGS)
if [ "$krel" != "%kernelrelease" ]; then
@ -323,6 +324,7 @@ mkdir -p %buildroot/%vm_install_dir
/sbin/dracut --nomdadmconf --nolvmconf \
--kmoddir %buildroot/lib/modules/%kernelrelease \
--include %_sourcedir/vm-initramfs / \
--add "dm" --omit "plymouth" \
-d "xenblk xen-blkfront cdrom ext4 jbd2 crc16 dm_snapshot" \
%buildroot/%vm_install_dir/initramfs %kernelrelease

@ -38,8 +38,8 @@ index 610001d..68cf060 100644
-extern int acpi_suspend_lowlevel(void);
+extern int (*acpi_suspend_lowlevel)(void);
extern const unsigned char acpi_wakeup_code[];
#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
/* Physical address to resume after wakeup */
#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f3..c3a5b95 100644
--- a/arch/x86/kernel/acpi/boot.c
@ -82,8 +82,8 @@ index 103b6ab..4d2d0b1 100644
-int acpi_suspend_lowlevel(void)
+int x86_acpi_suspend_lowlevel(void)
{
struct wakeup_header *header;
/* address in low memory of the wakeup routine. */
struct wakeup_header *header =
(struct wakeup_header *) __va(real_mode_header->wakeup_header);
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 416d4be..4d3feb5 100644
--- a/arch/x86/kernel/acpi/sleep.h

@ -1,967 +0,0 @@
From d8414d3c157dc1f83e73c17447ba41fe5afa9d3d Mon Sep 17 00:00:00 2001
From: Bastian Blank <waldi@debian.org>
Date: Fri, 16 Dec 2011 11:34:33 -0500
Subject: xen: Add privcmd device driver
Access to arbitrary hypercalls is currently provided via xenfs. This
adds a standard character device to handle this. The support in xenfs
remains for backward compatibility and uses the device driver code.
Signed-off-by: Bastian Blank <waldi@debian.org>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
drivers/xen/Kconfig | 7 +
drivers/xen/Makefile | 2 +
drivers/xen/privcmd.c | 437 +++++++++++++++++++++++++++++++++++++++++++
drivers/xen/privcmd.h | 3 +
drivers/xen/xenfs/Makefile | 2 +-
drivers/xen/xenfs/privcmd.c | 400 ---------------------------------------
drivers/xen/xenfs/super.c | 3 +-
drivers/xen/xenfs/xenfs.h | 1 -
8 files changed, 452 insertions(+), 403 deletions(-)
create mode 100644 drivers/xen/privcmd.c
create mode 100644 drivers/xen/privcmd.h
delete mode 100644 drivers/xen/xenfs/privcmd.c
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 8795480..a1ced52 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -86,6 +86,7 @@ config XEN_BACKEND
config XENFS
tristate "Xen filesystem"
+ select XEN_PRIVCMD
default y
help
The xen filesystem provides a way for domains to share
@@ -171,4 +172,10 @@ config XEN_PCIDEV_BACKEND
xen-pciback.hide=(03:00.0)(04:00.0)
If in doubt, say m.
+
+config XEN_PRIVCMD
+ tristate
+ depends on XEN
+ default m
+
endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 974fffd..aa31337 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -19,7 +19,9 @@ obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
+obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o
+xen-privcmd-y := privcmd.o
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
new file mode 100644
index 0000000..4e8d3da
--- /dev/null
+++ b/drivers/xen/privcmd.c
@@ -0,0 +1,437 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#include "privcmd.h"
+
+MODULE_LICENSE("GPL");
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto out_up;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int err;
+
+ xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain) < 0) {
+ *mfnp |= 0xf0000000U;
+ st->err++;
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ return put_user(*mfnp, st->user++);
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+ int ret;
+ struct privcmd_mmapbatch m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+ m.arr);
+
+ if (ret || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ ret = -EINVAL;
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.err = 0;
+
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state);
+
+ up_write(&mm->mmap_sem);
+
+ if (state.err > 0) {
+ state.user = m.arr;
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist,
+ mmap_return_errors, &state);
+ }
+
+out:
+ free_page_list(&pagelist);
+
+ return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+ * how to recreate these mappings */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations xen_privcmd_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
+EXPORT_SYMBOL_GPL(xen_privcmd_fops);
+
+static struct miscdevice privcmd_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/privcmd",
+ .fops = &xen_privcmd_fops,
+};
+
+static int __init privcmd_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&privcmd_dev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register Xen privcmd device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit privcmd_exit(void)
+{
+ misc_deregister(&privcmd_dev);
+}
+
+module_init(privcmd_init);
+module_exit(privcmd_exit);
diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h
new file mode 100644
index 0000000..14facae
--- /dev/null
+++ b/drivers/xen/privcmd.h
@@ -0,0 +1,3 @@
+#include <linux/fs.h>
+
+extern const struct file_operations xen_privcmd_fops;
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 4fde944..5d45ff1 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-y = super.o xenbus.o privcmd.o
+xenfs-y = super.o xenbus.o
xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
deleted file mode 100644
index dbd3b16..0000000
--- a/drivers/xen/xenfs/privcmd.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/******************************************************************************
- * privcmd.c
- *
- * Interface to privileged domain-0 commands.
- *
- * Copyright (c) 2002-2004, K A Fraser, B Dragovic
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/uaccess.h>
-#include <linux/swap.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/tlb.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/xen.h>
-#include <xen/privcmd.h>
-#include <xen/interface/xen.h>
-#include <xen/features.h>
-#include <xen/page.h>
-#include <xen/xen-ops.h>
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
-#endif
-
-static long privcmd_ioctl_hypercall(void __user *udata)
-{
- struct privcmd_hypercall hypercall;
- long ret;
-
- if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
- return -EFAULT;
-
- ret = privcmd_call(hypercall.op,
- hypercall.arg[0], hypercall.arg[1],
- hypercall.arg[2], hypercall.arg[3],
- hypercall.arg[4]);
-
- return ret;
-}
-
-static void free_page_list(struct list_head *pages)
-{
- struct page *p, *n;
-
- list_for_each_entry_safe(p, n, pages, lru)
- __free_page(p);
-
- INIT_LIST_HEAD(pages);
-}
-
-/*
- * Given an array of items in userspace, return a list of pages
- * containing the data. If copying fails, either because of memory
- * allocation failure or a problem reading user memory, return an
- * error code; its up to the caller to dispose of any partial list.
- */
-static int gather_array(struct list_head *pagelist,
- unsigned nelem, size_t size,
- void __user *data)
-{
- unsigned pageidx;
- void *pagedata;
- int ret;
-
- if (size > PAGE_SIZE)
- return 0;
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* quiet, gcc */
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page = alloc_page(GFP_KERNEL);
-
- ret = -ENOMEM;
- if (page == NULL)
- goto fail;
-
- pagedata = page_address(page);
-
- list_add_tail(&page->lru, pagelist);
- pageidx = 0;
- }
-
- ret = -EFAULT;
- if (copy_from_user(pagedata + pageidx, data, size))
- goto fail;
-
- data += size;
- pageidx += size;
- }
-
- ret = 0;
-
-fail:
- return ret;
-}
-
-/*
- * Call function "fn" on each element of the array fragmented
- * over a list of pages.
- */
-static int traverse_pages(unsigned nelem, size_t size,
- struct list_head *pos,
- int (*fn)(void *data, void *state),
- void *state)
-{
- void *pagedata;
- unsigned pageidx;
- int ret = 0;
-
- BUG_ON(size > PAGE_SIZE);
-
- pageidx = PAGE_SIZE;
- pagedata = NULL; /* hush, gcc */
-
- while (nelem--) {
- if (pageidx > PAGE_SIZE-size) {
- struct page *page;
- pos = pos->next;
- page = list_entry(pos, struct page, lru);
- pagedata = page_address(page);
- pageidx = 0;
- }
-
- ret = (*fn)(pagedata + pageidx, state);
- if (ret)
- break;
- pageidx += size;
- }
-
- return ret;
-}
-
-struct mmap_mfn_state {
- unsigned long va;
- struct vm_area_struct *vma;
- domid_t domain;
-};
-
-static int mmap_mfn_range(void *data, void *state)
-{
- struct privcmd_mmap_entry *msg = data;
- struct mmap_mfn_state *st = state;
- struct vm_area_struct *vma = st->vma;
- int rc;
-
- /* Do not allow range to wrap the address space. */
- if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
- ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
- return -EINVAL;
-
- /* Range chunks must be contiguous in va space. */
- if ((msg->va != st->va) ||
- ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
- return -EINVAL;
-
- rc = xen_remap_domain_mfn_range(vma,
- msg->va & PAGE_MASK,
- msg->mfn, msg->npages,
- vma->vm_page_prot,
- st->domain);
- if (rc < 0)
- return rc;
-
- st->va += msg->npages << PAGE_SHIFT;
-
- return 0;
-}
-
-static long privcmd_ioctl_mmap(void __user *udata)
-{
- struct privcmd_mmap mmapcmd;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int rc;
- LIST_HEAD(pagelist);
- struct mmap_mfn_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
- return -EFAULT;
-
- rc = gather_array(&pagelist,
- mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- mmapcmd.entry);
-
- if (rc || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- {
- struct page *page = list_first_entry(&pagelist,
- struct page, lru);
- struct privcmd_mmap_entry *msg = page_address(page);
-
- vma = find_vma(mm, msg->va);
- rc = -EINVAL;
-
- if (!vma || (msg->va != vma->vm_start) ||
- !privcmd_enforce_singleshot_mapping(vma))
- goto out_up;
- }
-
- state.va = vma->vm_start;
- state.vma = vma;
- state.domain = mmapcmd.dom;
-
- rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
- &pagelist,
- mmap_mfn_range, &state);
-
-
-out_up:
- up_write(&mm->mmap_sem);
-
-out:
- free_page_list(&pagelist);
-
- return rc;
-}
-
-struct mmap_batch_state {
- domid_t domain;
- unsigned long va;
- struct vm_area_struct *vma;
- int err;
-
- xen_pfn_t __user *user;
-};
-
-static int mmap_batch_fn(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
- st->vma->vm_page_prot, st->domain) < 0) {
- *mfnp |= 0xf0000000U;
- st->err++;
- }
- st->va += PAGE_SIZE;
-
- return 0;
-}
-
-static int mmap_return_errors(void *data, void *state)
-{
- xen_pfn_t *mfnp = data;
- struct mmap_batch_state *st = state;
-
- return put_user(*mfnp, st->user++);
-}
-
-static struct vm_operations_struct privcmd_vm_ops;
-
-static long privcmd_ioctl_mmap_batch(void __user *udata)
-{
- int ret;
- struct privcmd_mmapbatch m;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long nr_pages;
- LIST_HEAD(pagelist);
- struct mmap_batch_state state;
-
- if (!xen_initial_domain())
- return -EPERM;
-
- if (copy_from_user(&m, udata, sizeof(m)))
- return -EFAULT;
-
- nr_pages = m.num;
- if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
- return -EINVAL;
-
- ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
- m.arr);
-
- if (ret || list_empty(&pagelist))
- goto out;
-
- down_write(&mm->mmap_sem);
-
- vma = find_vma(mm, m.addr);
- ret = -EINVAL;
- if (!vma ||
- vma->vm_ops != &privcmd_vm_ops ||
- (m.addr != vma->vm_start) ||
- ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
- !privcmd_enforce_singleshot_mapping(vma)) {
- up_write(&mm->mmap_sem);
- goto out;
- }
-
- state.domain = m.dom;
- state.vma = vma;
- state.va = m.addr;
- state.err = 0;
-
- ret = traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist, mmap_batch_fn, &state);
-
- up_write(&mm->mmap_sem);
-
- if (state.err > 0) {
- state.user = m.arr;
- ret = traverse_pages(m.num, sizeof(xen_pfn_t),
- &pagelist,
- mmap_return_errors, &state);
- }
-
-out:
- free_page_list(&pagelist);
-
- return ret;
-}
-
-static long privcmd_ioctl(struct file *file,
- unsigned int cmd, unsigned long data)
-{
- int ret = -ENOSYS;
- void __user *udata = (void __user *) data;
-
- switch (cmd) {
- case IOCTL_PRIVCMD_HYPERCALL:
- ret = privcmd_ioctl_hypercall(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAP:
- ret = privcmd_ioctl_mmap(udata);
- break;
-
- case IOCTL_PRIVCMD_MMAPBATCH:
- ret = privcmd_ioctl_mmap_batch(udata);
- break;
-
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-#ifndef HAVE_ARCH_PRIVCMD_MMAP
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
- vma, vma->vm_start, vma->vm_end,
- vmf->pgoff, vmf->virtual_address);
-
- return VM_FAULT_SIGBUS;
-}
-
-static struct vm_operations_struct privcmd_vm_ops = {
- .fault = privcmd_fault
-};
-
-static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /* Unsupported for auto-translate guests. */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return -ENOSYS;
-
- /* DONTCOPY is essential for Xen because copy_page_range doesn't know
- * how to recreate these mappings */
- vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
- vma->vm_ops = &privcmd_vm_ops;
- vma->vm_private_data = NULL;
-
- return 0;
-}
-
-static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
-{
- return (xchg(&vma->vm_private_data, (void *)1) == NULL);
-}
-#endif
-
-const struct file_operations privcmd_file_ops = {
- .unlocked_ioctl = privcmd_ioctl,
- .mmap = privcmd_mmap,
-};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 1aa3897..a55fbf9 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -16,6 +16,7 @@
#include <xen/xen.h>
#include "xenfs.h"
+#include "../privcmd.h"
#include <asm/xen/hypervisor.h>
@@ -84,7 +85,7 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
[1] = {},
{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
- { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
+ { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
{""},
};
int rc;
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index b68aa62..5056306 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,7 +2,6 @@
#define _XENFS_XENBUS_H
extern const struct file_operations xenbus_file_ops;
-extern const struct file_operations privcmd_file_ops;
extern const struct file_operations xsd_kva_file_ops;
extern const struct file_operations xsd_port_file_ops;
--
1.7.6.4

@ -1,177 +0,0 @@
From 8fd04efb7e41da12d85ad382b7c7092fe832bebb Mon Sep 17 00:00:00 2001
From: Tang Liang <liang.tang@oracle.com>
Date: Fri, 9 Dec 2011 10:05:54 +0800
Subject: x86, acpi, tboot: Have a ACPI os prepare sleep instead of calling
tboot_sleep.
The ACPI suspend path makes a call to tboot_sleep right before
it writes the PM1A, PM1B values. We replace the direct call to
tboot via an registration callback similar to __acpi_register_gsi.
CC: Thomas Gleixner <tglx@linutronix.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: x86@kernel.org
CC: Len Brown <len.brown@intel.com>
Acked-by: Joseph Cihula <joseph.cihula@intel.com>
CC: Shane Wang <shane.wang@intel.com>
CC: xen-devel@lists.xensource.com
CC: linux-pm@lists.linux-foundation.org
CC: tboot-devel@lists.sourceforge.net
CC: linux-acpi@vger.kernel.org
[v1: Added __attribute__ ((unused))]
[v2: Introduced a wrapper instead of changing tboot_sleep return values]
[v3: Added return value AE_CTRL_SKIP for acpi_os_sleep_prepare]
Signed-off-by: Tang Liang <liang.tang@oracle.com>
[v1: Fix compile issues on IA64 and PPC64]
[v2: Fix where __acpi_os_prepare_sleep==NULL and did not go in sleep properly]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/kernel/tboot.c | 8 ++++++++
drivers/acpi/acpica/hwsleep.c | 10 +++++++---
drivers/acpi/osl.c | 24 ++++++++++++++++++++++++
include/acpi/acexcep.h | 1 +
include/linux/acpi.h | 10 ++++++++++
include/linux/tboot.h | 1 -
6 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e2410e2..1a4ab7d 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -297,6 +297,12 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
tboot_shutdown(acpi_shutdown_map[sleep_state]);
}
+static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control,
+ u32 pm1b_control)
+{
+ tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+ return 0;
+}
static atomic_t ap_wfs_count;
@@ -345,6 +351,8 @@ static __init int tboot_late_init(void)
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);
+
+ acpi_os_set_prepare_sleep(&tboot_sleep_wrapper);
return 0;
}
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index d52da30..992359a 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -43,9 +43,9 @@
*/
#include <acpi/acpi.h>
+#include <linux/acpi.h>
#include "accommon.h"
#include "actables.h"
-#include <linux/tboot.h>
#include <linux/module.h>
#define _COMPONENT ACPI_HARDWARE
@@ -344,8 +344,12 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
ACPI_FLUSH_CPU_CACHE();
- tboot_sleep(sleep_state, pm1a_control, pm1b_control);
-
+ status = acpi_os_prepare_sleep(sleep_state, pm1a_control,
+ pm1b_control);
+ if (ACPI_SKIP(status))
+ return_ACPI_STATUS(AE_OK);
+ if (ACPI_FAILURE(status))
+ return_ACPI_STATUS(status);
/* Write #2: Write both SLP_TYP + SLP_EN */
status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index f31c5c5..f3aae4b 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -76,6 +76,9 @@ EXPORT_SYMBOL(acpi_in_debugger);
extern char line_buf[80];
#endif /*ENABLE_DEBUGGER */
+static int (*__acpi_os_prepare_sleep)(u8 sleep_state, u32 pm1a_ctrl,
+ u32 pm1b_ctrl);
+
static acpi_osd_handler acpi_irq_handler;
static void *acpi_irq_context;
static struct workqueue_struct *kacpid_wq;
@@ -1659,3 +1662,24 @@ acpi_status acpi_os_terminate(void)
return AE_OK;
}
+
+acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control,
+ u32 pm1b_control)
+{
+ int rc = 0;
+ if (__acpi_os_prepare_sleep)
+ rc = __acpi_os_prepare_sleep(sleep_state,
+ pm1a_control, pm1b_control);
+ if (rc < 0)
+ return AE_ERROR;
+ else if (rc > 0)
+ return AE_CTRL_SKIP;
+
+ return AE_OK;
+}
+
+void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
+ u32 pm1a_ctrl, u32 pm1b_ctrl))
+{
+ __acpi_os_prepare_sleep = func;
+}
diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h
index 5b6c391..fa0d22c 100644
--- a/include/acpi/acexcep.h
+++ b/include/acpi/acexcep.h
@@ -57,6 +57,7 @@
#define ACPI_SUCCESS(a) (!(a))
#define ACPI_FAILURE(a) (a)
+#define ACPI_SKIP(a) (a == AE_CTRL_SKIP)
#define AE_OK (acpi_status) 0x0000
/*
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 627a3a4..9393f73 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -363,4 +363,14 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b)
}
#endif
+#ifdef CONFIG_ACPI
+void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
+ u32 pm1a_ctrl, u32 pm1b_ctrl));
+
+acpi_status acpi_os_prepare_sleep(u8 sleep_state,
+ u32 pm1a_control, u32 pm1b_control);
+#else
+#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
+#endif
+
#endif /*_LINUX_ACPI_H*/
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index 1dba6ee..c75128b 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -143,7 +143,6 @@ static inline int tboot_enabled(void)
extern void tboot_probe(void);
extern void tboot_shutdown(u32 shutdown_type);
-extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
extern struct acpi_table_header *tboot_get_dmar_table(
struct acpi_table_header *dmar_tbl);
extern int tboot_force_iommu(void);
--
1.7.6.4

@ -1,67 +0,0 @@
From 6f327383cd7ebef1fcc092e2d759ceb9d90dfb36 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 8 Dec 2011 17:14:08 +0800
Subject: tboot: Add return values for tboot_sleep
.. as appropiately. As tboot_sleep now returns values.
remove tboot_sleep_wrapper.
Suggested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Joseph Cihula <joseph.cihula@intel.com>
[v1: Return -1/0/+1 instead of ACPI_xx values]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/kernel/tboot.c | 13 ++++---------
1 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 1a4ab7d..6410744 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
offsetof(struct acpi_table_facs, firmware_waking_vector);
}
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
{
static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
/* S0,1,2: */ -1, -1, -1,
@@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
/* S5: */ TB_SHUTDOWN_S5 };
if (!tboot_enabled())
- return;
+ return 0;
tboot_copy_fadt(&acpi_gbl_FADT);
tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -292,15 +292,10 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
pr_warning("unsupported sleep state 0x%x\n", sleep_state);
- return;
+ return -1;
}
tboot_shutdown(acpi_shutdown_map[sleep_state]);
-}
-static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control,
- u32 pm1b_control)
-{
- tboot_sleep(sleep_state, pm1a_control, pm1b_control);
return 0;
}
@@ -352,7 +347,7 @@ static __init int tboot_late_init(void)
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);
- acpi_os_set_prepare_sleep(&tboot_sleep_wrapper);
+ acpi_os_set_prepare_sleep(&tboot_sleep);
return 0;
}
--
1.7.6.4

@ -1,197 +0,0 @@
From 9b10575276a220543b8791f2cb8268fbd4a0bc2e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 8 Dec 2011 17:32:23 +0800
Subject: xen/acpi/sleep: Enable ACPI sleep via the __acpi_os_prepare_sleep
Provide the registration callback to call in the Xen's
ACPI sleep functionality. This means that during S3/S5
we make a hypercall XENPF_enter_acpi_sleep with the
proper PM1A/PM1B registers.
Based of Ke Yu's <ke.yu@intel.com> initial idea.
[ From http://xenbits.xensource.com/linux-2.6.18-xen.hg
change c68699484a65 ]
[v1: Added Copyright and license]
[v2: Added check if PM1A/B the 16-bits MSB contain something. The spec
only uses 16-bits but might have more in future]
Signed-off-by: Liang Tang <liang.tang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/xen/enlighten.c | 3 ++
drivers/xen/Makefile | 2 +-
drivers/xen/acpi.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++
include/xen/acpi.h | 58 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 124 insertions(+), 1 deletions(-)
create mode 100644 drivers/xen/acpi.c
create mode 100644 include/xen/acpi.h
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12eb07b..a5277c2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
#include <xen/page.h>
#include <xen/hvm.h>
#include <xen/hvc-console.h>
+#include <xen/acpi.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
@@ -1275,6 +1276,8 @@ asmlinkage void __init xen_start_kernel(void)
/* Make sure ACS will be enabled */
pci_request_acs();
+
+ xen_acpi_sleep_register();
}
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index aa31337..77a845f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PVHVM) += platform-pci.o
obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
-obj-$(CONFIG_XEN_DOM0) += pci.o
+obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
new file mode 100644
index 0000000..119d42a
--- /dev/null
+++ b/drivers/xen/acpi.c
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * acpi.c
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke ke.yu@intel.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xen/acpi.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 pm1a_cnt, u32 pm1b_cnt)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_enter_acpi_sleep,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u = {
+ .enter_acpi_sleep = {
+ .pm1a_cnt_val = (u16)pm1a_cnt,
+ .pm1b_cnt_val = (u16)pm1b_cnt,
+ .sleep_state = sleep_state,
+ },
+ },
+ };
+
+ if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) {
+ WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!"
+ "Email xen-devel@lists.xensource.com Thank you.\n", \
+ pm1a_cnt, pm1b_cnt);
+ return -1;
+ }
+
+ HYPERVISOR_dom0_op(&op);
+ return 1;
+}
diff --git a/include/xen/acpi.h b/include/xen/acpi.h
new file mode 100644
index 0000000..48a9c01
--- /dev/null
+++ b/include/xen/acpi.h
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * acpi.h
+ * acpi file for domain 0 kernel
+ *
+ * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Copyright (c) 2011 Yu Ke <ke.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_ACPI_H
+#define _XEN_ACPI_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_XEN_DOM0
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <linux/acpi.h>
+
+int xen_acpi_notify_hypervisor_state(u8 sleep_state,
+ u32 pm1a_cnt, u32 pm1b_cnd);
+
+static inline void xen_acpi_sleep_register(void)
+{
+ if (xen_initial_domain())
+ acpi_os_set_prepare_sleep(
+ &xen_acpi_notify_hypervisor_state);
+}
+#else
+static inline void xen_acpi_sleep_register(void)
+{
+}
+#endif
+
+#endif /* _XEN_ACPI_H */
--
1.7.6.4

@ -1,81 +0,0 @@
From 86ceafdf50d67bcb2a5196122797a6972bedd279 Mon Sep 17 00:00:00 2001
From: Tang Liang <liang.tang@oracle.com>
Date: Thu, 8 Dec 2011 17:36:39 +0800
Subject: xen: Utilize the restore_msi_irqs hook.
to make a hypercall to restore the vectors in the MSI/MSI-X
configuration space.
Signed-off-by: Tang Liang <liang.tang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/pci/xen.c | 27 +++++++++++++++++++++++++++
include/xen/interface/physdev.h | 7 +++++++
2 files changed, 34 insertions(+), 0 deletions(-)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 492ade8..249a5ae 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -324,6 +324,32 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
out:
return ret;
}
+
+static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ int ret = 0;
+
+ if (pci_seg_supported) {
+ struct physdev_pci_device restore_ext;
+
+ restore_ext.seg = pci_domain_nr(dev->bus);
+ restore_ext.bus = dev->bus->number;
+ restore_ext.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
+ &restore_ext);
+ if (ret == -ENOSYS)
+ pci_seg_supported = false;
+ WARN(ret && ret != -ENOSYS, "restore_msi_ext -> %d\n", ret);
+ }
+ if (!pci_seg_supported) {
+ struct physdev_restore_msi restore;
+
+ restore.bus = dev->bus->number;
+ restore.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
+ WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
+ }
+}
#endif
static void xen_teardown_msi_irqs(struct pci_dev *dev)
@@ -446,6 +472,7 @@ int __init pci_xen_initial_domain(void)
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
#endif
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index c1080d9..0c28989 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -145,6 +145,13 @@ struct physdev_manage_pci {
uint8_t devfn;
};
+#define PHYSDEVOP_restore_msi 19
+struct physdev_restore_msi {
+ /* IN */
+ uint8_t bus;
+ uint8_t devfn;
+};
+
#define PHYSDEVOP_manage_pci_add_ext 20
struct physdev_manage_pci_ext {
/* IN */
--
1.7.6.4

@ -1,31 +0,0 @@
From cfb37553f53f993c22aad05c219581dfbc726bcc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 23 Jan 2012 10:53:57 -0500
Subject: xen/setup/pm/acpi: Remove the call to boot_option_idle_override.
We needed that call in the past to force the kernel to use
default_idle (which called safe_halt, which called xen_safe_halt).
But set_pm_idle_to_default() does now that, so there is no need
to use this boot option operand.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/xen/setup.c | 1 -
1 files changed, 0 insertions(+), 1 deletions(-)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e03c636..1236623 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -420,7 +420,6 @@ void __init xen_arch_setup(void)
boot_cpu_data.hlt_works_ok = 1;
#endif
disable_cpuidle();
- boot_option_idle_override = IDLE_HALT;
WARN_ON(set_pm_idle_to_default());
fiddle_vdso();
}
--
1.7.6.4

@ -1,216 +0,0 @@
From d281ee8c6d58a7f5d1f4241238daa315fb959e31 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 13 Feb 2012 22:26:32 -0500
Subject: xen/enlighten: Expose MWAIT and MWAIT_LEAF if hypervisor OKs it.
For the hypervisor to take advantage of the MWAIT support it needs
to extract from the ACPI _CST the register address. But the
hypervisor does not have the support to parse DSDT so it relies on
the initial domain (dom0) to parse the ACPI Power Management information
and push it up to the hypervisor. The pushing of the data is done
by the processor_harveset_xen module which parses the information that
the ACPI parser has graciously exposed in 'struct acpi_processor'.
For the ACPI parser to also expose the Cx states for MWAIT, we need
to expose the MWAIT capability (leaf 1). Furthermore we also need to
expose the MWAIT_LEAF capability (leaf 5) for cstate.c to properly
function.
The hypervisor could expose these flags when it traps the XEN_EMULATE_PREFIX
operations, but it can't do it since it needs to be backwards compatible.
Instead we choose to use the native CPUID to figure out if the MWAIT
capability exists and use the XEN_SET_PDC query hypercall to figure out
if the hypervisor wants us to expose the MWAIT_LEAF capability or not.
Note: The XEN_SET_PDC query was implemented in c/s 23783:
"ACPI: add _PDC input override mechanism".
With this in place, instead of
C3 ACPI IOPORT 415
we get now
C3:ACPI FFH INTEL MWAIT 0x20
Note: The cpu_idle which would be calling the mwait variants for idling
never gets set b/c we set the default pm_idle to be the hypercall variant.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/xen/enlighten.c | 92 +++++++++++++++++++++++++++++++++++++-
include/xen/interface/platform.h | 4 +-
2 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12eb07b..4c82936 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -62,6 +62,14 @@
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/pci_x86.h>
+#include <asm/mwait.h>
+
+#ifdef CONFIG_ACPI
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
#include "xen-ops.h"
#include "mmu.h"
@@ -200,13 +208,17 @@ static void __init xen_banner(void)
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
unsigned maskebx = ~0;
unsigned maskecx = ~0;
unsigned maskedx = ~0;
-
+ unsigned setecx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
@@ -214,9 +226,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
switch (*ax) {
case 1:
maskecx = cpuid_leaf1_ecx_mask;
+ setecx = cpuid_leaf1_ecx_set_mask;
maskedx = cpuid_leaf1_edx_mask;
break;
+ case CPUID_MWAIT_LEAF:
+ /* Synthesize the values.. */
+ *ax = 0;
+ *bx = 0;
+ *cx = cpuid_leaf5_ecx_val;
+ *dx = cpuid_leaf5_edx_val;
+ return;
+
case CPUID_THERM_POWER_LEAF:
/* Disabling APERFMPERF for kernel usage */
maskecx = ~(1 << APERFMPERF_PRESENT);
@@ -232,9 +253,75 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*bx &= maskebx;
*cx &= maskecx;
+ *cx |= setecx;
*dx &= maskedx;
+
}
+static bool __init xen_check_mwait(void)
+{
+#if CONFIG_ACPI
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .u.set_pminfo.id = -1,
+ .u.set_pminfo.type = XEN_PM_PDC,
+ };
+ uint32_t buf[3];
+ unsigned int ax, bx, cx, dx;
+ unsigned int mwait_mask;
+
+ /* We need to determine whether it is OK to expose the MWAIT
+ * capability to the kernel to harvest deeper than C3 states from ACPI
+ * _CST using the processor_harvest_xen.c module. For this to work, we
+ * need to gather the MWAIT_LEAF values (which the cstate.c code
+ * checks against). The hypervisor won't expose the MWAIT flag because
+ * it would break backwards compatibility; so we will find out directly
+ * from the hardware and hypercall.
+ */
+ if (!xen_initial_domain())
+ return false;
+
+ ax = 1;
+ cx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+ (1 << (X86_FEATURE_MWAIT % 32));
+
+ if ((cx & mwait_mask) != mwait_mask)
+ return false;
+
+ /* We need to emulate the MWAIT_LEAF and for that we need both
+ * ecx and edx. The hypercall provides only partial information.
+ */
+
+ ax = CPUID_MWAIT_LEAF;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+ */
+ buf[0] = ACPI_PDC_REVISION_ID;
+ buf[1] = 1;
+ buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+ set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+ if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ cpuid_leaf5_ecx_val = cx;
+ cpuid_leaf5_edx_val = dx;
+ }
+ return true;
+#else
+ return false;
+#endif
+}
static void __init xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
@@ -261,6 +348,9 @@ static void __init xen_init_cpuid_mask(void)
/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
if ((cx & xsave_mask) != xsave_mask)
cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+
+ if (xen_check_mwait())
+ cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
}
static void xen_set_debugreg(int reg, unsigned long val)
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
index c168468..6220b98 100644
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -200,7 +200,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
#define XEN_PM_CX 0
#define XEN_PM_PX 1
#define XEN_PM_TX 2
-
+#define XEN_PM_PDC 3
/* Px sub info type */
#define XEN_PX_PCT 1
#define XEN_PX_PSS 2
@@ -286,6 +286,7 @@ struct xen_processor_performance {
};
DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
+DEFINE_GUEST_HANDLE(uint32_t);
struct xenpf_set_processor_pminfo {
/* IN variables */
uint32_t id; /* ACPI CPU ID */
@@ -293,6 +294,7 @@ struct xenpf_set_processor_pminfo {
union {
struct xen_processor_power power;/* Cx: _CST/_CSD */
struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */
+ GUEST_HANDLE(uint32_t) pdc;
};
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
--
1.7.6.4

@ -1,529 +0,0 @@
From 20e7a07fa0f8a0dbe30a0f732686d78849d29d96 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 3 Feb 2012 16:03:20 -0500
Subject: [CPUFREQ] xen: governor for Xen hypervisor frequency scaling.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This CPU freq governor leaves the frequency decision to the Xen hypervisor.
To do that the driver parses the Power Management data and uploads said
information to the Xen hypervisor. Then the Xen hypervisor can select the
proper Cx and Pxx states for the initial domain and all other domains.
To upload the information, this CPU frequency driver reads Power Management (PM)
(_Pxx and _Cx) which are populated in the 'struct acpi_processor' structure.
It simply reads the contents of that structure and pass it up the Xen hypervisor.
For that to work we depend on the appropriate CPU frequency scaling driver
to do the heavy-lifting - so that the contents is correct.
The CPU frequency governor it has been loaded also sets up a timer
to check if the ACPI IDs count is different from the APIC ID count - which
can happen if the user choose to use dom0_max_vcpu argument. In such a case
a backup of the PM structure is used and uploaded to the hypervisor.
[v1-v2: Initial RFC implementations that were posted]
[v3: Changed the name to passthru suggested by Pasi Kärkkäinen <pasik@iki.fi>]
[v4: Added vCPU != pCPU support - aka dom0_max_vcpus support]
[v5: Cleaned up the driver, fix bug under Athlon XP]
[v6: Changed the driver to a CPU frequency governor]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
drivers/xen/Kconfig | 15 ++
drivers/xen/Makefile | 2 +-
drivers/xen/cpufreq_xen.c | 445 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 461 insertions(+), 1 deletions(-)
create mode 100644 drivers/xen/cpufreq_xen.c
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a1ced52..28ba371 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -178,4 +178,19 @@ config XEN_PRIVCMD
depends on XEN
default m
+config CPU_FREQ_GOV_XEN
+ tristate "'xen' governor for hypervisor scaling"
+ depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ
+ default m
+ help
+ This cpufreq governor leaves the frequency decision to the Xen hypervisor.
+
+ To do that the driver parses the Power Management data and uploads said
+ information to the Xen hypervisor. Then the Xen hypervisor can select the
+ proper Cx and Pxx states.
+
+ To compile this driver as a module, choose M here: the
+ module will be called cpufreq_xen. If you do not know what to choose,
+ select M here.
+
endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index aa31337..5802220 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
-
+obj-$(CONFIG_CPU_FREQ_GOV_XEN) += cpufreq_xen.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o
diff --git a/drivers/xen/cpufreq_xen.c b/drivers/xen/cpufreq_xen.c
new file mode 100644
index 0000000..1b709bf
--- /dev/null
+++ b/drivers/xen/cpufreq_xen.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2012 by Oracle Inc
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249
+ * so many thanks go to Kevin Tian <kevin.tian@intel.com>
+ * and Yu Ke <ke.yu@intel.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/processor.h>
+
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define DRV_NAME "cpufreq-xen"
+
+static int no_hypercall;
+MODULE_PARM_DESC(off, "Inhibit the hypercall.");
+module_param_named(off, no_hypercall, int, 0400);
+
+/*
+ * Mutex to protect the acpi_ids_done.
+ */
+static DEFINE_MUTEX(acpi_ids_mutex);
+/*
+ * Don't think convert this to cpumask_var_t or use cpumask_bit - as those
+ * shrink to nr_cpu_bits (which is dependent on possible_cpu), which can be
+ * less than what we want to put in.
+ */
+#define NR_ACPI_CPUS NR_CPUS
+#define MAX_ACPI_BITS (BITS_TO_LONGS(NR_ACPI_CPUS))
+static unsigned long *acpi_ids_done;
+/*
+ * Again, don't convert to cpumask - as we are reading the raw ACPI CPU ids
+ * which can go beyond what we presently see.
+ */
+static unsigned long *acpi_id_present;
+
+/*
+ * Pertient data for the timer to be launched to check if the # of
+ * ACPI CPU ids is different from the one we have processed.
+ */
+#define DELAY_TIMER msecs_to_jiffies(5000 /* 5 sec */)
+static struct acpi_processor *pr_backup;
+static struct delayed_work work;
+
+static int push_cxx_to_hypervisor(struct acpi_processor *_pr)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = _pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_CX,
+ };
+ struct xen_processor_cx *dst_cx, *dst_cx_states = NULL;
+ struct acpi_processor_cx *cx;
+ int i, ok, ret = 0;
+
+ dst_cx_states = kcalloc(_pr->power.count,
+ sizeof(struct xen_processor_cx), GFP_KERNEL);
+ if (!dst_cx_states)
+ return -ENOMEM;
+
+ for (ok = 0, i = 1; i <= _pr->power.count; i++) {
+ cx = &_pr->power.states[i];
+ if (!cx->valid)
+ continue;
+
+ dst_cx = &(dst_cx_states[ok++]);
+
+ dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
+ if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
+ dst_cx->reg.bit_width = 8;
+ dst_cx->reg.bit_offset = 0;
+ dst_cx->reg.access_size = 1;
+ } else {
+ dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE;
+ if (cx->entry_method == ACPI_CSTATE_FFH) {
+ /* NATIVE_CSTATE_BEYOND_HALT */
+ dst_cx->reg.bit_offset = 2;
+ dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */
+ }
+ dst_cx->reg.access_size = 0;
+ }
+ dst_cx->reg.address = cx->address;
+
+ dst_cx->type = cx->type;
+ dst_cx->latency = cx->latency;
+ dst_cx->power = cx->power;
+
+ dst_cx->dpcnt = 0;
+ set_xen_guest_handle(dst_cx->dp, NULL);
+#ifdef DEBUG
+ pr_debug(DRV_NAME ": CX: ID:%d [C%d:%s] entry:%d\n",
+ _pr->acpi_id, cx->type, cx->desc, cx->entry_method);
+#endif
+ }
+ if (!ok) {
+ pr_err(DRV_NAME ": No _Cx for CPU %d\n", _pr->acpi_id);
+ kfree(dst_cx_states);
+ return -EINVAL;
+ }
+ op.u.set_pminfo.power.count = ok;
+ op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control;
+ op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check;
+ op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst;
+ op.u.set_pminfo.power.flags.power_setup_done =
+ _pr->flags.power_setup_done;
+
+ set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states);
+
+ if (!no_hypercall)
+ ret = HYPERVISOR_dom0_op(&op);
+
+ if (ret)
+ pr_err(DRV_NAME "(CX): Hypervisor error (%d) for ACPI ID: %d\n",
+ ret, _pr->acpi_id);
+
+ kfree(dst_cx_states);
+
+ return ret;
+}
+static struct xen_processor_px *
+xen_copy_pss_data(struct acpi_processor *_pr,
+ struct xen_processor_performance *dst_perf)
+{
+ struct xen_processor_px *dst_states = NULL;
+ int i;
+
+ BUILD_BUG_ON(sizeof(struct xen_processor_px) !=
+ sizeof(struct acpi_processor_px));
+
+ dst_states = kcalloc(_pr->performance->state_count,
+ sizeof(struct xen_processor_px), GFP_KERNEL);
+ if (!dst_states)
+ return ERR_PTR(-ENOMEM);
+
+ dst_perf->state_count = _pr->performance->state_count;
+ for (i = 0; i < _pr->performance->state_count; i++) {
+ /* Fortunatly for us, they are both the same size */
+ memcpy(&(dst_states[i]), &(_pr->performance->states[i]),
+ sizeof(struct acpi_processor_px));
+ }
+ return dst_states;
+}
+static int xen_copy_psd_data(struct acpi_processor *_pr,
+ struct xen_processor_performance *dst)
+{
+ BUILD_BUG_ON(sizeof(struct xen_psd_package) !=
+ sizeof(struct acpi_psd_package));
+
+ if (_pr->performance->shared_type != CPUFREQ_SHARED_TYPE_NONE) {
+ dst->shared_type = _pr->performance->shared_type;
+
+ memcpy(&(dst->domain_info), &(_pr->performance->domain_info),
+ sizeof(struct acpi_psd_package));
+ } else {
+ if ((&cpu_data(0))->x86_vendor != X86_VENDOR_AMD)
+ return -EINVAL;
+
+ /* On AMD, the powernow-k8 is loaded before acpi_cpufreq
+ * meaning that acpi_processor_preregister_performance never
+ * gets called which would parse the _PSD. The only relevant
+ * information from _PSD we need is whether it is HW_ALL or any
+ * other type. AMD K8 >= are SW_ALL or SW_ANY, AMD K7<= HW_ANY.
+ * This driver checks at the start whether it is K8 so it
+ * if we get here it can only be K8.
+ */
+ dst->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+ dst->domain_info.coord_type = DOMAIN_COORD_TYPE_SW_ANY;
+ dst->domain_info.num_processors = num_online_cpus();
+ }
+ return 0;
+}
+static int xen_copy_pct_data(struct acpi_pct_register *pct,
+ struct xen_pct_register *dst_pct)
+{
+ /* It would be nice if you could just do 'memcpy(pct, dst_pct') but
+ * sadly the Xen structure did not have the proper padding so the
+ * descriptor field takes two (dst_pct) bytes instead of one (pct).
+ */
+ dst_pct->descriptor = pct->descriptor;
+ dst_pct->length = pct->length;
+ dst_pct->space_id = pct->space_id;
+ dst_pct->bit_width = pct->bit_width;
+ dst_pct->bit_offset = pct->bit_offset;
+ dst_pct->reserved = pct->reserved;
+ dst_pct->address = pct->address;
+ return 0;
+}
+static int push_pxx_to_hypervisor(struct acpi_processor *_pr)
+{
+ int ret = 0;
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.set_pminfo.id = _pr->acpi_id,
+ .u.set_pminfo.type = XEN_PM_PX,
+ };
+ struct xen_processor_performance *dst_perf;
+ struct xen_processor_px *dst_states = NULL;
+
+ dst_perf = &op.u.set_pminfo.perf;
+
+ dst_perf->platform_limit = _pr->performance_platform_limit;
+ dst_perf->flags |= XEN_PX_PPC;
+ xen_copy_pct_data(&(_pr->performance->control_register),
+ &dst_perf->control_register);
+ xen_copy_pct_data(&(_pr->performance->status_register),
+ &dst_perf->status_register);
+ dst_perf->flags |= XEN_PX_PCT;
+ dst_states = xen_copy_pss_data(_pr, dst_perf);
+ if (!IS_ERR_OR_NULL(dst_states)) {
+ set_xen_guest_handle(dst_perf->states, dst_states);
+ dst_perf->flags |= XEN_PX_PSS;
+ }
+ if (!xen_copy_psd_data(_pr, dst_perf))
+ dst_perf->flags |= XEN_PX_PSD;
+
+ if (!no_hypercall)
+ ret = HYPERVISOR_dom0_op(&op);
+
+ if (ret)
+ pr_err(DRV_NAME "(_PXX): Hypervisor error (%d) for ACPI ID %d\n",
+ ret, _pr->acpi_id);
+
+ if (!IS_ERR_OR_NULL(dst_states))
+ kfree(dst_states);
+
+ return ret;
+}
+static int upload_pm_data(struct acpi_processor *_pr)
+{
+ int err = 0;
+
+ if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done))
+ return -EBUSY;
+
+ if (_pr->flags.power)
+ err = push_cxx_to_hypervisor(_pr);
+
+ if (_pr->performance && _pr->performance->states)
+ err |= push_pxx_to_hypervisor(_pr);
+
+ return err;
+}
+static acpi_status
+read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+ u32 acpi_id;
+ acpi_status status;
+ acpi_object_type acpi_type;
+ unsigned long long tmp;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+
+ status = acpi_get_type(handle, &acpi_type);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+
+ switch (acpi_type) {
+ case ACPI_TYPE_PROCESSOR:
+ status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+ acpi_id = object.processor.proc_id;
+ break;
+ case ACPI_TYPE_DEVICE:
+ status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return AE_OK;
+ acpi_id = tmp;
+ break;
+ default:
+ return AE_OK;
+ }
+ if (acpi_id > NR_ACPI_CPUS) {
+ WARN_ONCE(1, "There are %d ACPI processors, but kernel can only do %d!\n",
+ acpi_id, NR_ACPI_CPUS);
+ return AE_OK;
+ }
+ __set_bit(acpi_id, acpi_id_present);
+
+ return AE_OK;
+}
+static unsigned int more_acpi_ids(void)
+{
+ unsigned int n = 0;
+
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ read_acpi_id, NULL, NULL, NULL);
+ acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL);
+
+ mutex_lock(&acpi_ids_mutex);
+ if (!bitmap_equal(acpi_id_present, acpi_ids_done, MAX_ACPI_BITS))
+ n = bitmap_weight(acpi_id_present, MAX_ACPI_BITS);
+ mutex_unlock(&acpi_ids_mutex);
+
+ return n;
+}
+static void do_check_acpi_id_timer(struct work_struct *_work)
+{
+ /* All online CPUs have been processed at this stage. Now verify
+ * whether in fact "online CPUs" == physical CPUs.
+ */
+ acpi_id_present = kcalloc(MAX_ACPI_BITS, sizeof(unsigned long), GFP_KERNEL);
+ if (!acpi_id_present)
+ return;
+ memset(acpi_id_present, 0, MAX_ACPI_BITS * sizeof(unsigned long));
+
+ if (more_acpi_ids()) {
+ int cpu;
+ if (!pr_backup) {
+ schedule_delayed_work(&work, DELAY_TIMER);
+ return;
+ }
+ for_each_set_bit(cpu, acpi_id_present, MAX_ACPI_BITS) {
+ pr_backup->acpi_id = cpu;
+ mutex_lock(&acpi_ids_mutex);
+ (void)upload_pm_data(pr_backup);
+ mutex_unlock(&acpi_ids_mutex);
+ }
+ }
+ kfree(acpi_id_present);
+ acpi_id_present = NULL;
+}
+
+static int cpufreq_governor_xen(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ struct acpi_processor *_pr;
+
+ switch (event) {
+ case CPUFREQ_GOV_START:
+ case CPUFREQ_GOV_LIMITS:
+ /* Set it to max and let the hypervisor take over */
+ __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
+
+ _pr = per_cpu(processors, policy->cpu /* APIC ID */);
+ if (!_pr)
+ break;
+
+ mutex_lock(&acpi_ids_mutex);
+ if (!pr_backup) {
+ pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
+ memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+
+ INIT_DELAYED_WORK_DEFERRABLE(&work, do_check_acpi_id_timer);
+ schedule_delayed_work(&work, DELAY_TIMER);
+ }
+ (void)upload_pm_data(_pr);
+ mutex_unlock(&acpi_ids_mutex);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+static struct cpufreq_governor cpufreq_gov_xen = {
+ .name = "xen",
+ .governor = cpufreq_governor_xen,
+ .owner = THIS_MODULE,
+};
+static int __init check_prereq(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ if (!acpi_gbl_FADT.smi_command)
+ return -ENODEV;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ if (!cpu_has(c, X86_FEATURE_EST))
+ return -ENODEV;
+
+ return 0;
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ u32 hi = 0, lo = 0;
+ /* Copied from powernow-k8.h, can't include ../cpufreq/powernow
+ * as we get compile warnings for the static functions.
+ */
+#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
+ rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+
+ /* If the MSR cannot provide the data, the powernow-k8
+ * won't process the data properly either.
+ */
+ if (hi || lo)
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int __init xen_processor_passthru_init(void)
+{
+ int rc = check_prereq();
+
+ if (rc)
+ return rc;
+
+ acpi_ids_done = kcalloc(MAX_ACPI_BITS, sizeof(unsigned long), GFP_KERNEL);
+ if (!acpi_ids_done)
+ return -ENOMEM;
+ memset(acpi_ids_done, 0, MAX_ACPI_BITS * sizeof(unsigned long));
+
+ return cpufreq_register_governor(&cpufreq_gov_xen);
+}
+static void __exit xen_processor_passthru_exit(void)
+{
+ cpufreq_unregister_governor(&cpufreq_gov_xen);
+ cancel_delayed_work_sync(&work);
+ kfree(acpi_ids_done);
+ kfree(pr_backup);
+}
+
+MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>");
+MODULE_DESCRIPTION("CPUfreq policy governor 'xen' which uploads PM data to Xen hypervisor");
+MODULE_LICENSE("GPL");
+
+late_initcall(xen_processor_passthru_init);
+module_exit(xen_processor_passthru_exit);
--
1.7.6.4

@ -1,135 +0,0 @@
From 76ccc297018d25d55b789bbd508861ef1e2cdb0c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 16 Dec 2011 17:38:18 -0500
Subject: x86/PCI: Expand the x86_msi_ops to have a restore MSIs.
The MSI restore function will become a function pointer in an
x86_msi_ops struct. It defaults to the implementation in the
io_apic.c and msi.c. We piggyback on the indirection mechanism
introduced by "x86: Introduce x86_msi_ops".
Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
arch/x86/include/asm/pci.h | 9 +++++++++
arch/x86/include/asm/x86_init.h | 1 +
arch/x86/kernel/x86_init.c | 1 +
drivers/pci/msi.c | 29 +++++++++++++++++++++++++++--
4 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943..df75d07 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
{
x86_msi.teardown_msi_irq(irq);
}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
#define arch_setup_msi_irqs x86_setup_msi_irqs
#define arch_teardown_msi_irqs x86_teardown_msi_irqs
#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
/* implemented in arch/x86/kernel/apic/io_apic. */
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
/* default to the implementation in drivers/lib/msi.c */
#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
#else
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#define default_teardown_msi_irqs NULL
+#define default_restore_msi_irqs NULL
#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1971e65..cd52084 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -177,6 +177,7 @@ struct x86_msi_ops {
int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
void (*teardown_msi_irq)(unsigned int irq);
void (*teardown_msi_irqs)(struct pci_dev *dev);
+ void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
};
extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd5..83b05ad 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -114,4 +114,5 @@ struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
};
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 82de95e..a825d78 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -86,6 +86,31 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
}
#endif
+#ifndef arch_restore_msi_irqs
+# define arch_restore_msi_irqs default_restore_msi_irqs
+# define HAVE_DEFAULT_MSI_RESTORE_IRQS
+#endif
+
+#ifdef HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ struct msi_desc *entry;
+
+ entry = NULL;
+ if (dev->msix_enabled) {
+ list_for_each_entry(entry, &dev->msi_list, list) {
+ if (irq == entry->irq)
+ break;
+ }
+ } else if (dev->msi_enabled) {
+ entry = irq_get_msi_desc(irq);
+ }
+
+ if (entry)
+ write_msi_msg(irq, &entry->msg);
+}
+#endif
+
static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
{
u16 control;
@@ -372,7 +397,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
pci_intx_for_msi(dev, 0);
msi_set_enable(dev, pos, 0);
- write_msi_msg(dev->irq, &entry->msg);
+ arch_restore_msi_irqs(dev, dev->irq);
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
@@ -400,7 +425,7 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
list_for_each_entry(entry, &dev->msi_list, list) {
- write_msi_msg(entry->irq, &entry->msg);
+ arch_restore_msi_irqs(dev, entry->irq);
msix_mask_irq(entry, entry->masked);
}
--
1.7.6.4

File diff suppressed because it is too large Load Diff

@ -0,0 +1,72 @@
From 433928d3823f561919ead305194e46e5311b573d Mon Sep 17 00:00:00 2001
From: Marek Marczykowski <marmarek@invisiblethingslab.com>
Date: Sat, 23 Jun 2012 19:50:44 +0200
Subject: [PATCH 1/2] Revert "xen/pat: Disable PAT support for now."
Organization: Invisible Things Lab
This reverts commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1.
We haven't observed failure which is workarounded by this patch, but it caused
horrible GPU performance. Anyway there is "nopat" option.
Signed-off-by: Marek Marczykowski <marmarek@invisiblethingslab.com>
---
arch/x86/xen/enlighten.c | 2 --
arch/x86/xen/mmu.c | 8 ++++----
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6c7f1e8..bf3319c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1269,9 +1269,7 @@ asmlinkage void __init xen_start_kernel(void)
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
-#if 0
if (!xen_initial_domain())
-#endif
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
__supported_pte_mask |= _PAGE_IOMAP;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 69f5857..a5d252a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -420,13 +420,13 @@ static pteval_t iomap_pte(pteval_t val)
static pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
-#if 0
+
/* If this is a WC pte, convert back from Xen WC to Linux WC */
if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
WARN_ON(!pat_enabled);
pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
}
-#endif
+
if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
return pteval;
@@ -468,7 +468,7 @@ void xen_set_pat(u64 pat)
static pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
-#if 0
+
/* If Linux is trying to set a WC pte, then map to the Xen WC.
* If _PAGE_PAT is set, then it probably means it is really
* _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
@@ -481,7 +481,7 @@ static pte_t xen_make_pte(pteval_t pte)
if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
}
-#endif
+
/*
* Unprivileged domains are allowed to do IOMAPpings for
* PCI passthrough, but not map ISA space. The ISA
--
1.7.4.4

@ -0,0 +1,196 @@
From f37a97dead89d07bce4d8fedc4c295c9bc700ab5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 4 Nov 2011 11:59:34 -0400
Subject: [PATCH 2/2] x86/cpa: Use pte_attrs instead of pte_flags on
CPA/set_p.._wb/wc operations.
When using the paravirt interface, most of the page operations are wrapped
in the pvops interface. The one that is not is the pte_flags. The reason
being that for most cases, the "raw" PTE flag values for baremetal and whatever
pvops platform is running (in this case) - share the same bit meaning.
Except for PAT. Under Linux, the PAT MSR is written to be:
PAT4 PAT0
+---+----+----+----+-----+----+----+
WC | WC | WB | UC | UC- | WC | WB | <= Linux
+---+----+----+----+-----+----+----+
WC | WT | WB | UC | UC- | WT | WB | <= BIOS
+---+----+----+----+-----+----+----+
WC | WP | WC | UC | UC- | WT | WB | <= Xen
+---+----+----+----+-----+----+----+
The lookup of this index table translates to looking up
Bit 7, Bit 4, and Bit 3 of PTE:
PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
If all bits are off, then we are using PAT0. If bit 3 turned on,
then we are using PAT1, if bit 3 and bit 4, then PAT2..
Back to the PAT MSR table:
As you can see, the PAT1 translates to PAT4 under Xen. Under Linux
we only use PAT0, PAT1, and PAT2 for the caching as:
WB = none (so PAT0)
WC = PWT (bit 3 on)
UC = PWT | PCD (bit 3 and 4 are on).
But to make it work with Xen, we end up doing for WC a translation:
PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
And to translate back (when the paravirt pte_val is used) we would:
PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
This works quite well, except if code uses the pte_flags, as pte_flags
reads the raw value and does not go through the paravirt. Which means
that if (when running under Xen):
1) we allocate some pages.
2) call set_pages_array_wc, which ends up calling:
__page_change_att_set_clr(.., __pgprot(__PAGE_WC), /* set */
, __pgprot(__PAGE_MASK), /* clear */
which ends up reading the _raw_ PTE flags and _only_ look at the
_PTE_FLAG_MASK contents with __PAGE_MASK cleared (0x18) and
__PAGE_WC (0x8) set.
read raw *pte -> 0x67
*pte = 0x67 & ^0x18 | 0x8
*pte = 0x67 & 0xfffffe7 | 0x8
*pte = 0x6f
[now set_pte_atomic is called, and 0x6f is written in, but under
xen_make_pte, the bit 3 is translated to bit 7, so it ends up
writting 0xa7, which is correct]
3) do something to them.
4) call set_pages_array_wb
__page_change_att_set_clr(.., __pgprot(__PAGE_WB), /* set */
, __pgprot(__PAGE_MASK), /* clear */
which ends up reading the _raw_ PTE and _only_ look at the
_PTE_FLAG_MASK contents with _PAGE_MASK cleared (0x18) and
__PAGE_WB (0x0) set:
read raw *pte -> 0xa7
*pte = 0xa7 & &0x18 | 0
*pte = 0xa7 & 0xfffffe7 | 0
*pte = 0xa7
[we check whether the old PTE is different from the new one
if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
...
and find out that 0xA7 == 0xA7 so we do not write the new PTE value in]
End result is that we failed at removing the WC caching bit!
5) free them.
[and have pages with PAT4 (bit 7) set, so other subsystems end up using
the pages that have the write combined bit set resulting in crashes. Yikes!].
The fix, which this patch proposes, is to wrap the pte_pgprot in the CPA
code with newly introduced pte_attrs which can go through the pvops interface
to get the "emulated" value instead of the raw. Naturally if CONFIG_PARAVIRT is
not set, it would end calling native_pte_val.
The other way to fix this is by wrapping pte_flags and go through the pvops
interface and it really is the Right Thing to do. The problem is, that past
experience with mprotect stuff demonstrates that it be really expensive in inner
loops, and pte_flags() is used in some very perf-critical areas.
Example code to run this and see the various mysterious subsystems/applications
crashing
MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>");
MODULE_DESCRIPTION("wb_to_wc_and_back");
MODULE_LICENSE("GPL");
MODULE_VERSION(WB_TO_WC);
static int thread(void *arg)
{
struct page *a[MAX_PAGES];
unsigned int i, j;
do {
for (j = 0, i = 0;i < MAX_PAGES; i++, j++) {
a[i] = alloc_page(GFP_KERNEL);
if (!a[i])
break;
}
set_pages_array_wc(a, j);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout_interruptible(HZ);
for (i = 0; i < j; i++) {
unsigned long *addr = page_address(a[i]);
if (addr) {
memset(addr, 0xc2, PAGE_SIZE);
}
}
set_pages_array_wb(a, j);
for (i = 0; i< MAX_PAGES; i++) {
if (a[i])
__free_page(a[i]);
a[i] = NULL;
}
} while (!kthread_should_stop());
return 0;
}
static struct task_struct *t;
static int __init wb_to_wc_init(void)
{
t = kthread_run(thread, NULL, "wb_to_wc_and_back");
return 0;
}
static void __exit wb_to_wc_exit(void)
{
if (t)
kthread_stop(t);
}
module_init(wb_to_wc_init);
module_exit(wb_to_wc_exit);
This fixes RH BZ #742032, #787403, and #745574
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Tom Goetz <tom.goetz@virtualcomputer.com>
CC: stable@kernel.org
---
arch/x86/include/asm/pgtable.h | 5 +++++
arch/x86/mm/pageattr.c | 2 +-
2 files changed, 6 insertions(+), 1 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 49afb3f..fa7bd2c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -349,6 +349,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
return __pgprot(preservebits | addbits);
}
+static inline pgprot_t pte_attrs(pte_t pte)
+{
+ return __pgprot(pte_val(pte) & PTE_FLAGS_MASK);
+}
+
#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e1ebde3..1ae1b4b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -651,7 +651,7 @@ repeat:
if (level == PG_LEVEL_4K) {
pte_t new_pte;
- pgprot_t new_prot = pte_pgprot(old_pte);
+ pgprot_t new_prot = pte_attrs(old_pte);
unsigned long pfn = pte_pfn(old_pte);
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
--
1.7.4.4

@ -1,9 +1,9 @@
--- linux-3.4.1.orig/drivers/block/xen-blkfront.c 2012-06-01 09:18:44.000000000 +0200
+++ linux-3.4.1/drivers/block/xen-blkfront.c 2012-07-15 15:54:31.350255623 +0200
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <linux/bitmap.h>
+#include <linux/fd.h>
#include <xen/xen.h>

@ -1,24 +0,0 @@
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 698b905..e31ebff 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1953,9 +1953,6 @@ static int __init netif_init(void)
if (!xen_domain())
return -ENODEV;
- if (xen_initial_domain())
- return 0;
-
printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
return xenbus_register_frontend(&netfront_driver);
@@ -1965,9 +1962,6 @@ module_init(netif_init);
static void __exit netif_exit(void)
{
- if (xen_initial_domain())
- return;
-
xenbus_unregister_driver(&netfront_driver);
}
module_exit(netif_exit);

@ -1,42 +0,0 @@
From: Simon Graham <simon.graham@citrix.com>
To: Ian Campbell <Ian.Campbell@citrix.com>, "konrad.wilk@oracle.com"
<konrad.wilk@oracle.com>, "xen-devel@lists.xensource.com"
<xen-devel@lists.xensource.com>, "netdev@vger.kernel.org"
<netdev@vger.kernel.org>
Date: Thu, 24 May 2012 12:26:07 -0400
Cc: "bhutchings@solarflare.com" <bhutchings@solarflare.com>,
Simon Graham <simon.graham@citrix.com>,
"davem@davemloft.net" <davem@davemloft.net>,
"adnan.misherfi@oracle.com" <adnan.misherfi@oracle.com>
Subject: [Xen-devel] [PATCH] xen/netback: Calculate the number of SKB slots
required correctly
When calculating the number of slots required for a packet header, the code
was reserving too many slots if the header crossed a page boundary. Since
netbk_gop_skb copies the header to the start of the page, the count of
slots required for the header should be based solely on the header size.
This problem is easy to reproduce if a VIF is bridged to a USB 3G modem
device as the skb->data value always starts near the end of the first page.
Signed-off-by: Simon Graham <simon.graham@citrix.com>
---
drivers/net/xen-netback/netback.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 2596401..f4a6fca 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -325,8 +325,7 @@ unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb)
unsigned int count;
int i, copy_off;
- count = DIV_ROUND_UP(
- offset_in_page(skb->data)+skb_headlen(skb), PAGE_SIZE);
+ count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE);
copy_off = skb_headlen(skb) % PAGE_SIZE;
--
1.7.9.1

@ -1,15 +1,12 @@
patches.xen/pvops-0001-xen-Add-privcmd-device-driver.patch
patches.xen/pvops-0002-x86-acpi-tboot-Have-a-ACPI-os-prepare-sleep-instead-.patch
patches.xen/pvops-0003-tboot-Add-return-values-for-tboot_sleep.patch
patches.xen/pvops-0004-x86-acpi-sleep-Provide-registration-for-acpi_suspend.patch
patches.xen/pvops-0005-xen-acpi-sleep-Enable-ACPI-sleep-via-the-__acpi_os_p.patch
patches.xen/pvops-0006-xen-acpi-sleep-Register-to-the-acpi_suspend_lowlevel.patch
patches.xen/pvops-0007-xen-Utilize-the-restore_msi_irqs-hook.patch
patches.xen/pvops-0008-xen-setup-pm-acpi-Remove-the-call-to-boot_option_idl.patch
patches.xen/pvops-0009-xen-enlighten-Expose-MWAIT-and-MWAIT_LEAF-if-hypervi.patch
patches.xen/pvops-0010-CPUFREQ-xen-governor-for-Xen-hypervisor-frequency-sc.patch
patches.xen/pvops-0011-x86-PCI-Expand-the-x86_msi_ops-to-have-a-restore-MSI.patch
patches.xen/pvops-enable-netfront-in-dom0.patch
patches.xen/pvops-netback-calculate-correctly-the-SKB-slots.patch
# ACPI S3
patches.xen/pvops-0001-x86-acpi-sleep-Provide-registration-for-acpi_suspend.patch
patches.xen/pvops-0003-xen-acpi-sleep-Register-to-the-acpi_suspend_lowlevel.patch
# fix for GPU performance (revert workaround and apply proper fix), should go in 3.5
patches.xen/pvops-3.4-Revert-xen-pat-Disable-PAT-support-for-now.patch
patches.xen/pvops-3.4-x86-cpa-Use-pte_attrs-instead-of-pte_flags-on-CPA-se.patch
# Additional features
patches.xen/pvops-0100-usb-xen-pvusb-driver.patch
patches.xen/pvops-blkfront-removable-flag.patch
patches.xen/pvops-blkfront-eject-support.patch

@ -1 +1 @@
3.2.30
3.7.6

Loading…
Cancel
Save