2010-07-07 11:12:45 +00:00
|
|
|
From: Russ Anderson <rja@sgi.com>
|
|
|
|
Subject: ia64: Call migration code on correctable errors v8
|
|
|
|
References: 415829
|
|
|
|
Acked-by: schwab@suse.de
|
|
|
|
Patch-mainline: not yet
|
|
|
|
|
|
|
|
Migrate data off pages with correctable memory errors. This patch is the
|
|
|
|
ia64 specific piece. It connects the CPE handler to the page migration
|
|
|
|
code. It is implemented as a kernel loadable module, similar to the mca
|
|
|
|
recovery code (mca_recovery.ko). This allows the feature to be turned off
|
|
|
|
by uninstalling the module.
|
|
|
|
|
|
|
|
Update Jan 19 2009 jeffm:
|
|
|
|
- isolate_lru_page doesn't put the page on a list anymore
|
|
|
|
|
|
|
|
|
|
|
|
Signed-off-by: Russ Anderson <rja@sgi.com>
|
|
|
|
|
|
|
|
---
|
|
|
|
arch/ia64/Kconfig | 9
|
|
|
|
arch/ia64/include/asm/mca.h | 6
|
|
|
|
arch/ia64/include/asm/page.h | 1
|
|
|
|
arch/ia64/kernel/Makefile | 1
|
|
|
|
arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++
|
|
|
|
arch/ia64/kernel/mca.c | 37 +++
|
|
|
|
6 files changed, 487 insertions(+), 1 deletion(-)
|
|
|
|
|
|
|
|
--- a/arch/ia64/Kconfig
|
|
|
|
+++ b/arch/ia64/Kconfig
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -511,6 +511,15 @@ config ARCH_PROC_KCORE_TEXT
|
2010-07-07 11:12:45 +00:00
|
|
|
config IA64_MCA_RECOVERY
|
|
|
|
tristate "MCA recovery from errors other than TLB."
|
|
|
|
|
|
|
|
+config IA64_CPE_MIGRATE
|
|
|
|
+ tristate "Migrate data off pages with correctable errors"
|
|
|
|
+ default m
|
|
|
|
+ help
|
|
|
|
+ Migrate data off pages with correctable memory errors. Selecting
|
|
|
|
+ Y will build this functionality into the kernel. Selecting M will
|
|
|
|
+ build this functionality as a kernel loadable module. Installing
|
|
|
|
+ the module will turn on the functionality.
|
|
|
|
+
|
|
|
|
config PERFMON
|
|
|
|
bool "Performance monitor support"
|
|
|
|
help
|
|
|
|
--- a/arch/ia64/include/asm/mca.h
|
|
|
|
+++ b/arch/ia64/include/asm/mca.h
|
|
|
|
@@ -142,6 +142,7 @@ extern unsigned long __per_cpu_mca[NR_CP
|
|
|
|
|
|
|
|
extern int cpe_vector;
|
|
|
|
extern int ia64_cpe_irq;
|
|
|
|
+extern int cpe_poll_enabled;
|
|
|
|
extern void ia64_mca_init(void);
|
|
|
|
extern void ia64_mca_cpu_init(void *);
|
|
|
|
extern void ia64_os_mca_dispatch(void);
|
|
|
|
@@ -156,11 +157,16 @@ extern void ia64_slave_init_handler(void
|
|
|
|
extern void ia64_mca_cmc_vector_setup(void);
|
|
|
|
extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
|
|
|
|
extern void ia64_unreg_MCA_extension(void);
|
|
|
|
+extern int ia64_reg_CE_extension(int (*fn)(void *));
|
|
|
|
+extern void ia64_unreg_CE_extension(void);
|
|
|
|
extern unsigned long ia64_get_rnat(unsigned long *);
|
|
|
|
extern void ia64_set_psr_mc(void);
|
|
|
|
extern void ia64_mca_printk(const char * fmt, ...)
|
|
|
|
__attribute__ ((format (printf, 1, 2)));
|
|
|
|
|
|
|
|
+extern struct list_head badpagelist;
|
|
|
|
+extern unsigned int total_badpages;
|
|
|
|
+
|
|
|
|
struct ia64_mca_notify_die {
|
|
|
|
struct ia64_sal_os_state *sos;
|
|
|
|
int *monarch_cpu;
|
|
|
|
--- a/arch/ia64/include/asm/page.h
|
|
|
|
+++ b/arch/ia64/include/asm/page.h
|
|
|
|
@@ -121,6 +121,7 @@ extern unsigned long max_low_pfn;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
|
|
|
|
+#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT))
|
|
|
|
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
|
|
|
|
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
|
|
|
|
|
|
|
|
--- a/arch/ia64/kernel/Makefile
|
|
|
|
+++ b/arch/ia64/kernel/Makefile
|
|
|
|
@@ -25,6 +25,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul
|
|
|
|
obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
|
|
|
|
obj-$(CONFIG_CPU_FREQ) += cpufreq/
|
|
|
|
obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
|
|
|
|
+obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o
|
|
|
|
obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
|
|
|
|
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
|
|
|
|
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
|
|
|
|
--- /dev/null
|
|
|
|
+++ b/arch/ia64/kernel/cpe_migrate.c
|
|
|
|
@@ -0,0 +1,434 @@
|
|
|
|
+/*
|
|
|
|
+ * File: cpe_migrate.c
|
|
|
|
+ * Purpose: Migrate data from physical pages with excessive correctable
|
|
|
|
+ * errors to new physical pages. Keep the old pages on a discard
|
|
|
|
+ * list.
|
|
|
|
+ *
|
|
|
|
+ * Copyright (C) 2008 SGI - Silicon Graphics Inc.
|
|
|
|
+ * Copyright (C) 2008 Russ Anderson <rja@sgi.com>
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+#include <linux/sysdev.h>
|
|
|
|
+#include <linux/types.h>
|
|
|
|
+#include <linux/sched.h>
|
|
|
|
+#include <linux/module.h>
|
|
|
|
+#include <linux/kernel.h>
|
|
|
|
+#include <linux/smp.h>
|
|
|
|
+#include <linux/workqueue.h>
|
|
|
|
+#include <linux/mm.h>
|
|
|
|
+#include <linux/swap.h>
|
|
|
|
+#include <linux/vmalloc.h>
|
|
|
|
+#include <linux/migrate.h>
|
|
|
|
+#include <linux/page-isolation.h>
|
|
|
|
+#include <linux/memcontrol.h>
|
|
|
|
+#include <linux/kobject.h>
|
|
|
|
+
|
|
|
|
+#include <asm/page.h>
|
|
|
|
+#include <asm/system.h>
|
|
|
|
+#include <asm/sn/sn_cpuid.h>
|
|
|
|
+#include <asm/mca.h>
|
|
|
|
+
|
|
|
|
+#define BADRAM_BASENAME "badram"
|
|
|
|
+#define CE_HISTORY_LENGTH 30
|
|
|
|
+
|
|
|
|
+struct cpe_info {
|
|
|
|
+ u64 paddr;
|
|
|
|
+ u16 node;
|
|
|
|
+};
|
|
|
|
+static struct cpe_info cpe[CE_HISTORY_LENGTH];
|
|
|
|
+
|
|
|
|
+static int cpe_polling_enabled = 1;
|
|
|
|
+static int cpe_head;
|
|
|
|
+static int cpe_tail;
|
|
|
|
+static int work_scheduled;
|
|
|
|
+static int mstat_cannot_isolate;
|
|
|
|
+static int mstat_failed_to_discard;
|
|
|
|
+static int mstat_already_marked;
|
|
|
|
+static int mstat_already_on_list;
|
|
|
|
+
|
|
|
|
+DEFINE_SPINLOCK(cpe_migrate_lock);
|
|
|
|
+
|
|
|
|
+static void
|
|
|
|
+get_physical_address(void *buffer, u64 *paddr, u16 *node)
|
|
|
|
+{
|
|
|
|
+ sal_log_record_header_t *rh;
|
|
|
|
+ sal_log_mem_dev_err_info_t *mdei;
|
|
|
|
+ ia64_err_rec_t *err_rec;
|
|
|
|
+ sal_log_platform_err_info_t *plat_err;
|
|
|
|
+ efi_guid_t guid;
|
|
|
|
+
|
|
|
|
+ err_rec = buffer;
|
|
|
|
+ rh = &err_rec->sal_elog_header;
|
|
|
|
+ *paddr = 0;
|
|
|
|
+ *node = 0;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Make sure it is a corrected error.
|
|
|
|
+ */
|
|
|
|
+ if (rh->severity != sal_log_severity_corrected)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
|
|
|
|
+
|
|
|
|
+ guid = plat_err->mem_dev_err.header.guid;
|
|
|
|
+ if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
|
|
|
|
+ /*
|
|
|
|
+ * Memory cpe
|
|
|
|
+ */
|
|
|
|
+ mdei = &plat_err->mem_dev_err;
|
|
|
|
+ if (mdei->valid.oem_data) {
|
|
|
|
+ if (mdei->valid.physical_addr)
|
|
|
|
+ *paddr = mdei->physical_addr;
|
|
|
|
+
|
|
|
|
+ if (mdei->valid.node) {
|
|
|
|
+ if (ia64_platform_is("sn2"))
|
|
|
|
+ *node = nasid_to_cnodeid(mdei->node);
|
|
|
|
+ else
|
|
|
|
+ *node = mdei->node;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static struct page *
|
|
|
|
+alloc_migrate_page(struct page *ignored, unsigned long node, int **x)
|
|
|
|
+{
|
|
|
|
+
|
|
|
|
+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static int
|
|
|
|
+validate_paddr_page(u64 paddr)
|
|
|
|
+{
|
|
|
|
+ struct page *page;
|
|
|
|
+
|
|
|
|
+ if (!paddr)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ if (!ia64_phys_addr_valid(paddr))
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ if (!pfn_valid(paddr >> PAGE_SHIFT))
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ page = phys_to_page(paddr);
|
|
|
|
+ if (PageMemError(page))
|
|
|
|
+ mstat_already_marked++;
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+extern int isolate_lru_page(struct page *);
|
|
|
|
+static int
|
|
|
|
+ia64_mca_cpe_move_page(u64 paddr, u32 node)
|
|
|
|
+{
|
|
|
|
+ LIST_HEAD(pagelist);
|
|
|
|
+ struct page *page;
|
|
|
|
+ int ret;
|
|
|
|
+
|
|
|
|
+ ret = validate_paddr_page(paddr);
|
|
|
|
+ if (ret < 0)
|
|
|
|
+ return ret;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * convert physical address to page number
|
|
|
|
+ */
|
|
|
|
+ page = phys_to_page(paddr);
|
|
|
|
+
|
|
|
|
+ migrate_prep();
|
|
|
|
+ ret = isolate_lru_page(page);
|
|
|
|
+ if (ret) {
|
|
|
|
+ mstat_cannot_isolate++;
|
|
|
|
+ return ret;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ list_add(&page->lru, &pagelist);
|
2011-04-19 20:09:59 +00:00
|
|
|
+ ret = migrate_pages(&pagelist, alloc_migrate_page, node, 0, true);
|
2010-07-07 11:12:45 +00:00
|
|
|
+ if (ret == 0) {
|
|
|
|
+ total_badpages++;
|
|
|
|
+ list_add_tail(&page->lru, &badpagelist);
|
|
|
|
+ } else {
|
|
|
|
+ mstat_failed_to_discard++;
|
|
|
|
+ /*
|
|
|
|
+ * The page failed to migrate and is not on the bad page list.
|
|
|
|
+ * Clearing the error bit will allow another attempt to migrate
|
|
|
|
+ * if it gets another correctable error.
|
|
|
|
+ */
|
|
|
|
+ ClearPageMemError(page);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * ia64_mca_cpe_migrate
|
|
|
|
+ * The worker that does the actual migration. It pulls a
|
|
|
|
+ * physical address off the list and calls the migration code.
|
|
|
|
+ */
|
|
|
|
+static void
|
|
|
|
+ia64_mca_cpe_migrate(struct work_struct *unused)
|
|
|
|
+{
|
|
|
|
+ int ret;
|
|
|
|
+ u64 paddr;
|
|
|
|
+ u16 node;
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ paddr = cpe[cpe_tail].paddr;
|
|
|
|
+ if (paddr) {
|
|
|
|
+ /*
|
|
|
|
+ * There is a valid entry that needs processing.
|
|
|
|
+ */
|
|
|
|
+ node = cpe[cpe_tail].node;
|
|
|
|
+
|
|
|
|
+ ret = ia64_mca_cpe_move_page(paddr, node);
|
|
|
|
+ if (ret <= 0)
|
|
|
|
+ /*
|
|
|
|
+ * Even though the return status is negative,
|
|
|
|
+ * clear the entry. If the same address has
|
|
|
|
+ * another CPE it will be re-added to the list.
|
|
|
|
+ */
|
|
|
|
+ cpe[cpe_tail].paddr = 0;
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ if (++cpe_tail >= CE_HISTORY_LENGTH)
|
|
|
|
+ cpe_tail = 0;
|
|
|
|
+
|
|
|
|
+ } while (cpe_tail != cpe_head);
|
|
|
|
+ work_scheduled = 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
|
|
|
|
+DEFINE_SPINLOCK(cpe_list_lock);
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * cpe_setup_migrate
|
|
|
|
+ * Get the physical address out of the CPE record, add it
|
|
|
|
+ * to the list of addresses to migrate (if not already on),
|
|
|
|
+ * and schedule the back end worker task. This is called
|
|
|
|
+ * in interrupt context so cannot directly call the migration
|
|
|
|
+ * code.
|
|
|
|
+ *
|
|
|
|
+ * Inputs
|
|
|
|
+ * rec The CPE record
|
|
|
|
+ * Outputs
|
|
|
|
+ * 1 on Success, -1 on failure
|
|
|
|
+ */
|
|
|
|
+static int
|
|
|
|
+cpe_setup_migrate(void *rec)
|
|
|
|
+{
|
|
|
|
+ u64 paddr;
|
|
|
|
+ u16 node;
|
|
|
|
+ /* int head, tail; */
|
|
|
|
+ int i, ret;
|
|
|
|
+
|
|
|
|
+ if (!rec)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ get_physical_address(rec, &paddr, &node);
|
|
|
|
+ ret = validate_paddr_page(paddr);
|
|
|
|
+ if (ret < 0)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0))
|
|
|
|
+ /*
|
|
|
|
+ * List not empty
|
|
|
|
+ */
|
|
|
|
+ for (i = 0; i < CE_HISTORY_LENGTH; i++) {
|
|
|
|
+ if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) {
|
|
|
|
+ mstat_already_on_list++;
|
|
|
|
+ return 1; /* already on the list */
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (!spin_trylock(&cpe_list_lock)) {
|
|
|
|
+ /*
|
|
|
|
+ * Someone else has the lock. To avoid spinning in interrupt
|
|
|
|
+ * handler context, bail.
|
|
|
|
+ */
|
|
|
|
+ return 1;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (cpe[cpe_head].paddr == 0) {
|
|
|
|
+ cpe[cpe_head].node = node;
|
|
|
|
+ cpe[cpe_head].paddr = paddr;
|
|
|
|
+
|
|
|
|
+ if (++cpe_head >= CE_HISTORY_LENGTH)
|
|
|
|
+ cpe_head = 0;
|
|
|
|
+ }
|
|
|
|
+ spin_unlock(&cpe_list_lock);
|
|
|
|
+
|
|
|
|
+ if (!work_scheduled) {
|
|
|
|
+ work_scheduled = 1;
|
|
|
|
+ schedule_work(&cpe_enable_work);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return 1;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * =============================================================================
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * free_one_bad_page
|
|
|
|
+ * Free one page from the list of bad pages.
|
|
|
|
+ */
|
|
|
|
+static int
|
|
|
|
+free_one_bad_page(unsigned long paddr)
|
|
|
|
+{
|
|
|
|
+ LIST_HEAD(pagelist);
|
|
|
|
+ struct page *page, *page2, *target;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Verify page address
|
|
|
|
+ */
|
|
|
|
+ target = phys_to_page(paddr);
|
|
|
|
+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
|
|
|
|
+ if (page != target)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ ClearPageMemError(page); /* Mark the page as good */
|
|
|
|
+ total_badpages--;
|
|
|
|
+ list_move_tail(&page->lru, &pagelist);
|
|
|
|
+ putback_lru_pages(&pagelist);
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * free_all_bad_pages
|
|
|
|
+ * Free all of the pages on the bad pages list.
|
|
|
|
+ */
|
|
|
|
+static int
|
|
|
|
+free_all_bad_pages(void)
|
|
|
|
+{
|
|
|
|
+ struct page *page, *page2;
|
|
|
|
+
|
|
|
|
+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
|
|
|
|
+ ClearPageMemError(page); /* Mark the page as good */
|
|
|
|
+ total_badpages--;
|
|
|
|
+ }
|
|
|
|
+ putback_lru_pages(&badpagelist);
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#define OPT_LEN 16
|
|
|
|
+
|
|
|
|
+static ssize_t
|
|
|
|
+badpage_store(struct kobject *kobj,
|
|
|
|
+ struct kobj_attribute *attr, const char *buf, size_t count)
|
|
|
|
+{
|
|
|
|
+ char optstr[OPT_LEN];
|
|
|
|
+ unsigned long opt;
|
|
|
|
+ int len = OPT_LEN;
|
|
|
|
+ int err;
|
|
|
|
+
|
|
|
|
+ if (count < len)
|
|
|
|
+ len = count;
|
|
|
|
+
|
|
|
|
+ strlcpy(optstr, buf, len);
|
|
|
|
+
|
|
|
|
+ err = strict_strtoul(optstr, 16, &opt);
|
|
|
|
+ if (err)
|
|
|
|
+ return err;
|
|
|
|
+
|
|
|
|
+ if (opt == 0)
|
|
|
|
+ free_all_bad_pages();
|
|
|
|
+ else
|
|
|
|
+ free_one_bad_page(opt);
|
|
|
|
+
|
|
|
|
+ return count;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * badpage_show
|
|
|
|
+ * Display the number, size, and addresses of all the pages on the
|
|
|
|
+ * bad page list.
|
|
|
|
+ *
|
|
|
|
+ * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks
|
|
|
|
+ * the remaining space in buf to avoid overflowing.
|
|
|
|
+ */
|
|
|
|
+static ssize_t
|
|
|
|
+badpage_show(struct kobject *kobj,
|
|
|
|
+ struct kobj_attribute *attr, char *buf)
|
|
|
|
+
|
|
|
|
+{
|
|
|
|
+ struct page *page, *page2;
|
|
|
|
+ int i = 0, cnt = 0;
|
|
|
|
+ char *bufend = buf + PAGE_SIZE;
|
|
|
|
+
|
|
|
|
+ cnt = snprintf(buf, bufend - (buf + cnt),
|
|
|
|
+ "Memory marked bad: %d kB\n"
|
|
|
|
+ "Pages marked bad: %d\n"
|
|
|
|
+ "Unable to isolate on LRU: %d\n"
|
|
|
|
+ "Unable to migrate: %d\n"
|
|
|
|
+ "Already marked bad: %d\n"
|
|
|
|
+ "Already on list: %d\n"
|
|
|
|
+ "List of bad physical pages\n",
|
|
|
|
+ total_badpages << (PAGE_SHIFT - 10), total_badpages,
|
|
|
|
+ mstat_cannot_isolate, mstat_failed_to_discard,
|
|
|
|
+ mstat_already_marked, mstat_already_on_list
|
|
|
|
+ );
|
|
|
|
+
|
|
|
|
+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
|
|
|
|
+ if (bufend - (buf + cnt) < 20)
|
|
|
|
+ break; /* Avoid overflowing the buffer */
|
|
|
|
+ cnt += snprintf(buf + cnt, bufend - (buf + cnt),
|
|
|
|
+ " 0x%011lx", page_to_phys(page));
|
|
|
|
+ if (!(++i % 5))
|
|
|
|
+ cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
|
|
|
|
+ }
|
|
|
|
+ cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
|
|
|
|
+
|
|
|
|
+ return cnt;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static struct kobj_attribute badram_attr = {
|
|
|
|
+ .attr = {
|
|
|
|
+ .name = "badram",
|
|
|
|
+ .mode = S_IWUSR | S_IRUGO,
|
|
|
|
+ },
|
|
|
|
+ .show = badpage_show,
|
|
|
|
+ .store = badpage_store,
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+static int __init
|
|
|
|
+cpe_migrate_external_handler_init(void)
|
|
|
|
+{
|
|
|
|
+ int error;
|
|
|
|
+
|
|
|
|
+ error = sysfs_create_file(kernel_kobj, &badram_attr.attr);
|
|
|
|
+ if (error)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * register external ce handler
|
|
|
|
+ */
|
|
|
|
+ if (ia64_reg_CE_extension(cpe_setup_migrate)) {
|
|
|
|
+ printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
|
|
|
|
+ return -EFAULT;
|
|
|
|
+ }
|
|
|
|
+ cpe_poll_enabled = cpe_polling_enabled;
|
|
|
|
+
|
|
|
|
+ printk(KERN_INFO "Registered badram Driver\n");
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void __exit
|
|
|
|
+cpe_migrate_external_handler_exit(void)
|
|
|
|
+{
|
|
|
|
+ /* unregister external mca handlers */
|
|
|
|
+ ia64_unreg_CE_extension();
|
|
|
|
+
|
|
|
|
+ sysfs_remove_file(kernel_kobj, &badram_attr.attr);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+module_init(cpe_migrate_external_handler_init);
|
|
|
|
+module_exit(cpe_migrate_external_handler_exit);
|
|
|
|
+
|
|
|
|
+module_param(cpe_polling_enabled, int, 0644);
|
|
|
|
+MODULE_PARM_DESC(cpe_polling_enabled,
|
|
|
|
+ "Enable polling with migration");
|
|
|
|
+
|
|
|
|
+MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
|
|
|
|
+MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
|
|
|
|
--- a/arch/ia64/kernel/mca.c
|
|
|
|
+++ b/arch/ia64/kernel/mca.c
|
|
|
|
@@ -68,6 +68,9 @@
|
|
|
|
*
|
|
|
|
* 2007-04-27 Russ Anderson <rja@sgi.com>
|
|
|
|
* Support multiple cpus going through OS_MCA in the same event.
|
|
|
|
+ *
|
|
|
|
+ * 2008-04-22 Russ Anderson <rja@sgi.com>
|
|
|
|
+ * Migrate data off pages with correctable memory errors.
|
|
|
|
*/
|
|
|
|
#include <linux/jiffies.h>
|
|
|
|
#include <linux/types.h>
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -164,7 +167,14 @@ static int cmc_polling_enabled = 1;
|
2010-07-07 11:12:45 +00:00
|
|
|
* but encounters problems retrieving CPE logs. This should only be
|
|
|
|
* necessary for debugging.
|
|
|
|
*/
|
|
|
|
-static int cpe_poll_enabled = 1;
|
|
|
|
+int cpe_poll_enabled = 1;
|
|
|
|
+EXPORT_SYMBOL(cpe_poll_enabled);
|
|
|
|
+
|
|
|
|
+unsigned int total_badpages;
|
|
|
|
+EXPORT_SYMBOL(total_badpages);
|
|
|
|
+
|
|
|
|
+LIST_HEAD(badpagelist);
|
|
|
|
+EXPORT_SYMBOL(badpagelist);
|
|
|
|
|
|
|
|
extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
|
|
|
|
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -524,6 +534,28 @@ int mca_recover_range(unsigned long addr
|
2010-07-07 11:12:45 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(mca_recover_range);
|
|
|
|
|
|
|
|
+/* Function pointer to Corrected Error memory migration driver */
|
|
|
|
+int (*ia64_mca_ce_extension)(void *);
|
|
|
|
+
|
|
|
|
+int
|
|
|
|
+ia64_reg_CE_extension(int (*fn)(void *))
|
|
|
|
+{
|
|
|
|
+ if (ia64_mca_ce_extension)
|
|
|
|
+ return 1;
|
|
|
|
+
|
|
|
|
+ ia64_mca_ce_extension = fn;
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+EXPORT_SYMBOL(ia64_reg_CE_extension);
|
|
|
|
+
|
|
|
|
+void
|
|
|
|
+ia64_unreg_CE_extension(void)
|
|
|
|
+{
|
|
|
|
+ if (ia64_mca_ce_extension)
|
|
|
|
+ ia64_mca_ce_extension = NULL;
|
|
|
|
+}
|
|
|
|
+EXPORT_SYMBOL(ia64_unreg_CE_extension);
|
|
|
|
+
|
|
|
|
#ifdef CONFIG_ACPI
|
|
|
|
|
|
|
|
int cpe_vector = -1;
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -535,6 +567,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
|
2010-07-07 11:12:45 +00:00
|
|
|
static unsigned long cpe_history[CPE_HISTORY_LENGTH];
|
|
|
|
static int index;
|
|
|
|
static DEFINE_SPINLOCK(cpe_history_lock);
|
|
|
|
+ int recover;
|
|
|
|
|
|
|
|
IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
|
|
|
|
__func__, cpe_irq, smp_processor_id());
|
2011-04-19 20:09:59 +00:00
|
|
|
@@ -581,6 +614,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
|
2010-07-07 11:12:45 +00:00
|
|
|
out:
|
|
|
|
/* Get the CPE error record and log it */
|
|
|
|
ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
|
|
|
|
+ recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
|
|
|
|
+ IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));
|
|
|
|
|
|
|
|
return IRQ_HANDLED;
|
|
|
|
}
|