From: Russ Anderson Subject: ia64: Call migration code on correctable errors v8 References: 415829 Acked-by: schwab@suse.de Patch-mainline: not yet Migrate data off pages with correctable memory errors. This patch is the ia64 specific piece. It connects the CPE handler to the page migration code. It is implemented as a kernel loadable module, similar to the mca recovery code (mca_recovery.ko). This allows the feature to be turned off by uninstalling the module. Update Jan 19 2009 jeffm: - isolate_lru_page doesn't put the page on a list anymore Signed-off-by: Russ Anderson --- arch/ia64/Kconfig | 9 arch/ia64/include/asm/mca.h | 6 arch/ia64/include/asm/page.h | 1 arch/ia64/kernel/Makefile | 1 arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++ arch/ia64/kernel/mca.c | 37 +++ 6 files changed, 487 insertions(+), 1 deletion(-) --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -511,6 +511,15 @@ config ARCH_PROC_KCORE_TEXT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." +config IA64_CPE_MIGRATE + tristate "Migrate data off pages with correctable errors" + default m + help + Migrate data off pages with correctable memory errors. Selecting + Y will build this functionality into the kernel. Selecting M will + build this functionality as a kernel loadable module. Installing + the module will turn on the functionality. + config PERFMON bool "Performance monitor support" help --- a/arch/ia64/include/asm/mca.h +++ b/arch/ia64/include/asm/mca.h @@ -142,6 +142,7 @@ extern unsigned long __per_cpu_mca[NR_CP extern int cpe_vector; extern int ia64_cpe_irq; +extern int cpe_poll_enabled; extern void ia64_mca_init(void); extern void ia64_mca_cpu_init(void *); extern void ia64_os_mca_dispatch(void); @@ -156,11 +157,16 @@ extern void ia64_slave_init_handler(void extern void ia64_mca_cmc_vector_setup(void); extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *)); extern void ia64_unreg_MCA_extension(void); +extern int ia64_reg_CE_extension(int (*fn)(void *)); +extern void ia64_unreg_CE_extension(void); extern unsigned long ia64_get_rnat(unsigned long *); extern void ia64_set_psr_mc(void); extern void ia64_mca_printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern struct list_head badpagelist; +extern unsigned int total_badpages; + struct ia64_mca_notify_die { struct ia64_sal_os_state *sos; int *monarch_cpu; --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -121,6 +121,7 @@ extern unsigned long max_low_pfn; #endif #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) +#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT)) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o --- /dev/null +++ b/arch/ia64/kernel/cpe_migrate.c @@ -0,0 +1,434 @@ +/* + * File: cpe_migrate.c + * Purpose: Migrate data from physical pages with excessive correctable + * errors to new physical pages. Keep the old pages on a discard + * list. + * + * Copyright (C) 2008 SGI - Silicon Graphics Inc. + * Copyright (C) 2008 Russ Anderson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BADRAM_BASENAME "badram" +#define CE_HISTORY_LENGTH 30 + +struct cpe_info { + u64 paddr; + u16 node; +}; +static struct cpe_info cpe[CE_HISTORY_LENGTH]; + +static int cpe_polling_enabled = 1; +static int cpe_head; +static int cpe_tail; +static int work_scheduled; +static int mstat_cannot_isolate; +static int mstat_failed_to_discard; +static int mstat_already_marked; +static int mstat_already_on_list; + +DEFINE_SPINLOCK(cpe_migrate_lock); + +static void +get_physical_address(void *buffer, u64 *paddr, u16 *node) +{ + sal_log_record_header_t *rh; + sal_log_mem_dev_err_info_t *mdei; + ia64_err_rec_t *err_rec; + sal_log_platform_err_info_t *plat_err; + efi_guid_t guid; + + err_rec = buffer; + rh = &err_rec->sal_elog_header; + *paddr = 0; + *node = 0; + + /* + * Make sure it is a corrected error. + */ + if (rh->severity != sal_log_severity_corrected) + return; + + plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err; + + guid = plat_err->mem_dev_err.header.guid; + if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) { + /* + * Memory cpe + */ + mdei = &plat_err->mem_dev_err; + if (mdei->valid.oem_data) { + if (mdei->valid.physical_addr) + *paddr = mdei->physical_addr; + + if (mdei->valid.node) { + if (ia64_platform_is("sn2")) + *node = nasid_to_cnodeid(mdei->node); + else + *node = mdei->node; + } + } + } +} + +static struct page * +alloc_migrate_page(struct page *ignored, unsigned long node, int **x) +{ + + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); +} + +static int +validate_paddr_page(u64 paddr) +{ + struct page *page; + + if (!paddr) + return -EINVAL; + + if (!ia64_phys_addr_valid(paddr)) + return -EINVAL; + + if (!pfn_valid(paddr >> PAGE_SHIFT)) + return -EINVAL; + + page = phys_to_page(paddr); + if (PageMemError(page)) + mstat_already_marked++; + return 0; +} + +extern int isolate_lru_page(struct page *); +static int +ia64_mca_cpe_move_page(u64 paddr, u32 node) +{ + LIST_HEAD(pagelist); + struct page *page; + int ret; + + ret = validate_paddr_page(paddr); + if (ret < 0) + return ret; + + /* + * convert physical address to page number + */ + page = phys_to_page(paddr); + + migrate_prep(); + ret = isolate_lru_page(page); + if (ret) { + mstat_cannot_isolate++; + return ret; + } + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, alloc_migrate_page, node, 0, true); + if (ret == 0) { + total_badpages++; + list_add_tail(&page->lru, &badpagelist); + } else { + mstat_failed_to_discard++; + /* + * The page failed to migrate and is not on the bad page list. + * Clearing the error bit will allow another attempt to migrate + * if it gets another correctable error. + */ + ClearPageMemError(page); + } + + return 0; +} + +/* + * ia64_mca_cpe_migrate + * The worker that does the actual migration. It pulls a + * physical address off the list and calls the migration code. + */ +static void +ia64_mca_cpe_migrate(struct work_struct *unused) +{ + int ret; + u64 paddr; + u16 node; + + do { + paddr = cpe[cpe_tail].paddr; + if (paddr) { + /* + * There is a valid entry that needs processing. + */ + node = cpe[cpe_tail].node; + + ret = ia64_mca_cpe_move_page(paddr, node); + if (ret <= 0) + /* + * Even though the return status is negative, + * clear the entry. If the same address has + * another CPE it will be re-added to the list. + */ + cpe[cpe_tail].paddr = 0; + + } + if (++cpe_tail >= CE_HISTORY_LENGTH) + cpe_tail = 0; + + } while (cpe_tail != cpe_head); + work_scheduled = 0; +} + +static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate); +DEFINE_SPINLOCK(cpe_list_lock); + +/* + * cpe_setup_migrate + * Get the physical address out of the CPE record, add it + * to the list of addresses to migrate (if not already on), + * and schedule the back end worker task. This is called + * in interrupt context so cannot directly call the migration + * code. + * + * Inputs + * rec The CPE record + * Outputs + * 1 on Success, -1 on failure + */ +static int +cpe_setup_migrate(void *rec) +{ + u64 paddr; + u16 node; + /* int head, tail; */ + int i, ret; + + if (!rec) + return -EINVAL; + + get_physical_address(rec, &paddr, &node); + ret = validate_paddr_page(paddr); + if (ret < 0) + return -EINVAL; + + if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0)) + /* + * List not empty + */ + for (i = 0; i < CE_HISTORY_LENGTH; i++) { + if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) { + mstat_already_on_list++; + return 1; /* already on the list */ + } + } + + if (!spin_trylock(&cpe_list_lock)) { + /* + * Someone else has the lock. To avoid spinning in interrupt + * handler context, bail. + */ + return 1; + } + + if (cpe[cpe_head].paddr == 0) { + cpe[cpe_head].node = node; + cpe[cpe_head].paddr = paddr; + + if (++cpe_head >= CE_HISTORY_LENGTH) + cpe_head = 0; + } + spin_unlock(&cpe_list_lock); + + if (!work_scheduled) { + work_scheduled = 1; + schedule_work(&cpe_enable_work); + } + + return 1; +} + +/* + * ============================================================================= + */ + +/* + * free_one_bad_page + * Free one page from the list of bad pages. + */ +static int +free_one_bad_page(unsigned long paddr) +{ + LIST_HEAD(pagelist); + struct page *page, *page2, *target; + + /* + * Verify page address + */ + target = phys_to_page(paddr); + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + if (page != target) + continue; + + ClearPageMemError(page); /* Mark the page as good */ + total_badpages--; + list_move_tail(&page->lru, &pagelist); + putback_lru_pages(&pagelist); + break; + } + return 0; +} + +/* + * free_all_bad_pages + * Free all of the pages on the bad pages list. + */ +static int +free_all_bad_pages(void) +{ + struct page *page, *page2; + + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + ClearPageMemError(page); /* Mark the page as good */ + total_badpages--; + } + putback_lru_pages(&badpagelist); + return 0; +} + +#define OPT_LEN 16 + +static ssize_t +badpage_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char optstr[OPT_LEN]; + unsigned long opt; + int len = OPT_LEN; + int err; + + if (count < len) + len = count; + + strlcpy(optstr, buf, len); + + err = strict_strtoul(optstr, 16, &opt); + if (err) + return err; + + if (opt == 0) + free_all_bad_pages(); + else + free_one_bad_page(opt); + + return count; +} + +/* + * badpage_show + * Display the number, size, and addresses of all the pages on the + * bad page list. + * + * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks + * the remaining space in buf to avoid overflowing. + */ +static ssize_t +badpage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct page *page, *page2; + int i = 0, cnt = 0; + char *bufend = buf + PAGE_SIZE; + + cnt = snprintf(buf, bufend - (buf + cnt), + "Memory marked bad: %d kB\n" + "Pages marked bad: %d\n" + "Unable to isolate on LRU: %d\n" + "Unable to migrate: %d\n" + "Already marked bad: %d\n" + "Already on list: %d\n" + "List of bad physical pages\n", + total_badpages << (PAGE_SHIFT - 10), total_badpages, + mstat_cannot_isolate, mstat_failed_to_discard, + mstat_already_marked, mstat_already_on_list + ); + + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + if (bufend - (buf + cnt) < 20) + break; /* Avoid overflowing the buffer */ + cnt += snprintf(buf + cnt, bufend - (buf + cnt), + " 0x%011lx", page_to_phys(page)); + if (!(++i % 5)) + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n"); + } + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n"); + + return cnt; +} + +static struct kobj_attribute badram_attr = { + .attr = { + .name = "badram", + .mode = S_IWUSR | S_IRUGO, + }, + .show = badpage_show, + .store = badpage_store, +}; + +static int __init +cpe_migrate_external_handler_init(void) +{ + int error; + + error = sysfs_create_file(kernel_kobj, &badram_attr.attr); + if (error) + return -EINVAL; + + /* + * register external ce handler + */ + if (ia64_reg_CE_extension(cpe_setup_migrate)) { + printk(KERN_ERR "ia64_reg_CE_extension failed.\n"); + return -EFAULT; + } + cpe_poll_enabled = cpe_polling_enabled; + + printk(KERN_INFO "Registered badram Driver\n"); + return 0; +} + +static void __exit +cpe_migrate_external_handler_exit(void) +{ + /* unregister external mca handlers */ + ia64_unreg_CE_extension(); + + sysfs_remove_file(kernel_kobj, &badram_attr.attr); +} + +module_init(cpe_migrate_external_handler_init); +module_exit(cpe_migrate_external_handler_exit); + +module_param(cpe_polling_enabled, int, 0644); +MODULE_PARM_DESC(cpe_polling_enabled, + "Enable polling with migration"); + +MODULE_AUTHOR("Russ Anderson "); +MODULE_DESCRIPTION("ia64 Corrected Error page migration driver"); --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -68,6 +68,9 @@ * * 2007-04-27 Russ Anderson * Support multiple cpus going through OS_MCA in the same event. + * + * 2008-04-22 Russ Anderson + * Migrate data off pages with correctable memory errors. */ #include #include @@ -164,7 +167,14 @@ static int cmc_polling_enabled = 1; * but encounters problems retrieving CPE logs. This should only be * necessary for debugging. */ -static int cpe_poll_enabled = 1; +int cpe_poll_enabled = 1; +EXPORT_SYMBOL(cpe_poll_enabled); + +unsigned int total_badpages; +EXPORT_SYMBOL(total_badpages); + +LIST_HEAD(badpagelist); +EXPORT_SYMBOL(badpagelist); extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe); @@ -524,6 +534,28 @@ int mca_recover_range(unsigned long addr } EXPORT_SYMBOL_GPL(mca_recover_range); +/* Function pointer to Corrected Error memory migration driver */ +int (*ia64_mca_ce_extension)(void *); + +int +ia64_reg_CE_extension(int (*fn)(void *)) +{ + if (ia64_mca_ce_extension) + return 1; + + ia64_mca_ce_extension = fn; + return 0; +} +EXPORT_SYMBOL(ia64_reg_CE_extension); + +void +ia64_unreg_CE_extension(void) +{ + if (ia64_mca_ce_extension) + ia64_mca_ce_extension = NULL; +} +EXPORT_SYMBOL(ia64_unreg_CE_extension); + #ifdef CONFIG_ACPI int cpe_vector = -1; @@ -535,6 +567,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v static unsigned long cpe_history[CPE_HISTORY_LENGTH]; static int index; static DEFINE_SPINLOCK(cpe_history_lock); + int recover; IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", __func__, cpe_irq, smp_processor_id()); @@ -581,6 +614,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v out: /* Get the CPE error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); + recover = (ia64_mca_ce_extension && ia64_mca_ce_extension( + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE))); return IRQ_HANDLED; }