From b70ef01016850de87b9a28a6af19fed8801df076 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:32:38 +0200 Subject: EDAC: move MCE error descriptions to EDAC core This is in preparation of adding AMD-specific MCE decoding functionality to the EDAC core. The error decoding macros originate from the AMD64 EDAC driver albeit in a simplified and cleaned up version here. While at it, add macros to generate the error description strings and use them in the error type decoders directly which removes a bunch of code and makes the decoding functions much more readable. Also, fix strings and shorten macro names. Remove superfluous htlink_msgs. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 140 ++++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 62 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a1..b9e84bc91766 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -18,6 +18,63 @@ struct amd64_pvt; static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; +/* + * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only + * for DDR2 DRAM mapping. + */ +u32 revf_quad_ddr2_shift[] = { + 0, /* 0000b NULL DIMM (128mb) */ + 28, /* 0001b 256mb */ + 29, /* 0010b 512mb */ + 29, /* 0011b 512mb */ + 29, /* 0100b 512mb */ + 30, /* 0101b 1gb */ + 30, /* 0110b 1gb */ + 31, /* 0111b 2gb */ + 31, /* 1000b 2gb */ + 32, /* 1001b 4gb */ + 32, /* 1010b 4gb */ + 33, /* 1011b 8gb */ + 0, /* 1100b future */ + 0, /* 1101b future */ + 0, /* 1110b future */ + 0 /* 1111b future */ +}; + +/* + * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing + * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- + * or higher value'. + * + *FIXME: Produce a better mapping/linearisation. + */ + +struct scrubrate scrubrates[] = { + { 0x01, 1600000000UL}, + { 0x02, 800000000UL}, + { 0x03, 400000000UL}, + { 0x04, 200000000UL}, + { 0x05, 100000000UL}, + { 0x06, 50000000UL}, + { 0x07, 25000000UL}, + { 0x08, 12284069UL}, + { 0x09, 6274509UL}, + { 0x0A, 3121951UL}, + { 0x0B, 1560975UL}, + { 0x0C, 781440UL}, + { 0x0D, 390720UL}, + { 0x0E, 195300UL}, + { 0x0F, 97650UL}, + { 0x10, 48854UL}, + { 0x11, 24427UL}, + { 0x12, 12213UL}, + { 0x13, 6101UL}, + { 0x14, 3051UL}, + { 0x15, 1523UL}, + { 0x16, 761UL}, + { 0x00, 0UL}, /* scrubbing off */ +}; + /* * Memory scrubber control interface. For K8, memory scrubbing is handled by * hardware and can involve L2 cache, dcache as well as the main memory. With @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u32 page, offset; /* Extract the syndrome parts and form a 16-bit syndrome */ - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* CHIPKILL enabled */ if (info->nbcfg & K8_NBCFG_CHIPKILL) { @@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, if (csrow >= 0) { error_address_to_page_and_offset(sys_addr, &page, &offset); - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* * Is CHIPKILL on? If so, then we can attempt to use the @@ -2155,36 +2212,22 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "GART TLB event: transaction type(%s), " - "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); + "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "cache hierarchy error: memory transaction type(%s), " "transaction type(%s), cache level(%s)\n", - rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } @@ -2264,21 +2307,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, static void amd64_decode_bus_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code, ext_ec; - u32 ec_pp; /* error code participating processor (2p) */ - u32 ec_to; /* error code timed out (1b) */ - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_ii; /* error code memory or I/O (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); - err_code = EXTRACT_ERROR_CODE(info->nbsl); - - ec_ll = EXTRACT_LL_CODE(err_code); - ec_ii = EXTRACT_II_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - ec_to = EXTRACT_TO_CODE(err_code); - ec_pp = EXTRACT_PP_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); + u32 xec = EXT_ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "BUS ERROR:\n" @@ -2286,20 +2316,17 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, " participating processor(%s)\n" " memory transaction type(%s)\n" " cache level(%s) Error Found by: %s\n", - to_msgs[ec_to], - ii_msgs[ec_ii], - pp_msgs[ec_pp], - rrrr_msgs[ec_rrrr], - ll_msgs[ec_ll], + TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), (info->nbsh & K8_NBSH_ERR_SCRUBER) ? "Scrubber" : "Normal Operation"); - /* If this was an 'observed' error, early out */ - if (ec_pp == K8_NBSL_PP_OBS) - return; /* We aren't the node involved */ + + /* Bail early out if this was an 'observed' error */ + if (PP(ec) == K8_NBSL_PP_OBS) + return; /* Parse out the extended error code for ECC events */ - switch (ext_ec) { + switch (xec) { /* F10 changed to one Extended ECC error code */ case F10_NBSL_EXT_ERR_RES: /* Reserved field */ case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ @@ -2379,7 +2406,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); - err_code = EXTRACT_ERROR_CODE(regs->nbsl); + err_code = ERROR_CODE(regs->nbsl); /* Determine which error type: * 1) GART errors - non-fatal, developmental events @@ -2387,7 +2414,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * 3) BUS errors * 4) Unknown error */ - if (TEST_TLB_ERROR(err_code)) { + if (TLB_ERROR(err_code)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2411,10 +2438,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, debugf1("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, info); - } else if (TEST_MEM_ERROR(err_code)) { + } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, info); - } else if (TEST_BUS_ERROR(err_code)) { + } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, info); } else { @@ -2424,21 +2451,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, err_code); } - ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); + ext_ec = EXT_ERROR_CODE(regs->nbsl); amd64_mc_printk(mci, KERN_ERR, "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); - if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && - ext_ec <= F10_NBSL_EXT_ERR_TGT) || - (ext_ec == F10_NBSL_EXT_ERR_RMW)) && - EXTRACT_LDT_LINK(info->nbsh)) { - - amd64_mc_printk(mci, KERN_ERR, - "Error on hypertransport link: %s\n", - htlink_msgs[ - EXTRACT_LDT_LINK(info->nbsh)]); - } - /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. -- cgit v1.2.3 From ef44cc4c2245d3c43f3c11c7bff6239852eef498 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 14:45:48 +0200 Subject: amd64_edac: cleanup amd64_process_error_info * mv amd64_error_info_regs -> err_regs * remove redundant info ptr Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 44 ++++++++++++++++++++------------------------ drivers/edac/amd64_edac.h | 10 +++++----- 2 files changed, 25 insertions(+), 29 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index b9e84bc91766..c9b88d829701 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -750,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, * specific. */ static u64 extract_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; @@ -1106,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt) /* extract the ERROR ADDRESS for the K8 CPUs */ static u64 k8_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xff)) << 32) + (info->nbeal & ~0x03); @@ -1149,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddress) { struct mem_ctl_info *src_mci; @@ -1368,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt) } static u64 f10_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xffff)) << 32) + (info->nbeal & ~0x01); @@ -1745,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * The @sys_addr is usually an error address received from the hardware. */ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; @@ -2102,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) * - 0: if no valid error is indicated */ static int amd64_get_error_info_regs(struct mem_ctl_info *mci, - struct amd64_error_info_regs *regs) + struct err_regs *regs) { struct amd64_pvt *pvt; struct pci_dev *misc_f3_ctl; @@ -2151,10 +2151,10 @@ err_reg: * - 0: if no error is found */ static int amd64_get_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt; - struct amd64_error_info_regs regs; + struct err_regs regs; pvt = mci->pvt_info; @@ -2210,7 +2210,7 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, } static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2220,7 +2220,7 @@ static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2236,7 +2236,7 @@ static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, * ADDRESS and process. */ static void amd64_handle_ce(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; u64 SystemAddress; @@ -2259,7 +2259,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, /* Handle any Un-correctable Errors (UEs) */ static void amd64_handle_ue(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { int csrow; u64 SystemAddress; @@ -2305,7 +2305,7 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2356,22 +2356,18 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, } int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *regs, int handle_errors) { struct amd64_pvt *pvt; - struct amd64_error_info_regs *regs; u32 err_code, ext_ec; int gart_tlb_error = 0; pvt = mci->pvt_info; - /* If caller doesn't want us to process the error, return */ if (!handle_errors) return 1; - regs = info; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", pvt->mc_node_id, regs->nbeah, regs->nbeal); @@ -2437,13 +2433,13 @@ int amd64_process_error_info(struct mem_ctl_info *mci, gart_tlb_error = 1; debugf1("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, info); + amd64_decode_gart_tlb_error(mci, regs); } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, info); + amd64_decode_mem_cache_error(mci, regs); } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, info); + amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, @@ -2480,10 +2476,10 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info); */ static void amd64_check(struct mem_ctl_info *mci) { - struct amd64_error_info_regs info; + struct err_regs regs; - if (amd64_get_error_info(mci, &info)) - amd64_process_error_info(mci, &info, 1); + if (amd64_get_error_info(mci, ®s)) + amd64_process_error_info(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 1ddef8d15d52..bde8f78551f9 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -449,7 +449,7 @@ enum amd64_chipset_families { * * Depends on entry into the modules */ -struct amd64_error_info_regs { +struct err_regs { u32 nbcfg; u32 nbsh; u32 nbsl; @@ -527,7 +527,7 @@ struct amd64_pvt { u32 online_spare; /* On-Line spare Reg */ /* temp storage for when input is received from sysfs */ - struct amd64_error_info_regs ctl_error_info; + struct err_regs ctl_error_info; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -586,11 +586,11 @@ struct low_ops { int (*early_channel_count)(struct amd64_pvt *pvt); u64 (*get_error_address)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info); + struct err_regs *info); void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); void (*read_dram_ctl_register)(struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddr); int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); }; @@ -623,7 +623,7 @@ static inline struct low_ops *family_ops(int index) #define F11_MIN_SCRUB_RATE_BITS 0x6 int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, int handle_errors); int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); -- cgit v1.2.3 From 5110dbdeab546268dda2e4c6a83448639b2fc5ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:51:04 +0200 Subject: amd64_edac: cleanup/complete NB MCE decoding * don't dump info which mcheck already does * update to newest BKDG * mv amd64_process_error_info -> amd64_decode_nb_mce * shorten error struct names * remove redundant info ptr in amd64_process_error_info * remove unused ErrorCodeExt[19:16] (MCx_STATUS) defines Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 125 ++++++++++++++++-------------------------- drivers/edac/amd64_edac.h | 26 +++------ drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.h | 2 + 4 files changed, 56 insertions(+), 99 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c9b88d829701..5af87d44c80c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2355,62 +2355,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, "Error Overflow set"); } -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *regs, - int handle_errors) +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, + int handle_errors) { - struct amd64_pvt *pvt; - u32 err_code, ext_ec; - int gart_tlb_error = 0; - - pvt = mci->pvt_info; + struct amd64_pvt *pvt = mci->pvt_info; + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); if (!handle_errors) - return 1; + return; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); - debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", - pvt->mc_node_id, regs->nbeah, regs->nbeal); - debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", - regs->nbsh, regs->nbsl); - debugf1(" Valid Error=%s Overflow=%s\n", - (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", - (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); - debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", - (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_ENABLE) ? - "True" : "False"); - debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", - (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? - "True" : "False", - (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_PCC) ? - "True" : "False"); - debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", - (regs->nbsh & K8_NBSH_CECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_UECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? - "True" : "False"); - debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", - (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); + pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } - err_code = ERROR_CODE(regs->nbsl); + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - /* Determine which error type: - * 1) GART errors - non-fatal, developmental events - * 2) MEMORY errors - * 3) BUS errors - * 4) Unknown error - */ - if (TLB_ERROR(err_code)) { + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2423,52 +2408,34 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * [1] section 13.10.1 on BIOS and Kernel Developers Guide for * AMD NPT family 0Fh processors */ - if (report_gart_errors == 0) - return 1; - - /* - * Only if GART error reporting is requested should we generate - * any logs. - */ - gart_tlb_error = 1; + if (!report_gart_errors) + return; - debugf1("GART TLB error\n"); + pr_emerg("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, regs); - } else if (MEM_ERROR(err_code)) { - debugf1("Memory/Cache error\n"); + } else if (MEM_ERROR(ec)) { + pr_emerg("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, regs); - } else if (BUS_ERROR(err_code)) { - debugf1("Bus (Link/DRAM) error\n"); + } else if (BUS_ERROR(ec)) { + pr_emerg("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, "%s(): unknown MCE error 0x%x\n", __func__, - err_code); + ec); } - ext_ec = EXT_ERROR_CODE(regs->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. */ - if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { - amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); - if (!gart_tlb_error) - edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); - } - - if (regs->nbsh & K8_NBSH_PCC) - amd64_mc_printk(mci, KERN_CRIT, - "PCC (processor context corrupt) set\n"); - - return 1; + if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) + edac_mc_handle_ue_no_info(mci, "UE bit is set"); } -EXPORT_SYMBOL_GPL(amd64_process_error_info); /* * The main polling 'check' function, called FROM the edac core to perform the @@ -2479,7 +2446,7 @@ static void amd64_check(struct mem_ctl_info *mci) struct err_regs regs; if (amd64_get_error_info(mci, ®s)) - amd64_process_error_info(mci, ®s, 1); + amd64_decode_nb_mce(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index bde8f78551f9..ecab0c9fd14e 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -306,16 +306,7 @@ enum { /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 -#define F10_NBSL_EXT_ERR_CRC 0x1 -#define F10_NBSL_EXT_ERR_SYNC 0x2 -#define F10_NBSL_EXT_ERR_MST 0x3 -#define F10_NBSL_EXT_ERR_TGT 0x4 -#define F10_NBSL_EXT_ERR_GART 0x5 -#define F10_NBSL_EXT_ERR_RMW 0x6 -#define F10_NBSL_EXT_ERR_WDT 0x7 #define F10_NBSL_EXT_ERR_ECC 0x8 -#define F10_NBSL_EXT_ERR_DEV 0x9 -#define F10_NBSL_EXT_ERR_LINK_DATA 0xA /* Next two are overloaded values */ #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB @@ -360,18 +351,15 @@ enum { #define K8_NBSH_VALID_BIT BIT(31) #define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UNCORRECTED_ERR BIT(29) -#define K8_NBSH_ERR_ENABLE BIT(28) -#define K8_NBSH_MISC_ERR_VALID BIT(27) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) #define K8_NBSH_VALID_ERROR_ADDR BIT(26) #define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) #define K8_NBSH_CECC BIT(14) #define K8_NBSH_UECC BIT(13) #define K8_NBSH_ERR_SCRUBER BIT(8) -#define K8_NBSH_CORE3 BIT(3) -#define K8_NBSH_CORE2 BIT(2) -#define K8_NBSH_CORE1 BIT(1) -#define K8_NBSH_CORE0 BIT(0) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) @@ -622,8 +610,8 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *info, - int handle_errors); +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, + int handle_errors); + int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 0a41b248a4ad..bcb4e2eba3dc 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_process_error_info(mci, &pvt->ctl_error_info, 1); + amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 81f9dcf9990a..39971cdabb51 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,5 +1,7 @@ #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -- cgit v1.2.3 From b7225e4fc19ce27a594cb2b868ef151bf82f8f93 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:05:53 +0200 Subject: amd64_edac: remove memory and GART TLB error decoders Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5af87d44c80c..75842f08db83 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2209,28 +2209,6 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, return 1; } -static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "GART TLB event: transaction type(%s), " - "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); -} - -static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "cache hierarchy error: memory transaction type(%s), " - "transaction type(%s), cache level(%s)\n", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -} - - /* * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR * ADDRESS and process. @@ -2411,19 +2389,19 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, if (!report_gart_errors) return; - pr_emerg("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, regs); + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, regs); + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg("Bus (Link/DRAM) error\n"); + pr_emerg(" Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, - ec); + "%s(): unknown MCE error 0x%x\n", __func__, ec); } pr_emerg("%s.\n", EXT_ERR_MSG(xec)); -- cgit v1.2.3 From ecaf5606de65cdd04de5f526185fe28fb0df654e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:32:01 +0200 Subject: amd64_edac: cleanup amd64_decode_bus_error Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 75842f08db83..82f48ee90f11 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,42 +2283,26 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info) + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "BUS ERROR:\n" - " time-out(%s) mem or i/o(%s)\n" - " participating processor(%s)\n" - " memory transaction type(%s)\n" - " cache level(%s) Error Found by: %s\n", - TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), - (info->nbsh & K8_NBSH_ERR_SCRUBER) ? - "Scrubber" : "Normal Operation"); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; - /* Parse out the extended error code for ECC events */ - switch (xec) { - /* F10 changed to one Extended ECC error code */ - case F10_NBSL_EXT_ERR_RES: /* Reserved field */ - case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ - break; - - default: - amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " - "handling for this error\n"); + /* Do only ECC errors */ + if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; - } - if (info->nbsh & K8_NBSH_CECC) + if (ecc_type == 2) amd64_handle_ce(mci, info); - else if (info->nbsh & K8_NBSH_UECC) + else if (ecc_type == 1) amd64_handle_ue(mci, info); /* @@ -2329,8 +2313,7 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, * catastrophic. */ if (info->nbsh & K8_NBSH_OVERFLOW) - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR - "Error Overflow set"); + edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, @@ -2397,7 +2380,7 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs); + amd64_decode_bus_error(mci, regs, ecc); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, -- cgit v1.2.3 From 549d042df240dfb4203bab40ad44f9336751b7d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 24 Jul 2009 13:51:42 +0200 Subject: x86, mce: pass mce info to EDAC for decoding Move NB decoder along with required defines to EDAC MCE core. Add registration routines for further decoding of the MCE info in the AMD64 EDAC module. CC: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 7 +++ drivers/edac/amd64_edac.c | 98 ++++++++------------------------- drivers/edac/amd64_edac.h | 36 ------------ drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.c | 115 +++++++++++++++++++++++++++++++++++++++ drivers/edac/edac_mce_amd.h | 38 +++++++++++++ 6 files changed, 185 insertions(+), 111 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62f..b82866f6adf5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -183,6 +183,11 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void __weak decode_mce(struct mce *m) +{ + return; +} + static void print_mce(struct mce *m) { printk(KERN_EMERG @@ -205,6 +210,8 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + decode_mce(m); } static void print_mce_head(void) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 82f48ee90f11..2080b1e2e8a2 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } } -static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) +static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, - int handle_errors) +void amd64_decode_bus_error(int node_id, struct err_regs *regs, + int ecc_type) { - struct amd64_pvt *pvt = mci->pvt_info; - int ecc; - u32 ec = ERROR_CODE(regs->nbsl); - u32 xec = EXT_ERROR_CODE(regs->nbsl); - - if (!handle_errors) - return; - - pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); - - /* - * F10h, revD can disable ErrCpu[3:0] so check that first and also the - * value encoding has changed so interpret those differently - */ - if ((boot_cpu_data.x86 == 0x10) && - (boot_cpu_data.x86_model > 8)) { - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); - } else { - pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); - } - - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - - if (TLB_ERROR(ec)) { - /* - * GART errors are intended to help graphics driver developers - * to detect bad GART PTEs. It is recommended by AMD to disable - * GART table walk error reporting by default[1] (currently - * being disabled in mce_cpu_quirks()) and according to the - * comment in mce_cpu_quirks(), such GART errors can be - * incorrectly triggered. We may see these errors anyway and - * unless requested by the user, they won't be reported. - * - * [1] section 13.10.1 on BIOS and Kernel Developers Guide for - * AMD NPT family 0Fh processors - */ - if (!report_gart_errors) - return; - - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", - TT_MSG(ec), LL_MSG(ec)); - } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); - } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs, ecc); - } else { - /* shouldn't reach here! */ - amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, ec); - } + struct mem_ctl_info *mci = mci_lookup[node_id]; - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + __amd64_decode_bus_error(mci, regs, ecc_type); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. + * + * FIXME: this should go somewhere else, if at all. */ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) edac_mc_handle_ue_no_info(mci, "UE bit is set"); + } /* @@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci) { struct err_regs regs; - if (amd64_get_error_info(mci, ®s)) - amd64_decode_nb_mce(mci, ®s, 1); + if (amd64_get_error_info(mci, ®s)) { + struct amd64_pvt *pvt = mci->pvt_info; + amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); + } } /* @@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt) mci_lookup[node_id] = mci; pvt_lookup[node_id] = NULL; + + /* register stuff with EDAC MCE */ + if (report_gart_errors) + amd_report_gart_errors(true); + + amd_register_ecc_decoder(amd64_decode_bus_error); + return 0; err_add_mc: @@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) mci_lookup[pvt->mc_node_id] = NULL; + /* unregister from EDAC MCE */ + amd_report_gart_errors(false); + amd_unregister_ecc_decoder(amd64_decode_bus_error); + /* Free the EDAC CORE resources */ edac_mc_free(mci); } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ecab0c9fd14e..8ea07e2715dc 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -346,24 +346,8 @@ enum { #define K8_NBSL_PP_OBS 0x2 #define K8_NBSL_PP_GENERIC 0x3 - -#define K8_NBSH 0x4C - -#define K8_NBSH_VALID_BIT BIT(31) -#define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UC_ERR BIT(29) -#define K8_NBSH_ERR_EN BIT(28) -#define K8_NBSH_MISCV BIT(27) -#define K8_NBSH_VALID_ERROR_ADDR BIT(26) -#define K8_NBSH_PCC BIT(25) -#define K8_NBSH_ERR_CPU_VAL BIT(24) -#define K8_NBSH_CECC BIT(14) -#define K8_NBSH_UECC BIT(13) -#define K8_NBSH_ERR_SCRUBER BIT(8) - #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) - #define K8_NBEAL 0x50 #define K8_NBEAH 0x54 #define K8_SCRCTRL 0x58 @@ -428,23 +412,6 @@ enum amd64_chipset_families { F11_CPUS, }; -/* - * Structure to hold: - * - * 1) dynamically read status and error address HW registers - * 2) sysfs entered values - * 3) MCE values - * - * Depends on entry into the modules - */ -struct err_regs { - u32 nbcfg; - u32 nbsh; - u32 nbsl; - u32 nbeah; - u32 nbeal; -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, - int handle_errors); - int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index bcb4e2eba3dc..59cf2cf6e11e 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 918567e8cfd5..444c2cc4472d 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,6 +1,31 @@ #include #include "edac_mce_amd.h" +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + /* * string representation for the different MCA reported error types, see F3x48 * or MSR0000_0411. @@ -102,3 +127,93 @@ const char *ext_msgs[] = { "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); + +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +{ + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); + + if (!handle_errors) + return; + + pr_emerg(" Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { + /* + * GART errors are intended to help graphics driver developers + * to detect bad GART PTEs. It is recommended by AMD to disable + * GART table walk error reporting by default[1] (currently + * being disabled in mce_cpu_quirks()) and according to the + * comment in mce_cpu_quirks(), such GART errors can be + * incorrectly triggered. We may see these errors anyway and + * unless requested by the user, they won't be reported. + * + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for + * AMD NPT family 0Fh processors + */ + if (!report_gart_errors) + return; + + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(" Bus (Link/DRAM) error\n"); + if (nb_bus_decoder) + nb_bus_decoder(node_id, regs, ecc); + } else { + /* shouldn't reach here! */ + pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); + } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +void decode_mce(struct mce *m) +{ + struct err_regs regs; + int node; + + if (m->bank != 4) + return; + + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = topology_cpu_node_id(m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); +} diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 39971cdabb51..9114dc62782b 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,3 +1,8 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include + #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] @@ -22,6 +27,20 @@ #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + extern const char *tt_msgs[]; extern const char *ll_msgs[]; extern const char *rrrr_msgs[]; @@ -29,3 +48,22 @@ extern const char *pp_msgs[]; extern const char *to_msgs[]; extern const char *ii_msgs[]; extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_decode_nb_mce(int, struct err_regs *, int); + +#endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From b69b29de65fe4078b125acc9dea34be82f7c362c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 27 Jul 2009 16:21:14 +0200 Subject: EDAC, AMD: carve out MCi_STATUS decoding The MCi_STATUS registers have most field definitions in common so decode them in the general path. Do not pass ecc_type along and compute it in __amd64_decode_bus_error instead. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 8 +++---- drivers/edac/edac_mce_amd.c | 57 ++++++++++++++++++++++----------------------- drivers/edac/edac_mce_amd.h | 4 ++-- 3 files changed, 34 insertions(+), 35 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2080b1e2e8a2..c81ca2cf8dc7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,10 +2283,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); + int ecc_type = info->nbsh & (0x3 << 13); pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); @@ -2316,12 +2317,11 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_bus_error(int node_id, struct err_regs *regs, - int ecc_type) +void amd64_decode_bus_error(int node_id, struct err_regs *regs) { struct mem_ctl_info *mci = mci_lookup[node_id]; - __amd64_decode_bus_error(mci, regs, ecc_type); + __amd64_decode_bus_error(mci, regs); /* * Check the UE bit of the NB status high register, if set generate some diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 444c2cc4472d..0ba92d65db43 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -2,7 +2,7 @@ #include "edac_mce_amd.h" static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); void amd_report_gart_errors(bool v) { @@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -130,7 +130,6 @@ EXPORT_SYMBOL_GPL(ext_msgs); void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { - int ecc; u32 ec = ERROR_CODE(regs->nbsl); u32 xec = EXT_ERROR_CODE(regs->nbsl); @@ -151,21 +150,6 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -191,7 +175,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); if (nb_bus_decoder) - nb_bus_decoder(node_id, regs, ecc); + nb_bus_decoder(node_id, regs); } else { /* shouldn't reach here! */ pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); @@ -204,16 +188,31 @@ EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; - int node; + int node, ecc; - if (m->bank != 4) - return; + pr_emerg("MC%d_STATUS:\n", m->bank); - regs.nbsl = (u32) m->status; - regs.nbsh = (u32)(m->status >> 32); - regs.nbeal = (u32) m->addr; - regs.nbeah = (u32)(m->addr >> 32); - node = topology_cpu_node_id(m->extcpu); + pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + "CPU context corrupt: %s", + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), + ((m->status & MCI_STATUS_EN) ? "yes" : "no"), + ((m->status & MCI_STATUS_MISCV) ? "" : "in"), + ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); - amd_decode_nb_mce(node, ®s, 1); + /* do the two bits[14:13] together */ + ecc = m->status & (3ULL << 45); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (m->bank == 4) { + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = per_cpu(cpu_llc_id, m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); + } } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 9114dc62782b..df23ee065f79 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -62,8 +62,8 @@ struct err_regs { void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_decode_nb_mce(int, struct err_regs *, int); #endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From d93cc222adf3532ddb442648f8db00c15d1dc4c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 10:56:15 +0200 Subject: EDAC, AMD: carve out decoding of MCi_STATUS ErrorCode This is the MCE error code from the MCi_STATUS banks, bits [15:0] which describe what type of error was encountered: GART TLB, Memory or Bus error. The semantics of those bits are identical across all MCE banks so decode those separately, irrespectively of MCE type. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ---- drivers/edac/edac_mce_amd.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'drivers/edac/amd64_edac.c') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c81ca2cf8dc7..173dc4a84166 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2289,10 +2289,6 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, u32 xec = EXT_ERROR_CODE(info->nbsl); int ecc_type = info->nbsh & (0x3 << 13); - pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); - - /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 0ba92d65db43..81f812eb3aea 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -150,6 +150,16 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + + if (BUS_ERROR(ec) && nb_bus_decoder) + nb_bus_decoder(node_id, regs); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +static inline void amd_decode_err_code(unsigned int ec) +{ if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -166,33 +176,28 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if (!report_gart_errors) return; - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + pr_emerg(" Transaction: %s, Cache Level %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", + pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - if (nb_bus_decoder) - nb_bus_decoder(node_id, regs); - } else { - /* shouldn't reach here! */ - pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); - } - - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " + "Participating Processor: %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), + PP_MSG(ec)); + } else + pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; int node, ecc; - pr_emerg("MC%d_STATUS:\n", m->bank); + pr_emerg("MC%d_STATUS: ", m->bank); - pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + pr_cont("%sorrected error, report: %s, MiscV: %svalid, " "CPU context corrupt: %s", ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), ((m->status & MCI_STATUS_EN) ? "yes" : "no"), @@ -206,6 +211,8 @@ void decode_mce(struct mce *m) pr_cont("\n"); + amd_decode_err_code(m->status & 0xffff); + if (m->bank == 4) { regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); -- cgit v1.2.3