summaryrefslogtreecommitdiff
path: root/drivers/edac
diff options
context:
space:
mode:
authorShuai Xue <xueshuai@linux.alibaba.com>2022-03-08 22:40:52 +0800
committerBorislav Petkov <bp@suse.de>2022-04-08 11:31:18 +0200
commited27b5df3877458eb24615fd9c202178660db009 (patch)
treedfdae7d45985dd794f507cf39ba2dd0f65103c32 /drivers/edac
parentbdae79651453df0bca20963fc2ab970146ef2a37 (diff)
downloadlwn-ed27b5df3877458eb24615fd9c202178660db009.tar.gz
lwn-ed27b5df3877458eb24615fd9c202178660db009.zip
EDAC/ghes: Unify CPER memory error location reporting
Switch the GHES EDAC memory error reporting functions to use the common CPER ones and get rid of code duplication. [ bp: - rewrite commit message, remove useless text - rip out useless reformatting - align function params on the opening brace - rename function to a more descriptive name - drop useless function exports - handle buffer lengths properly when printing other detail - remove useless casting ] Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com> Signed-off-by: Borislav Petkov <bp@suse.de> Link: https://lore.kernel.org/r/20220308144053.49090-3-xueshuai@linux.alibaba.com
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/Kconfig1
-rw-r--r--drivers/edac/ghes_edac.c200
2 files changed, 38 insertions, 163 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 58ab63642e72..23f11554f400 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -55,6 +55,7 @@ config EDAC_DECODE_MCE
config EDAC_GHES
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES && (EDAC=y)
+ select UEFI_CPER
help
Not all machines support hardware-driven error report. Some of those
provide a BIOS-driven error report mechanism via ACPI, using the
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 6d1ddecbf0da..2805d5610300 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -15,11 +15,13 @@
#include "edac_module.h"
#include <ras/ras_event.h>
+#define OTHER_DETAIL_LEN 400
+
struct ghes_pvt {
struct mem_ctl_info *mci;
/* Buffers for the error handling routine */
- char other_detail[400];
+ char other_detail[OTHER_DETAIL_LEN];
char msg[80];
};
@@ -235,8 +237,34 @@ static void ghes_scan_system(void)
system_scanned = true;
}
+static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg,
+ const char *location, unsigned int len)
+{
+ u32 n;
+
+ if (!msg)
+ return 0;
+
+ n = 0;
+ len -= 1;
+
+ n += scnprintf(msg + n, len - n, "APEI location: %s ", location);
+
+ if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS))
+ goto out;
+
+ n += scnprintf(msg + n, len - n, "status(0x%016llx): ", mem->error_status);
+ n += scnprintf(msg + n, len - n, "%s ", cper_mem_err_status_str(mem->error_status));
+
+out:
+ msg[n] = '\0';
+
+ return n;
+}
+
void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
{
+ struct cper_mem_err_compact cmem;
struct edac_raw_error_desc *e;
struct mem_ctl_info *mci;
struct ghes_pvt *pvt;
@@ -292,60 +320,10 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Error type, mapped on e->msg */
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
+ u8 etype = mem_err->error_type;
+
p = pvt->msg;
- switch (mem_err->error_type) {
- case 0:
- p += sprintf(p, "Unknown");
- break;
- case 1:
- p += sprintf(p, "No error");
- break;
- case 2:
- p += sprintf(p, "Single-bit ECC");
- break;
- case 3:
- p += sprintf(p, "Multi-bit ECC");
- break;
- case 4:
- p += sprintf(p, "Single-symbol ChipKill ECC");
- break;
- case 5:
- p += sprintf(p, "Multi-symbol ChipKill ECC");
- break;
- case 6:
- p += sprintf(p, "Master abort");
- break;
- case 7:
- p += sprintf(p, "Target abort");
- break;
- case 8:
- p += sprintf(p, "Parity Error");
- break;
- case 9:
- p += sprintf(p, "Watchdog timeout");
- break;
- case 10:
- p += sprintf(p, "Invalid address");
- break;
- case 11:
- p += sprintf(p, "Mirror Broken");
- break;
- case 12:
- p += sprintf(p, "Memory Sparing");
- break;
- case 13:
- p += sprintf(p, "Scrub corrected error");
- break;
- case 14:
- p += sprintf(p, "Scrub uncorrected error");
- break;
- case 15:
- p += sprintf(p, "Physical Memory Map-out event");
- break;
- default:
- p += sprintf(p, "reserved error (%d)",
- mem_err->error_type);
- }
+ p += snprintf(p, sizeof(pvt->msg), "%s", cper_mem_err_type_str(etype));
} else {
strcpy(pvt->msg, "unknown error");
}
@@ -362,52 +340,19 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* Memory error location, mapped on e->location */
p = e->location;
- if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
- p += sprintf(p, "node:%d ", mem_err->node);
- if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
- p += sprintf(p, "card:%d ", mem_err->card);
- if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
- p += sprintf(p, "module:%d ", mem_err->module);
- if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
- p += sprintf(p, "rank:%d ", mem_err->rank);
- if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
- p += sprintf(p, "bank:%d ", mem_err->bank);
- if (mem_err->validation_bits & CPER_MEM_VALID_BANK_GROUP)
- p += sprintf(p, "bank_group:%d ",
- mem_err->bank >> CPER_MEM_BANK_GROUP_SHIFT);
- if (mem_err->validation_bits & CPER_MEM_VALID_BANK_ADDRESS)
- p += sprintf(p, "bank_address:%d ",
- mem_err->bank & CPER_MEM_BANK_ADDRESS_MASK);
- if (mem_err->validation_bits & (CPER_MEM_VALID_ROW | CPER_MEM_VALID_ROW_EXT)) {
- u32 row = mem_err->row;
-
- row |= cper_get_mem_extension(mem_err->validation_bits, mem_err->extended);
- p += sprintf(p, "row:%d ", row);
- }
- if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
- p += sprintf(p, "col:%d ", mem_err->column);
- if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
- p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
+ cper_mem_err_pack(mem_err, &cmem);
+ p += cper_mem_err_location(&cmem, p);
+
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
- const char *bank = NULL, *device = NULL;
struct dimm_info *dimm;
- dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
- if (bank != NULL && device != NULL)
- p += sprintf(p, "DIMM location:%s %s ", bank, device);
- else
- p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
- mem_err->mem_dev_handle);
-
+ p += cper_dimm_err_location(&cmem, p);
dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
if (dimm) {
e->top_layer = dimm->idx;
strcpy(e->label, dimm->label);
}
}
- if (mem_err->validation_bits & CPER_MEM_VALID_CHIP_ID)
- p += sprintf(p, "chipID: %d ",
- mem_err->extended >> CPER_MEM_CHIP_ID_SHIFT);
if (p > e->location)
*(p - 1) = '\0';
@@ -416,78 +361,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
/* All other fields are mapped on e->other_detail */
p = pvt->other_detail;
- p += snprintf(p, sizeof(pvt->other_detail),
- "APEI location: %s ", e->location);
- if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
- u64 status = mem_err->error_status;
-
- p += sprintf(p, "status(0x%016llx): ", (long long)status);
- switch ((status >> 8) & 0xff) {
- case 1:
- p += sprintf(p, "Error detected internal to the component ");
- break;
- case 16:
- p += sprintf(p, "Error detected in the bus ");
- break;
- case 4:
- p += sprintf(p, "Storage error in DRAM memory ");
- break;
- case 5:
- p += sprintf(p, "Storage error in TLB ");
- break;
- case 6:
- p += sprintf(p, "Storage error in cache ");
- break;
- case 7:
- p += sprintf(p, "Error in one or more functional units ");
- break;
- case 8:
- p += sprintf(p, "component failed self test ");
- break;
- case 9:
- p += sprintf(p, "Overflow or undervalue of internal queue ");
- break;
- case 17:
- p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
- break;
- case 18:
- p += sprintf(p, "Improper access error ");
- break;
- case 19:
- p += sprintf(p, "Access to a memory address which is not mapped to any component ");
- break;
- case 20:
- p += sprintf(p, "Loss of Lockstep ");
- break;
- case 21:
- p += sprintf(p, "Response not associated with a request ");
- break;
- case 22:
- p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
- break;
- case 23:
- p += sprintf(p, "Detection of a PATH_ERROR ");
- break;
- case 25:
- p += sprintf(p, "Bus operation timeout ");
- break;
- case 26:
- p += sprintf(p, "A read was issued to data that has been poisoned ");
- break;
- default:
- p += sprintf(p, "reserved ");
- break;
- }
- }
- if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
- p += sprintf(p, "requestorID: 0x%016llx ",
- (long long)mem_err->requestor_id);
- if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
- p += sprintf(p, "responderID: 0x%016llx ",
- (long long)mem_err->responder_id);
- if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
- p += sprintf(p, "targetID: 0x%016llx ",
- (long long)mem_err->responder_id);
+ p += print_mem_error_other_detail(mem_err, p, e->location, OTHER_DETAIL_LEN);
if (p > pvt->other_detail)
*(p - 1) = '\0';