From 77c5f5d2f212e1963063e427fc57c44bf6eae9fb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 15 Feb 2013 06:11:57 -0300
Subject: ghes_edac: Register at EDAC core the BIOS report

Register GHES at EDAC MC core, in order to avoid other
drivers to also handle errors and mangle with error data.

The edac core will warrant that just one driver will be used,
so the first one to register (BIOS first) will be the one that
will be reporting the hardware errors.

For now, the EDAC driver does nothing but to register at the
EDAC core, preventing the hardware-driven mechanism to
interfere with GHES.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 drivers/edac/ghes_edac.c

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
new file mode 100644
index 000000000000..d8e54b496e0f
--- /dev/null
+++ b/drivers/edac/ghes_edac.c
@@ -0,0 +1,114 @@
+/*
+ * GHES/EDAC Linux driver
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2013 by Mauro Carvalho Chehab <mchehab@redhat.com>
+ *
+ * Red Hat Inc. http://www.redhat.com
+ */
+
+#include <acpi/ghes.h>
+#include <linux/edac.h>
+#include "edac_core.h"
+
+#define GHES_PFX   "ghes_edac: "
+#define GHES_EDAC_REVISION " Ver: 1.0.0"
+
+struct ghes_edac_pvt {
+	struct list_head list;
+	struct ghes *ghes;
+	struct mem_ctl_info *mci;
+};
+
+static LIST_HEAD(ghes_reglist);
+static DEFINE_MUTEX(ghes_edac_lock);
+static int ghes_edac_mc_num;
+
+void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
+                               struct cper_sec_mem_err *mem_err)
+{
+}
+EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
+
+int ghes_edac_register(struct ghes *ghes, struct device *dev)
+{
+	int rc;
+	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[1];
+	struct csrow_info *csrow;
+	struct dimm_info *dimm;
+	struct ghes_edac_pvt *pvt;
+
+	layers[0].type = EDAC_MC_LAYER_ALL_MEM;
+	layers[0].size = 1;
+	layers[0].is_virt_csrow = true;
+
+	/*
+	 * We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
+	 * to avoid duplicated memory controller numbers
+	 */
+	mutex_lock(&ghes_edac_lock);
+	mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
+			    sizeof(*pvt));
+	if (!mci) {
+		pr_info(GHES_PFX "Can't allocate memory for EDAC data\n");
+		mutex_unlock(&ghes_edac_lock);
+		return -ENOMEM;
+	}
+
+	pvt = mci->pvt_info;
+	memset(pvt, 0, sizeof(*pvt));
+        list_add_tail(&pvt->list, &ghes_reglist);
+	pvt->ghes = ghes;
+	pvt->mci  = mci;
+	mci->pdev = dev;
+
+	mci->mtype_cap = MEM_FLAG_EMPTY;
+	mci->edac_ctl_cap = EDAC_FLAG_NONE;
+	mci->edac_cap = EDAC_FLAG_NONE;
+	mci->mod_name = "ghes_edac.c";
+	mci->mod_ver = GHES_EDAC_REVISION;
+	mci->ctl_name = "ghes_edac";
+	mci->dev_name = "ghes";
+
+	csrow = mci->csrows[0];
+	dimm = csrow->channels[0]->dimm;
+
+	/* FIXME: FAKE DATA */
+	dimm->nr_pages = 1000;
+	dimm->grain = 128;
+	dimm->mtype = MEM_UNKNOWN;
+	dimm->dtype = DEV_UNKNOWN;
+	dimm->edac_mode = EDAC_SECDED;
+
+	rc = edac_mc_add_mc(mci);
+	if (rc < 0) {
+		pr_info(GHES_PFX "Can't register at EDAC core\n");
+		edac_mc_free(mci);
+		mutex_unlock(&ghes_edac_lock);
+		return -ENODEV;
+	}
+
+	ghes_edac_mc_num++;
+	mutex_unlock(&ghes_edac_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ghes_edac_register);
+
+void ghes_edac_unregister(struct ghes *ghes)
+{
+	struct mem_ctl_info *mci;
+	struct ghes_edac_pvt *pvt;
+
+	list_for_each_entry(pvt, &ghes_reglist, list) {
+		if (ghes == pvt->ghes) {
+			mci = pvt->mci;
+			edac_mc_del_mc(mci->pdev);
+			edac_mc_free(mci);
+			list_del(&pvt->list);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(ghes_edac_unregister);
-- 
cgit v1.2.3


From f04c62a7036a4b8490b224a9ad1be4378a3acf4d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 15 Feb 2013 06:36:27 -0300
Subject: ghes_edac: add support for reporting errors via EDAC

Now that the EDAC core is capable of just forward the errors via
the userspace API, add a report mechanism for the GHES errors.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index d8e54b496e0f..0853f450d2c1 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -27,8 +27,60 @@ static DEFINE_MUTEX(ghes_edac_lock);
 static int ghes_edac_mc_num;
 
 void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
-                               struct cper_sec_mem_err *mem_err)
+				struct cper_sec_mem_err *mem_err)
 {
+	enum hw_event_mc_err_type type;
+	struct edac_raw_error_desc *e;
+	struct mem_ctl_info *mci;
+	struct ghes_edac_pvt *pvt = NULL;
+
+	list_for_each_entry(pvt, &ghes_reglist, list) {
+		if (ghes == pvt->ghes)
+			break;
+	}
+	if (!pvt) {
+		pr_err("Internal error: Can't find EDAC structure\n");
+		return;
+	}
+	mci = pvt->mci;
+	e = &mci->error_desc;
+
+	/* Cleans the error report buffer */
+	memset(e, 0, sizeof (*e));
+	e->error_count = 1;
+	e->msg = "APEI";
+	strcpy(e->label, "unknown");
+	e->other_detail = "";
+
+	if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+		e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
+		e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
+		e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
+	}
+
+	switch (sev) {
+	case GHES_SEV_CORRECTED:
+		type = HW_EVENT_ERR_CORRECTED;
+		break;
+	case GHES_SEV_RECOVERABLE:
+		type = HW_EVENT_ERR_UNCORRECTED;
+		break;
+	case GHES_SEV_PANIC:
+		type = HW_EVENT_ERR_FATAL;
+		break;
+	default:
+	case GHES_SEV_NO:
+		type = HW_EVENT_ERR_INFO;
+	}
+
+	sprintf(e->location,
+		"node:%d card:%d module:%d bank:%d device:%d row: %d column:%d bit_pos:%d",
+		mem_err->node, mem_err->card, mem_err->module,
+		mem_err->bank, mem_err->device, mem_err->row, mem_err->column,
+		mem_err->bit_pos);
+	edac_dbg(3, "error at location %s\n", e->location);
+
+	edac_raw_mc_handle_error(type, mci, e);
 }
 EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
 
@@ -60,7 +112,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 
 	pvt = mci->pvt_info;
 	memset(pvt, 0, sizeof(*pvt));
-        list_add_tail(&pvt->list, &ghes_reglist);
+	list_add_tail(&pvt->list, &ghes_reglist);
 	pvt->ghes = ghes;
 	pvt->mci  = mci;
 	mci->pdev = dev;
-- 
cgit v1.2.3


From 32fa1f53c2daf9f55f17ff883b4297f86095b09c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Thu, 14 Feb 2013 09:11:08 -0300
Subject: ghes_edac: do a better job of filling EDAC DIMM info

Instead of just faking a random value for the DIMM data, get
the information that it is available via DMI table.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 192 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 180 insertions(+), 12 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 0853f450d2c1..22ac29e4733f 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -11,6 +11,7 @@
 
 #include <acpi/ghes.h>
 #include <linux/edac.h>
+#include <linux/dmi.h>
 #include "edac_core.h"
 
 #define GHES_PFX   "ghes_edac: "
@@ -26,6 +27,155 @@ static LIST_HEAD(ghes_reglist);
 static DEFINE_MUTEX(ghes_edac_lock);
 static int ghes_edac_mc_num;
 
+/* Memory Device - Type 17 of SMBIOS spec */
+struct memdev_dmi_entry {
+	u8 type;
+	u8 length;
+	u16 handle;
+	u16 phys_mem_array_handle;
+	u16 mem_err_info_handle;
+	u16 total_width;
+	u16 data_width;
+	u16 size;
+	u8 form_factor;
+	u8 device_set;
+	u8 device_locator;
+	u8 bank_locator;
+	u8 memory_type;
+	u16 type_detail;
+	u16 speed;
+	u8 manufacturer;
+	u8 serial_number;
+	u8 asset_tag;
+	u8 part_number;
+	u8 attributes;
+	u32 extended_size;
+	u16 conf_mem_clk_speed;
+} __attribute__((__packed__));
+
+struct ghes_edac_dimm_fill {
+	struct mem_ctl_info *mci;
+	unsigned count;
+};
+
+char *memory_type[] = {
+	[MEM_EMPTY] = "EMPTY",
+	[MEM_RESERVED] = "RESERVED",
+	[MEM_UNKNOWN] = "UNKNOWN",
+	[MEM_FPM] = "FPM",
+	[MEM_EDO] = "EDO",
+	[MEM_BEDO] = "BEDO",
+	[MEM_SDR] = "SDR",
+	[MEM_RDR] = "RDR",
+	[MEM_DDR] = "DDR",
+	[MEM_RDDR] = "RDDR",
+	[MEM_RMBS] = "RMBS",
+	[MEM_DDR2] = "DDR2",
+	[MEM_FB_DDR2] = "FB_DDR2",
+	[MEM_RDDR2] = "RDDR2",
+	[MEM_XDR] = "XDR",
+	[MEM_DDR3] = "DDR3",
+	[MEM_RDDR3] = "RDDR3",
+};
+
+static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
+{
+	int *num_dimm = arg;
+
+	if (dh->type == DMI_ENTRY_MEM_DEVICE)
+		(*num_dimm)++;
+}
+
+static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
+{
+	struct ghes_edac_dimm_fill *dimm_fill = arg;
+	struct mem_ctl_info *mci = dimm_fill->mci;
+
+	if (dh->type == DMI_ENTRY_MEM_DEVICE) {
+		struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
+		struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
+						       mci->n_layers,
+						       dimm_fill->count, 0, 0);
+
+		if (entry->size == 0xffff) {
+			pr_info(GHES_PFX "Can't get dimm size\n");
+			dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
+		} else if (entry->size == 0x7fff) {
+			dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
+		} else {
+			if (entry->size & 1 << 15)
+				dimm->nr_pages = MiB_TO_PAGES((entry->size &
+							       0x7fff) << 10);
+			else
+				dimm->nr_pages = MiB_TO_PAGES(entry->size);
+		}
+
+		switch (entry->memory_type) {
+		case 0x12:
+			if (entry->type_detail & 1 << 13)
+				dimm->mtype = MEM_RDDR;
+			else
+				dimm->mtype = MEM_DDR;
+			break;
+		case 0x13:
+			if (entry->type_detail & 1 << 13)
+				dimm->mtype = MEM_RDDR2;
+			else
+				dimm->mtype = MEM_DDR2;
+			break;
+		case 0x14:
+			dimm->mtype = MEM_FB_DDR2;
+			break;
+		case 0x18:
+			if (entry->type_detail & 1 << 13)
+				dimm->mtype = MEM_RDDR3;
+			else
+				dimm->mtype = MEM_DDR3;
+			break;
+		default:
+			if (entry->type_detail & 1 << 6)
+				dimm->mtype = MEM_RMBS;
+			else if ((entry->type_detail & ((1 << 7) | (1 << 13)))
+				 == ((1 << 7) | (1 << 13)))
+				dimm->mtype = MEM_RDR;
+			else if (entry->type_detail & 1 << 7)
+				dimm->mtype = MEM_SDR;
+			else if (entry->type_detail & 1 << 9)
+				dimm->mtype = MEM_EDO;
+			else
+				dimm->mtype = MEM_UNKNOWN;
+		}
+
+		/*
+		 * Actually, we can only detect if the memory has bits for
+		 * checksum or not
+		 */
+		if (entry->total_width == entry->data_width)
+			dimm->edac_mode = EDAC_NONE;
+		else
+			dimm->edac_mode = EDAC_SECDED;
+
+		dimm->dtype = DEV_UNKNOWN;
+		dimm->grain = 128;		/* Likely, worse case */
+
+		/*
+		 * FIXME: It shouldn't be hard to also fill the DIMM labels
+		 */
+
+		if (dimm->nr_pages) {
+			pr_info(GHES_PFX "DIMM%i: %s size = %d MB%s\n",
+				dimm_fill->count, memory_type[dimm->mtype],
+				PAGES_TO_MiB(dimm->nr_pages),
+				(dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
+			pr_info(GHES_PFX "\ttype %d, detail 0x%02x, width %d(total %d)\n",
+				entry->memory_type, entry->type_detail,
+				entry->total_width, entry->data_width);
+		}
+
+		dimm_fill->count++;
+	}
+}
+
 void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 				struct cper_sec_mem_err *mem_err)
 {
@@ -86,15 +236,24 @@ EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
 
 int ghes_edac_register(struct ghes *ghes, struct device *dev)
 {
-	int rc;
+	bool fake = false;
+	int rc, num_dimm = 0;
 	struct mem_ctl_info *mci;
 	struct edac_mc_layer layers[1];
-	struct csrow_info *csrow;
-	struct dimm_info *dimm;
 	struct ghes_edac_pvt *pvt;
+	struct ghes_edac_dimm_fill dimm_fill;
+
+	/* Get the number of DIMMs */
+	dmi_walk(ghes_edac_count_dimms, &num_dimm);
+
+	/* Check if we've got a bogus BIOS */
+	if (num_dimm == 0) {
+		fake = true;
+		num_dimm = 1;
+	}
 
 	layers[0].type = EDAC_MC_LAYER_ALL_MEM;
-	layers[0].size = 1;
+	layers[0].size = num_dimm;
 	layers[0].is_virt_csrow = true;
 
 	/*
@@ -102,6 +261,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	 * to avoid duplicated memory controller numbers
 	 */
 	mutex_lock(&ghes_edac_lock);
+	pr_info("ghes_edac#%d: allocating space for %d dimms\n",
+		ghes_edac_mc_num, num_dimm);
 	mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
 			    sizeof(*pvt));
 	if (!mci) {
@@ -125,15 +286,22 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	mci->ctl_name = "ghes_edac";
 	mci->dev_name = "ghes";
 
-	csrow = mci->csrows[0];
-	dimm = csrow->channels[0]->dimm;
+	if (!fake) {
+		/* Fill DIMM info from DMI */
+		dimm_fill.count = 0;
+		dimm_fill.mci = mci;
+		dmi_walk(ghes_edac_dmidecode, &dimm_fill);
+	} else {
+		struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
+						       mci->n_layers, 0, 0, 0);
 
-	/* FIXME: FAKE DATA */
-	dimm->nr_pages = 1000;
-	dimm->grain = 128;
-	dimm->mtype = MEM_UNKNOWN;
-	dimm->dtype = DEV_UNKNOWN;
-	dimm->edac_mode = EDAC_SECDED;
+		pr_info(GHES_PFX "Crappy BIOS detected. Faking DIMM EDAC data\n");
+		dimm->nr_pages = 1000;
+		dimm->grain = 128;
+		dimm->mtype = MEM_UNKNOWN;
+		dimm->dtype = DEV_UNKNOWN;
+		dimm->edac_mode = EDAC_SECDED;
+	}
 
 	rc = edac_mc_add_mc(mci);
 	if (rc < 0) {
-- 
cgit v1.2.3


From 5ee726db521fddf991f261e5c45e04a7d2bf1bc1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 15 Feb 2013 08:45:00 -0300
Subject: ghes_edac: Don't credit the same memory dimm twice

On my tests on a 4xE5-4650 CPU's system, the GHES
EDAC driver is called twice. As the SMBIOS DMI enumeration
call will seek for the entire DIMM sockets in the system, on
this machine, equipped with 128 GB of RAM, the memory is
displayed twice:

          +-----------------------+
          |    mc0    |    mc1    |
----------+-----------------------+
memory45: |  8192 MB  |  8192 MB  |
memory44: |     0 MB  |     0 MB  |
----------+-----------------------+
memory43: |     0 MB  |     0 MB  |
memory42: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory41: |     0 MB  |     0 MB  |
memory40: |     0 MB  |     0 MB  |
----------+-----------------------+
memory39: |  8192 MB  |  8192 MB  |
memory38: |     0 MB  |     0 MB  |
----------+-----------------------+
memory37: |     0 MB  |     0 MB  |
memory36: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory35: |     0 MB  |     0 MB  |
memory34: |     0 MB  |     0 MB  |
----------+-----------------------+
memory33: |  8192 MB  |  8192 MB  |
memory32: |     0 MB  |     0 MB  |
----------+-----------------------+
memory31: |     0 MB  |     0 MB  |
memory30: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory29: |     0 MB  |     0 MB  |
memory28: |     0 MB  |     0 MB  |
----------+-----------------------+
memory27: |  8192 MB  |  8192 MB  |
memory26: |     0 MB  |     0 MB  |
----------+-----------------------+
memory25: |     0 MB  |     0 MB  |
memory24: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory23: |     0 MB  |     0 MB  |
memory22: |     0 MB  |     0 MB  |
----------+-----------------------+
memory21: |  8192 MB  |  8192 MB  |
memory20: |     0 MB  |     0 MB  |
----------+-----------------------+
memory19: |     0 MB  |     0 MB  |
memory18: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory17: |     0 MB  |     0 MB  |
memory16: |     0 MB  |     0 MB  |
----------+-----------------------+
memory15: |  8192 MB  |  8192 MB  |
memory14: |     0 MB  |     0 MB  |
----------+-----------------------+
memory13: |     0 MB  |     0 MB  |
memory12: |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory11: |     0 MB  |     0 MB  |
memory10: |     0 MB  |     0 MB  |
----------+-----------------------+
memory9:  |  8192 MB  |  8192 MB  |
memory8:  |     0 MB  |     0 MB  |
----------+-----------------------+
memory7:  |     0 MB  |     0 MB  |
memory6:  |  8192 MB  |  8192 MB  |
----------+-----------------------+
memory5:  |     0 MB  |     0 MB  |
memory4:  |     0 MB  |     0 MB  |
----------+-----------------------+
memory3:  |  8192 MB  |  8192 MB  |
memory2:  |     0 MB  |     0 MB  |
----------+-----------------------+
memory1:  |     0 MB  |     0 MB  |
memory0:  |  8192 MB  |  8192 MB  |
----------+-----------------------+

Total sum of 256 GB.

As there's no reliable way to credit DIMMS to the right memory
controller, just put everything on memory controller 0 (with should
always exist).

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 22ac29e4733f..fb866804820c 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -287,10 +287,19 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	mci->dev_name = "ghes";
 
 	if (!fake) {
-		/* Fill DIMM info from DMI */
-		dimm_fill.count = 0;
-		dimm_fill.mci = mci;
-		dmi_walk(ghes_edac_dmidecode, &dimm_fill);
+		/*
+		 * Fill DIMM info from DMI for the memory controller #0
+		 *
+		 * Keep it in blank for the other memory controllers, as
+		 * there's no reliable way to properly credit each DIMM to
+		 * the memory controller, as different BIOSes fill the
+		 * DMI bank location fields on different ways
+		 */
+		if (!ghes_edac_mc_num) {
+			dimm_fill.count = 0;
+			dimm_fill.mci = mci;
+			dmi_walk(ghes_edac_dmidecode, &dimm_fill);
+		}
 	} else {
 		struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 						       mci->n_layers, 0, 0, 0);
-- 
cgit v1.2.3


From d2a6856614fd34e36352146307a5655efbdbc14d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 15 Feb 2013 09:06:38 -0300
Subject: ghes_edac: Improve driver's printk messages

Provide a better infrastructure for printk's inside the driver:
	- use edac_dbg() for debug messages;
	- standardize the usage of pr_info();
	- provide warning about the risk of relying on this
	  driver.

While here, changes the size of a fake memory to 1 page. This is
as good or as bad as 1000 pages, but it is easier for userspace to
detect, as I don't expect that any machine implementing GHES would
provide just 1 page available ;)

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>

Conflicts:
	drivers/edac/ghes_edac.c
---
 drivers/edac/ghes_edac.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index fb866804820c..b4acc4f2074d 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -9,12 +9,13 @@
  * Red Hat Inc. http://www.redhat.com
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <acpi/ghes.h>
 #include <linux/edac.h>
 #include <linux/dmi.h>
 #include "edac_core.h"
 
-#define GHES_PFX   "ghes_edac: "
 #define GHES_EDAC_REVISION " Ver: 1.0.0"
 
 struct ghes_edac_pvt {
@@ -27,6 +28,7 @@ static LIST_HEAD(ghes_reglist);
 static DEFINE_MUTEX(ghes_edac_lock);
 static int ghes_edac_mc_num;
 
+
 /* Memory Device - Type 17 of SMBIOS spec */
 struct memdev_dmi_entry {
 	u8 type;
@@ -98,7 +100,8 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 						       dimm_fill->count, 0, 0);
 
 		if (entry->size == 0xffff) {
-			pr_info(GHES_PFX "Can't get dimm size\n");
+			pr_info("Can't get DIMM%i size\n",
+				dimm_fill->count);
 			dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
 		} else if (entry->size == 0x7fff) {
 			dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
@@ -163,11 +166,11 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 		 */
 
 		if (dimm->nr_pages) {
-			pr_info(GHES_PFX "DIMM%i: %s size = %d MB%s\n",
+			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
 				dimm_fill->count, memory_type[dimm->mtype],
 				PAGES_TO_MiB(dimm->nr_pages),
 				(dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
-			pr_info(GHES_PFX "\ttype %d, detail 0x%02x, width %d(total %d)\n",
+			edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
 				entry->memory_type, entry->type_detail,
 				entry->total_width, entry->data_width);
 		}
@@ -261,12 +264,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	 * to avoid duplicated memory controller numbers
 	 */
 	mutex_lock(&ghes_edac_lock);
-	pr_info("ghes_edac#%d: allocating space for %d dimms\n",
-		ghes_edac_mc_num, num_dimm);
 	mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
 			    sizeof(*pvt));
 	if (!mci) {
-		pr_info(GHES_PFX "Can't allocate memory for EDAC data\n");
+		pr_info("Can't allocate memory for EDAC data\n");
 		mutex_unlock(&ghes_edac_lock);
 		return -ENOMEM;
 	}
@@ -286,6 +287,22 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	mci->ctl_name = "ghes_edac";
 	mci->dev_name = "ghes";
 
+	if (!ghes_edac_mc_num) {
+		if (!fake) {
+			pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
+			pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
+			pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
+			pr_info("If you find incorrect reports, please contact your hardware vendor\n");
+			pr_info("to correct its BIOS.\n");
+			pr_info("This system has %d DIMM sockets.\n",
+				num_dimm);
+		} else {
+			pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
+			pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
+			pr_info("work on such system. Use this driver with caution\n");
+		}
+	}
+
 	if (!fake) {
 		/*
 		 * Fill DIMM info from DMI for the memory controller #0
@@ -304,8 +321,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 		struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 						       mci->n_layers, 0, 0, 0);
 
-		pr_info(GHES_PFX "Crappy BIOS detected. Faking DIMM EDAC data\n");
-		dimm->nr_pages = 1000;
+		dimm->nr_pages = 1;
 		dimm->grain = 128;
 		dimm->mtype = MEM_UNKNOWN;
 		dimm->dtype = DEV_UNKNOWN;
@@ -314,7 +330,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 
 	rc = edac_mc_add_mc(mci);
 	if (rc < 0) {
-		pr_info(GHES_PFX "Can't register at EDAC core\n");
+		pr_info("Can't register at EDAC core\n");
 		edac_mc_free(mci);
 		mutex_unlock(&ghes_edac_lock);
 		return -ENODEV;
-- 
cgit v1.2.3


From 689c9cd8128f13bf9843a3e133423f5e3e0ce4aa Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Tue, 19 Feb 2013 19:24:12 -0300
Subject: ghes_edac: Make it compliant with UEFI spec 2.3.1

The UEFI spec defines the memory error types ans the bits that
validate each field on the memory error record, at
Appendix N om items N.2.5 (Memory Error Section) and
N.2.11 (Error Status). Make the error description compliant with
it, only showing the valid fields.

The EDAC error log is now properly reporting the error:

[  281.556854] mce: [Hardware Error]: Machine check events logged
[  281.557042] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0
[  281.557044] {2}[Hardware Error]: APEI generic hardware error status
[  281.557046] {2}[Hardware Error]: severity: 2, corrected
[  281.557048] {2}[Hardware Error]: section: 0, severity: 2, corrected
[  281.557050] {2}[Hardware Error]: flags: 0x01
[  281.557052] {2}[Hardware Error]: primary
[  281.557053] {2}[Hardware Error]: section_type: memory error
[  281.557055] {2}[Hardware Error]: error_status: 0x0000000000000400
[  281.557056] {2}[Hardware Error]: node: 3
[  281.557057] {2}[Hardware Error]: card: 0
[  281.557058] {2}[Hardware Error]: module: 1
[  281.557059] {2}[Hardware Error]: device: 0
[  281.557061] {2}[Hardware Error]: error_type: 18, unknown
[  281.557067] EDAC DEBUG: ghes_edac_report_mem_error: error validation_bits: 0x000040b9
[  281.557084] EDAC MC0: 1 CE reserved error (18) on unknown label (node:3 card:0 module:1 page:0x0 offset:0x0 grain:0 syndrome:0x0 - status(0x0000000000000400): Storage error in DRAM memory)

Tested on a 4 CPUs E5-4650 Sandy Bridge machine.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 195 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 180 insertions(+), 15 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index b4acc4f2074d..1bde45141073 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -22,6 +22,10 @@ struct ghes_edac_pvt {
 	struct list_head list;
 	struct ghes *ghes;
 	struct mem_ctl_info *mci;
+
+	/* Buffers for the error handling routine */
+	char other_detail[160];
+	char msg[80];
 };
 
 static LIST_HEAD(ghes_reglist);
@@ -186,6 +190,7 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 	struct edac_raw_error_desc *e;
 	struct mem_ctl_info *mci;
 	struct ghes_edac_pvt *pvt = NULL;
+	char *p;
 
 	list_for_each_entry(pvt, &ghes_reglist, list) {
 		if (ghes == pvt->ghes)
@@ -201,15 +206,14 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 	/* Cleans the error report buffer */
 	memset(e, 0, sizeof (*e));
 	e->error_count = 1;
-	e->msg = "APEI";
-	strcpy(e->label, "unknown");
-	e->other_detail = "";
-
-	if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
-		e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
-		e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
-		e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
-	}
+	strcpy(e->label, "unknown label");
+	e->msg = pvt->msg;
+	e->other_detail = pvt->other_detail;
+	e->top_layer = -1;
+	e->mid_layer = -1;
+	e->low_layer = -1;
+	*pvt->other_detail = '\0';
+	*pvt->msg = '\0';
 
 	switch (sev) {
 	case GHES_SEV_CORRECTED:
@@ -226,12 +230,173 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 		type = HW_EVENT_ERR_INFO;
 	}
 
-	sprintf(e->location,
-		"node:%d card:%d module:%d bank:%d device:%d row: %d column:%d bit_pos:%d",
-		mem_err->node, mem_err->card, mem_err->module,
-		mem_err->bank, mem_err->device, mem_err->row, mem_err->column,
-		mem_err->bit_pos);
-	edac_dbg(3, "error at location %s\n", e->location);
+	edac_dbg(1, "error validation_bits: 0x%08llx\n",
+		 (long long)mem_err->validation_bits);
+
+	/* Error type, mapped on e->msg */
+	if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
+		p = pvt->msg;
+		switch (mem_err->error_type) {
+		case 0:
+			p += sprintf(p, "Unknown");
+			break;
+		case 1:
+			p += sprintf(p, "No error");
+			break;
+		case 2:
+			p += sprintf(p, "Single-bit ECC");
+			break;
+		case 3:
+			p += sprintf(p, "Multi-bit ECC");
+			break;
+		case 4:
+			p += sprintf(p, "Single-symbol ChipKill ECC");
+			break;
+		case 5:
+			p += sprintf(p, "Multi-symbol ChipKill ECC");
+			break;
+		case 6:
+			p += sprintf(p, "Master abort");
+			break;
+		case 7:
+			p += sprintf(p, "Target abort");
+			break;
+		case 8:
+			p += sprintf(p, "Parity Error");
+			break;
+		case 9:
+			p += sprintf(p, "Watchdog timeout");
+			break;
+		case 10:
+			p += sprintf(p, "Invalid address");
+			break;
+		case 11:
+			p += sprintf(p, "Mirror Broken");
+			break;
+		case 12:
+			p += sprintf(p, "Memory Sparing");
+			break;
+		case 13:
+			p += sprintf(p, "Scrub corrected error");
+			break;
+		case 14:
+			p += sprintf(p, "Scrub uncorrected error");
+			break;
+		case 15:
+			p += sprintf(p, "Physical Memory Map-out event");
+			break;
+		default:
+			p += sprintf(p, "reserved error (%d)",
+				     mem_err->error_type);
+		}
+	} else {
+		strcpy(pvt->msg, "unknown error");
+	}
+
+	/* Error address */
+	if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+		e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
+		e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
+	}
+
+	/* Error grain */
+	if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) {
+		e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
+	}
+
+	/* Memory error location, mapped on e->location */
+	p = e->location;
+	if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
+		p += sprintf(p, "node:%d ", mem_err->node);
+	if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
+		p += sprintf(p, "card:%d ", mem_err->card);
+	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
+		p += sprintf(p, "module:%d ", mem_err->module);
+	if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
+		p += sprintf(p, "bank:%d ", mem_err->bank);
+	if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
+		p += sprintf(p, "row:%d ", mem_err->row);
+	if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
+		p += sprintf(p, "col:%d ", mem_err->column);
+	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
+		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
+	if (p > e->location)
+		*(p - 1) = '\0';
+
+	/* All other fields are mapped on e->other_detail */
+	p = pvt->other_detail;
+	if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
+		u64 status = mem_err->error_status;
+
+		p += sprintf(p, "status(0x%016llx): ", (long long)status);
+		switch ((status >> 8) & 0xff) {
+		case 1:
+			p += sprintf(p, "Error detected internal to the component ");
+			break;
+		case 16:
+			p += sprintf(p, "Error detected in the bus ");
+			break;
+		case 4:
+			p += sprintf(p, "Storage error in DRAM memory ");
+			break;
+		case 5:
+			p += sprintf(p, "Storage error in TLB ");
+			break;
+		case 6:
+			p += sprintf(p, "Storage error in cache ");
+			break;
+		case 7:
+			p += sprintf(p, "Error in one or more functional units ");
+			break;
+		case 8:
+			p += sprintf(p, "component failed self test ");
+			break;
+		case 9:
+			p += sprintf(p, "Overflow or undervalue of internal queue ");
+			break;
+		case 17:
+			p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
+			break;
+		case 18:
+			p += sprintf(p, "Improper access error ");
+			break;
+		case 19:
+			p += sprintf(p, "Access to a memory address which is not mapped to any component ");
+			break;
+		case 20:
+			p += sprintf(p, "Loss of Lockstep ");
+			break;
+		case 21:
+			p += sprintf(p, "Response not associated with a request ");
+			break;
+		case 22:
+			p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
+			break;
+		case 23:
+			p += sprintf(p, "Detection of a PATH_ERROR ");
+			break;
+		case 25:
+			p += sprintf(p, "Bus operation timeout ");
+			break;
+		case 26:
+			p += sprintf(p, "A read was issued to data that has been poisoned ");
+			break;
+		default:
+			p += sprintf(p, "reserved ");
+			break;
+		}
+	}
+	if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
+		p += sprintf(p, "requestorID: 0x%016llx ",
+			     (long long)mem_err->requestor_id);
+	if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
+		p += sprintf(p, "responderID: 0x%016llx ",
+			     (long long)mem_err->responder_id);
+	if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
+		p += sprintf(p, "targetID: 0x%016llx ",
+			     (long long)mem_err->responder_id);
+	if (p > pvt->other_detail)
+		*(p - 1) = '\0';
 
 	edac_raw_mc_handle_error(type, mci, e);
 }
-- 
cgit v1.2.3


From 8ae8f50ad8979bb670598ff92eebea611b799f10 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Tue, 19 Feb 2013 21:35:41 -0300
Subject: ghes_edac: Fix RAS tracing

With the current version of CPER, there's no way to associate an
error with the memory error. So, the error location in EDAC
layers is unused.

As CPER has its own idea about memory architectural layers, just
output whatever is there inside the driver's detail at the RAS
tracepoint.

The EDAC location keeps untouched, in the case that, in some future,
we could actually map the error into the dimm labels.

Now, the error message:

[   72.396625] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0
[   72.396627] {1}[Hardware Error]: APEI generic hardware error status
[   72.396628] {1}[Hardware Error]: severity: 2, corrected
[   72.396630] {1}[Hardware Error]: section: 0, severity: 2, corrected
[   72.396632] {1}[Hardware Error]: flags: 0x01
[   72.396634] {1}[Hardware Error]: primary
[   72.396635] {1}[Hardware Error]: section_type: memory error
[   72.396637] {1}[Hardware Error]: error_status: 0x0000000000000400
[   72.396638] {1}[Hardware Error]: node: 3
[   72.396639] {1}[Hardware Error]: card: 0
[   72.396640] {1}[Hardware Error]: module: 0
[   72.396641] {1}[Hardware Error]: device: 0
[   72.396643] {1}[Hardware Error]: error_type: 18, unknown
[   72.396666] EDAC MC0: 1 CE reserved error (18) on unknown label (node:3 card:0 module:0 page:0x0 offset:0x0 grain:0 syndrome:0x0 - status(0x0000000000000400): Storage error in DRAM memory)

Is properly represented on the trace event:

     kworker/0:2-584   [000] ....    72.396657: mc_event: 1 Corrected error: reserved error (18) on unknown label (mc:0 location:-1:-1:-1 address:0x00000000 grain:1 syndrome:0x00000000 APEI location: node:3 card:0 module:0 status(0x0000000000000400): Storage error in DRAM memory)

Tested on a 4 sockets E5-4650 Sandy Bridge machine.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 1bde45141073..636dcf18d5b6 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -15,6 +15,7 @@
 #include <linux/edac.h>
 #include <linux/dmi.h>
 #include "edac_core.h"
+#include <ras/ras_event.h>
 
 #define GHES_EDAC_REVISION " Ver: 1.0.0"
 
@@ -24,6 +25,7 @@ struct ghes_edac_pvt {
 	struct mem_ctl_info *mci;
 
 	/* Buffers for the error handling routine */
+	char detail_location[240];
 	char other_detail[160];
 	char msg[80];
 };
@@ -191,6 +193,7 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 	struct mem_ctl_info *mci;
 	struct ghes_edac_pvt *pvt = NULL;
 	char *p;
+	u8 grain_bits;
 
 	list_for_each_entry(pvt, &ghes_reglist, list) {
 		if (ghes == pvt->ghes)
@@ -398,6 +401,16 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 	if (p > pvt->other_detail)
 		*(p - 1) = '\0';
 
+	/* Generate the trace event */
+	grain_bits = fls_long(e->grain);
+	sprintf(pvt->detail_location, "APEI location: %s %s",
+		e->location, e->other_detail);
+	trace_mc_event(type, e->msg, e->label, e->error_count,
+		       mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
+		       PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
+		       grain_bits, e->syndrome, pvt->detail_location);
+
+	/* Report the error via EDAC API */
 	edac_raw_mc_handle_error(type, mci, e);
 }
 EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
-- 
cgit v1.2.3


From 5dae92a718570e6a942e0b882e53d25cab03b40f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 26 Feb 2013 16:39:14 +0800
Subject: ghes_edac: fix to use list_for_each_entry_safe() when delete list
 items

Since we will remove items off the list using list_del() we need
to use a safe version of the list_for_each_entry() macro aptly named
list_for_each_entry_safe().

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/edac/ghes_edac.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/edac/ghes_edac.c')

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 636dcf18d5b6..bb534670ec02 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -523,9 +523,9 @@ EXPORT_SYMBOL_GPL(ghes_edac_register);
 void ghes_edac_unregister(struct ghes *ghes)
 {
 	struct mem_ctl_info *mci;
-	struct ghes_edac_pvt *pvt;
+	struct ghes_edac_pvt *pvt, *tmp;
 
-	list_for_each_entry(pvt, &ghes_reglist, list) {
+	list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) {
 		if (ghes == pvt->ghes) {
 			mci = pvt->mci;
 			edac_mc_del_mc(mci->pdev);
-- 
cgit v1.2.3