diff options
Diffstat (limited to 'drivers/edac/igen6_edac.c')
-rw-r--r-- | drivers/edac/igen6_edac.c | 374 |
1 files changed, 349 insertions, 25 deletions
diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 6be9986fc6bd..a07bbfd075d0 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -22,11 +22,12 @@ #include <linux/io.h> #include <asm/mach_traps.h> #include <asm/nmi.h> +#include <asm/mce.h> #include "edac_mc.h" #include "edac_module.h" -#define IGEN6_REVISION "v2.4" +#define IGEN6_REVISION "v2.5" #define EDAC_MOD_STR "igen6_edac" #define IGEN6_NMI_NAME "igen6_ibecc" @@ -40,7 +41,7 @@ #define GET_BITFIELD(v, lo, hi) (((v) & GENMASK_ULL(hi, lo)) >> (lo)) -#define NUM_IMC 1 /* Max memory controllers */ +#define NUM_IMC 2 /* Max memory controllers */ #define NUM_CHANNELS 2 /* Max channels */ #define NUM_DIMMS 2 /* Max DIMMs per channel */ @@ -54,6 +55,10 @@ #define CAPID_C_OFFSET 0xec #define CAPID_C_IBECC BIT(15) +/* Capability register E */ +#define CAPID_E_OFFSET 0xf0 +#define CAPID_E_IBECC BIT(12) + /* Error Status */ #define ERRSTS_OFFSET 0xc8 #define ERRSTS_CE BIT_ULL(6) @@ -70,7 +75,7 @@ #define IBECC_ACTIVATE_EN BIT(0) /* IBECC error log */ -#define ECC_ERROR_LOG_OFFSET (IBECC_BASE + 0x170) +#define ECC_ERROR_LOG_OFFSET (IBECC_BASE + res_cfg->ibecc_error_log_offset) #define ECC_ERROR_LOG_CE BIT_ULL(62) #define ECC_ERROR_LOG_UE BIT_ULL(63) #define ECC_ERROR_LOG_ADDR_SHIFT 5 @@ -84,39 +89,54 @@ #define MCHBAR_SIZE 0x10000 /* Parameters for the channel decode stage */ -#define MAD_INTER_CHANNEL_OFFSET 0x5000 +#define IMC_BASE (res_cfg->imc_base) +#define MAD_INTER_CHANNEL_OFFSET IMC_BASE #define MAD_INTER_CHANNEL_DDR_TYPE(v) GET_BITFIELD(v, 0, 2) #define MAD_INTER_CHANNEL_ECHM(v) GET_BITFIELD(v, 3, 3) #define MAD_INTER_CHANNEL_CH_L_MAP(v) GET_BITFIELD(v, 4, 4) #define MAD_INTER_CHANNEL_CH_S_SIZE(v) ((u64)GET_BITFIELD(v, 12, 19) << 29) /* Parameters for DRAM decode stage */ -#define MAD_INTRA_CH0_OFFSET 0x5004 +#define MAD_INTRA_CH0_OFFSET (IMC_BASE + 4) #define MAD_INTRA_CH_DIMM_L_MAP(v) GET_BITFIELD(v, 0, 0) /* DIMM characteristics */ -#define MAD_DIMM_CH0_OFFSET 0x500c +#define MAD_DIMM_CH0_OFFSET (IMC_BASE + 0xc) #define MAD_DIMM_CH_DIMM_L_SIZE(v) ((u64)GET_BITFIELD(v, 0, 6) << 29) #define MAD_DIMM_CH_DLW(v) GET_BITFIELD(v, 7, 8) #define MAD_DIMM_CH_DIMM_S_SIZE(v) ((u64)GET_BITFIELD(v, 16, 22) << 29) #define MAD_DIMM_CH_DSW(v) GET_BITFIELD(v, 24, 25) +/* Hash for memory controller selection */ +#define MAD_MC_HASH_OFFSET (IMC_BASE + 0x1b8) +#define MAC_MC_HASH_LSB(v) GET_BITFIELD(v, 1, 3) + /* Hash for channel selection */ -#define CHANNEL_HASH_OFFSET 0X5024 +#define CHANNEL_HASH_OFFSET (IMC_BASE + 0x24) /* Hash for enhanced channel selection */ -#define CHANNEL_EHASH_OFFSET 0X5028 +#define CHANNEL_EHASH_OFFSET (IMC_BASE + 0x28) #define CHANNEL_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6) #define CHANNEL_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26) #define CHANNEL_HASH_MODE(v) GET_BITFIELD(v, 28, 28) +/* Parameters for memory slice decode stage */ +#define MEM_SLICE_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6) +#define MEM_SLICE_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26) + static struct res_config { + bool machine_check; int num_imc; + u32 imc_base; + u32 cmf_base; + u32 cmf_size; + u32 ms_hash_offset; u32 ibecc_base; + u32 ibecc_error_log_offset; bool (*ibecc_available)(struct pci_dev *pdev); /* Convert error address logged in IBECC to system physical address */ - u64 (*err_addr_to_sys_addr)(u64 eaddr); + u64 (*err_addr_to_sys_addr)(u64 eaddr, int mc); /* Convert error address logged in IBECC to integrated memory controller address */ - u64 (*err_addr_to_imc_addr)(u64 eaddr); + u64 (*err_addr_to_imc_addr)(u64 eaddr, int mc); } *res_cfg; struct igen6_imc { @@ -125,6 +145,7 @@ struct igen6_imc { struct pci_dev *pdev; struct device dev; void __iomem *window; + u64 size; u64 ch_s_size; int ch_l_map; u64 dimm_s_size[NUM_CHANNELS]; @@ -134,6 +155,9 @@ struct igen6_imc { static struct igen6_pvt { struct igen6_imc imc[NUM_IMC]; + u64 ms_hash; + u64 ms_s_size; + int ms_l_map; } *igen6_pvt; /* The top of low usable DRAM */ @@ -183,6 +207,21 @@ static struct work_struct ecclog_work; #define DID_EHL_SKU14 0x4534 #define DID_EHL_SKU15 0x4536 +/* Compute die IDs for ICL-NNPI with IBECC */ +#define DID_ICL_SKU8 0x4581 +#define DID_ICL_SKU10 0x4585 +#define DID_ICL_SKU11 0x4589 +#define DID_ICL_SKU12 0x458d + +/* Compute die IDs for Tiger Lake with IBECC */ +#define DID_TGL_SKU 0x9a14 + +/* Compute die IDs for Alder Lake with IBECC */ +#define DID_ADL_SKU1 0x4601 +#define DID_ADL_SKU2 0x4602 +#define DID_ADL_SKU3 0x4621 +#define DID_ADL_SKU4 0x4641 + static bool ehl_ibecc_available(struct pci_dev *pdev) { u32 v; @@ -193,12 +232,12 @@ static bool ehl_ibecc_available(struct pci_dev *pdev) return !!(CAPID_C_IBECC & v); } -static u64 ehl_err_addr_to_sys_addr(u64 eaddr) +static u64 ehl_err_addr_to_sys_addr(u64 eaddr, int mc) { return eaddr; } -static u64 ehl_err_addr_to_imc_addr(u64 eaddr) +static u64 ehl_err_addr_to_imc_addr(u64 eaddr, int mc) { if (eaddr < igen6_tolud) return eaddr; @@ -212,12 +251,156 @@ static u64 ehl_err_addr_to_imc_addr(u64 eaddr) return eaddr; } +static bool icl_ibecc_available(struct pci_dev *pdev) +{ + u32 v; + + if (pci_read_config_dword(pdev, CAPID_C_OFFSET, &v)) + return false; + + return !(CAPID_C_IBECC & v) && + (boot_cpu_data.x86_stepping >= 1); +} + +static bool tgl_ibecc_available(struct pci_dev *pdev) +{ + u32 v; + + if (pci_read_config_dword(pdev, CAPID_E_OFFSET, &v)) + return false; + + return !(CAPID_E_IBECC & v); +} + +static u64 mem_addr_to_sys_addr(u64 maddr) +{ + if (maddr < igen6_tolud) + return maddr; + + if (igen6_tom <= _4GB) + return maddr - igen6_tolud + _4GB; + + if (maddr < _4GB) + return maddr - igen6_tolud + igen6_tom; + + return maddr; +} + +static u64 mem_slice_hash(u64 addr, u64 mask, u64 hash_init, int intlv_bit) +{ + u64 hash_addr = addr & mask, hash = hash_init; + u64 intlv = (addr >> intlv_bit) & 1; + int i; + + for (i = 6; i < 20; i++) + hash ^= (hash_addr >> i) & 1; + + return hash ^ intlv; +} + +static u64 tgl_err_addr_to_mem_addr(u64 eaddr, int mc) +{ + u64 maddr, hash, mask, ms_s_size; + int intlv_bit; + u32 ms_hash; + + ms_s_size = igen6_pvt->ms_s_size; + if (eaddr >= ms_s_size) + return eaddr + ms_s_size; + + ms_hash = igen6_pvt->ms_hash; + + mask = MEM_SLICE_HASH_MASK(ms_hash); + intlv_bit = MEM_SLICE_HASH_LSB_MASK_BIT(ms_hash) + 6; + + maddr = GET_BITFIELD(eaddr, intlv_bit, 63) << (intlv_bit + 1) | + GET_BITFIELD(eaddr, 0, intlv_bit - 1); + + hash = mem_slice_hash(maddr, mask, mc, intlv_bit); + + return maddr | (hash << intlv_bit); +} + +static u64 tgl_err_addr_to_sys_addr(u64 eaddr, int mc) +{ + u64 maddr = tgl_err_addr_to_mem_addr(eaddr, mc); + + return mem_addr_to_sys_addr(maddr); +} + +static u64 tgl_err_addr_to_imc_addr(u64 eaddr, int mc) +{ + return eaddr; +} + +static u64 adl_err_addr_to_sys_addr(u64 eaddr, int mc) +{ + return mem_addr_to_sys_addr(eaddr); +} + +static u64 adl_err_addr_to_imc_addr(u64 eaddr, int mc) +{ + u64 imc_addr, ms_s_size = igen6_pvt->ms_s_size; + struct igen6_imc *imc = &igen6_pvt->imc[mc]; + int intlv_bit; + u32 mc_hash; + + if (eaddr >= 2 * ms_s_size) + return eaddr - ms_s_size; + + mc_hash = readl(imc->window + MAD_MC_HASH_OFFSET); + + intlv_bit = MAC_MC_HASH_LSB(mc_hash) + 6; + + imc_addr = GET_BITFIELD(eaddr, intlv_bit + 1, 63) << intlv_bit | + GET_BITFIELD(eaddr, 0, intlv_bit - 1); + + return imc_addr; +} + static struct res_config ehl_cfg = { - .num_imc = 1, - .ibecc_base = 0xdc00, - .ibecc_available = ehl_ibecc_available, - .err_addr_to_sys_addr = ehl_err_addr_to_sys_addr, - .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, + .num_imc = 1, + .imc_base = 0x5000, + .ibecc_base = 0xdc00, + .ibecc_available = ehl_ibecc_available, + .ibecc_error_log_offset = 0x170, + .err_addr_to_sys_addr = ehl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, +}; + +static struct res_config icl_cfg = { + .num_imc = 1, + .imc_base = 0x5000, + .ibecc_base = 0xd800, + .ibecc_error_log_offset = 0x170, + .ibecc_available = icl_ibecc_available, + .err_addr_to_sys_addr = ehl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, +}; + +static struct res_config tgl_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0x5000, + .cmf_base = 0x11000, + .cmf_size = 0x800, + .ms_hash_offset = 0xac, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = tgl_ibecc_available, + .err_addr_to_sys_addr = tgl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = tgl_err_addr_to_imc_addr, +}; + +static struct res_config adl_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x68, + .ibecc_available = tgl_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; static const struct pci_device_id igen6_pci_tbl[] = { @@ -232,6 +415,15 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU13), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU14), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU15), (kernel_ulong_t)&ehl_cfg }, + { PCI_VDEVICE(INTEL, DID_ICL_SKU8), (kernel_ulong_t)&icl_cfg }, + { PCI_VDEVICE(INTEL, DID_ICL_SKU10), (kernel_ulong_t)&icl_cfg }, + { PCI_VDEVICE(INTEL, DID_ICL_SKU11), (kernel_ulong_t)&icl_cfg }, + { PCI_VDEVICE(INTEL, DID_ICL_SKU12), (kernel_ulong_t)&icl_cfg }, + { PCI_VDEVICE(INTEL, DID_TGL_SKU), (kernel_ulong_t)&tgl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_SKU1), (kernel_ulong_t)&adl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_SKU2), (kernel_ulong_t)&adl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_SKU3), (kernel_ulong_t)&adl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_SKU4), (kernel_ulong_t)&adl_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); @@ -490,8 +682,8 @@ static void ecclog_work_cb(struct work_struct *work) eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) << ECC_ERROR_LOG_ADDR_SHIFT; res.mc = node->mc; - res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr); - res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr); + res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr, res.mc); + res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr, res.mc); mci = igen6_pvt->imc[res.mc].mci; @@ -540,6 +732,57 @@ static int ecclog_nmi_handler(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } +static int ecclog_mce_handler(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + char *type; + + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + + /* + * Ignore unless this is a memory related error. + * We don't check the bit MCI_STATUS_ADDRV of MCi_STATUS here, + * since this bit isn't set on some CPU (e.g., Tiger Lake UP3). + */ + if ((mce->status & 0xefff) >> 7 != 1) + return NOTIFY_DONE; + + if (mce->mcgstatus & MCG_STATUS_MCIP) + type = "Exception"; + else + type = "Event"; + + edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n", + mce->extcpu, type, mce->mcgstatus, + mce->bank, mce->status); + edac_dbg(0, "TSC 0x%llx\n", mce->tsc); + edac_dbg(0, "ADDR 0x%llx\n", mce->addr); + edac_dbg(0, "MISC 0x%llx\n", mce->misc); + edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n", + mce->cpuvendor, mce->cpuid, mce->time, + mce->socketid, mce->apicid); + /* + * We just use the Machine Check for the memory error notification. + * Each memory controller is associated with an IBECC instance. + * Directly read and clear the error information(error address and + * error type) on all the IBECC instances so that we know on which + * memory controller the memory error(s) occurred. + */ + if (!ecclog_handler()) + return NOTIFY_DONE; + + mce->kflags |= MCE_HANDLED_EDAC; + + return NOTIFY_DONE; +} + +static struct notifier_block ecclog_mce_dec = { + .notifier_call = ecclog_mce_handler, + .priority = MCE_PRIO_EDAC, +}; + static bool igen6_check_ecc(struct igen6_imc *imc) { u32 activate = readl(imc->window + IBECC_ACTIVATE_OFFSET); @@ -573,6 +816,8 @@ static int igen6_get_dimm_config(struct mem_ctl_info *mci) imc->dimm_l_size[i] = MAD_DIMM_CH_DIMM_L_SIZE(mad_dimm); imc->dimm_s_size[i] = MAD_DIMM_CH_DIMM_S_SIZE(mad_dimm); imc->dimm_l_map[i] = MAD_INTRA_CH_DIMM_L_MAP(mad_intra); + imc->size += imc->dimm_s_size[i]; + imc->size += imc->dimm_l_size[i]; ndimms = 0; for (j = 0; j < NUM_DIMMS; j++) { @@ -608,6 +853,8 @@ static int igen6_get_dimm_config(struct mem_ctl_info *mci) } } + edac_dbg(0, "MC %d, total size %llu MiB\n", mc, imc->size >> 20); + return 0; } @@ -857,6 +1104,80 @@ static void igen6_unregister_mcis(void) } } +static int igen6_mem_slice_setup(u64 mchbar) +{ + struct igen6_imc *imc = &igen6_pvt->imc[0]; + u64 base = mchbar + res_cfg->cmf_base; + u32 offset = res_cfg->ms_hash_offset; + u32 size = res_cfg->cmf_size; + u64 ms_s_size, ms_hash; + void __iomem *cmf; + int ms_l_map; + + edac_dbg(2, "\n"); + + if (imc[0].size < imc[1].size) { + ms_s_size = imc[0].size; + ms_l_map = 1; + } else { + ms_s_size = imc[1].size; + ms_l_map = 0; + } + + igen6_pvt->ms_s_size = ms_s_size; + igen6_pvt->ms_l_map = ms_l_map; + + edac_dbg(0, "ms_s_size: %llu MiB, ms_l_map %d\n", + ms_s_size >> 20, ms_l_map); + + if (!size) + return 0; + + cmf = ioremap(base, size); + if (!cmf) { + igen6_printk(KERN_ERR, "Failed to ioremap cmf 0x%llx\n", base); + return -ENODEV; + } + + ms_hash = readq(cmf + offset); + igen6_pvt->ms_hash = ms_hash; + + edac_dbg(0, "MEM_SLICE_HASH: 0x%llx\n", ms_hash); + + iounmap(cmf); + + return 0; +} + +static int register_err_handler(void) +{ + int rc; + + if (res_cfg->machine_check) { + mce_register_decode_chain(&ecclog_mce_dec); + return 0; + } + + rc = register_nmi_handler(NMI_SERR, ecclog_nmi_handler, + 0, IGEN6_NMI_NAME); + if (rc) { + igen6_printk(KERN_ERR, "Failed to register NMI handler\n"); + return rc; + } + + return 0; +} + +static void unregister_err_handler(void) +{ + if (res_cfg->machine_check) { + mce_unregister_decode_chain(&ecclog_mce_dec); + return; + } + + unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME); +} + static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { u64 mchbar; @@ -880,6 +1201,12 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto fail2; } + if (res_cfg->num_imc > 1) { + rc = igen6_mem_slice_setup(mchbar); + if (rc) + goto fail2; + } + ecclog_pool = ecclog_gen_pool_create(); if (!ecclog_pool) { rc = -ENOMEM; @@ -892,12 +1219,9 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Check if any pending errors before registering the NMI handler */ ecclog_handler(); - rc = register_nmi_handler(NMI_SERR, ecclog_nmi_handler, - 0, IGEN6_NMI_NAME); - if (rc) { - igen6_printk(KERN_ERR, "Failed to register NMI handler\n"); + rc = register_err_handler(); + if (rc) goto fail3; - } /* Enable error reporting */ rc = errcmd_enable_error_reporting(true); @@ -925,7 +1249,7 @@ static void igen6_remove(struct pci_dev *pdev) igen6_debug_teardown(); errcmd_enable_error_reporting(false); - unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME); + unregister_err_handler(); irq_work_sync(&ecclog_irq_work); flush_work(&ecclog_work); gen_pool_destroy(ecclog_pool); |