From b57c1a1e7effab067a65bab54c5d83a67cffd043 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 4 Jul 2023 18:18:08 +0800 Subject: EDAC/synopsys: Convert to devm_platform_ioremap_resource() Use devm_platform_ioremap_resource() to simplify code. Signed-off-by: Yangtao Li Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Michal Simek Link: https://lore.kernel.org/r/20230704101811.49637-3-frank.li@vivo.com --- drivers/edac/synopsys_edac.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 709babce43ba..5527055b0964 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -1324,11 +1324,9 @@ static int mc_probe(struct platform_device *pdev) struct synps_edac_priv *priv; struct mem_ctl_info *mci; void __iomem *baseaddr; - struct resource *res; int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - baseaddr = devm_ioremap_resource(&pdev->dev, res); + baseaddr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(baseaddr)) return PTR_ERR(baseaddr); -- cgit v1.2.3 From 3f3174996be6b4312c38f54d5969f5d5b75fec9e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:13:59 -0600 Subject: RAS: Introduce AMD Address Translation Library AMD Zen-based systems report memory errors through Machine Check banks representing Unified Memory Controllers (UMCs). The address value reported for DRAM ECC errors is a "normalized address" that is relative to the UMC. This normalized address must be converted to a system physical address to be usable by the OS. Support for this address translation was introduced to the MCA subsystem with Zen1 systems. The code was later moved to the AMD64 EDAC module, since this was the only user of the code at the time. However, there are uses for this translation outside of EDAC. The system physical address can be used in MCA for preemptive page offlining as done in some MCA notifier functions. Also, this translation is needed as the basis of similar functionality needed for some CXL configurations on AMD systems. Introduce a common address translation library that can be used for multiple subsystems including MCA, EDAC, and CXL. Include support for UMC normalized to system physical address translation for current CPU systems. The Data Fabric Indirect register access offsets and one of the register fields were changed. Default to the current offsets and register field definition. And fallback to the older values if running on a "legacy" system. Provide built-in code to facilitate the loading and unloading of the library module without affecting other modules or built-in code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-2-yazen.ghannam@amd.com --- MAINTAINERS | 6 + drivers/ras/Kconfig | 1 + drivers/ras/Makefile | 2 + drivers/ras/amd/atl/Kconfig | 20 ++ drivers/ras/amd/atl/Makefile | 18 ++ drivers/ras/amd/atl/access.c | 106 ++++++ drivers/ras/amd/atl/core.c | 225 +++++++++++++ drivers/ras/amd/atl/dehash.c | 407 +++++++++++++++++++++++ drivers/ras/amd/atl/denormalize.c | 617 +++++++++++++++++++++++++++++++++++ drivers/ras/amd/atl/internal.h | 297 +++++++++++++++++ drivers/ras/amd/atl/map.c | 665 ++++++++++++++++++++++++++++++++++++++ drivers/ras/amd/atl/reg_fields.h | 603 ++++++++++++++++++++++++++++++++++ drivers/ras/amd/atl/system.c | 281 ++++++++++++++++ drivers/ras/amd/atl/umc.c | 41 +++ drivers/ras/ras.c | 31 ++ include/linux/ras.h | 16 + 16 files changed, 3336 insertions(+) create mode 100644 drivers/ras/amd/atl/Kconfig create mode 100644 drivers/ras/amd/atl/Makefile create mode 100644 drivers/ras/amd/atl/access.c create mode 100644 drivers/ras/amd/atl/core.c create mode 100644 drivers/ras/amd/atl/dehash.c create mode 100644 drivers/ras/amd/atl/denormalize.c create mode 100644 drivers/ras/amd/atl/internal.h create mode 100644 drivers/ras/amd/atl/map.c create mode 100644 drivers/ras/amd/atl/reg_fields.h create mode 100644 drivers/ras/amd/atl/system.c create mode 100644 drivers/ras/amd/atl/umc.c diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a69..25537a37338e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h +AMD ADDRESS TRANSLATION LIBRARY (ATL) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/ras/amd/atl/* + AMD AXI W1 DRIVER M: Kris Chaplin R: Thomas Delev diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index c2a236f2e846..2e969f59c0ca 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -32,5 +32,6 @@ menuconfig RAS if RAS source "arch/x86/ras/Kconfig" +source "drivers/ras/amd/atl/Kconfig" endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 6f0404f50107..3fac80f58005 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o + +obj-y += amd/atl/ diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig new file mode 100644 index 000000000000..a43513a700f1 --- /dev/null +++ b/drivers/ras/amd/atl/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# AMD Address Translation Library Kconfig +# +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All Rights Reserved. +# +# Author: Yazen Ghannam + +config AMD_ATL + tristate "AMD Address Translation Library" + depends on AMD_NB && X86_64 && RAS + default N + help + This library includes support for implementation-specific + address translation procedures needed for various error + handling cases. + + Enable this option if using DRAM ECC on Zen-based systems + and OS-based error handling. diff --git a/drivers/ras/amd/atl/Makefile b/drivers/ras/amd/atl/Makefile new file mode 100644 index 000000000000..4acd5f05bd9c --- /dev/null +++ b/drivers/ras/amd/atl/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# AMD Address Translation Library Makefile +# +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All Rights Reserved. +# +# Author: Yazen Ghannam + +amd_atl-y := access.o +amd_atl-y += core.o +amd_atl-y += dehash.o +amd_atl-y += denormalize.o +amd_atl-y += map.o +amd_atl-y += system.o +amd_atl-y += umc.o + +obj-$(CONFIG_AMD_ATL) += amd_atl.o diff --git a/drivers/ras/amd/atl/access.c b/drivers/ras/amd/atl/access.c new file mode 100644 index 000000000000..f6dd87bb2c35 --- /dev/null +++ b/drivers/ras/amd/atl/access.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * access.c : DF Indirect Access functions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* Protect the PCI config register pairs used for DF indirect access. */ +static DEFINE_MUTEX(df_indirect_mutex); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): there are FICAD + * low and high registers but so far only the low register is needed. + * + * Use Instance Id 0xFF to indicate a broadcast read. + */ +#define DF_BROADCAST 0xFF + +#define DF_FICAA_INST_EN BIT(0) +#define DF_FICAA_REG_NUM GENMASK(10, 1) +#define DF_FICAA_FUNC_NUM GENMASK(13, 11) +#define DF_FICAA_INST_ID GENMASK(23, 16) + +#define DF_FICAA_REG_NUM_LEGACY GENMASK(10, 2) + +static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + u32 ficaa_addr = 0x8C, ficad_addr = 0xB8; + struct pci_dev *F4; + int err = -ENODEV; + u32 ficaa = 0; + + if (node >= amd_nb_num()) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + /* Enable instance-specific access. */ + if (instance_id != DF_BROADCAST) { + ficaa |= FIELD_PREP(DF_FICAA_INST_EN, 1); + ficaa |= FIELD_PREP(DF_FICAA_INST_ID, instance_id); + } + + /* + * The two least-significant bits are masked when inputing the + * register offset to FICAA. + */ + reg >>= 2; + + if (df_cfg.flags.legacy_ficaa) { + ficaa_addr = 0x5C; + ficad_addr = 0x98; + + ficaa |= FIELD_PREP(DF_FICAA_REG_NUM_LEGACY, reg); + } else { + ficaa |= FIELD_PREP(DF_FICAA_REG_NUM, reg); + } + + ficaa |= FIELD_PREP(DF_FICAA_FUNC_NUM, func); + + mutex_lock(&df_indirect_mutex); + + err = pci_write_config_dword(F4, ficaa_addr, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, ficad_addr, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + + pr_debug("node=%u inst=0x%x func=0x%x reg=0x%x val=0x%x", + node, instance_id, func, reg << 2, *lo); + +out_unlock: + mutex_unlock(&df_indirect_mutex); + +out: + return err; +} + +int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + return __df_indirect_read(node, func, reg, instance_id, lo); +} + +int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) +{ + return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); +} diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c new file mode 100644 index 000000000000..6dc4e06305f7 --- /dev/null +++ b/drivers/ras/amd/atl/core.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * core.c : Module init and base translation functions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include +#include + +#include "internal.h" + +struct df_config df_cfg __read_mostly; + +static int addr_over_limit(struct addr_ctx *ctx) +{ + u64 dram_limit_addr; + + if (df_cfg.rev >= DF4) + dram_limit_addr = FIELD_GET(DF4_DRAM_LIMIT_ADDR, ctx->map.limit); + else + dram_limit_addr = FIELD_GET(DF2_DRAM_LIMIT_ADDR, ctx->map.limit); + + dram_limit_addr <<= DF_DRAM_BASE_LIMIT_LSB; + dram_limit_addr |= GENMASK(DF_DRAM_BASE_LIMIT_LSB - 1, 0); + + /* Is calculated system address above DRAM limit address? */ + if (ctx->ret_addr > dram_limit_addr) { + atl_debug(ctx, "Calculated address (0x%016llx) > DRAM limit (0x%016llx)", + ctx->ret_addr, dram_limit_addr); + return -EINVAL; + } + + return 0; +} + +static bool legacy_hole_en(struct addr_ctx *ctx) +{ + u32 reg = ctx->map.base; + + if (df_cfg.rev >= DF4) + reg = ctx->map.ctl; + + return FIELD_GET(DF_LEGACY_MMIO_HOLE_EN, reg); +} + +static int add_legacy_hole(struct addr_ctx *ctx) +{ + u32 dram_hole_base; + u8 func = 0; + + if (!legacy_hole_en(ctx)) + return 0; + + if (df_cfg.rev >= DF4) + func = 7; + + if (df_indirect_read_broadcast(ctx->node_id, func, 0x104, &dram_hole_base)) + return -EINVAL; + + dram_hole_base &= DF_DRAM_HOLE_BASE_MASK; + + if (ctx->ret_addr >= dram_hole_base) + ctx->ret_addr += (BIT_ULL(32) - dram_hole_base); + + return 0; +} + +static u64 get_base_addr(struct addr_ctx *ctx) +{ + u64 base_addr; + + if (df_cfg.rev >= DF4) + base_addr = FIELD_GET(DF4_BASE_ADDR, ctx->map.base); + else + base_addr = FIELD_GET(DF2_BASE_ADDR, ctx->map.base); + + return base_addr << DF_DRAM_BASE_LIMIT_LSB; +} + +static int add_base_and_hole(struct addr_ctx *ctx) +{ + ctx->ret_addr += get_base_addr(ctx); + + if (add_legacy_hole(ctx)) + return -EINVAL; + + return 0; +} + +static bool late_hole_remove(struct addr_ctx *ctx) +{ + if (df_cfg.rev == DF3p5) + return true; + + if (df_cfg.rev == DF4) + return true; + + if (ctx->map.intlv_mode == DF3_6CHAN) + return true; + + return false; +} + +unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr) +{ + struct addr_ctx ctx; + + if (df_cfg.rev == UNKNOWN) + return -EINVAL; + + memset(&ctx, 0, sizeof(ctx)); + + /* Start from the normalized address */ + ctx.ret_addr = addr; + ctx.inst_id = coh_st_inst_id; + + ctx.inputs.norm_addr = addr; + ctx.inputs.socket_id = socket_id; + ctx.inputs.die_id = die_id; + ctx.inputs.coh_st_inst_id = coh_st_inst_id; + + if (determine_node_id(&ctx, socket_id, die_id)) + return -EINVAL; + + if (get_address_map(&ctx)) + return -EINVAL; + + if (denormalize_address(&ctx)) + return -EINVAL; + + if (!late_hole_remove(&ctx) && add_base_and_hole(&ctx)) + return -EINVAL; + + if (dehash_address(&ctx)) + return -EINVAL; + + if (late_hole_remove(&ctx) && add_base_and_hole(&ctx)) + return -EINVAL; + + if (addr_over_limit(&ctx)) + return -EINVAL; + + return ctx.ret_addr; +} + +static void check_for_legacy_df_access(void) +{ + /* + * All Zen-based systems before Family 19h use the legacy + * DF Indirect Access (FICAA/FICAD) offsets. + */ + if (boot_cpu_data.x86 < 0x19) { + df_cfg.flags.legacy_ficaa = true; + return; + } + + /* All systems after Family 19h use the current offsets. */ + if (boot_cpu_data.x86 > 0x19) + return; + + /* Some Family 19h systems use the legacy offsets. */ + switch (boot_cpu_data.x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + df_cfg.flags.legacy_ficaa = true; + } +} + +/* + * This library provides functionality for AMD-based systems with a Data Fabric. + * The set of systems with a Data Fabric is equivalent to the set of Zen-based systems + * and the set of systems with the Scalable MCA feature at this time. However, these + * are technically independent things. + * + * It's possible to match on the PCI IDs of the Data Fabric devices, but this will be + * an ever expanding list. Instead, match on the SMCA and Zen features to cover all + * relevant systems. + */ +static const struct x86_cpu_id amd_atl_cpuids[] = { + X86_MATCH_FEATURE(X86_FEATURE_SMCA, NULL), + X86_MATCH_FEATURE(X86_FEATURE_ZEN, NULL), + { } +}; +MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids); + +static int __init amd_atl_init(void) +{ + if (!x86_match_cpu(amd_atl_cpuids)) + return -ENODEV; + + if (!amd_nb_num()) + return -ENODEV; + + check_for_legacy_df_access(); + + if (get_df_system_info()) + return -ENODEV; + + /* Increment this module's recount so that it can't be easily unloaded. */ + __module_get(THIS_MODULE); + amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr); + + pr_info("AMD Address Translation Library initialized"); + return 0; +} + +/* + * Exit function is only needed for testing and debug. Module unload must be + * forced to override refcount check. + */ +static void __exit amd_atl_exit(void) +{ + amd_atl_unregister_decoder(); +} + +module_init(amd_atl_init); +module_exit(amd_atl_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/ras/amd/atl/dehash.c b/drivers/ras/amd/atl/dehash.c new file mode 100644 index 000000000000..6f414926e6fe --- /dev/null +++ b/drivers/ras/amd/atl/dehash.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * dehash.c : Functions to account for hashing bits + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* + * Verify the interleave bits are correct in the different interleaving + * settings. + * + * If @num_intlv_dies and/or @num_intlv_sockets are 1, it means the + * respective interleaving is disabled. + */ +static inline bool map_bits_valid(struct addr_ctx *ctx, u8 bit1, u8 bit2, + u8 num_intlv_dies, u8 num_intlv_sockets) +{ + if (!(ctx->map.intlv_bit_pos == bit1 || ctx->map.intlv_bit_pos == bit2)) { + pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos); + return false; + } + + if (ctx->map.num_intlv_dies > num_intlv_dies) { + pr_debug("Invalid number of interleave dies: %u", ctx->map.num_intlv_dies); + return false; + } + + if (ctx->map.num_intlv_sockets > num_intlv_sockets) { + pr_debug("Invalid number of interleave sockets: %u", ctx->map.num_intlv_sockets); + return false; + } + + return true; +} + +static int df2_dehash_addr(struct addr_ctx *ctx) +{ + u8 hashed_bit, intlv_bit, intlv_bit_pos; + + if (!map_bits_valid(ctx, 8, 9, 1, 1)) + return -EINVAL; + + intlv_bit_pos = ctx->map.intlv_bit_pos; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(12), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr); + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + return 0; +} + +static int df3_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u8 hashed_bit, intlv_bit, intlv_bit_pos; + + if (!map_bits_valid(ctx, 8, 9, 1, 1)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF3_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit_pos = ctx->map.intlv_bit_pos; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + /* Calculation complete for 2 channels. Continue for 4 and 8 channels. */ + if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + + /* Calculation complete for 4 channels. Continue for 8 channels. */ + if (ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + + return 0; +} + +static int df3_6chan_dehash_addr(struct addr_ctx *ctx) +{ + u8 intlv_bit_pos = ctx->map.intlv_bit_pos; + u8 hashed_bit, intlv_bit, num_intlv_bits; + bool hash_ctl_2M, hash_ctl_1G; + + if (ctx->map.intlv_mode != DF3_6CHAN) { + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + } + + num_intlv_bits = ilog2(ctx->map.num_intlv_chan) + 1; + + hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= !!(BIT_ULL(intlv_bit_pos + num_intlv_bits) & ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + intlv_bit_pos++; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + intlv_bit_pos++; + intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(intlv_bit_pos); + + return 0; +} + +static int df4_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u8 hashed_bit, intlv_bit; + + if (!map_bits_valid(ctx, 8, 8, 1, 2)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + + intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + if (ctx->map.num_intlv_sockets == 1) + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(8); + + /* + * Hashing is possible with socket interleaving, so check the total number + * of channels in the system rather than DRAM map interleaving mode. + * + * Calculation complete for 2 channels. Continue for 4, 8, and 16 channels. + */ + if (ctx->map.total_intlv_chan <= 2) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + + /* Calculation complete for 4 channels. Continue for 8 and 16 channels. */ + if (ctx->map.total_intlv_chan <= 4) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + + /* Calculation complete for 8 channels. Continue for 16 channels. */ + if (ctx->map.total_intlv_chan <= 8) + return 0; + + intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(14); + + return 0; +} + +static int df4p5_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T; + u8 hashed_bit, intlv_bit; + u64 rehash_vector; + + if (!map_bits_valid(ctx, 8, 8, 1, 2)) + return -EINVAL; + + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4_HASH_CTL_1T, ctx->map.ctl); + + /* + * Generate a unique address to determine which bits + * need to be dehashed. + * + * Start with a contiguous bitmask for the total + * number of channels starting at bit 8. + * + * Then make a gap in the proper place based on + * interleave mode. + */ + rehash_vector = ctx->map.total_intlv_chan - 1; + rehash_vector <<= 8; + + if (ctx->map.intlv_mode == DF4p5_NPS2_4CHAN_1K_HASH || + ctx->map.intlv_mode == DF4p5_NPS1_8CHAN_1K_HASH || + ctx->map.intlv_mode == DF4p5_NPS1_16CHAN_1K_HASH) + rehash_vector = expand_bits(10, 2, rehash_vector); + else + rehash_vector = expand_bits(9, 3, rehash_vector); + + if (rehash_vector & BIT_ULL(8)) { + intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(40), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(8); + } + + if (rehash_vector & BIT_ULL(9)) { + intlv_bit = FIELD_GET(BIT_ULL(9), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(41), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(9); + } + + if (rehash_vector & BIT_ULL(12)) { + intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(42), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(12); + } + + if (rehash_vector & BIT_ULL(13)) { + intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(43), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(13); + } + + if (rehash_vector & BIT_ULL(14)) { + intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr); + + hashed_bit = intlv_bit; + hashed_bit ^= FIELD_GET(BIT_ULL(20), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(25), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(34), ctx->ret_addr) & hash_ctl_1G; + hashed_bit ^= FIELD_GET(BIT_ULL(44), ctx->ret_addr) & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(14); + } + + return 0; +} + +int dehash_address(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + /* No hashing cases. */ + case NONE: + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + /* Hashing bits handled earlier during CS ID calculation. */ + case DF4_NPS4_3CHAN_HASH: + case DF4_NPS2_5CHAN_HASH: + case DF4_NPS2_6CHAN_HASH: + case DF4_NPS1_10CHAN_HASH: + case DF4_NPS1_12CHAN_HASH: + case DF4p5_NPS2_6CHAN_1K_HASH: + case DF4p5_NPS2_6CHAN_2K_HASH: + case DF4p5_NPS1_10CHAN_1K_HASH: + case DF4p5_NPS1_10CHAN_2K_HASH: + case DF4p5_NPS1_12CHAN_1K_HASH: + case DF4p5_NPS1_12CHAN_2K_HASH: + case DF4p5_NPS0_24CHAN_1K_HASH: + case DF4p5_NPS0_24CHAN_2K_HASH: + /* No hash physical address bits, so nothing to do. */ + case DF4p5_NPS4_3CHAN_1K_HASH: + case DF4p5_NPS4_3CHAN_2K_HASH: + case DF4p5_NPS2_5CHAN_1K_HASH: + case DF4p5_NPS2_5CHAN_2K_HASH: + return 0; + + case DF2_2CHAN_HASH: + return df2_dehash_addr(ctx); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + return df3_dehash_addr(ctx); + + case DF3_6CHAN: + return df3_6chan_dehash_addr(ctx); + + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + return df4_dehash_addr(ctx); + + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return df4p5_dehash_addr(ctx); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + } +} diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c new file mode 100644 index 000000000000..01f1d0fb6799 --- /dev/null +++ b/drivers/ras/amd/atl/denormalize.c @@ -0,0 +1,617 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * denormalize.c : Functions to account for interleaving bits + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +/* + * Returns the Destination Fabric ID. This is the first (lowest) + * COH_ST Fabric ID used within a DRAM Address map. + */ +static u16 get_dst_fabric_id(struct addr_ctx *ctx) +{ + switch (df_cfg.rev) { + case DF2: return FIELD_GET(DF2_DST_FABRIC_ID, ctx->map.limit); + case DF3: return FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit); + case DF3p5: return FIELD_GET(DF3p5_DST_FABRIC_ID, ctx->map.limit); + case DF4: return FIELD_GET(DF4_DST_FABRIC_ID, ctx->map.ctl); + case DF4p5: return FIELD_GET(DF4p5_DST_FABRIC_ID, ctx->map.ctl); + default: + atl_debug_on_bad_df_rev(); + return 0; + } +} + +/* + * Make a contiguous gap in address for N bits starting at bit P. + * + * Example: + * address bits: [20:0] + * # of interleave bits (n): 3 + * starting interleave bit (p): 8 + * + * expanded address bits: [20+n : n+p][n+p-1 : p][p-1 : 0] + * [23 : 11][10 : 8][7 : 0] + */ +static u64 make_space_for_coh_st_id_at_intlv_bit(struct addr_ctx *ctx) +{ + return expand_bits(ctx->map.intlv_bit_pos, + ctx->map.total_intlv_bits, + ctx->ret_addr); +} + +/* + * Make two gaps in address for N bits. + * First gap is a single bit at bit P. + * Second gap is the remaining N-1 bits at bit 12. + * + * Example: + * address bits: [20:0] + * # of interleave bits (n): 3 + * starting interleave bit (p): 8 + * + * First gap + * expanded address bits: [20+1 : p+1][p][p-1 : 0] + * [21 : 9][8][7 : 0] + * + * Second gap uses result from first. + * r = n - 1; remaining interleave bits + * expanded address bits: [21+r : 12+r][12+r-1: 12][11 : 0] + * [23 : 14][13 : 12][11 : 0] + */ +static u64 make_space_for_coh_st_id_split_2_1(struct addr_ctx *ctx) +{ + /* Make a single space at the interleave bit. */ + u64 denorm_addr = expand_bits(ctx->map.intlv_bit_pos, 1, ctx->ret_addr); + + /* Done if there's only a single interleave bit. */ + if (ctx->map.total_intlv_bits <= 1) + return denorm_addr; + + /* Make spaces for the remaining interleave bits starting at bit 12. */ + return expand_bits(12, ctx->map.total_intlv_bits - 1, denorm_addr); +} + +/* + * Take the current calculated address and shift enough bits in the middle + * to make a gap where the interleave bits will be inserted. + */ +static u64 make_space_for_coh_st_id(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF2_2CHAN_HASH: + return make_space_for_coh_st_id_at_intlv_bit(ctx); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return make_space_for_coh_st_id_split_2_1(ctx); + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0ULL; + } +} + +static u16 get_coh_st_id_df2(struct addr_ctx *ctx) +{ + u8 num_socket_intlv_bits = ilog2(ctx->map.num_intlv_sockets); + u8 num_die_intlv_bits = ilog2(ctx->map.num_intlv_dies); + u8 num_intlv_bits; + u16 coh_st_id, mask; + + coh_st_id = ctx->coh_st_fabric_id - get_dst_fabric_id(ctx); + + /* Channel interleave bits */ + num_intlv_bits = order_base_2(ctx->map.num_intlv_chan); + mask = GENMASK(num_intlv_bits - 1, 0); + coh_st_id &= mask; + + /* Die interleave bits */ + if (num_die_intlv_bits) { + u16 die_bits; + + mask = GENMASK(num_die_intlv_bits - 1, 0); + die_bits = ctx->coh_st_fabric_id & df_cfg.die_id_mask; + die_bits >>= df_cfg.die_id_shift; + + coh_st_id |= (die_bits & mask) << num_intlv_bits; + num_intlv_bits += num_die_intlv_bits; + } + + /* Socket interleave bits */ + if (num_socket_intlv_bits) { + u16 socket_bits; + + mask = GENMASK(num_socket_intlv_bits - 1, 0); + socket_bits = ctx->coh_st_fabric_id & df_cfg.socket_id_mask; + socket_bits >>= df_cfg.socket_id_shift; + + coh_st_id |= (socket_bits & mask) << num_intlv_bits; + } + + return coh_st_id; +} + +static u16 get_coh_st_id_df4(struct addr_ctx *ctx) +{ + /* + * Start with the original component mask and the number of interleave + * bits for the channels in this map. + */ + u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + u16 mask = df_cfg.component_id_mask; + + u16 socket_bits; + + /* Set the derived Coherent Station ID to the input Coherent Station Fabric ID. */ + u16 coh_st_id = ctx->coh_st_fabric_id & mask; + + /* + * Subtract the "base" Destination Fabric ID. + * This accounts for systems with disabled Coherent Stations. + */ + coh_st_id -= get_dst_fabric_id(ctx) & mask; + + /* + * Generate and use a new mask based on the number of bits + * needed for channel interleaving in this map. + */ + mask = GENMASK(num_intlv_bits - 1, 0); + coh_st_id &= mask; + + /* Done if socket interleaving is not enabled. */ + if (ctx->map.num_intlv_sockets <= 1) + return coh_st_id; + + /* + * Figure out how many bits are needed for the number of + * interleaved sockets. And shift the derived Coherent Station ID to account + * for these. + */ + num_intlv_bits = ilog2(ctx->map.num_intlv_sockets); + coh_st_id <<= num_intlv_bits; + + /* Generate a new mask for the socket interleaving bits. */ + mask = GENMASK(num_intlv_bits - 1, 0); + + /* Get the socket interleave bits from the original Coherent Station Fabric ID. */ + socket_bits = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask) >> df_cfg.socket_id_shift; + + /* Apply the appropriate socket bits to the derived Coherent Station ID. */ + coh_st_id |= socket_bits & mask; + + return coh_st_id; +} + +/* + * Derive the correct Coherent Station ID that represents the interleave bits + * used within the system physical address. This accounts for the + * interleave mode, number of interleaved channels/dies/sockets, and + * other system/mode-specific bit swizzling. + * + * Returns: Coherent Station ID on success. + * All bits set on error. + */ +static u16 calculate_coh_st_id(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF2_2CHAN_HASH: + return get_coh_st_id_df2(ctx); + + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return get_coh_st_id_df4(ctx); + + /* COH_ST ID is simply the COH_ST Fabric ID adjusted by the Destination Fabric ID. */ + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + return ctx->coh_st_fabric_id - get_dst_fabric_id(ctx); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0; + } +} + +static u64 insert_coh_st_id_at_intlv_bit(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + return denorm_addr | (coh_st_id << ctx->map.intlv_bit_pos); +} + +static u64 insert_coh_st_id_split_2_1(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + /* Insert coh_st_id[0] at the interleave bit. */ + denorm_addr |= (coh_st_id & BIT(0)) << ctx->map.intlv_bit_pos; + + /* Insert coh_st_id[2:1] at bit 12. */ + denorm_addr |= (coh_st_id & GENMASK(2, 1)) << 11; + + return denorm_addr; +} + +static u64 insert_coh_st_id_split_2_2(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + /* Insert coh_st_id[1:0] at bit 8. */ + denorm_addr |= (coh_st_id & GENMASK(1, 0)) << 8; + + /* + * Insert coh_st_id[n:2] at bit 12. 'n' could be 2 or 3. + * Grab both because bit 3 will be clear if unused. + */ + denorm_addr |= (coh_st_id & GENMASK(3, 2)) << 10; + + return denorm_addr; +} + +static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id) +{ + switch (ctx->map.intlv_mode) { + case NOHASH_2CHAN: + case NOHASH_4CHAN: + case NOHASH_8CHAN: + case NOHASH_16CHAN: + case NOHASH_32CHAN: + case DF2_2CHAN_HASH: + return insert_coh_st_id_at_intlv_bit(ctx, denorm_addr, coh_st_id); + + case DF3_COD4_2CHAN_HASH: + case DF3_COD2_4CHAN_HASH: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return insert_coh_st_id_split_2_1(ctx, denorm_addr, coh_st_id); + + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_1K_HASH: + return insert_coh_st_id_split_2_2(ctx, denorm_addr, coh_st_id); + + default: + atl_debug_on_bad_intlv_mode(ctx); + return ~0ULL; + } +} + +static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) +{ + u16 component_id, log_fabric_id; + + /* Start with the physical COH_ST Fabric ID. */ + u16 phys_fabric_id = ctx->coh_st_fabric_id; + + /* Skip logical ID lookup if remapping is disabled. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl) && + ctx->map.intlv_mode != DF3_6CHAN) + return phys_fabric_id; + + /* Mask off the Node ID bits to get the "local" Component ID. */ + component_id = phys_fabric_id & df_cfg.component_id_mask; + + /* + * Search the list of logical Component IDs for the one that + * matches this physical Component ID. + */ + for (log_fabric_id = 0; log_fabric_id < MAX_COH_ST_CHANNELS; log_fabric_id++) { + if (ctx->map.remap_array[log_fabric_id] == component_id) + break; + } + + if (log_fabric_id == MAX_COH_ST_CHANNELS) + atl_debug(ctx, "COH_ST remap entry not found for 0x%x", + log_fabric_id); + + /* Get the Node ID bits from the physical and apply to the logical. */ + return (phys_fabric_id & df_cfg.node_id_mask) | log_fabric_id; +} + +static int denorm_addr_common(struct addr_ctx *ctx) +{ + u64 denorm_addr; + u16 coh_st_id; + + /* + * Convert the original physical COH_ST Fabric ID to a logical value. + * This is required for non-power-of-two and other interleaving modes. + */ + ctx->coh_st_fabric_id = get_logical_coh_st_fabric_id(ctx); + + denorm_addr = make_space_for_coh_st_id(ctx); + coh_st_id = calculate_coh_st_id(ctx); + ctx->ret_addr = insert_coh_st_id(ctx, denorm_addr, coh_st_id); + return 0; +} + +static int denorm_addr_df3_6chan(struct addr_ctx *ctx) +{ + u16 coh_st_id = ctx->coh_st_fabric_id & df_cfg.component_id_mask; + u8 total_intlv_bits = ctx->map.total_intlv_bits; + u8 low_bit, intlv_bit = ctx->map.intlv_bit_pos; + u64 msb_intlv_bits, temp_addr_a, temp_addr_b; + u8 np2_bits = ctx->map.np2_bits; + + if (ctx->map.intlv_mode != DF3_6CHAN) + return -EINVAL; + + /* + * 'np2_bits' holds the number of bits needed to cover the + * amount of memory (rounded up) in this map using 64K chunks. + * + * Example: + * Total memory in map: 6GB + * Rounded up to next power-of-2: 8GB + * Number of 64K chunks: 0x20000 + * np2_bits = log2(# of chunks): 17 + * + * Get the two most-significant interleave bits from the + * input address based on the following: + * + * [15 + np2_bits - total_intlv_bits : 14 + np2_bits - total_intlv_bits] + */ + low_bit = 14 + np2_bits - total_intlv_bits; + msb_intlv_bits = ctx->ret_addr >> low_bit; + msb_intlv_bits &= 0x3; + + /* + * If MSB are 11b, then logical COH_ST ID is 6 or 7. + * Need to adjust based on the mod3 result. + */ + if (msb_intlv_bits == 3) { + u8 addr_mod, phys_addr_msb, msb_coh_st_id; + + /* Get the remaining interleave bits from the input address. */ + temp_addr_b = GENMASK_ULL(low_bit - 1, intlv_bit) & ctx->ret_addr; + temp_addr_b >>= intlv_bit; + + /* Calculate the logical COH_ST offset based on mod3. */ + addr_mod = temp_addr_b % 3; + + /* Get COH_ST ID bits [2:1]. */ + msb_coh_st_id = (coh_st_id >> 1) & 0x3; + + /* Get the bit that starts the physical address bits. */ + phys_addr_msb = (intlv_bit + np2_bits + 1); + phys_addr_msb &= BIT(0); + phys_addr_msb++; + phys_addr_msb *= 3 - addr_mod + msb_coh_st_id; + phys_addr_msb %= 3; + + /* Move the physical address MSB to the correct place. */ + temp_addr_b |= phys_addr_msb << (low_bit - total_intlv_bits - intlv_bit); + + /* Generate a new COH_ST ID as follows: coh_st_id = [1, 1, coh_st_id[0]] */ + coh_st_id &= BIT(0); + coh_st_id |= GENMASK(2, 1); + } else { + temp_addr_b = GENMASK_ULL(63, intlv_bit) & ctx->ret_addr; + temp_addr_b >>= intlv_bit; + } + + temp_addr_a = GENMASK_ULL(intlv_bit - 1, 0) & ctx->ret_addr; + temp_addr_b <<= intlv_bit + total_intlv_bits; + + ctx->ret_addr = temp_addr_a | temp_addr_b; + ctx->ret_addr |= coh_st_id << intlv_bit; + return 0; +} + +static int denorm_addr_df4_np2(struct addr_ctx *ctx) +{ + bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G; + u16 group, group_offset, log_coh_st_offset; + unsigned int mod_value, shift_value; + u16 mask = df_cfg.component_id_mask; + u64 temp_addr_a, temp_addr_b; + u8 hash_pa8, hashed_bit; + + switch (ctx->map.intlv_mode) { + case DF4_NPS4_3CHAN_HASH: + mod_value = 3; + shift_value = 13; + break; + case DF4_NPS2_6CHAN_HASH: + mod_value = 3; + shift_value = 12; + break; + case DF4_NPS1_12CHAN_HASH: + mod_value = 3; + shift_value = 11; + break; + case DF4_NPS2_5CHAN_HASH: + mod_value = 5; + shift_value = 13; + break; + case DF4_NPS1_10CHAN_HASH: + mod_value = 5; + shift_value = 12; + break; + default: + atl_debug_on_bad_intlv_mode(ctx); + return -EINVAL; + }; + + if (ctx->map.num_intlv_sockets == 1) { + hash_pa8 = BIT_ULL(shift_value) & ctx->ret_addr; + temp_addr_a = remove_bits(shift_value, shift_value, ctx->ret_addr); + } else { + hash_pa8 = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask); + hash_pa8 >>= df_cfg.socket_id_shift; + temp_addr_a = ctx->ret_addr; + } + + /* Make a gap for the real bit [8]. */ + temp_addr_a = expand_bits(8, 1, temp_addr_a); + + /* Make an additional gap for bits [13:12], as appropriate.*/ + if (ctx->map.intlv_mode == DF4_NPS2_6CHAN_HASH || + ctx->map.intlv_mode == DF4_NPS1_10CHAN_HASH) { + temp_addr_a = expand_bits(13, 1, temp_addr_a); + } else if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH) { + temp_addr_a = expand_bits(12, 2, temp_addr_a); + } + + /* Keep bits [13:0]. */ + temp_addr_a &= GENMASK_ULL(13, 0); + + /* Get the appropriate high bits. */ + shift_value += 1 - ilog2(ctx->map.num_intlv_sockets); + temp_addr_b = GENMASK_ULL(63, shift_value) & ctx->ret_addr; + temp_addr_b >>= shift_value; + temp_addr_b *= mod_value; + + /* + * Coherent Stations are divided into groups. + * + * Multiples of 3 (mod3) are divided into quadrants. + * e.g. NP4_3CHAN -> [0, 1, 2] [6, 7, 8] + * [3, 4, 5] [9, 10, 11] + * + * Multiples of 5 (mod5) are divided into sides. + * e.g. NP2_5CHAN -> [0, 1, 2, 3, 4] [5, 6, 7, 8, 9] + */ + + /* + * Calculate the logical offset for the COH_ST within its DRAM Address map. + * e.g. if map includes [5, 6, 7, 8, 9] and target instance is '8', then + * log_coh_st_offset = 8 - 5 = 3 + */ + log_coh_st_offset = (ctx->coh_st_fabric_id & mask) - (get_dst_fabric_id(ctx) & mask); + + /* + * Figure out the group number. + * + * Following above example, + * log_coh_st_offset = 3 + * mod_value = 5 + * group = 3 / 5 = 0 + */ + group = log_coh_st_offset / mod_value; + + /* + * Figure out the offset within the group. + * + * Following above example, + * log_coh_st_offset = 3 + * mod_value = 5 + * group_offset = 3 % 5 = 3 + */ + group_offset = log_coh_st_offset % mod_value; + + /* Adjust group_offset if the hashed bit [8] is set. */ + if (hash_pa8) { + if (!group_offset) + group_offset = mod_value - 1; + else + group_offset--; + } + + /* Add in the group offset to the high bits. */ + temp_addr_b += group_offset; + + /* Shift the high bits to the proper starting position. */ + temp_addr_b <<= 14; + + /* Combine the high and low bits together. */ + ctx->ret_addr = temp_addr_a | temp_addr_b; + + /* Account for hashing here instead of in dehash_address(). */ + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + + hashed_bit = !!hash_pa8; + hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr); + hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 8; + + /* Done for 3 and 5 channel. */ + if (ctx->map.intlv_mode == DF4_NPS4_3CHAN_HASH || + ctx->map.intlv_mode == DF4_NPS2_5CHAN_HASH) + return 0; + + /* Select the proper 'group' bit to use for Bit 13. */ + if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH) + hashed_bit = !!(group & BIT(1)); + else + hashed_bit = group & BIT(0); + + hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 13; + + /* Done for 6 and 10 channel. */ + if (ctx->map.intlv_mode != DF4_NPS1_12CHAN_HASH) + return 0; + + hashed_bit = group & BIT(0); + hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k; + hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M; + hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G; + + ctx->ret_addr |= hashed_bit << 12; + return 0; +} + +int denormalize_address(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NONE: + return 0; + case DF4_NPS4_3CHAN_HASH: + case DF4_NPS2_6CHAN_HASH: + case DF4_NPS1_12CHAN_HASH: + case DF4_NPS2_5CHAN_HASH: + case DF4_NPS1_10CHAN_HASH: + return denorm_addr_df4_np2(ctx); + case DF3_6CHAN: + return denorm_addr_df3_6chan(ctx); + default: + return denorm_addr_common(ctx); + } +} diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h new file mode 100644 index 000000000000..13f1b6098c96 --- /dev/null +++ b/drivers/ras/amd/atl/internal.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Address Translation Library + * + * internal.h : Helper functions and common defines + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#ifndef __AMD_ATL_INTERNAL_H__ +#define __AMD_ATL_INTERNAL_H__ + +#include +#include +#include + +#include + +#include "reg_fields.h" + +/* Maximum possible number of Coherent Stations within a single Data Fabric. */ +#define MAX_COH_ST_CHANNELS 32 + +/* PCI ID for Zen4 Server DF Function 0. */ +#define DF_FUNC0_ID_ZEN4_SERVER 0x14AD1022 + +/* Shift needed for adjusting register values to true values. */ +#define DF_DRAM_BASE_LIMIT_LSB 28 + +enum df_revisions { + UNKNOWN, + DF2, + DF3, + DF3p5, + DF4, + DF4p5, +}; + +/* These are mapped 1:1 to the hardware values. Special cases are set at > 0x20. */ +enum intlv_modes { + NONE = 0x00, + NOHASH_2CHAN = 0x01, + NOHASH_4CHAN = 0x03, + NOHASH_8CHAN = 0x05, + DF3_6CHAN = 0x06, + NOHASH_16CHAN = 0x07, + NOHASH_32CHAN = 0x08, + DF3_COD4_2CHAN_HASH = 0x0C, + DF3_COD2_4CHAN_HASH = 0x0D, + DF3_COD1_8CHAN_HASH = 0x0E, + DF4_NPS4_2CHAN_HASH = 0x10, + DF4_NPS2_4CHAN_HASH = 0x11, + DF4_NPS1_8CHAN_HASH = 0x12, + DF4_NPS4_3CHAN_HASH = 0x13, + DF4_NPS2_6CHAN_HASH = 0x14, + DF4_NPS1_12CHAN_HASH = 0x15, + DF4_NPS2_5CHAN_HASH = 0x16, + DF4_NPS1_10CHAN_HASH = 0x17, + DF2_2CHAN_HASH = 0x21, + /* DF4.5 modes are all IntLvNumChan + 0x20 */ + DF4p5_NPS1_16CHAN_1K_HASH = 0x2C, + DF4p5_NPS0_24CHAN_1K_HASH = 0x2E, + DF4p5_NPS4_2CHAN_1K_HASH = 0x30, + DF4p5_NPS2_4CHAN_1K_HASH = 0x31, + DF4p5_NPS1_8CHAN_1K_HASH = 0x32, + DF4p5_NPS4_3CHAN_1K_HASH = 0x33, + DF4p5_NPS2_6CHAN_1K_HASH = 0x34, + DF4p5_NPS1_12CHAN_1K_HASH = 0x35, + DF4p5_NPS2_5CHAN_1K_HASH = 0x36, + DF4p5_NPS1_10CHAN_1K_HASH = 0x37, + DF4p5_NPS4_2CHAN_2K_HASH = 0x40, + DF4p5_NPS2_4CHAN_2K_HASH = 0x41, + DF4p5_NPS1_8CHAN_2K_HASH = 0x42, + DF4p5_NPS1_16CHAN_2K_HASH = 0x43, + DF4p5_NPS4_3CHAN_2K_HASH = 0x44, + DF4p5_NPS2_6CHAN_2K_HASH = 0x45, + DF4p5_NPS1_12CHAN_2K_HASH = 0x46, + DF4p5_NPS0_24CHAN_2K_HASH = 0x47, + DF4p5_NPS2_5CHAN_2K_HASH = 0x48, + DF4p5_NPS1_10CHAN_2K_HASH = 0x49, +}; + +struct df_flags { + __u8 legacy_ficaa : 1, + socket_id_shift_quirk : 1, + __reserved_0 : 6; +}; + +struct df_config { + enum df_revisions rev; + + /* + * These masks operate on the 16-bit Coherent Station IDs, + * e.g. Instance, Fabric, Destination, etc. + */ + u16 component_id_mask; + u16 die_id_mask; + u16 node_id_mask; + u16 socket_id_mask; + + /* + * Least-significant bit of Node ID portion of the + * system-wide Coherent Station Fabric ID. + */ + u8 node_id_shift; + + /* + * Least-significant bit of Die portion of the Node ID. + * Adjusted to include the Node ID shift in order to apply + * to the Coherent Station Fabric ID. + */ + u8 die_id_shift; + + /* + * Least-significant bit of Socket portion of the Node ID. + * Adjusted to include the Node ID shift in order to apply + * to the Coherent Station Fabric ID. + */ + u8 socket_id_shift; + + /* Number of DRAM Address maps visible in a Coherent Station. */ + u8 num_coh_st_maps; + + /* Global flags to handle special cases. */ + struct df_flags flags; +}; + +extern struct df_config df_cfg; + +struct dram_addr_map { + /* + * Each DRAM Address Map can operate independently + * in different interleaving modes. + */ + enum intlv_modes intlv_mode; + + /* System-wide number for this address map. */ + u8 num; + + /* Raw register values */ + u32 base; + u32 limit; + u32 ctl; + u32 intlv; + + /* + * Logical to Physical Coherent Station Remapping array + * + * Index: Logical Coherent Station Instance ID + * Value: Physical Coherent Station Instance ID + * + * phys_coh_st_inst_id = remap_array[log_coh_st_inst_id] + */ + u8 remap_array[MAX_COH_ST_CHANNELS]; + + /* + * Number of bits covering DRAM Address map 0 + * when interleaving is non-power-of-2. + * + * Used only for DF3_6CHAN. + */ + u8 np2_bits; + + /* Position of the 'interleave bit'. */ + u8 intlv_bit_pos; + /* Number of channels interleaved in this map. */ + u8 num_intlv_chan; + /* Number of dies interleaved in this map. */ + u8 num_intlv_dies; + /* Number of sockets interleaved in this map. */ + u8 num_intlv_sockets; + /* + * Total number of channels interleaved accounting + * for die and socket interleaving. + */ + u8 total_intlv_chan; + /* Total bits needed to cover 'total_intlv_chan'. */ + u8 total_intlv_bits; +}; + +/* Original input values cached for debug printing. */ +struct addr_ctx_inputs { + u64 norm_addr; + u8 socket_id; + u8 die_id; + u8 coh_st_inst_id; +}; + +struct addr_ctx { + u64 ret_addr; + + struct addr_ctx_inputs inputs; + struct dram_addr_map map; + + /* AMD Node ID calculated from Socket and Die IDs. */ + u8 node_id; + + /* + * Coherent Station Instance ID + * Local ID used within a 'node'. + */ + u16 inst_id; + + /* + * Coherent Station Fabric ID + * System-wide ID that includes 'node' bits. + */ + u16 coh_st_fabric_id; +}; + +int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); +int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo); + +int get_df_system_info(void); +int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num); + +int get_address_map(struct addr_ctx *ctx); + +int denormalize_address(struct addr_ctx *ctx); +int dehash_address(struct addr_ctx *ctx); + +unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr); +unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err); + +/* + * Make a gap in @data that is @num_bits long starting at @bit_num. + * e.g. data = 11111111'b + * bit_num = 3 + * num_bits = 2 + * result = 1111100111'b + */ +static inline u64 expand_bits(u8 bit_num, u8 num_bits, u64 data) +{ + u64 temp1, temp2; + + if (!num_bits) + return data; + + if (!bit_num) { + WARN_ON_ONCE(num_bits >= BITS_PER_LONG); + return data << num_bits; + } + + WARN_ON_ONCE(bit_num >= BITS_PER_LONG); + + temp1 = data & GENMASK_ULL(bit_num - 1, 0); + + temp2 = data & GENMASK_ULL(63, bit_num); + temp2 <<= num_bits; + + return temp1 | temp2; +} + +/* + * Remove bits in @data between @low_bit and @high_bit inclusive. + * e.g. data = XXXYYZZZ'b + * low_bit = 3 + * high_bit = 4 + * result = XXXZZZ'b + */ +static inline u64 remove_bits(u8 low_bit, u8 high_bit, u64 data) +{ + u64 temp1, temp2; + + WARN_ON_ONCE(high_bit >= BITS_PER_LONG); + WARN_ON_ONCE(low_bit >= BITS_PER_LONG); + WARN_ON_ONCE(low_bit > high_bit); + + if (!low_bit) + return data >> (high_bit++); + + temp1 = GENMASK_ULL(low_bit - 1, 0) & data; + temp2 = GENMASK_ULL(63, high_bit + 1) & data; + temp2 >>= high_bit - low_bit + 1; + + return temp1 | temp2; +} + +#define atl_debug(ctx, fmt, arg...) \ + pr_debug("socket_id=%u die_id=%u coh_st_inst_id=%u norm_addr=0x%016llx: " fmt,\ + (ctx)->inputs.socket_id, (ctx)->inputs.die_id,\ + (ctx)->inputs.coh_st_inst_id, (ctx)->inputs.norm_addr, ##arg) + +static inline void atl_debug_on_bad_df_rev(void) +{ + pr_debug("Unrecognized DF rev: %u", df_cfg.rev); +} + +static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx) +{ + atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode); +} + +#endif /* __AMD_ATL_INTERNAL_H__ */ diff --git a/drivers/ras/amd/atl/map.c b/drivers/ras/amd/atl/map.c new file mode 100644 index 000000000000..33f549b6255a --- /dev/null +++ b/drivers/ras/amd/atl/map.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * map.c : Functions to read and decode DRAM address maps + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +static int df2_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF2_INTLV_NUM_CHAN, ctx->map.base); + + if (ctx->map.intlv_mode == 8) + ctx->map.intlv_mode = DF2_2CHAN_HASH; + + if (ctx->map.intlv_mode != NONE && + ctx->map.intlv_mode != NOHASH_2CHAN && + ctx->map.intlv_mode != DF2_2CHAN_HASH) + return -EINVAL; + + return 0; +} + +static int df3_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF3_INTLV_NUM_CHAN, ctx->map.base); + return 0; +} + +static int df3p5_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF3p5_INTLV_NUM_CHAN, ctx->map.base); + + if (ctx->map.intlv_mode == DF3_6CHAN) + return -EINVAL; + + return 0; +} + +static int df4_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF4_INTLV_NUM_CHAN, ctx->map.intlv); + + if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH || + ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH || + ctx->map.intlv_mode == DF3_COD1_8CHAN_HASH || + ctx->map.intlv_mode == DF3_6CHAN) + return -EINVAL; + + return 0; +} + +static int df4p5_get_intlv_mode(struct addr_ctx *ctx) +{ + ctx->map.intlv_mode = FIELD_GET(DF4p5_INTLV_NUM_CHAN, ctx->map.intlv); + + if (ctx->map.intlv_mode <= NOHASH_32CHAN) + return 0; + + /* + * Modes matching the ranges above are returned as-is. + * + * All other modes are "fixed up" by adding 20h to make a unique value. + */ + ctx->map.intlv_mode += 0x20; + + return 0; +} + +static int get_intlv_mode(struct addr_ctx *ctx) +{ + int ret; + + switch (df_cfg.rev) { + case DF2: + ret = df2_get_intlv_mode(ctx); + break; + case DF3: + ret = df3_get_intlv_mode(ctx); + break; + case DF3p5: + ret = df3p5_get_intlv_mode(ctx); + break; + case DF4: + ret = df4_get_intlv_mode(ctx); + break; + case DF4p5: + ret = df4p5_get_intlv_mode(ctx); + break; + default: + ret = -EINVAL; + } + + if (ret) + atl_debug_on_bad_df_rev(); + + return ret; +} + +static u64 get_hi_addr_offset(u32 reg_dram_offset) +{ + u8 shift = DF_DRAM_BASE_LIMIT_LSB; + u64 hi_addr_offset; + + switch (df_cfg.rev) { + case DF2: + hi_addr_offset = FIELD_GET(DF2_HI_ADDR_OFFSET, reg_dram_offset); + break; + case DF3: + case DF3p5: + hi_addr_offset = FIELD_GET(DF3_HI_ADDR_OFFSET, reg_dram_offset); + break; + case DF4: + case DF4p5: + hi_addr_offset = FIELD_GET(DF4_HI_ADDR_OFFSET, reg_dram_offset); + break; + default: + hi_addr_offset = 0; + atl_debug_on_bad_df_rev(); + } + + return hi_addr_offset << shift; +} + +/* + * Returns: 0 if offset is disabled. + * 1 if offset is enabled. + * -EINVAL on error. + */ +static int get_dram_offset(struct addr_ctx *ctx, u64 *norm_offset) +{ + u32 reg_dram_offset; + u8 map_num; + + /* Should not be called for map 0. */ + if (!ctx->map.num) { + atl_debug(ctx, "Trying to find DRAM offset for map 0"); + return -EINVAL; + } + + /* + * DramOffset registers don't exist for map 0, so the base register + * actually refers to map 1. + * Adjust the map_num for the register offsets. + */ + map_num = ctx->map.num - 1; + + if (df_cfg.rev >= DF4) { + /* Read D18F7x140 (DramOffset) */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x140 + (4 * map_num), + ctx->inst_id, ®_dram_offset)) + return -EINVAL; + + } else { + /* Read D18F0x1B4 (DramOffset) */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x1B4 + (4 * map_num), + ctx->inst_id, ®_dram_offset)) + return -EINVAL; + } + + if (!FIELD_GET(DF_HI_ADDR_OFFSET_EN, reg_dram_offset)) + return 0; + + *norm_offset = get_hi_addr_offset(reg_dram_offset); + + return 1; +} + +static int df3_6ch_get_dram_addr_map(struct addr_ctx *ctx) +{ + u16 dst_fabric_id = FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit); + u8 i, j, shift = 4, mask = 0xF; + u32 reg, offset = 0x60; + u16 dst_node_id; + + /* Get Socket 1 register. */ + if (dst_fabric_id & df_cfg.socket_id_mask) + offset = 0x68; + + /* Read D18F0x06{0,8} (DF::Skt0CsTargetRemap0)/(DF::Skt0CsTargetRemap1) */ + if (df_indirect_read_broadcast(ctx->node_id, 0, offset, ®)) + return -EINVAL; + + /* Save 8 remap entries. */ + for (i = 0, j = 0; i < 8; i++, j++) + ctx->map.remap_array[i] = (reg >> (j * shift)) & mask; + + dst_node_id = dst_fabric_id & df_cfg.node_id_mask; + dst_node_id >>= df_cfg.node_id_shift; + + /* Read D18F2x090 (DF::Np2ChannelConfig) */ + if (df_indirect_read_broadcast(dst_node_id, 2, 0x90, ®)) + return -EINVAL; + + ctx->map.np2_bits = FIELD_GET(DF_LOG2_ADDR_64K_SPACE0, reg); + return 0; +} + +static int df2_get_dram_addr_map(struct addr_ctx *ctx) +{ + /* Read D18F0x110 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x110 + (8 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F0x114 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x114 + (8 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + return 0; +} + +static int df3_get_dram_addr_map(struct addr_ctx *ctx) +{ + if (df2_get_dram_addr_map(ctx)) + return -EINVAL; + + /* Read D18F0x3F8 (DfGlobalCtl). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x3F8, + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + return 0; +} + +static int df4_get_dram_addr_map(struct addr_ctx *ctx) +{ + u8 remap_sel, i, j, shift = 4, mask = 0xF; + u32 remap_reg; + + /* Read D18F7xE00 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE00 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F7xE04 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE04 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + /* Read D18F7xE08 (DramAddressCtl). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE08 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + /* Read D18F7xE0C (DramAddressIntlv). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0xE0C + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.intlv)) + return -EINVAL; + + /* Check if Remap Enable bit is valid. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl)) + return 0; + + /* Fill with bogus values, because '0' is a valid value. */ + memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array)); + + /* Get Remap registers. */ + remap_sel = FIELD_GET(DF4_REMAP_SEL, ctx->map.ctl); + + /* Read D18F7x180 (CsTargetRemap0A). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (8 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save first 8 remap entries. */ + for (i = 0, j = 0; i < 8; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x184 (CsTargetRemap0B). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (8 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 8 remap entries. */ + for (i = 8, j = 0; i < 16; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + return 0; +} + +static int df4p5_get_dram_addr_map(struct addr_ctx *ctx) +{ + u8 remap_sel, i, j, shift = 5, mask = 0x1F; + u32 remap_reg; + + /* Read D18F7x200 (DramBaseAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x200 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.base)) + return -EINVAL; + + /* Read D18F7x204 (DramLimitAddress). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x204 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.limit)) + return -EINVAL; + + /* Read D18F7x208 (DramAddressCtl). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x208 + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.ctl)) + return -EINVAL; + + /* Read D18F7x20C (DramAddressIntlv). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x20C + (16 * ctx->map.num), + ctx->inst_id, &ctx->map.intlv)) + return -EINVAL; + + /* Check if Remap Enable bit is valid. */ + if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl)) + return 0; + + /* Fill with bogus values, because '0' is a valid value. */ + memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array)); + + /* Get Remap registers. */ + remap_sel = FIELD_GET(DF4p5_REMAP_SEL, ctx->map.ctl); + + /* Read D18F7x180 (CsTargetRemap0A). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save first 6 remap entries. */ + for (i = 0, j = 0; i < 6; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x184 (CsTargetRemap0B). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 6 remap entries. */ + for (i = 6, j = 0; i < 12; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + /* Read D18F7x188 (CsTargetRemap0C). */ + if (df_indirect_read_instance(ctx->node_id, 7, 0x188 + (24 * remap_sel), + ctx->inst_id, &remap_reg)) + return -EINVAL; + + /* Save next 6 remap entries. */ + for (i = 12, j = 0; i < 18; i++, j++) + ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask; + + return 0; +} + +static int get_dram_addr_map(struct addr_ctx *ctx) +{ + switch (df_cfg.rev) { + case DF2: return df2_get_dram_addr_map(ctx); + case DF3: + case DF3p5: return df3_get_dram_addr_map(ctx); + case DF4: return df4_get_dram_addr_map(ctx); + case DF4p5: return df4p5_get_dram_addr_map(ctx); + default: + atl_debug_on_bad_df_rev(); + return -EINVAL; + } +} + +static int get_coh_st_fabric_id(struct addr_ctx *ctx) +{ + u32 reg; + + /* Read D18F0x50 (FabricBlockInstanceInformation3). */ + if (df_indirect_read_instance(ctx->node_id, 0, 0x50, ctx->inst_id, ®)) + return -EINVAL; + + if (df_cfg.rev < DF4p5) + ctx->coh_st_fabric_id = FIELD_GET(DF2_COH_ST_FABRIC_ID, reg); + else + ctx->coh_st_fabric_id = FIELD_GET(DF4p5_COH_ST_FABRIC_ID, reg); + + return 0; +} + +static int find_normalized_offset(struct addr_ctx *ctx, u64 *norm_offset) +{ + u64 last_offset = 0; + int ret; + + for (ctx->map.num = 1; ctx->map.num < df_cfg.num_coh_st_maps; ctx->map.num++) { + ret = get_dram_offset(ctx, norm_offset); + if (ret < 0) + return ret; + + /* Continue search if this map's offset is not enabled. */ + if (!ret) + continue; + + /* Enabled offsets should never be 0. */ + if (*norm_offset == 0) { + atl_debug(ctx, "Enabled map %u offset is 0", ctx->map.num); + return -EINVAL; + } + + /* Offsets should always increase from one map to the next. */ + if (*norm_offset <= last_offset) { + atl_debug(ctx, "Map %u offset (0x%016llx) <= previous (0x%016llx)", + ctx->map.num, *norm_offset, last_offset); + return -EINVAL; + } + + /* Match if this map's offset is less than the current calculated address. */ + if (ctx->ret_addr >= *norm_offset) + break; + + last_offset = *norm_offset; + } + + /* + * Finished search without finding a match. + * Reset to map 0 and no offset. + */ + if (ctx->map.num >= df_cfg.num_coh_st_maps) { + ctx->map.num = 0; + *norm_offset = 0; + } + + return 0; +} + +static bool valid_map(struct addr_ctx *ctx) +{ + if (df_cfg.rev >= DF4) + return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.ctl); + else + return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.base); +} + +static int get_address_map_common(struct addr_ctx *ctx) +{ + u64 norm_offset = 0; + + if (get_coh_st_fabric_id(ctx)) + return -EINVAL; + + if (find_normalized_offset(ctx, &norm_offset)) + return -EINVAL; + + if (get_dram_addr_map(ctx)) + return -EINVAL; + + if (!valid_map(ctx)) + return -EINVAL; + + ctx->ret_addr -= norm_offset; + + return 0; +} + +static u8 get_num_intlv_chan(struct addr_ctx *ctx) +{ + switch (ctx->map.intlv_mode) { + case NONE: + return 1; + case NOHASH_2CHAN: + case DF2_2CHAN_HASH: + case DF3_COD4_2CHAN_HASH: + case DF4_NPS4_2CHAN_HASH: + case DF4p5_NPS4_2CHAN_1K_HASH: + case DF4p5_NPS4_2CHAN_2K_HASH: + return 2; + case DF4_NPS4_3CHAN_HASH: + case DF4p5_NPS4_3CHAN_1K_HASH: + case DF4p5_NPS4_3CHAN_2K_HASH: + return 3; + case NOHASH_4CHAN: + case DF3_COD2_4CHAN_HASH: + case DF4_NPS2_4CHAN_HASH: + case DF4p5_NPS2_4CHAN_1K_HASH: + case DF4p5_NPS2_4CHAN_2K_HASH: + return 4; + case DF4_NPS2_5CHAN_HASH: + case DF4p5_NPS2_5CHAN_1K_HASH: + case DF4p5_NPS2_5CHAN_2K_HASH: + return 5; + case DF3_6CHAN: + case DF4_NPS2_6CHAN_HASH: + case DF4p5_NPS2_6CHAN_1K_HASH: + case DF4p5_NPS2_6CHAN_2K_HASH: + return 6; + case NOHASH_8CHAN: + case DF3_COD1_8CHAN_HASH: + case DF4_NPS1_8CHAN_HASH: + case DF4p5_NPS1_8CHAN_1K_HASH: + case DF4p5_NPS1_8CHAN_2K_HASH: + return 8; + case DF4_NPS1_10CHAN_HASH: + case DF4p5_NPS1_10CHAN_1K_HASH: + case DF4p5_NPS1_10CHAN_2K_HASH: + return 10; + case DF4_NPS1_12CHAN_HASH: + case DF4p5_NPS1_12CHAN_1K_HASH: + case DF4p5_NPS1_12CHAN_2K_HASH: + return 12; + case NOHASH_16CHAN: + case DF4p5_NPS1_16CHAN_1K_HASH: + case DF4p5_NPS1_16CHAN_2K_HASH: + return 16; + case DF4p5_NPS0_24CHAN_1K_HASH: + case DF4p5_NPS0_24CHAN_2K_HASH: + return 24; + case NOHASH_32CHAN: + return 32; + default: + atl_debug_on_bad_intlv_mode(ctx); + return 0; + } +} + +static void calculate_intlv_bits(struct addr_ctx *ctx) +{ + ctx->map.num_intlv_chan = get_num_intlv_chan(ctx); + + ctx->map.total_intlv_chan = ctx->map.num_intlv_chan; + ctx->map.total_intlv_chan *= ctx->map.num_intlv_dies; + ctx->map.total_intlv_chan *= ctx->map.num_intlv_sockets; + + /* + * Get the number of bits needed to cover this many channels. + * order_base_2() rounds up automatically. + */ + ctx->map.total_intlv_bits = order_base_2(ctx->map.total_intlv_chan); +} + +static u8 get_intlv_bit_pos(struct addr_ctx *ctx) +{ + u8 addr_sel = 0; + + switch (df_cfg.rev) { + case DF2: + addr_sel = FIELD_GET(DF2_INTLV_ADDR_SEL, ctx->map.base); + break; + case DF3: + case DF3p5: + addr_sel = FIELD_GET(DF3_INTLV_ADDR_SEL, ctx->map.base); + break; + case DF4: + case DF4p5: + addr_sel = FIELD_GET(DF4_INTLV_ADDR_SEL, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Add '8' to get the 'interleave bit position'. */ + return addr_sel + 8; +} + +static u8 get_num_intlv_dies(struct addr_ctx *ctx) +{ + u8 dies = 0; + + switch (df_cfg.rev) { + case DF2: + dies = FIELD_GET(DF2_INTLV_NUM_DIES, ctx->map.limit); + break; + case DF3: + dies = FIELD_GET(DF3_INTLV_NUM_DIES, ctx->map.base); + break; + case DF3p5: + dies = FIELD_GET(DF3p5_INTLV_NUM_DIES, ctx->map.base); + break; + case DF4: + case DF4p5: + dies = FIELD_GET(DF4_INTLV_NUM_DIES, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Register value is log2, e.g. 0 -> 1 die, 1 -> 2 dies, etc. */ + return 1 << dies; +} + +static u8 get_num_intlv_sockets(struct addr_ctx *ctx) +{ + u8 sockets = 0; + + switch (df_cfg.rev) { + case DF2: + sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.limit); + break; + case DF3: + case DF3p5: + sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.base); + break; + case DF4: + case DF4p5: + sockets = FIELD_GET(DF4_INTLV_NUM_SOCKETS, ctx->map.intlv); + break; + default: + atl_debug_on_bad_df_rev(); + break; + } + + /* Register value is log2, e.g. 0 -> 1 sockets, 1 -> 2 sockets, etc. */ + return 1 << sockets; +} + +static int get_global_map_data(struct addr_ctx *ctx) +{ + if (get_intlv_mode(ctx)) + return -EINVAL; + + if (ctx->map.intlv_mode == DF3_6CHAN && + df3_6ch_get_dram_addr_map(ctx)) + return -EINVAL; + + ctx->map.intlv_bit_pos = get_intlv_bit_pos(ctx); + ctx->map.num_intlv_dies = get_num_intlv_dies(ctx); + ctx->map.num_intlv_sockets = get_num_intlv_sockets(ctx); + calculate_intlv_bits(ctx); + + return 0; +} + +static void dump_address_map(struct dram_addr_map *map) +{ + u8 i; + + pr_debug("intlv_mode=0x%x", map->intlv_mode); + pr_debug("num=0x%x", map->num); + pr_debug("base=0x%x", map->base); + pr_debug("limit=0x%x", map->limit); + pr_debug("ctl=0x%x", map->ctl); + pr_debug("intlv=0x%x", map->intlv); + + for (i = 0; i < MAX_COH_ST_CHANNELS; i++) + pr_debug("remap_array[%u]=0x%x", i, map->remap_array[i]); + + pr_debug("intlv_bit_pos=%u", map->intlv_bit_pos); + pr_debug("num_intlv_chan=%u", map->num_intlv_chan); + pr_debug("num_intlv_dies=%u", map->num_intlv_dies); + pr_debug("num_intlv_sockets=%u", map->num_intlv_sockets); + pr_debug("total_intlv_chan=%u", map->total_intlv_chan); + pr_debug("total_intlv_bits=%u", map->total_intlv_bits); +} + +int get_address_map(struct addr_ctx *ctx) +{ + int ret; + + ret = get_address_map_common(ctx); + if (ret) + return ret; + + ret = get_global_map_data(ctx); + if (ret) + return ret; + + dump_address_map(&ctx->map); + + return ret; +} diff --git a/drivers/ras/amd/atl/reg_fields.h b/drivers/ras/amd/atl/reg_fields.h new file mode 100644 index 000000000000..6aaa5093f42c --- /dev/null +++ b/drivers/ras/amd/atl/reg_fields.h @@ -0,0 +1,603 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Address Translation Library + * + * reg_fields.h : Register field definitions + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +/* + * Notes on naming: + * 1) Use "DF_" prefix for fields that are the same for all revisions. + * 2) Use "DFx_" prefix for fields that differ between revisions. + * a) "x" is the first major revision where the new field appears. + * b) E.g., if DF2 and DF3 have the same field, then call it DF2. + * c) E.g., if DF3p5 and DF4 have the same field, then call it DF4. + */ + +/* + * Coherent Station Fabric ID + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x50 [Fabric Block Instance Information 3] + * DF2 BlockFabricId [19:8] + * DF3 BlockFabricId [19:8] + * DF3p5 BlockFabricId [19:8] + * DF4 BlockFabricId [19:8] + * DF4p5 BlockFabricId [15:8] + */ +#define DF2_COH_ST_FABRIC_ID GENMASK(19, 8) +#define DF4p5_COH_ST_FABRIC_ID GENMASK(15, 8) + +/* + * Component ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x208 [System Fabric ID Mask 0] + * DF3 ComponentIdMask [9:0] + * + * D18F1x150 [System Fabric ID Mask 0] + * DF3p5 ComponentIdMask [15:0] + * + * D18F4x1B0 [System Fabric ID Mask 0] + * DF4 ComponentIdMask [15:0] + * DF4p5 ComponentIdMask [15:0] + */ +#define DF3_COMPONENT_ID_MASK GENMASK(9, 0) +#define DF4_COMPONENT_ID_MASK GENMASK(15, 0) + +/* + * Destination Fabric ID + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 DstFabricID [7:0] + * DF3 DstFabricID [9:0] + * DF3 DstFabricID [11:0] + * + * D18F7xE08 [DRAM Address Control] + * DF4 DstFabricID [27:16] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 DstFabricID [23:16] + */ +#define DF2_DST_FABRIC_ID GENMASK(7, 0) +#define DF3_DST_FABRIC_ID GENMASK(9, 0) +#define DF3p5_DST_FABRIC_ID GENMASK(11, 0) +#define DF4_DST_FABRIC_ID GENMASK(27, 16) +#define DF4p5_DST_FABRIC_ID GENMASK(23, 16) + +/* + * Die ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 DieIdMask [15:8] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 DieIdMask [18:16] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 DieIdMask [15:0] + * + * D18F4x1B8 [System Fabric ID Mask 2] + * DF4 DieIdMask [15:0] + * DF4p5 DieIdMask [15:0] + */ +#define DF2_DIE_ID_MASK GENMASK(15, 8) +#define DF3_DIE_ID_MASK GENMASK(18, 16) +#define DF4_DIE_ID_MASK GENMASK(15, 0) + +/* + * Die ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 DieIdShift [27:24] + * + * DF3 N/A + * DF3p5 N/A + * DF4 N/A + * DF4p5 N/A + */ +#define DF2_DIE_ID_SHIFT GENMASK(27, 24) + +/* + * DRAM Address Range Valid + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 AddrRngVal [0] + * DF3 AddrRngVal [0] + * DF3p5 AddrRngVal [0] + * + * D18F7xE08 [DRAM Address Control] + * DF4 AddrRngVal [0] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 AddrRngVal [0] + */ +#define DF_ADDR_RANGE_VAL BIT(0) + +/* + * DRAM Base Address + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 DramBaseAddr [31:12] + * DF3 DramBaseAddr [31:12] + * DF3p5 DramBaseAddr [31:12] + * + * D18F7xE00 [DRAM Base Address] + * DF4 DramBaseAddr [27:0] + * + * D18F7x200 [DRAM Base Address] + * DF4p5 DramBaseAddr [27:0] + */ +#define DF2_BASE_ADDR GENMASK(31, 12) +#define DF4_BASE_ADDR GENMASK(27, 0) + +/* + * DRAM Hole Base + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F0x104 [DRAM Hole Control] + * DF2 DramHoleBase [31:24] + * DF3 DramHoleBase [31:24] + * DF3p5 DramHoleBase [31:24] + * + * D18F7x104 [DRAM Hole Control] + * DF4 DramHoleBase [31:24] + * DF4p5 DramHoleBase [31:24] + */ +#define DF_DRAM_HOLE_BASE_MASK GENMASK(31, 24) + +/* + * DRAM Limit Address + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 DramLimitAddr [31:12] + * DF3 DramLimitAddr [31:12] + * DF3p5 DramLimitAddr [31:12] + * + * D18F7xE04 [DRAM Limit Address] + * DF4 DramLimitAddr [27:0] + * + * D18F7x204 [DRAM Limit Address] + * DF4p5 DramLimitAddr [27:0] + */ +#define DF2_DRAM_LIMIT_ADDR GENMASK(31, 12) +#define DF4_DRAM_LIMIT_ADDR GENMASK(27, 0) + +/* + * Hash Interleave Controls + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F0x3F8 [DF Global Control] + * DF3 GlbHashIntlvCtl64K [20] + * GlbHashIntlvCtl2M [21] + * GlbHashIntlvCtl1G [22] + * + * DF3p5 GlbHashIntlvCtl64K [20] + * GlbHashIntlvCtl2M [21] + * GlbHashIntlvCtl1G [22] + * + * D18F7xE08 [DRAM Address Control] + * DF4 HashIntlvCtl64K [8] + * HashIntlvCtl2M [9] + * HashIntlvCtl1G [10] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 HashIntlvCtl4K [7] + * HashIntlvCtl64K [8] + * HashIntlvCtl2M [9] + * HashIntlvCtl1G [10] + * HashIntlvCtl1T [15] + */ +#define DF3_HASH_CTL_64K BIT(20) +#define DF3_HASH_CTL_2M BIT(21) +#define DF3_HASH_CTL_1G BIT(22) +#define DF4_HASH_CTL_4K BIT(7) +#define DF4_HASH_CTL_64K BIT(8) +#define DF4_HASH_CTL_2M BIT(9) +#define DF4_HASH_CTL_1G BIT(10) +#define DF4_HASH_CTL_1T BIT(15) + +/* + * High Address Offset + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x1B4 [DRAM Offset] + * DF2 HiAddrOffset [31:20] + * DF3 HiAddrOffset [31:12] + * DF3p5 HiAddrOffset [31:12] + * + * D18F7x140 [DRAM Offset] + * DF4 HiAddrOffset [24:1] + * DF4p5 HiAddrOffset [24:1] + */ +#define DF2_HI_ADDR_OFFSET GENMASK(31, 20) +#define DF3_HI_ADDR_OFFSET GENMASK(31, 12) +#define DF4_HI_ADDR_OFFSET GENMASK(24, 1) + +/* + * High Address Offset Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x1B4 [DRAM Offset] + * DF2 HiAddrOffsetEn [0] + * DF3 HiAddrOffsetEn [0] + * DF3p5 HiAddrOffsetEn [0] + * + * D18F7x140 [DRAM Offset] + * DF4 HiAddrOffsetEn [0] + * DF4p5 HiAddrOffsetEn [0] + */ +#define DF_HI_ADDR_OFFSET_EN BIT(0) + +/* + * Interleave Address Select + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 IntLvAddrSel [10:8] + * DF3 IntLvAddrSel [11:9] + * DF3p5 IntLvAddrSel [11:9] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvAddrSel [2:0] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvAddrSel [2:0] + */ +#define DF2_INTLV_ADDR_SEL GENMASK(10, 8) +#define DF3_INTLV_ADDR_SEL GENMASK(11, 9) +#define DF4_INTLV_ADDR_SEL GENMASK(2, 0) + +/* + * Interleave Number of Channels + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 IntLvNumChan [7:4] + * DF3 IntLvNumChan [5:2] + * DF3p5 IntLvNumChan [6:2] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumChan [8:4] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumChan [9:4] + */ +#define DF2_INTLV_NUM_CHAN GENMASK(7, 4) +#define DF3_INTLV_NUM_CHAN GENMASK(5, 2) +#define DF3p5_INTLV_NUM_CHAN GENMASK(6, 2) +#define DF4_INTLV_NUM_CHAN GENMASK(8, 4) +#define DF4p5_INTLV_NUM_CHAN GENMASK(9, 4) + +/* + * Interleave Number of Dies + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 IntLvNumDies [11:10] + * + * D18F0x110 [DRAM Base Address] + * DF3 IntLvNumDies [7:6] + * DF3p5 IntLvNumDies [7] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumDies [13:12] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumDies [13:12] + */ +#define DF2_INTLV_NUM_DIES GENMASK(11, 10) +#define DF3_INTLV_NUM_DIES GENMASK(7, 6) +#define DF3p5_INTLV_NUM_DIES BIT(7) +#define DF4_INTLV_NUM_DIES GENMASK(13, 12) + +/* + * Interleave Number of Sockets + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x114 [DRAM Limit Address] + * DF2 IntLvNumSockets [8] + * + * D18F0x110 [DRAM Base Address] + * DF3 IntLvNumSockets [8] + * DF3p5 IntLvNumSockets [8] + * + * D18F7xE0C [DRAM Address Interleave] + * DF4 IntLvNumSockets [18] + * + * D18F7x20C [DRAM Address Interleave] + * DF4p5 IntLvNumSockets [18] + */ +#define DF2_INTLV_NUM_SOCKETS BIT(8) +#define DF4_INTLV_NUM_SOCKETS BIT(18) + +/* + * Legacy MMIO Hole Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * D18F0x110 [DRAM Base Address] + * DF2 LgcyMmioHoleEn [1] + * DF3 LgcyMmioHoleEn [1] + * DF3p5 LgcyMmioHoleEn [1] + * + * D18F7xE08 [DRAM Address Control] + * DF4 LgcyMmioHoleEn [1] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 LgcyMmioHoleEn [1] + */ +#define DF_LEGACY_MMIO_HOLE_EN BIT(1) + +/* + * Log2 Address 64K Space 0 + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F2x90 [Non-power-of-2 channel Configuration Register for COH_ST DRAM Address Maps] + * DF3 Log2Addr64KSpace0 [5:0] + * + * DF3p5 N/A + * DF4 N/A + * DF4p5 N/A + */ +#define DF_LOG2_ADDR_64K_SPACE0 GENMASK(5, 0) + +/* + * Major Revision + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F0x040 [Fabric Block Instance Count] + * DF4 MajorRevision [27:24] + * DF4p5 MajorRevision [27:24] + */ +#define DF_MAJOR_REVISION GENMASK(27, 24) + +/* + * Minor Revision + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F0x040 [Fabric Block Instance Count] + * DF4 MinorRevision [23:16] + * DF4p5 MinorRevision [23:16] + */ +#define DF_MINOR_REVISION GENMASK(23, 16) + +/* + * Node ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x208 [System Fabric ID Mask 0] + * DF3 NodeIdMask [25:16] + * + * D18F1x150 [System Fabric ID Mask 0] + * DF3p5 NodeIdMask [31:16] + * + * D18F4x1B0 [System Fabric ID Mask 0] + * DF4 NodeIdMask [31:16] + * DF4p5 NodeIdMask [31:16] + */ +#define DF3_NODE_ID_MASK GENMASK(25, 16) +#define DF4_NODE_ID_MASK GENMASK(31, 16) + +/* + * Node ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 NodeIdShift [3:0] + * + * D18F1x154 [System Fabric ID Mask 1] + * DF3p5 NodeIdShift [3:0] + * + * D18F4x1B4 [System Fabric ID Mask 1] + * DF4 NodeIdShift [3:0] + * DF4p5 NodeIdShift [3:0] + */ +#define DF3_NODE_ID_SHIFT GENMASK(3, 0) + +/* + * Remap Enable + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F7xE08 [DRAM Address Control] + * DF4 RemapEn [4] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 RemapEn [4] + */ +#define DF4_REMAP_EN BIT(4) + +/* + * Remap Select + * + * Access type: Instance + * + * Register + * Rev Fieldname Bits + * + * DF2 N/A + * DF3 N/A + * DF3p5 N/A + * + * D18F7xE08 [DRAM Address Control] + * DF4 RemapSel [7:5] + * + * D18F7x208 [DRAM Address Control] + * DF4p5 RemapSel [6:5] + */ +#define DF4_REMAP_SEL GENMASK(7, 5) +#define DF4p5_REMAP_SEL GENMASK(6, 5) + +/* + * Socket ID Mask + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 SocketIdMask [23:16] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 SocketIdMask [26:24] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 SocketIdMask [31:16] + * + * D18F4x1B8 [System Fabric ID Mask 2] + * DF4 SocketIdMask [31:16] + * DF4p5 SocketIdMask [31:16] + */ +#define DF2_SOCKET_ID_MASK GENMASK(23, 16) +#define DF3_SOCKET_ID_MASK GENMASK(26, 24) +#define DF4_SOCKET_ID_MASK GENMASK(31, 16) + +/* + * Socket ID Shift + * + * Access type: Broadcast + * + * Register + * Rev Fieldname Bits + * + * D18F1x208 [System Fabric ID Mask] + * DF2 SocketIdShift [31:28] + * + * D18F1x20C [System Fabric ID Mask 1] + * DF3 SocketIdShift [9:8] + * + * D18F1x158 [System Fabric ID Mask 2] + * DF3p5 SocketIdShift [11:8] + * + * D18F4x1B4 [System Fabric ID Mask 1] + * DF4 SocketIdShift [11:8] + * DF4p5 SocketIdShift [11:8] + */ +#define DF2_SOCKET_ID_SHIFT GENMASK(31, 28) +#define DF3_SOCKET_ID_SHIFT GENMASK(9, 8) +#define DF4_SOCKET_ID_SHIFT GENMASK(11, 8) diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c new file mode 100644 index 000000000000..af61f2f1d6de --- /dev/null +++ b/drivers/ras/amd/atl/system.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * system.c : Functions to read and save system-wide data + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id) +{ + u16 socket_id_bits, die_id_bits; + + if (socket_id > 0 && df_cfg.socket_id_mask == 0) { + atl_debug(ctx, "Invalid socket inputs: socket_id=%u socket_id_mask=0x%x", + socket_id, df_cfg.socket_id_mask); + return -EINVAL; + } + + /* Do each step independently to avoid shift out-of-bounds issues. */ + socket_id_bits = socket_id; + socket_id_bits <<= df_cfg.socket_id_shift; + socket_id_bits &= df_cfg.socket_id_mask; + + if (die_id > 0 && df_cfg.die_id_mask == 0) { + atl_debug(ctx, "Invalid die inputs: die_id=%u die_id_mask=0x%x", + die_id, df_cfg.die_id_mask); + return -EINVAL; + } + + /* Do each step independently to avoid shift out-of-bounds issues. */ + die_id_bits = die_id; + die_id_bits <<= df_cfg.die_id_shift; + die_id_bits &= df_cfg.die_id_mask; + + ctx->node_id = (socket_id_bits | die_id_bits) >> df_cfg.node_id_shift; + return 0; +} + +static void df2_get_masks_shifts(u32 mask0) +{ + df_cfg.socket_id_shift = FIELD_GET(DF2_SOCKET_ID_SHIFT, mask0); + df_cfg.socket_id_mask = FIELD_GET(DF2_SOCKET_ID_MASK, mask0); + df_cfg.die_id_shift = FIELD_GET(DF2_DIE_ID_SHIFT, mask0); + df_cfg.die_id_mask = FIELD_GET(DF2_DIE_ID_MASK, mask0); + df_cfg.node_id_shift = df_cfg.die_id_shift; + df_cfg.node_id_mask = df_cfg.socket_id_mask | df_cfg.die_id_mask; + df_cfg.component_id_mask = ~df_cfg.node_id_mask; +} + +static void df3_get_masks_shifts(u32 mask0, u32 mask1) +{ + df_cfg.component_id_mask = FIELD_GET(DF3_COMPONENT_ID_MASK, mask0); + df_cfg.node_id_mask = FIELD_GET(DF3_NODE_ID_MASK, mask0); + + df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1); + df_cfg.socket_id_shift = FIELD_GET(DF3_SOCKET_ID_SHIFT, mask1); + df_cfg.socket_id_mask = FIELD_GET(DF3_SOCKET_ID_MASK, mask1); + df_cfg.die_id_mask = FIELD_GET(DF3_DIE_ID_MASK, mask1); +} + +static void df3p5_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2) +{ + df_cfg.component_id_mask = FIELD_GET(DF4_COMPONENT_ID_MASK, mask0); + df_cfg.node_id_mask = FIELD_GET(DF4_NODE_ID_MASK, mask0); + + df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1); + df_cfg.socket_id_shift = FIELD_GET(DF4_SOCKET_ID_SHIFT, mask1); + + df_cfg.socket_id_mask = FIELD_GET(DF4_SOCKET_ID_MASK, mask2); + df_cfg.die_id_mask = FIELD_GET(DF4_DIE_ID_MASK, mask2); +} + +static void df4_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2) +{ + df3p5_get_masks_shifts(mask0, mask1, mask2); + + if (!(df_cfg.flags.socket_id_shift_quirk && df_cfg.socket_id_shift == 1)) + return; + + df_cfg.socket_id_shift = 0; + df_cfg.socket_id_mask = 1; + df_cfg.die_id_shift = 0; + df_cfg.die_id_mask = 0; + df_cfg.node_id_shift = 8; + df_cfg.node_id_mask = 0x100; +} + +static int df4_get_fabric_id_mask_registers(void) +{ + u32 mask0, mask1, mask2; + + /* Read D18F4x1B0 (SystemFabricIdMask0) */ + if (df_indirect_read_broadcast(0, 4, 0x1B0, &mask0)) + return -EINVAL; + + /* Read D18F4x1B4 (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 4, 0x1B4, &mask1)) + return -EINVAL; + + /* Read D18F4x1B8 (SystemFabricIdMask2) */ + if (df_indirect_read_broadcast(0, 4, 0x1B8, &mask2)) + return -EINVAL; + + df4_get_masks_shifts(mask0, mask1, mask2); + return 0; +} + +static int df4_determine_df_rev(u32 reg) +{ + df_cfg.rev = FIELD_GET(DF_MINOR_REVISION, reg) < 5 ? DF4 : DF4p5; + + /* Check for special cases or quirks based on Device/Vendor IDs.*/ + + /* Read D18F0x000 (DeviceVendorId0) */ + if (df_indirect_read_broadcast(0, 0, 0, ®)) + return -EINVAL; + + if (reg == DF_FUNC0_ID_ZEN4_SERVER) + df_cfg.flags.socket_id_shift_quirk = 1; + + return df4_get_fabric_id_mask_registers(); +} + +static int determine_df_rev_legacy(void) +{ + u32 fabric_id_mask0, fabric_id_mask1, fabric_id_mask2; + + /* + * Check for DF3.5. + * + * Component ID Mask must be non-zero. Register D18F1x150 is + * reserved pre-DF3.5, so value will be Read-as-Zero. + */ + + /* Read D18F1x150 (SystemFabricIdMask0). */ + if (df_indirect_read_broadcast(0, 1, 0x150, &fabric_id_mask0)) + return -EINVAL; + + if (FIELD_GET(DF4_COMPONENT_ID_MASK, fabric_id_mask0)) { + df_cfg.rev = DF3p5; + + /* Read D18F1x154 (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 1, 0x154, &fabric_id_mask1)) + return -EINVAL; + + /* Read D18F1x158 (SystemFabricIdMask2) */ + if (df_indirect_read_broadcast(0, 1, 0x158, &fabric_id_mask2)) + return -EINVAL; + + df3p5_get_masks_shifts(fabric_id_mask0, fabric_id_mask1, fabric_id_mask2); + return 0; + } + + /* + * Check for DF3. + * + * Component ID Mask must be non-zero. Field is Read-as-Zero on DF2. + */ + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (df_indirect_read_broadcast(0, 1, 0x208, &fabric_id_mask0)) + return -EINVAL; + + if (FIELD_GET(DF3_COMPONENT_ID_MASK, fabric_id_mask0)) { + df_cfg.rev = DF3; + + /* Read D18F1x20C (SystemFabricIdMask1) */ + if (df_indirect_read_broadcast(0, 1, 0x20C, &fabric_id_mask1)) + return -EINVAL; + + df3_get_masks_shifts(fabric_id_mask0, fabric_id_mask1); + return 0; + } + + /* Default to DF2. */ + df_cfg.rev = DF2; + df2_get_masks_shifts(fabric_id_mask0); + return 0; +} + +static int determine_df_rev(void) +{ + u32 reg; + u8 rev; + + if (df_cfg.rev != UNKNOWN) + return 0; + + /* Read D18F0x40 (FabricBlockInstanceCount). */ + if (df_indirect_read_broadcast(0, 0, 0x40, ®)) + return -EINVAL; + + /* + * Revision fields added for DF4 and later. + * + * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. + */ + rev = FIELD_GET(DF_MAJOR_REVISION, reg); + if (!rev) + return determine_df_rev_legacy(); + + /* + * Fail out for major revisions other than '4'. + * + * Explicit support should be added for newer systems to avoid issues. + */ + if (rev == 4) + return df4_determine_df_rev(reg); + + return -EINVAL; +} + +static void get_num_maps(void) +{ + switch (df_cfg.rev) { + case DF2: + case DF3: + case DF3p5: + df_cfg.num_coh_st_maps = 2; + break; + case DF4: + case DF4p5: + df_cfg.num_coh_st_maps = 4; + break; + default: + atl_debug_on_bad_df_rev(); + } +} + +static void apply_node_id_shift(void) +{ + if (df_cfg.rev == DF2) + return; + + df_cfg.die_id_shift = df_cfg.node_id_shift; + df_cfg.die_id_mask <<= df_cfg.node_id_shift; + df_cfg.socket_id_mask <<= df_cfg.node_id_shift; + df_cfg.socket_id_shift += df_cfg.node_id_shift; +} + +static void dump_df_cfg(void) +{ + pr_debug("rev=0x%x", df_cfg.rev); + + pr_debug("component_id_mask=0x%x", df_cfg.component_id_mask); + pr_debug("die_id_mask=0x%x", df_cfg.die_id_mask); + pr_debug("node_id_mask=0x%x", df_cfg.node_id_mask); + pr_debug("socket_id_mask=0x%x", df_cfg.socket_id_mask); + + pr_debug("die_id_shift=0x%x", df_cfg.die_id_shift); + pr_debug("node_id_shift=0x%x", df_cfg.node_id_shift); + pr_debug("socket_id_shift=0x%x", df_cfg.socket_id_shift); + + pr_debug("num_coh_st_maps=%u", df_cfg.num_coh_st_maps); + + pr_debug("flags.legacy_ficaa=%u", df_cfg.flags.legacy_ficaa); + pr_debug("flags.socket_id_shift_quirk=%u", df_cfg.flags.socket_id_shift_quirk); +} + +int get_df_system_info(void) +{ + if (determine_df_rev()) { + pr_warn("amd_atl: Failed to determine DF Revision"); + df_cfg.rev = UNKNOWN; + return -EINVAL; + } + + apply_node_id_shift(); + + get_num_maps(); + + dump_df_cfg(); + + return 0; +} diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c new file mode 100644 index 000000000000..9d51e4954687 --- /dev/null +++ b/drivers/ras/amd/atl/umc.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * umc.c : Unified Memory Controller (UMC) topology helpers + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam + */ + +#include "internal.h" + +static u8 get_die_id(struct atl_err *err) +{ + /* + * For CPUs, this is the AMD Node ID modulo the number + * of AMD Nodes per socket. + */ + return topology_die_id(err->cpu) % amd_get_nodes_per_socket(); +} + +#define UMC_CHANNEL_NUM GENMASK(31, 20) +static u8 get_coh_st_inst_id(struct atl_err *err) +{ + return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); +} + +unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) +{ + u8 socket_id = topology_physical_package_id(err->cpu); + u8 coh_st_inst_id = get_coh_st_inst_id(err); + unsigned long addr = err->addr; + u8 die_id = get_die_id(err); + + pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", + socket_id, die_id, coh_st_inst_id, addr); + + return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); +} diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 95540ea8dd9d..a6e4792a1b2e 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -10,6 +10,37 @@ #include #include +#if IS_ENABLED(CONFIG_AMD_ATL) +/* + * Once set, this function pointer should never be unset. + * + * The library module will set this pointer if it successfully loads. The module + * should not be unloaded except for testing and debug purposes. + */ +static unsigned long (*amd_atl_umc_na_to_spa)(struct atl_err *err); + +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)) +{ + amd_atl_umc_na_to_spa = f; +} +EXPORT_SYMBOL_GPL(amd_atl_register_decoder); + +void amd_atl_unregister_decoder(void) +{ + amd_atl_umc_na_to_spa = NULL; +} +EXPORT_SYMBOL_GPL(amd_atl_unregister_decoder); + +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) +{ + if (!amd_atl_umc_na_to_spa) + return -EINVAL; + + return amd_atl_umc_na_to_spa(err); +} +EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_sys_addr); +#endif /* CONFIG_AMD_ATL */ + #define CREATE_TRACE_POINTS #define TRACE_INCLUDE_PATH ../../include/ras #include diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674..09c632832bf1 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -25,6 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); void log_arm_hw_error(struct cper_sec_proc_arm *err); + #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -35,4 +36,19 @@ static inline void log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } #endif +struct atl_err { + u64 addr; + u64 ipid; + u32 cpu; +}; + +#if IS_ENABLED(CONFIG_AMD_ATL) +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); +void amd_atl_unregister_decoder(void); +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); +#else +static inline unsigned long +amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } +#endif /* CONFIG_AMD_ATL */ + #endif /* __RAS_H__ */ -- cgit v1.2.3 From 6c9058f49084569d1d816e87185e0a4f9ab1a321 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:14:00 -0600 Subject: EDAC/amd64: Use new AMD Address Translation Library Remove old address translation code and use the new AMD Address Translation Library. Use "imply" in Kconfig so that the "AMD_ATL" config option takes the value of "EDAC_AMD64" as its default. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-3-yazen.ghannam@amd.com --- drivers/edac/Kconfig | 1 + drivers/edac/amd64_edac.c | 286 ++-------------------------------------------- 2 files changed, 10 insertions(+), 277 deletions(-) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 5a7f3fabee22..16c8de5050e5 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -78,6 +78,7 @@ config EDAC_GHES config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE + imply AMD_ATL help Support for error detection and correction of DRAM ECC errors on the AMD64 families (>= K8) of memory controllers. diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 537b9987a431..ca9a8641652d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include "amd64_edac.h" #include @@ -1051,281 +1052,6 @@ static int fixup_node_id(int node_id, struct mce *m) return nid - gpu_node_map.base_node_id + 1; } -/* Protect the PCI config register pairs used for DF indirect access. */ -static DEFINE_MUTEX(df_indirect_mutex); - -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - * - * Use Instance Id 0xFF to indicate a broadcast read. - */ -#define DF_BROADCAST 0xFF -static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_nb_num()) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = (instance_id == DF_BROADCAST) ? 0 : 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&df_indirect_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&df_indirect_mutex); - -out: - return err; -} - -static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - return __df_indirect_read(node, func, reg, instance_id, lo); -} - -static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo) -{ - return __df_indirect_read(node, func, reg, DF_BROADCAST, lo); -} - -struct addr_ctx { - u64 ret_addr; - u32 tmp; - u16 nid; - u8 inst_id; -}; - -static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - struct addr_ctx ctx; - - memset(&ctx, 0, sizeof(ctx)); - - /* Start from the normalized address */ - ctx.ret_addr = norm_addr; - - ctx.nid = nid; - ctx.inst_id = umc; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (ctx.tmp & BIT(0)) { - u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ctx.ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(ctx.tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, ctx.tmp); - goto out_err; - } - - lgcy_mmio_hole_en = ctx.tmp & BIT(1); - intlv_num_chan = (ctx.tmp >> 4) & 0xF; - intlv_addr_sel = (ctx.tmp >> 8) & 0x7; - dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp)) - goto out_err; - - intlv_num_sockets = (ctx.tmp >> 8) & 0x1; - intlv_num_dies = (ctx.tmp >> 10) & 0x3; - dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp)) - goto out_err; - - cs_fabric_id = (ctx.tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (ctx.tmp >> 24) & 0xF; - die_id_mask = (ctx.tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (ctx.tmp >> 28) & 0xF; - socket_id_mask = (ctx.tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ctx.ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp)) - goto out_err; - - dram_hole_base = ctx.tmp & GENMASK(31, 24); - if (ctx.ret_addr >= dram_hole_base) - ctx.ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ctx.ret_addr >> 12) ^ - (ctx.ret_addr >> 18) ^ - (ctx.ret_addr >> 21) ^ - (ctx.ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0))) - ctx.ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ctx.ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ctx.ret_addr; - return 0; - -out_err: - return -EINVAL; -} - static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); /* @@ -3073,9 +2799,10 @@ static void decode_umc_error(int node_id, struct mce *m) { u8 ecc_type = (m->status >> 45) & 0x3; struct mem_ctl_info *mci; + unsigned long sys_addr; struct amd64_pvt *pvt; + struct atl_err a_err; struct err_info err; - u64 sys_addr; node_id = fixup_node_id(node_id, m); @@ -3106,7 +2833,12 @@ static void decode_umc_error(int node_id, struct mce *m) pvt->ops->get_err_info(m, &err); - if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { + a_err.addr = m->addr; + a_err.ipid = m->ipid; + a_err.cpu = m->extcpu; + + sys_addr = amd_convert_umc_mca_addr_to_sys_addr(&a_err); + if (IS_ERR_VALUE(sys_addr)) { err.err_code = ERR_NORM_ADDR; goto log_error; } -- cgit v1.2.3 From 1289c431641f8beacc47db506210154dcea2492a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:14:01 -0600 Subject: Documentation: RAS: Add index and address translation section There are a lot of RAS topic to document, and there are a lot of details for each topic. Prep for this by adding an index for the RAS directory. This will provide a top-level document and table of contents. It also provides the option to build the RAS directory individually using "make SPHINXDIRS=". Start a section on address translation. This will be expanded with details for future translation methods and how they're used in the kernel. Move the error decoding topic to its own section. Links to other error decoding kernel docs will be added. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-4-yazen.ghannam@amd.com --- Documentation/RAS/address-translation.rst | 24 ++++++++++++++++++++++++ Documentation/RAS/error-decoding.rst | 21 +++++++++++++++++++++ Documentation/RAS/index.rst | 14 ++++++++++++++ Documentation/RAS/ras.rst | 26 -------------------------- Documentation/index.rst | 2 +- MAINTAINERS | 1 + 6 files changed, 61 insertions(+), 27 deletions(-) create mode 100644 Documentation/RAS/address-translation.rst create mode 100644 Documentation/RAS/error-decoding.rst create mode 100644 Documentation/RAS/index.rst delete mode 100644 Documentation/RAS/ras.rst diff --git a/Documentation/RAS/address-translation.rst b/Documentation/RAS/address-translation.rst new file mode 100644 index 000000000000..f0ca17b43cd3 --- /dev/null +++ b/Documentation/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/RAS/error-decoding.rst b/Documentation/RAS/error-decoding.rst new file mode 100644 index 000000000000..26a72f3fe5de --- /dev/null +++ b/Documentation/RAS/error-decoding.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Error decoding +============== + +x86 +--- + +Error decoding on AMD systems should be done using the rasdaemon tool: +https://github.com/mchehab/rasdaemon/ + +While the daemon is running, it would automatically log and decode +errors. If not, one can still decode such errors by supplying the +hardware information from the error:: + + $ rasdaemon -p --status --ipid --smca + +Also, the user can pass particular family and model to decode the error +string:: + + $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/RAS/index.rst b/Documentation/RAS/index.rst new file mode 100644 index 000000000000..2794c1816e90 --- /dev/null +++ b/Documentation/RAS/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================================== +Reliability, Availability and Serviceability (RAS) features +=========================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +.. toctree:: + :maxdepth: 2 + + error-decoding + address-translation diff --git a/Documentation/RAS/ras.rst b/Documentation/RAS/ras.rst deleted file mode 100644 index 2556b397cd27..000000000000 --- a/Documentation/RAS/ras.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Reliability, Availability and Serviceability features -===================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - -Error decoding ---------------- - -* x86 - -Error decoding on AMD systems should be done using the rasdaemon tool: -https://github.com/mchehab/rasdaemon/ - -While the daemon is running, it would automatically log and decode -errors. If not, one can still decode such errors by supplying the -hardware information from the error:: - - $ rasdaemon -p --status --ipid --smca - -Also, the user can pass particular family and model to decode the error -string:: - - $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/index.rst b/Documentation/index.rst index 36e61783437c..07f2aa07f0fa 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,7 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/ras + RAS/index Translations diff --git a/MAINTAINERS b/MAINTAINERS index 25537a37338e..5b945fd5a3b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18359,6 +18359,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained +F: Documentation/RAS/ F: Documentation/admin-guide/ras.rst F: drivers/ras/ F: include/linux/ras.h -- cgit v1.2.3 From 453f0ae797328e675840466c80e5b268d7feb9ba Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Sun, 28 Jan 2024 09:59:50 -0600 Subject: RAS/AMD/ATL: Add MI300 support AMD MI300 systems include on-die HBM3 memory and a unique topology. And they fall under Data Fabric version 4.5 in overall design. Generally, topology information (IDs, etc.) is gathered from Data Fabric registers. However, the unique topology for MI300 means that some topology information is fixed in hardware and follows arbitrary mappings. Furthermore, not all hardware instances are software-visible, so register accesses must be adjusted. Recognize and add helper functions for the new MI300 interleave modes. Add lookup tables for fixed values where appropriate. Adjust how Die and Node IDs are found and used. Also, fix some register bitmasks that were mislabeled. Signed-off-by: Muralidhara M K Co-developed-by: Yazen Ghannam Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240128155950.1434067-1-yazen.ghannam@amd.com --- drivers/ras/amd/atl/access.c | 27 ++++++++++ drivers/ras/amd/atl/dehash.c | 95 ++++++++++++++++++++++++++++++++++- drivers/ras/amd/atl/denormalize.c | 102 ++++++++++++++++++++++++++++++++++++++ drivers/ras/amd/atl/internal.h | 10 +++- drivers/ras/amd/atl/map.c | 17 +++++++ drivers/ras/amd/atl/reg_fields.h | 9 ++-- drivers/ras/amd/atl/system.c | 3 ++ drivers/ras/amd/atl/umc.c | 51 +++++++++++++++++++ 8 files changed, 309 insertions(+), 5 deletions(-) diff --git a/drivers/ras/amd/atl/access.c b/drivers/ras/amd/atl/access.c index f6dd87bb2c35..ee4661ed28ba 100644 --- a/drivers/ras/amd/atl/access.c +++ b/drivers/ras/amd/atl/access.c @@ -36,6 +36,32 @@ static DEFINE_MUTEX(df_indirect_mutex); #define DF_FICAA_REG_NUM_LEGACY GENMASK(10, 2) +static u16 get_accessible_node(u16 node) +{ + /* + * On heterogeneous systems, not all AMD Nodes are accessible + * through software-visible registers. The Node ID needs to be + * adjusted for register accesses. But its value should not be + * changed for the translation methods. + */ + if (df_cfg.flags.heterogeneous) { + /* Only Node 0 is accessible on DF3.5 systems. */ + if (df_cfg.rev == DF3p5) + node = 0; + + /* + * Only the first Node in each Socket is accessible on + * DF4.5 systems, and this is visible to software as one + * Fabric per Socket. The Socket ID can be derived from + * the Node ID and global shift values. + */ + if (df_cfg.rev == DF4p5) + node >>= df_cfg.socket_id_shift - df_cfg.node_id_shift; + } + + return node; +} + static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) { u32 ficaa_addr = 0x8C, ficad_addr = 0xB8; @@ -43,6 +69,7 @@ static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *l int err = -ENODEV; u32 ficaa = 0; + node = get_accessible_node(node); if (node >= amd_nb_num()) goto out; diff --git a/drivers/ras/amd/atl/dehash.c b/drivers/ras/amd/atl/dehash.c index 6f414926e6fe..4ea46262c4f5 100644 --- a/drivers/ras/amd/atl/dehash.c +++ b/drivers/ras/amd/atl/dehash.c @@ -253,7 +253,7 @@ static int df4p5_dehash_addr(struct addr_ctx *ctx) hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); - hash_ctl_1T = FIELD_GET(DF4_HASH_CTL_1T, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl); /* * Generate a unique address to determine which bits @@ -343,6 +343,94 @@ static int df4p5_dehash_addr(struct addr_ctx *ctx) return 0; } +/* + * MI300 hash bits + * 4K 64K 2M 1G 1T 1T + * COH_ST_Select[0] = XOR of addr{8, 12, 15, 22, 29, 36, 43} + * COH_ST_Select[1] = XOR of addr{9, 13, 16, 23, 30, 37, 44} + * COH_ST_Select[2] = XOR of addr{10, 14, 17, 24, 31, 38, 45} + * COH_ST_Select[3] = XOR of addr{11, 18, 25, 32, 39, 46} + * COH_ST_Select[4] = XOR of addr{14, 19, 26, 33, 40, 47} aka Stack + * DieID[0] = XOR of addr{12, 20, 27, 34, 41 } + * DieID[1] = XOR of addr{13, 21, 28, 35, 42 } + */ +static int mi300_dehash_addr(struct addr_ctx *ctx) +{ + bool hash_ctl_4k, hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T; + bool hashed_bit, intlv_bit, test_bit; + u8 num_intlv_bits, base_bit, i; + + if (!map_bits_valid(ctx, 8, 8, 4, 1)) + return -EINVAL; + + hash_ctl_4k = FIELD_GET(DF4p5_HASH_CTL_4K, ctx->map.ctl); + hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl); + hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl); + hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl); + hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl); + + /* Channel bits */ + num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + + for (i = 0; i < num_intlv_bits; i++) { + base_bit = 8 + i; + + /* COH_ST_Select[4] jumps to a base bit of 14. */ + if (i == 4) + base_bit = 14; + + intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr; + + hashed_bit = intlv_bit; + + /* 4k hash bit only applies to the first 3 bits. */ + if (i <= 2) { + test_bit = BIT_ULL(12 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_4k; + } + + /* Use temporary 'test_bit' value to avoid Sparse warnings. */ + test_bit = BIT_ULL(15 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_64k; + test_bit = BIT_ULL(22 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_2M; + test_bit = BIT_ULL(29 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1G; + test_bit = BIT_ULL(36 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + test_bit = BIT_ULL(43 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(base_bit); + } + + /* Die bits */ + num_intlv_bits = ilog2(ctx->map.num_intlv_dies); + + for (i = 0; i < num_intlv_bits; i++) { + base_bit = 12 + i; + + intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr; + + hashed_bit = intlv_bit; + + test_bit = BIT_ULL(20 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_64k; + test_bit = BIT_ULL(27 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_2M; + test_bit = BIT_ULL(34 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1G; + test_bit = BIT_ULL(41 + i) & ctx->ret_addr; + hashed_bit ^= test_bit & hash_ctl_1T; + + if (hashed_bit != intlv_bit) + ctx->ret_addr ^= BIT_ULL(base_bit); + } + + return 0; +} + int dehash_address(struct addr_ctx *ctx) { switch (ctx->map.intlv_mode) { @@ -400,6 +488,11 @@ int dehash_address(struct addr_ctx *ctx) case DF4p5_NPS1_16CHAN_2K_HASH: return df4p5_dehash_addr(ctx); + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return mi300_dehash_addr(ctx); + default: atl_debug_on_bad_intlv_mode(ctx); return -EINVAL; diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c index 01f1d0fb6799..d5d0e1fda159 100644 --- a/drivers/ras/amd/atl/denormalize.c +++ b/drivers/ras/amd/atl/denormalize.c @@ -80,6 +80,40 @@ static u64 make_space_for_coh_st_id_split_2_1(struct addr_ctx *ctx) return expand_bits(12, ctx->map.total_intlv_bits - 1, denorm_addr); } +/* + * Make space for CS ID at bits [14:8] as follows: + * + * 8 channels -> bits [10:8] + * 16 channels -> bits [11:8] + * 32 channels -> bits [14,11:8] + * + * 1 die -> N/A + * 2 dies -> bit [12] + * 4 dies -> bits [13:12] + */ +static u64 make_space_for_coh_st_id_mi300(struct addr_ctx *ctx) +{ + u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan); + u64 denorm_addr; + + if (ctx->map.intlv_bit_pos != 8) { + pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos); + return ~0ULL; + } + + /* Channel bits. Covers up to 4 bits at [11:8]. */ + denorm_addr = expand_bits(8, min(num_intlv_bits, 4), ctx->ret_addr); + + /* Die bits. Always starts at [12]. */ + denorm_addr = expand_bits(12, ilog2(ctx->map.num_intlv_dies), denorm_addr); + + /* Additional channel bit at [14]. */ + if (num_intlv_bits > 4) + denorm_addr = expand_bits(14, 1, denorm_addr); + + return denorm_addr; +} + /* * Take the current calculated address and shift enough bits in the middle * to make a gap where the interleave bits will be inserted. @@ -107,6 +141,12 @@ static u64 make_space_for_coh_st_id(struct addr_ctx *ctx) case DF4p5_NPS1_8CHAN_2K_HASH: case DF4p5_NPS1_16CHAN_2K_HASH: return make_space_for_coh_st_id_split_2_1(ctx); + + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return make_space_for_coh_st_id_mi300(ctx); + default: atl_debug_on_bad_intlv_mode(ctx); return ~0ULL; @@ -204,6 +244,32 @@ static u16 get_coh_st_id_df4(struct addr_ctx *ctx) return coh_st_id; } +/* + * MI300 hash has: + * (C)hannel[3:0] = coh_st_id[3:0] + * (S)tack[0] = coh_st_id[4] + * (D)ie[1:0] = coh_st_id[6:5] + * + * Hashed coh_st_id is swizzled so that Stack bit is at the end. + * coh_st_id = SDDCCCC + */ +static u16 get_coh_st_id_mi300(struct addr_ctx *ctx) +{ + u8 channel_bits, die_bits, stack_bit; + u16 die_id; + + /* Subtract the "base" Destination Fabric ID. */ + ctx->coh_st_fabric_id -= get_dst_fabric_id(ctx); + + die_id = (ctx->coh_st_fabric_id & df_cfg.die_id_mask) >> df_cfg.die_id_shift; + + channel_bits = FIELD_GET(GENMASK(3, 0), ctx->coh_st_fabric_id); + stack_bit = FIELD_GET(BIT(4), ctx->coh_st_fabric_id) << 6; + die_bits = die_id << 4; + + return stack_bit | die_bits | channel_bits; +} + /* * Derive the correct Coherent Station ID that represents the interleave bits * used within the system physical address. This accounts for the @@ -237,6 +303,11 @@ static u16 calculate_coh_st_id(struct addr_ctx *ctx) case DF4p5_NPS1_16CHAN_2K_HASH: return get_coh_st_id_df4(ctx); + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: + return get_coh_st_id_mi300(ctx); + /* COH_ST ID is simply the COH_ST Fabric ID adjusted by the Destination Fabric ID. */ case DF4p5_NPS2_4CHAN_1K_HASH: case DF4p5_NPS1_8CHAN_1K_HASH: @@ -287,6 +358,9 @@ static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id case NOHASH_8CHAN: case NOHASH_16CHAN: case NOHASH_32CHAN: + case MI3_HASH_8CHAN: + case MI3_HASH_16CHAN: + case MI3_HASH_32CHAN: case DF2_2CHAN_HASH: return insert_coh_st_id_at_intlv_bit(ctx, denorm_addr, coh_st_id); @@ -314,6 +388,31 @@ static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id } } +/* + * MI300 systems have a fixed, hardware-defined physical-to-logical + * Coherent Station mapping. The Remap registers are not used. + */ +static const u16 phy_to_log_coh_st_map_mi300[] = { + 12, 13, 14, 15, + 8, 9, 10, 11, + 4, 5, 6, 7, + 0, 1, 2, 3, + 28, 29, 30, 31, + 24, 25, 26, 27, + 20, 21, 22, 23, + 16, 17, 18, 19, +}; + +static u16 get_logical_coh_st_fabric_id_mi300(struct addr_ctx *ctx) +{ + if (ctx->inst_id >= sizeof(phy_to_log_coh_st_map_mi300)) { + atl_debug(ctx, "Instance ID out of range"); + return ~0; + } + + return phy_to_log_coh_st_map_mi300[ctx->inst_id] | (ctx->node_id << df_cfg.node_id_shift); +} + static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) { u16 component_id, log_fabric_id; @@ -321,6 +420,9 @@ static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx) /* Start with the physical COH_ST Fabric ID. */ u16 phys_fabric_id = ctx->coh_st_fabric_id; + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return get_logical_coh_st_fabric_id_mi300(ctx); + /* Skip logical ID lookup if remapping is disabled. */ if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl) && ctx->map.intlv_mode != DF3_6CHAN) diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 13f1b6098c96..21d45755e5f2 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -27,8 +27,12 @@ /* PCI ID for Zen4 Server DF Function 0. */ #define DF_FUNC0_ID_ZEN4_SERVER 0x14AD1022 +/* PCI IDs for MI300 DF Function 0. */ +#define DF_FUNC0_ID_MI300 0x15281022 + /* Shift needed for adjusting register values to true values. */ #define DF_DRAM_BASE_LIMIT_LSB 28 +#define MI300_DRAM_LIMIT_LSB 20 enum df_revisions { UNKNOWN, @@ -59,6 +63,9 @@ enum intlv_modes { DF4_NPS1_12CHAN_HASH = 0x15, DF4_NPS2_5CHAN_HASH = 0x16, DF4_NPS1_10CHAN_HASH = 0x17, + MI3_HASH_8CHAN = 0x18, + MI3_HASH_16CHAN = 0x19, + MI3_HASH_32CHAN = 0x1A, DF2_2CHAN_HASH = 0x21, /* DF4.5 modes are all IntLvNumChan + 0x20 */ DF4p5_NPS1_16CHAN_1K_HASH = 0x2C, @@ -86,7 +93,8 @@ enum intlv_modes { struct df_flags { __u8 legacy_ficaa : 1, socket_id_shift_quirk : 1, - __reserved_0 : 6; + heterogeneous : 1, + __reserved_0 : 5; }; struct df_config { diff --git a/drivers/ras/amd/atl/map.c b/drivers/ras/amd/atl/map.c index 33f549b6255a..8b908e8d7495 100644 --- a/drivers/ras/amd/atl/map.c +++ b/drivers/ras/amd/atl/map.c @@ -63,6 +63,10 @@ static int df4p5_get_intlv_mode(struct addr_ctx *ctx) if (ctx->map.intlv_mode <= NOHASH_32CHAN) return 0; + if (ctx->map.intlv_mode >= MI3_HASH_8CHAN && + ctx->map.intlv_mode <= MI3_HASH_32CHAN) + return 0; + /* * Modes matching the ranges above are returned as-is. * @@ -125,6 +129,9 @@ static u64 get_hi_addr_offset(u32 reg_dram_offset) atl_debug_on_bad_df_rev(); } + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + shift = MI300_DRAM_LIMIT_LSB; + return hi_addr_offset << shift; } @@ -369,6 +376,13 @@ static int get_coh_st_fabric_id(struct addr_ctx *ctx) { u32 reg; + /* + * On MI300 systems, the Coherent Station Fabric ID is derived + * later. And it does not depend on the register value. + */ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return 0; + /* Read D18F0x50 (FabricBlockInstanceInformation3). */ if (df_indirect_read_instance(ctx->node_id, 0, 0x50, ctx->inst_id, ®)) return -EINVAL; @@ -490,6 +504,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case NOHASH_8CHAN: case DF3_COD1_8CHAN_HASH: case DF4_NPS1_8CHAN_HASH: + case MI3_HASH_8CHAN: case DF4p5_NPS1_8CHAN_1K_HASH: case DF4p5_NPS1_8CHAN_2K_HASH: return 8; @@ -502,6 +517,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case DF4p5_NPS1_12CHAN_2K_HASH: return 12; case NOHASH_16CHAN: + case MI3_HASH_16CHAN: case DF4p5_NPS1_16CHAN_1K_HASH: case DF4p5_NPS1_16CHAN_2K_HASH: return 16; @@ -509,6 +525,7 @@ static u8 get_num_intlv_chan(struct addr_ctx *ctx) case DF4p5_NPS0_24CHAN_2K_HASH: return 24; case NOHASH_32CHAN: + case MI3_HASH_32CHAN: return 32; default: atl_debug_on_bad_intlv_mode(ctx); diff --git a/drivers/ras/amd/atl/reg_fields.h b/drivers/ras/amd/atl/reg_fields.h index 6aaa5093f42c..9dcdf6e4a856 100644 --- a/drivers/ras/amd/atl/reg_fields.h +++ b/drivers/ras/amd/atl/reg_fields.h @@ -246,11 +246,11 @@ #define DF3_HASH_CTL_64K BIT(20) #define DF3_HASH_CTL_2M BIT(21) #define DF3_HASH_CTL_1G BIT(22) -#define DF4_HASH_CTL_4K BIT(7) #define DF4_HASH_CTL_64K BIT(8) #define DF4_HASH_CTL_2M BIT(9) #define DF4_HASH_CTL_1G BIT(10) -#define DF4_HASH_CTL_1T BIT(15) +#define DF4p5_HASH_CTL_4K BIT(7) +#define DF4p5_HASH_CTL_1T BIT(15) /* * High Address Offset @@ -268,10 +268,13 @@ * D18F7x140 [DRAM Offset] * DF4 HiAddrOffset [24:1] * DF4p5 HiAddrOffset [24:1] + * MI300 HiAddrOffset [31:1] */ #define DF2_HI_ADDR_OFFSET GENMASK(31, 20) #define DF3_HI_ADDR_OFFSET GENMASK(31, 12) -#define DF4_HI_ADDR_OFFSET GENMASK(24, 1) + +/* Follow reference code by including reserved bits for simplicity. */ +#define DF4_HI_ADDR_OFFSET GENMASK(31, 1) /* * High Address Offset Enable diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index af61f2f1d6de..46493ed405d6 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -124,6 +124,9 @@ static int df4_determine_df_rev(u32 reg) if (reg == DF_FUNC0_ID_ZEN4_SERVER) df_cfg.flags.socket_id_shift_quirk = 1; + if (reg == DF_FUNC0_ID_MI300) + df_cfg.flags.heterogeneous = 1; + return df4_get_fabric_id_mask_registers(); } diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 9d51e4954687..7bfa21a582f0 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -12,8 +12,56 @@ #include "internal.h" +/* + * MI300 has a fixed, model-specific mapping between a UMC instance and + * its related Data Fabric Coherent Station instance. + * + * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the + * UMC instance within a Node. Use this to find the appropriate Coherent + * Station ID. + * + * Redundant bits were removed from the map below. + */ +static const u16 umc_coh_st_map[32] = { + 0x393, 0x293, 0x193, 0x093, + 0x392, 0x292, 0x192, 0x092, + 0x391, 0x291, 0x191, 0x091, + 0x390, 0x290, 0x190, 0x090, + 0x793, 0x693, 0x593, 0x493, + 0x792, 0x692, 0x592, 0x492, + 0x791, 0x691, 0x591, 0x491, + 0x790, 0x690, 0x590, 0x490, +}; + +#define UMC_ID_MI300 GENMASK(23, 12) +static u8 get_coh_st_inst_id_mi300(struct atl_err *err) +{ + u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid); + u8 i; + + for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) { + if (umc_id == umc_coh_st_map[i]) + break; + } + + WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map)); + + return i; +} + +#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) static u8 get_die_id(struct atl_err *err) { + /* + * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this + * needs to be divided by 4 to get the internal Die ID. + */ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) { + u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid); + + return node_id >> 2; + } + /* * For CPUs, this is the AMD Node ID modulo the number * of AMD Nodes per socket. @@ -24,6 +72,9 @@ static u8 get_die_id(struct atl_err *err) #define UMC_CHANNEL_NUM GENMASK(31, 20) static u8 get_coh_st_inst_id(struct atl_err *err) { + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return get_coh_st_inst_id_mi300(err); + return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); } -- cgit v1.2.3 From a7b57372e1c5c848cbe9169574f07a9ee2177a1b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 31 Jan 2024 11:24:25 +0300 Subject: RAS/AMD/ATL: Fix array overflow in get_logical_coh_st_fabric_id_mi300() Check against ARRAY_SIZE() which is the number of elements instead of sizeof() which is the number of bytes. Fixes: 453f0ae79732 ("RAS/AMD/ATL: Add MI300 support") Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/279c8b5e-6c00-467a-9071-9c67926abea4@moroto.mountain --- drivers/ras/amd/atl/denormalize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c index d5d0e1fda159..49a900e066f1 100644 --- a/drivers/ras/amd/atl/denormalize.c +++ b/drivers/ras/amd/atl/denormalize.c @@ -405,7 +405,7 @@ static const u16 phy_to_log_coh_st_map_mi300[] = { static u16 get_logical_coh_st_fabric_id_mi300(struct addr_ctx *ctx) { - if (ctx->inst_id >= sizeof(phy_to_log_coh_st_map_mi300)) { + if (ctx->inst_id >= ARRAY_SIZE(phy_to_log_coh_st_map_mi300)) { atl_debug(ctx, "Instance ID out of range"); return ~0; } -- cgit v1.2.3 From 87a61237530769d5a7a750fbc747ac0d1b2e18c1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Wed, 31 Jan 2024 10:57:32 -0600 Subject: RAS/AMD/ATL: Add MI300 DRAM to normalized address translation support Zen-based AMD systems report DRAM ECC errors through Unified Memory Controller (UMC) MCA banks. The value provided in MCA_ADDR is a "normalized" address which represents the UMC's view of its managed memory. The normalized address must be translated to a system physical address for software to take action. MI300 systems, uniquely, do not provide a normalized address in MCA_ADDR for DRAM ECC errors. Rather, the "DRAM" address is reported. This value includes identifiers for the bank, row, column, pseudochannel and stack of the memory location. The DRAM address must be converted to a normalized address in order to be further translated to a system physical address. Add helper functions to do the DRAM to normalized translation for MI300 systems. The method is based on the fixed hardware layout of the on-chip memory. [ bp: Massage commit message, decapitalize some, rename function. ] Signed-off-by: Yazen Ghannam Co-developed-by: Muralidhara M K Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Tested-by: Muralidhara M K Link: https://lore.kernel.org/r/20240131165732.88297-1-yazen.ghannam@amd.com --- drivers/ras/amd/atl/internal.h | 1 + drivers/ras/amd/atl/system.c | 6 +- drivers/ras/amd/atl/umc.c | 200 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 205 insertions(+), 2 deletions(-) diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 21d45755e5f2..5de69e0bb0f9 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -224,6 +224,7 @@ int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo); int get_df_system_info(void); int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num); +int get_addr_hash_mi300(void); int get_address_map(struct addr_ctx *ctx); diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index 46493ed405d6..701349e84942 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -124,9 +124,13 @@ static int df4_determine_df_rev(u32 reg) if (reg == DF_FUNC0_ID_ZEN4_SERVER) df_cfg.flags.socket_id_shift_quirk = 1; - if (reg == DF_FUNC0_ID_MI300) + if (reg == DF_FUNC0_ID_MI300) { df_cfg.flags.heterogeneous = 1; + if (get_addr_hash_mi300()) + return -EINVAL; + } + return df4_get_fabric_id_mask_registers(); } diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 7bfa21a582f0..7e310d1dfcfc 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -49,6 +49,204 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err) return i; } +/* XOR the bits in @val. */ +static u16 bitwise_xor_bits(u16 val) +{ + u16 tmp = 0; + u8 i; + + for (i = 0; i < 16; i++) + tmp ^= (val >> i) & 0x1; + + return tmp; +} + +struct xor_bits { + bool xor_enable; + u16 col_xor; + u32 row_xor; +}; + +#define NUM_BANK_BITS 4 + +static struct { + /* UMC::CH::AddrHashBank */ + struct xor_bits bank[NUM_BANK_BITS]; + + /* UMC::CH::AddrHashPC */ + struct xor_bits pc; + + /* UMC::CH::AddrHashPC2 */ + u8 bank_xor; +} addr_hash; + +#define MI300_UMC_CH_BASE 0x90000 +#define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8) +#define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0) +#define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4) + +#define ADDR_HASH_XOR_EN BIT(0) +#define ADDR_HASH_COL_XOR GENMASK(13, 1) +#define ADDR_HASH_ROW_XOR GENMASK(31, 14) +#define ADDR_HASH_BANK_XOR GENMASK(5, 0) + +/* + * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used + * for hashing. Do this during module init, since the values will not + * change during run time. + * + * These registers are instantiated for each UMC across each AMD Node. + * However, they should be identically programmed due to the fixed hardware + * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a + * single global structure for simplicity. + */ +int get_addr_hash_mi300(void) +{ + u32 temp; + int ret; + u8 i; + + for (i = 0; i < NUM_BANK_BITS; i++) { + ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp); + if (ret) + return ret; + + addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); + addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); + addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); + } + + ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp); + if (ret) + return ret; + + addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); + addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); + addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); + + ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp); + if (ret) + return ret; + + addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp); + + return 0; +} + +/* + * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must + * be converted to the intermediate normalized address (NA) before translating to a + * system physical address. + * + * The DRAM address includes bank, row, and column. Also included are bits for + * pseudochannel (PC) and stack ID (SID). + * + * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero + * + * The MCA address format is as follows: + * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} + * + * The normalized address format is fixed in hardware and is as follows: + * NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]} + * + * Additionally, the PC and Bank bits may be hashed. This must be accounted for before + * reconstructing the normalized address. + */ +#define MI300_UMC_MCA_COL GENMASK(5, 1) +#define MI300_UMC_MCA_BANK GENMASK(9, 6) +#define MI300_UMC_MCA_ROW GENMASK(24, 10) +#define MI300_UMC_MCA_PC BIT(25) +#define MI300_UMC_MCA_SID GENMASK(27, 26) + +#define MI300_NA_COL_1_0 GENMASK(6, 5) +#define MI300_NA_PC BIT(7) +#define MI300_NA_COL_3_2 GENMASK(9, 8) +#define MI300_NA_BANK_3_2 GENMASK(11, 10) +#define MI300_NA_BANK_1_0 GENMASK(13, 12) +#define MI300_NA_COL_4 BIT(14) +#define MI300_NA_ROW GENMASK(28, 15) +#define MI300_NA_SID GENMASK(30, 29) + +static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) +{ + u16 i, col, row, bank, pc, sid, temp; + + col = FIELD_GET(MI300_UMC_MCA_COL, addr); + bank = FIELD_GET(MI300_UMC_MCA_BANK, addr); + row = FIELD_GET(MI300_UMC_MCA_ROW, addr); + pc = FIELD_GET(MI300_UMC_MCA_PC, addr); + sid = FIELD_GET(MI300_UMC_MCA_SID, addr); + + /* Calculate hash for each Bank bit. */ + for (i = 0; i < NUM_BANK_BITS; i++) { + if (!addr_hash.bank[i].xor_enable) + continue; + + temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); + temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); + bank ^= temp << i; + } + + /* Calculate hash for PC bit. */ + if (addr_hash.pc.xor_enable) { + /* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */ + bank |= sid << 5; + + temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); + temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); + temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor); + pc ^= temp; + + /* Drop SID bits for the sake of debug printing later. */ + bank &= 0x1F; + } + + /* Reconstruct the normalized address starting with NA[4:0] = 0 */ + addr = 0; + + /* NA[6:5] = Column[1:0] */ + temp = col & 0x3; + addr |= FIELD_PREP(MI300_NA_COL_1_0, temp); + + /* NA[7] = PC */ + addr |= FIELD_PREP(MI300_NA_PC, pc); + + /* NA[9:8] = Column[3:2] */ + temp = (col >> 2) & 0x3; + addr |= FIELD_PREP(MI300_NA_COL_3_2, temp); + + /* NA[11:10] = Bank[3:2] */ + temp = (bank >> 2) & 0x3; + addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp); + + /* NA[13:12] = Bank[1:0] */ + temp = bank & 0x3; + addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp); + + /* NA[14] = Column[4] */ + temp = (col >> 4) & 0x1; + addr |= FIELD_PREP(MI300_NA_COL_4, temp); + + /* NA[28:15] = Row[13:0] */ + addr |= FIELD_PREP(MI300_NA_ROW, row); + + /* NA[30:29] = SID[1:0] */ + addr |= FIELD_PREP(MI300_NA_SID, sid); + + pr_debug("Addr=0x%016lx", addr); + pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid); + + return addr; +} + +static unsigned long get_addr(unsigned long addr) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return convert_dram_to_norm_addr_mi300(addr); + + return addr; +} + #define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) static u8 get_die_id(struct atl_err *err) { @@ -82,7 +280,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { u8 socket_id = topology_physical_package_id(err->cpu); u8 coh_st_inst_id = get_coh_st_inst_id(err); - unsigned long addr = err->addr; + unsigned long addr = get_addr(err->addr); u8 die_id = get_die_id(err); pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", -- cgit v1.2.3 From 65c441ec582247757059e0662fb6f9ebce4965f2 Mon Sep 17 00:00:00 2001 From: Lili Li Date: Mon, 29 Jan 2024 14:20:39 +0800 Subject: EDAC/igen6: Add one more Intel Alder Lake-N SoC support Add a new Intel Alder Lake-N SoC compute die ID for EDAC support. Signed-off-by: Lili Li Signed-off-by: Tony Luck Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20240129062040.60809-2-qiuxu.zhuo@intel.com --- drivers/edac/igen6_edac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 2b0ecdeba5cd..cdd8480e7368 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -238,6 +238,7 @@ static struct work_struct ecclog_work; #define DID_ADL_N_SKU9 0x4678 #define DID_ADL_N_SKU10 0x4679 #define DID_ADL_N_SKU11 0x467c +#define DID_ADL_N_SKU12 0x4632 /* Compute die IDs for Raptor Lake-P with IBECC */ #define DID_RPL_P_SKU1 0xa706 @@ -583,6 +584,7 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_ADL_N_SKU9), (kernel_ulong_t)&adl_n_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_N_SKU10), (kernel_ulong_t)&adl_n_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_N_SKU11), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU12), (kernel_ulong_t)&adl_n_cfg }, { PCI_VDEVICE(INTEL, DID_RPL_P_SKU1), (kernel_ulong_t)&rpl_p_cfg }, { PCI_VDEVICE(INTEL, DID_RPL_P_SKU2), (kernel_ulong_t)&rpl_p_cfg }, { PCI_VDEVICE(INTEL, DID_RPL_P_SKU3), (kernel_ulong_t)&rpl_p_cfg }, -- cgit v1.2.3 From e77086c3750834553cf6fd2255c5f3ee04843ed8 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 29 Jan 2024 14:20:40 +0800 Subject: EDAC/i10nm: Add Intel Grand Ridge micro-server support The Grand Ridge CPU model uses similar memory controller registers with Granite Rapids server. Add Grand Ridge CPU model ID for EDAC support. Tested-by: Ricardo Neri Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20240129062040.60809-3-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 2b83d6de9352..3fd22a1eb1a9 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -951,6 +951,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); -- cgit v1.2.3 From 83bf24051a60d867e7633e07343913593c242f5d Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Thu, 8 Feb 2024 15:16:53 +0530 Subject: EDAC/versal: Make the bit position of injected errors configurable Currently, the bit positions to inject correctable and uncorrectable errors are hardcoded. To make that configurable add separate sysfs entries to set the bit positions for injecting CE and UE errors. Allow for single bit error for CE and two bits errors for UE injection. [ bp: Massage. ] Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240208094653.11704-1-shubhrajyoti.datta@amd.com --- drivers/edac/versal_edac.c | 193 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 161 insertions(+), 32 deletions(-) diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c index 62caf454b567..3016870689f1 100644 --- a/drivers/edac/versal_edac.c +++ b/drivers/edac/versal_edac.c @@ -42,8 +42,11 @@ #define ECCW0_FLIP_CTRL 0x109C #define ECCW0_FLIP0_OFFSET 0x10A0 +#define ECCW0_FLIP0_BITS 31 +#define ECCW0_FLIP1_OFFSET 0x10A4 #define ECCW1_FLIP_CTRL 0x10AC #define ECCW1_FLIP0_OFFSET 0x10B0 +#define ECCW1_FLIP1_OFFSET 0x10B4 #define ECCR0_CERR_STAT_OFFSET 0x10BC #define ECCR0_CE_ADDR_LO_OFFSET 0x10C0 #define ECCR0_CE_ADDR_HI_OFFSET 0x10C4 @@ -116,9 +119,6 @@ #define XDDR_BUS_WIDTH_32 1 #define XDDR_BUS_WIDTH_16 2 -#define ECC_CEPOISON_MASK 0x1 -#define ECC_UEPOISON_MASK 0x3 - #define XDDR_MAX_ROW_CNT 18 #define XDDR_MAX_COL_CNT 10 #define XDDR_MAX_RANK_CNT 2 @@ -133,6 +133,7 @@ * https://docs.xilinx.com/r/en-US/am012-versal-register-reference/PCSR_LOCK-XRAM_SLCR-Register */ #define PCSR_UNLOCK_VAL 0xF9E8D7C6 +#define PCSR_LOCK_VAL 1 #define XDDR_ERR_TYPE_CE 0 #define XDDR_ERR_TYPE_UE 1 @@ -142,6 +143,7 @@ #define XILINX_DRAM_SIZE_12G 3 #define XILINX_DRAM_SIZE_16G 4 #define XILINX_DRAM_SIZE_32G 5 +#define NUM_UE_BITPOS 2 /** * struct ecc_error_info - ECC error log information. @@ -479,7 +481,7 @@ static void err_callback(const u32 *payload, void *data) writel(regval, priv->ddrmc_baseaddr + XDDR_ISR_OFFSET); /* Lock the PCSR registers */ - writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); edac_dbg(3, "Total error count CE %d UE %d\n", priv->ce_cnt, priv->ue_cnt); } @@ -650,7 +652,7 @@ static void enable_intr(struct edac_priv *priv) writel(XDDR_IRQ_UE_MASK, priv->ddrmc_baseaddr + XDDR_IRQ1_EN_OFFSET); /* Lock the PCSR registers */ - writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); } static void disable_intr(struct edac_priv *priv) @@ -663,7 +665,7 @@ static void disable_intr(struct edac_priv *priv) priv->ddrmc_baseaddr + XDDR_IRQ_DIS_OFFSET); /* Lock the PCSR registers */ - writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); } #define to_mci(k) container_of(k, struct mem_ctl_info, dev) @@ -734,38 +736,63 @@ static void poison_setup(struct edac_priv *priv) writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC15_OFFSET); } -static ssize_t xddr_inject_data_poison_store(struct mem_ctl_info *mci, - const char __user *data) +static void xddr_inject_data_ce_store(struct mem_ctl_info *mci, u8 ce_bitpos) { + u32 ecc0_flip0, ecc1_flip0, ecc0_flip1, ecc1_flip1; struct edac_priv *priv = mci->pvt_info; - writel(0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET); - writel(0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET); - - if (strncmp(data, "CE", 2) == 0) { - writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr + - ECCW0_FLIP0_OFFSET); - writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr + - ECCW1_FLIP0_OFFSET); + if (ce_bitpos < ECCW0_FLIP0_BITS) { + ecc0_flip0 = BIT(ce_bitpos); + ecc1_flip0 = BIT(ce_bitpos); + ecc0_flip1 = 0; + ecc1_flip1 = 0; } else { - writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr + - ECCW0_FLIP0_OFFSET); - writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr + - ECCW1_FLIP0_OFFSET); + ce_bitpos = ce_bitpos - ECCW0_FLIP0_BITS; + ecc0_flip1 = BIT(ce_bitpos); + ecc1_flip1 = BIT(ce_bitpos); + ecc0_flip0 = 0; + ecc1_flip0 = 0; } - /* Lock the PCSR registers */ - writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); - - return 0; + writel(ecc0_flip0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET); + writel(ecc1_flip0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET); + writel(ecc0_flip1, priv->ddrmc_baseaddr + ECCW0_FLIP1_OFFSET); + writel(ecc1_flip1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET); } -static ssize_t inject_data_poison_store(struct file *file, const char __user *data, - size_t count, loff_t *ppos) +/* + * To inject a correctable error, the following steps are needed: + * + * - Write the correctable error bit position value: + * echo > /sys/kernel/debug/edac//inject_ce + * + * poison_setup() derives the row, column, bank, group and rank and + * writes to the ADEC registers based on the address given by the user. + * + * The ADEC12 and ADEC13 are mask registers; write 0 to make sure default + * configuration is there and no addresses are masked. + * + * The row, column, bank, group and rank registers are written to the + * match ADEC bit to generate errors at the particular address. ADEC14 + * and ADEC15 have the match bits. + * + * xddr_inject_data_ce_store() updates the ECC FLIP registers with the + * bits to be corrupted based on the bit position given by the user. + * + * Upon doing a read to the address the errors are injected. + */ +static ssize_t inject_data_ce_store(struct file *file, const char __user *data, + size_t count, loff_t *ppos) { struct device *dev = file->private_data; struct mem_ctl_info *mci = to_mci(dev); struct edac_priv *priv = mci->pvt_info; + u8 ce_bitpos; + int ret; + + ret = kstrtou8_from_user(data, count, 0, &ce_bitpos); + if (ret) + return ret; /* Unlock the PCSR registers */ writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); @@ -773,17 +800,110 @@ static ssize_t inject_data_poison_store(struct file *file, const char __user *da poison_setup(priv); + xddr_inject_data_ce_store(mci, ce_bitpos); + ret = count; + /* Lock the PCSR registers */ - writel(1, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET); + + return ret; +} + +static const struct file_operations xddr_inject_ce_fops = { + .open = simple_open, + .write = inject_data_ce_store, + .llseek = generic_file_llseek, +}; + +static void xddr_inject_data_ue_store(struct mem_ctl_info *mci, u32 val0, u32 val1) +{ + struct edac_priv *priv = mci->pvt_info; + + writel(val0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET); + writel(val0, priv->ddrmc_baseaddr + ECCW0_FLIP1_OFFSET); + writel(val1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET); + writel(val1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET); +} + +/* + * To inject an uncorrectable error, the following steps are needed: + * echo > /sys/kernel/debug/edac//inject_ue + * + * poison_setup() derives the row, column, bank, group and rank and + * writes to the ADEC registers based on the address given by the user. + * + * The ADEC12 and ADEC13 are mask registers; write 0 so that none of the + * addresses are masked. The row, column, bank, group and rank registers + * are written to the match ADEC bit to generate errors at the + * particular address. ADEC14 and ADEC15 have the match bits. + * + * xddr_inject_data_ue_store() updates the ECC FLIP registers with the + * bits to be corrupted based on the bit position given by the user. For + * uncorrectable errors + * 2 bit errors are injected. + * + * Upon doing a read to the address the errors are injected. + */ +static ssize_t inject_data_ue_store(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + struct device *dev = file->private_data; + struct mem_ctl_info *mci = to_mci(dev); + struct edac_priv *priv = mci->pvt_info; + char buf[6], *pbuf, *token[2]; + u32 val0 = 0, val1 = 0; + u8 len, ue0, ue1; + int i, ret; + + len = min_t(size_t, count, sizeof(buf)); + if (copy_from_user(buf, data, len)) + return -EFAULT; + + buf[len] = '\0'; + pbuf = &buf[0]; + for (i = 0; i < NUM_UE_BITPOS; i++) + token[i] = strsep(&pbuf, ","); + + ret = kstrtou8(token[0], 0, &ue0); + if (ret) + return ret; + + ret = kstrtou8(token[1], 0, &ue1); + if (ret) + return ret; + + if (ue0 < ECCW0_FLIP0_BITS) { + val0 = BIT(ue0); + } else { + ue0 = ue0 - ECCW0_FLIP0_BITS; + val1 = BIT(ue0); + } - xddr_inject_data_poison_store(mci, data); + if (ue1 < ECCW0_FLIP0_BITS) { + val0 |= BIT(ue1); + } else { + ue1 = ue1 - ECCW0_FLIP0_BITS; + val1 |= BIT(ue1); + } + /* Unlock the PCSR registers */ + writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_UNLOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET); + + poison_setup(priv); + + xddr_inject_data_ue_store(mci, val0, val1); + + /* Lock the PCSR registers */ + writel(PCSR_LOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET); + writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET); return count; } -static const struct file_operations xddr_inject_enable_fops = { +static const struct file_operations xddr_inject_ue_fops = { .open = simple_open, - .write = inject_data_poison_store, + .write = inject_data_ue_store, .llseek = generic_file_llseek, }; @@ -795,8 +915,17 @@ static void create_debugfs_attributes(struct mem_ctl_info *mci) if (!priv->debugfs) return; - edac_debugfs_create_file("inject_error", 0200, priv->debugfs, - &mci->dev, &xddr_inject_enable_fops); + if (!edac_debugfs_create_file("inject_ce", 0200, priv->debugfs, + &mci->dev, &xddr_inject_ce_fops)) { + debugfs_remove_recursive(priv->debugfs); + return; + } + + if (!edac_debugfs_create_file("inject_ue", 0200, priv->debugfs, + &mci->dev, &xddr_inject_ue_fops)) { + debugfs_remove_recursive(priv->debugfs); + return; + } debugfs_create_x64("address", 0600, priv->debugfs, &priv->err_inject_addr); mci->debugfs = priv->debugfs; -- cgit v1.2.3 From 0e4fd816b08e85484e4dbe06e91466c85273f8e0 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 24 Jan 2024 13:37:52 +0100 Subject: Documentation: Move RAS section to admin-guide This is where this stuff should be. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/87a5pes8jy.fsf@meer.lwn.net --- Documentation/RAS/address-translation.rst | 24 - Documentation/RAS/error-decoding.rst | 21 - Documentation/RAS/index.rst | 14 - .../admin-guide/RAS/address-translation.rst | 24 + Documentation/admin-guide/RAS/error-decoding.rst | 21 + Documentation/admin-guide/RAS/index.rst | 7 + Documentation/admin-guide/RAS/main.rst | 1223 ++++++++++++++++++++ Documentation/admin-guide/index.rst | 2 +- Documentation/admin-guide/ras.rst | 1219 ------------------- Documentation/index.rst | 1 - MAINTAINERS | 4 +- 11 files changed, 1277 insertions(+), 1283 deletions(-) delete mode 100644 Documentation/RAS/address-translation.rst delete mode 100644 Documentation/RAS/error-decoding.rst delete mode 100644 Documentation/RAS/index.rst create mode 100644 Documentation/admin-guide/RAS/address-translation.rst create mode 100644 Documentation/admin-guide/RAS/error-decoding.rst create mode 100644 Documentation/admin-guide/RAS/index.rst create mode 100644 Documentation/admin-guide/RAS/main.rst delete mode 100644 Documentation/admin-guide/ras.rst diff --git a/Documentation/RAS/address-translation.rst b/Documentation/RAS/address-translation.rst deleted file mode 100644 index f0ca17b43cd3..000000000000 --- a/Documentation/RAS/address-translation.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Address translation -=================== - -x86 AMD -------- - -Zen-based AMD systems include a Data Fabric that manages the layout of -physical memory. Devices attached to the Fabric, like memory controllers, -I/O, etc., may not have a complete view of the system physical memory map. -These devices may provide a "normalized", i.e. device physical, address -when reporting memory errors. Normalized addresses must be translated to -a system physical address for the kernel to action on the memory. - -AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for -this case. - -Glossary of acronyms used in address translation for Zen-based systems - -* CCM = Cache Coherent Moderator -* COD = Cluster-on-Die -* COH_ST = Coherent Station -* DF = Data Fabric diff --git a/Documentation/RAS/error-decoding.rst b/Documentation/RAS/error-decoding.rst deleted file mode 100644 index 26a72f3fe5de..000000000000 --- a/Documentation/RAS/error-decoding.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Error decoding -============== - -x86 ---- - -Error decoding on AMD systems should be done using the rasdaemon tool: -https://github.com/mchehab/rasdaemon/ - -While the daemon is running, it would automatically log and decode -errors. If not, one can still decode such errors by supplying the -hardware information from the error:: - - $ rasdaemon -p --status --ipid --smca - -Also, the user can pass particular family and model to decode the error -string:: - - $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/RAS/index.rst b/Documentation/RAS/index.rst deleted file mode 100644 index 2794c1816e90..000000000000 --- a/Documentation/RAS/index.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========================================================== -Reliability, Availability and Serviceability (RAS) features -=========================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - -.. toctree:: - :maxdepth: 2 - - error-decoding - address-translation diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst new file mode 100644 index 000000000000..f0ca17b43cd3 --- /dev/null +++ b/Documentation/admin-guide/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/admin-guide/RAS/error-decoding.rst b/Documentation/admin-guide/RAS/error-decoding.rst new file mode 100644 index 000000000000..26a72f3fe5de --- /dev/null +++ b/Documentation/admin-guide/RAS/error-decoding.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Error decoding +============== + +x86 +--- + +Error decoding on AMD systems should be done using the rasdaemon tool: +https://github.com/mchehab/rasdaemon/ + +While the daemon is running, it would automatically log and decode +errors. If not, one can still decode such errors by supplying the +hardware information from the error:: + + $ rasdaemon -p --status --ipid --smca + +Also, the user can pass particular family and model to decode the error +string:: + + $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst new file mode 100644 index 000000000000..f4087040a7c0 --- /dev/null +++ b/Documentation/admin-guide/RAS/index.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. toctree:: + :maxdepth: 2 + + main + error-decoding + address-translation diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst new file mode 100644 index 000000000000..7ac1d4ccc509 --- /dev/null +++ b/Documentation/admin-guide/RAS/main.rst @@ -0,0 +1,1223 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +================================================== +Reliability, Availability and Serviceability (RAS) +================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +RAS concepts +************ + +Reliability, Availability and Serviceability (RAS) is a concept used on +servers meant to measure their robustness. + +Reliability + is the probability that a system will produce correct outputs. + + * Generally measured as Mean Time Between Failures (MTBF) + * Enhanced by features that help to avoid, detect and repair hardware faults + +Availability + is the probability that a system is operational at a given time + + * Generally measured as a percentage of downtime per a period of time + * Often uses mechanisms to detect and correct hardware faults in + runtime; + +Serviceability (or maintainability) + is the simplicity and speed with which a system can be repaired or + maintained + + * Generally measured on Mean Time Between Repair (MTBR) + +Improving RAS +------------- + +In order to reduce systems downtime, a system should be capable of detecting +hardware errors, and, when possible correcting them in runtime. It should +also provide mechanisms to detect hardware degradation, in order to warn +the system administrator to take the action of replacing a component before +it causes data loss or system downtime. + +Among the monitoring measures, the most usual ones include: + +* CPU – detect errors at instruction execution and at L1/L2/L3 caches; +* Memory – add error correction logic (ECC) to detect and correct errors; +* I/O – add CRC checksums for transferred data; +* Storage – RAID, journal file systems, checksums, + Self-Monitoring, Analysis and Reporting Technology (SMART). + +By monitoring the number of occurrences of error detections, it is possible +to identify if the probability of hardware errors is increasing, and, on such +case, do a preventive maintenance to replace a degraded component while +those errors are correctable. + +Types of errors +--------------- + +Most mechanisms used on modern systems use technologies like Hamming +Codes that allow error correction when the number of errors on a bit packet +is below a threshold. If the number of errors is above, those mechanisms +can indicate with a high degree of confidence that an error happened, but +they can't correct. + +Also, sometimes an error occur on a component that it is not used. For +example, a part of the memory that it is not currently allocated. + +That defines some categories of errors: + +* **Correctable Error (CE)** - the error detection mechanism detected and + corrected the error. Such errors are usually not fatal, although some + Kernel mechanisms allow the system administrator to consider them as fatal. + +* **Uncorrected Error (UE)** - the amount of errors happened above the error + correction threshold, and the system was unable to auto-correct. + +* **Fatal Error** - when an UE error happens on a critical component of the + system (for example, a piece of the Kernel got corrupted by an UE), the + only reliable way to avoid data corruption is to hang or reboot the machine. + +* **Non-fatal Error** - when an UE error happens on an unused component, + like a CPU in power down state or an unused memory bank, the system may + still run, eventually replacing the affected hardware by a hot spare, + if available. + + Also, when an error happens on a userspace process, it is also possible to + kill such process and let userspace restart it. + +The mechanism for handling non-fatal errors is usually complex and may +require the help of some userspace application, in order to apply the +policy desired by the system administrator. + +Identifying a bad hardware component +------------------------------------ + +Just detecting a hardware flaw is usually not enough, as the system needs +to pinpoint to the minimal replaceable unit (MRU) that should be exchanged +to make the hardware reliable again. + +So, it requires not only error logging facilities, but also mechanisms that +will translate the error message to the silkscreen or component label for +the MRU. + +Typically, it is very complex for memory, as modern CPUs interlace memory +from different memory modules, in order to provide a better performance. The +DMI BIOS usually have a list of memory module labels, with can be obtained +using the ``dmidecode`` tool. For example, on a desktop machine, it shows:: + + Memory Device + Total Width: 64 bits + Data Width: 64 bits + Size: 16384 MB + Form Factor: SODIMM + Set: None + Locator: ChannelA-DIMM0 + Bank Locator: BANK 0 + Type: DDR4 + Type Detail: Synchronous + Speed: 2133 MHz + Rank: 2 + Configured Clock Speed: 2133 MHz + +On the above example, a DDR4 SO-DIMM memory module is located at the +system's memory labeled as "BANK 0", as given by the *bank locator* field. +Please notice that, on such system, the *total width* is equal to the +*data width*. It means that such memory module doesn't have error +detection/correction mechanisms. + +Unfortunately, not all systems use the same field to specify the memory +bank. On this example, from an older server, ``dmidecode`` shows:: + + Memory Device + Array Handle: 0x1000 + Error Information Handle: Not Provided + Total Width: 72 bits + Data Width: 64 bits + Size: 8192 MB + Form Factor: DIMM + Set: 1 + Locator: DIMM_A1 + Bank Locator: Not Specified + Type: DDR3 + Type Detail: Synchronous Registered (Buffered) + Speed: 1600 MHz + Rank: 2 + Configured Clock Speed: 1600 MHz + +There, the DDR3 RDIMM memory module is located at the system's memory labeled +as "DIMM_A1", as given by the *locator* field. Please notice that this +memory module has 64 bits of *data width* and 72 bits of *total width*. So, +it has 8 extra bits to be used by error detection and correction mechanisms. +Such kind of memory is called Error-correcting code memory (ECC memory). + +To make things even worse, it is not uncommon that systems with different +labels on their system's board to use exactly the same BIOS, meaning that +the labels provided by the BIOS won't match the real ones. + +ECC memory +---------- + +As mentioned in the previous section, ECC memory has extra bits to be +used for error correction. In the above example, a memory module has +64 bits of *data width*, and 72 bits of *total width*. The extra 8 +bits which are used for the error detection and correction mechanisms +are referred to as the *syndrome*\ [#f1]_\ [#f2]_. + +So, when the cpu requests the memory controller to write a word with +*data width*, the memory controller calculates the *syndrome* in real time, +using Hamming code, or some other error correction code, like SECDED+, +producing a code with *total width* size. Such code is then written +on the memory modules. + +At read, the *total width* bits code is converted back, using the same +ECC code used on write, producing a word with *data width* and a *syndrome*. +The word with *data width* is sent to the CPU, even when errors happen. + +The memory controller also looks at the *syndrome* in order to check if +there was an error, and if the ECC code was able to fix such error. +If the error was corrected, a Corrected Error (CE) happened. If not, an +Uncorrected Error (UE) happened. + +The information about the CE/UE errors is stored on some special registers +at the memory controller and can be accessed by reading such registers, +either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64 +bit CPUs, such errors can also be retrieved via the Machine Check +Architecture (MCA)\ [#f3]_. + +.. [#f1] Please notice that several memory controllers allow operation on a + mode called "Lock-Step", where it groups two memory modules together, + doing 128-bit reads/writes. That gives 16 bits for error correction, with + significantly improves the error correction mechanism, at the expense + that, when an error happens, there's no way to know what memory module is + to blame. So, it has to blame both memory modules. + +.. [#f2] Some memory controllers also allow using memory in mirror mode. + On such mode, the same data is written to two memory modules. At read, + the system checks both memory modules, in order to check if both provide + identical data. On such configuration, when an error happens, there's no + way to know what memory module is to blame. So, it has to blame both + memory modules (or 4 memory modules, if the system is also on Lock-step + mode). + +.. [#f3] For more details about the Machine Check Architecture (MCA), + please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. + +EDAC - Error Detection And Correction +************************************* + +.. note:: + + "bluesmoke" was the name for this device driver subsystem when it + was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net. + That site is mostly archaic now and can be used only for historical + purposes. + + When the subsystem was pushed upstream for the first time, on + Kernel 2.6.16, it was renamed to ``EDAC``. + +Purpose +------- + +The ``edac`` kernel module's goal is to detect and report hardware errors +that occur within the computer system running under linux. + +Memory +------ + +Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the +primary errors being harvested. These types of errors are harvested by +the ``edac_mc`` device. + +Detecting CE events, then harvesting those events and reporting them, +**can** but must not necessarily be a predictor of future UE events. With +CE events only, the system can and will continue to operate as no data +has been damaged yet. + +However, preventive maintenance and proactive part replacement of memory +modules exhibiting CEs can reduce the likelihood of the dreaded UE events +and system panics. + +Other hardware elements +----------------------- + +A new feature for EDAC, the ``edac_device`` class of device, was added in +the 2.6.23 version of the kernel. + +This new device type allows for non-memory type of ECC hardware detectors +to have their states harvested and presented to userspace via the sysfs +interface. + +Some architectures have ECC detectors for L1, L2 and L3 caches, +along with DMA engines, fabric switches, main data path switches, +interconnections, and various other hardware data paths. If the hardware +reports it, then a edac_device device probably can be constructed to +harvest and present that to userspace. + + +PCI bus scanning +---------------- + +In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors +in order to determine if errors are occurring during data transfers. + +The presence of PCI Parity errors must be examined with a grain of salt. +There are several add-in adapters that do **not** follow the PCI specification +with regards to Parity generation and reporting. The specification says +the vendor should tie the parity status bits to 0 if they do not intend +to generate parity. Some vendors do not do this, and thus the parity bit +can "float" giving false positives. + +There is a PCI device attribute located in sysfs that is checked by +the EDAC PCI scanning code. If that attribute is set, PCI parity/error +scanning is skipped for that device. The attribute is:: + + broken_parity_status + +and is located in ``/sys/devices/pci/0000:XX:YY.Z`` directories for +PCI devices. + + +Versioning +---------- + +EDAC is composed of a "core" module (``edac_core.ko``) and several Memory +Controller (MC) driver modules. On a given system, the CORE is loaded +and one MC driver will be loaded. Both the CORE and the MC driver (or +``edac_device`` driver) have individual versions that reflect current +release level of their respective modules. + +Thus, to "report" on what version a system is running, one must report +both the CORE's and the MC driver's versions. + + +Loading +------- + +If ``edac`` was statically linked with the kernel then no loading +is necessary. If ``edac`` was built as modules then simply modprobe +the ``edac`` pieces that you need. You should be able to modprobe +hardware-specific modules and have the dependencies load the necessary +core modules. + +Example:: + + $ modprobe amd76x_edac + +loads both the ``amd76x_edac.ko`` memory controller module and the +``edac_mc.ko`` core module. + + +Sysfs interface +--------------- + +EDAC presents a ``sysfs`` interface for control and reporting purposes. It +lives in the /sys/devices/system/edac directory. + +Within this directory there currently reside 2 components: + + ======= ============================== + mc memory controller(s) system + pci PCI control and status system + ======= ============================== + + + +Memory Controller (mc) Model +---------------------------- + +Each ``mc`` device controls a set of memory modules [#f4]_. These modules +are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``). +There can be multiple csrows and multiple channels. + +.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely + used to refer to a memory module, although there are other memory + packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI + specification (Version 2.7) defines a memory module in the Common + Platform Error Record (CPER) section to be an SMBIOS Memory Device + (Type 17). Along this document, and inside the EDAC subsystem, the term + "dimm" is used for all memory modules, even when they use a + different kind of packaging. + +Memory controllers allow for several csrows, with 8 csrows being a +typical value. Yet, the actual number of csrows depends on the layout of +a given motherboard, memory controller and memory module characteristics. + +Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems) +data transfers to/from the CPU from/to memory. Some newer chipsets allow +for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory +controllers. The following example will assume 2 channels: + + +------------+-----------------------+ + | CS Rows | Channels | + +------------+-----------+-----------+ + | | ``ch0`` | ``ch1`` | + +============+===========+===========+ + | |**DIMM_A0**|**DIMM_B0**| + +------------+-----------+-----------+ + | ``csrow0`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow1`` | rank1 | rank1 | + +------------+-----------+-----------+ + | |**DIMM_A1**|**DIMM_B1**| + +------------+-----------+-----------+ + | ``csrow2`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow3`` | rank1 | rank1 | + +------------+-----------+-----------+ + +In the above example, there are 4 physical slots on the motherboard +for memory DIMMs: + + +---------+---------+ + | DIMM_A0 | DIMM_B0 | + +---------+---------+ + | DIMM_A1 | DIMM_B1 | + +---------+---------+ + +Labels for these slots are usually silk-screened on the motherboard. +Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are +channel 1. Notice that there are two csrows possible on a physical DIMM. +These csrows are allocated their csrow assignment based on the slot into +which the memory DIMM is placed. Thus, when 1 DIMM is placed in each +Channel, the csrows cross both DIMMs. + +Memory DIMMs come single or dual "ranked". A rank is a populated csrow. +In the example above 2 dual ranked DIMMs are similarly placed. Thus, +both csrow0 and csrow1 are populated. On the other hand, when 2 single +ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will +have just one csrow (csrow0) and csrow1 will be empty. The pattern +repeats itself for csrow2 and csrow3. Also note that some memory +controllers don't have any logic to identify the memory module, see +``rankX`` directories below. + +The representation of the above is reflected in the directory +tree in EDAC's sysfs interface. Starting in directory +``/sys/devices/system/edac/mc``, each memory controller will be +represented by its own ``mcX`` directory, where ``X`` is the +index of the MC:: + + ..../edac/mc/ + | + |->mc0 + |->mc1 + |->mc2 + .... + +Under each ``mcX`` directory each ``csrowX`` is again represented by a +``csrowX``, where ``X`` is the csrow index:: + + .../mc/mc0/ + | + |->csrow0 + |->csrow2 + |->csrow3 + .... + +Notice that there is no csrow1, which indicates that csrow0 is composed +of a single ranked DIMMs. This should also apply in both Channels, in +order to have dual-channel mode be operational. Since both csrow2 and +csrow3 are populated, this indicates a dual ranked set of DIMMs for +channels 0 and 1. + +Within each of the ``mcX`` and ``csrowX`` directories are several EDAC +control and attribute files. + +``mcX`` directories +------------------- + +In ``mcX`` directories are EDAC control and attribute files for +this ``X`` instance of the memory controllers. + +For a description of the sysfs API, please see: + + Documentation/ABI/testing/sysfs-devices-edac + + +``dimmX`` or ``rankX`` directories +---------------------------------- + +The recommended way to use the EDAC subsystem is to look at the information +provided by the ``dimmX`` or ``rankX`` directories [#f5]_. + +A typical EDAC system has the following structure under +``/sys/devices/system/edac/``\ [#f6]_:: + + /sys/devices/system/edac/ + ├── mc + │   ├── mc0 + │   │   ├── ce_count + │   │   ├── ce_noinfo_count + │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count + │   │   │   ├── dimm_dev_type + │   │   │   ├── dimm_edac_mode + │   │   │   ├── dimm_label + │   │   │   ├── dimm_location + │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count + │   │   │   ├── size + │   │   │   └── uevent + │   │   ├── max_location + │   │   ├── mc_name + │   │   ├── reset_counters + │   │   ├── seconds_since_reset + │   │   ├── size_mb + │   │   ├── ue_count + │   │   ├── ue_noinfo_count + │   │   └── uevent + │   ├── mc1 + │   │   ├── ce_count + │   │   ├── ce_noinfo_count + │   │   ├── dimm0 + │   │   │   ├── dimm_ce_count + │   │   │   ├── dimm_dev_type + │   │   │   ├── dimm_edac_mode + │   │   │   ├── dimm_label + │   │   │   ├── dimm_location + │   │   │   ├── dimm_mem_type + │   │   │   ├── dimm_ue_count + │   │   │   ├── size + │   │   │   └── uevent + │   │   ├── max_location + │   │   ├── mc_name + │   │   ├── reset_counters + │   │   ├── seconds_since_reset + │   │   ├── size_mb + │   │   ├── ue_count + │   │   ├── ue_noinfo_count + │   │   └── uevent + │   └── uevent + └── uevent + +In the ``dimmX`` directories are EDAC control and attribute files for +this ``X`` memory module: + +- ``size`` - Total memory managed by this csrow attribute file + + This attribute file displays, in count of megabytes, the memory + that this csrow contains. + +- ``dimm_ue_count`` - Uncorrectable Errors count attribute file + + This attribute file displays the total count of uncorrectable + errors that have occurred on this DIMM. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + +- ``dimm_ce_count`` - Correctable Errors count attribute file + + This attribute file displays the total count of correctable + errors that have occurred on this DIMM. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. + +- ``dimm_dev_type`` - Device type attribute file + + This attribute file will display what type of DRAM device is + being utilized on this DIMM. + Examples: + + - x1 + - x2 + - x4 + - x8 + +- ``dimm_edac_mode`` - EDAC Mode of operation attribute file + + This attribute file will display what type of Error detection + and correction is being utilized. + +- ``dimm_label`` - memory module label control file + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + +- ``dimm_location`` - location of the memory module + + The location can have up to 3 levels, and describe how the + memory controller identifies the location of a memory module. + Depending on the type of memory and memory controller, it + can be: + + - *csrow* and *channel* - used when the memory controller + doesn't identify a single DIMM - e. g. in ``rankX`` dir; + - *branch*, *channel*, *slot* - typically used on FB-DIMM memory + controllers; + - *channel*, *slot* - used on Nehalem and newer Intel drivers. + +- ``dimm_mem_type`` - Memory Type attribute file + + This attribute file will display what type of memory is currently + on this csrow. Normally, either buffered or unbuffered memory. + Examples: + + - Registered-DDR + - Unbuffered-DDR + +.. [#f5] On some systems, the memory controller doesn't have any logic + to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories. + On modern Intel memory controllers, the memory controller identifies the + memory modules directly. On such systems, the directory is called ``dimmX``. + +.. [#f6] There are also some ``power`` directories and ``subsystem`` + symlinks inside the sysfs mapping that are automatically created by + the sysfs subsystem. Currently, they serve no purpose. + +``csrowX`` directories +---------------------- + +When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX`` +directories. As this API doesn't work properly for Rambus, FB-DIMMs and +modern Intel Memory Controllers, this is being deprecated in favor of +``dimmX`` directories. + +In the ``csrowX`` directories are EDAC control and attribute files for +this ``X`` instance of csrow: + + +- ``ue_count`` - Total Uncorrectable Errors count attribute file + + This attribute file displays the total count of uncorrectable + errors that have occurred on this csrow. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + + +- ``ce_count`` - Total Correctable Errors count attribute file + + This attribute file displays the total count of correctable + errors that have occurred on this csrow. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. + + +- ``size_mb`` - Total memory managed by this csrow attribute file + + This attribute file displays, in count of megabytes, the memory + that this csrow contains. + + +- ``mem_type`` - Memory Type attribute file + + This attribute file will display what type of memory is currently + on this csrow. Normally, either buffered or unbuffered memory. + Examples: + + - Registered-DDR + - Unbuffered-DDR + + +- ``edac_mode`` - EDAC Mode of operation attribute file + + This attribute file will display what type of Error detection + and correction is being utilized. + + +- ``dev_type`` - Device type attribute file + + This attribute file will display what type of DRAM device is + being utilized on this DIMM. + Examples: + + - x1 + - x2 + - x4 + - x8 + + +- ``ch0_ce_count`` - Channel 0 CE Count attribute file + + This attribute file will display the count of CEs on this + DIMM located in channel 0. + + +- ``ch0_ue_count`` - Channel 0 UE Count attribute file + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +- ``ch0_dimm_label`` - Channel 0 DIMM Label control file + + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +- ``ch1_ce_count`` - Channel 1 CE Count attribute file + + + This attribute file will display the count of CEs on this + DIMM located in channel 1. + + +- ``ch1_ue_count`` - Channel 1 UE Count attribute file + + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +- ``ch1_dimm_label`` - Channel 1 DIMM Label control file + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +System Logging +-------------- + +If logging for UEs and CEs is enabled, then system logs will contain +information indicating that errors have been detected:: + + EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac + EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac + + +The structure of the message is: + + +---------------------------------------+-------------+ + | Content | Example | + +=======================================+=============+ + | The memory controller | MC0 | + +---------------------------------------+-------------+ + | Error type | CE | + +---------------------------------------+-------------+ + | Memory page | 0x283 | + +---------------------------------------+-------------+ + | Offset in the page | 0xce0 | + +---------------------------------------+-------------+ + | The byte granularity | grain 8 | + | or resolution of the error | | + +---------------------------------------+-------------+ + | The error syndrome | 0xb741 | + +---------------------------------------+-------------+ + | Memory row | row 0 | + +---------------------------------------+-------------+ + | Memory channel | channel 1 | + +---------------------------------------+-------------+ + | DIMM label, if set prior | DIMM B1 | + +---------------------------------------+-------------+ + | And then an optional, driver-specific | | + | message that may have additional | | + | information. | | + +---------------------------------------+-------------+ + +Both UEs and CEs with no info will lack all but memory controller, error +type, a notice of "no info" and then an optional, driver-specific error +message. + + +PCI Bus Parity Detection +------------------------ + +On Header Type 00 devices, the primary status is looked at for any +parity error regardless of whether parity is enabled on the device or +not. (The spec indicates parity is generated in some cases). On Header +Type 01 bridges, the secondary status register is also looked at to see +if parity occurred on the bus on the other side of the bridge. + + +Sysfs configuration +------------------- + +Under ``/sys/devices/system/edac/pci`` are control and attribute files as +follows: + + +- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file + + This control file enables or disables the PCI Bus Parity scanning + operation. Writing a 1 to this file enables the scanning. Writing + a 0 to this file disables the scanning. + + Enable:: + + echo "1" >/sys/devices/system/edac/pci/check_pci_parity + + Disable:: + + echo "0" >/sys/devices/system/edac/pci/check_pci_parity + + +- ``pci_parity_count`` - Parity Count + + This attribute file will display the number of parity errors that + have been detected. + + +Module parameters +----------------- + +- ``edac_mc_panic_on_ue`` - Panic on UE control file + + An uncorrectable error will cause a machine panic. This is usually + desirable. It is a bad idea to continue when an uncorrectable error + occurs - it is indeterminate what was uncorrected and the operating + system context might be so mangled that continuing will lead to further + corruption. If the kernel has MCE configured, then EDAC will never + notice the UE. + + LOAD TIME:: + + module/kernel parameter: edac_mc_panic_on_ue=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue + + +- ``edac_mc_log_ue`` - Log UE control file + + + Generate kernel messages describing uncorrectable errors. These errors + are reported through the system message log system. UE statistics + will be accumulated even when UE logging is disabled. + + LOAD TIME:: + + module/kernel parameter: edac_mc_log_ue=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue + + +- ``edac_mc_log_ce`` - Log CE control file + + + Generate kernel messages describing correctable errors. These + errors are reported through the system message log system. + CE statistics will be accumulated even when CE logging is disabled. + + LOAD TIME:: + + module/kernel parameter: edac_mc_log_ce=[0|1] + + RUN TIME:: + + echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce + + +- ``edac_mc_poll_msec`` - Polling period control file + + + The time period, in milliseconds, for polling for error information. + Too small a value wastes resources. Too large a value might delay + necessary handling of errors and might loose valuable information for + locating the error. 1000 milliseconds (once each second) is the current + default. Systems which require all the bandwidth they can get, may + increase this. + + LOAD TIME:: + + module/kernel parameter: edac_mc_poll_msec=[0|1] + + RUN TIME:: + + echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec + + +- ``panic_on_pci_parity`` - Panic on PCI PARITY Error + + + This control file enables or disables panicking when a parity + error has been detected. + + + module/kernel parameter:: + + edac_panic_on_pci_pe=[0|1] + + Enable:: + + echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe + + Disable:: + + echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe + + + +EDAC device type +---------------- + +In the header file, edac_pci.h, there is a series of edac_device structures +and APIs for the EDAC_DEVICE. + +User space access to an edac_device is through the sysfs interface. + +At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices +will appear. + +There is a three level tree beneath the above ``edac`` directory. For example, +the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net +website) installs itself as:: + + /sys/devices/system/edac/test-instance + +in this directory are various controls, a symlink and one or more ``instance`` +directories. + +The standard default controls are: + + ============== ======================================================= + log_ce boolean to log CE events + log_ue boolean to log UE events + panic_on_ue boolean to ``panic`` the system if an UE is encountered + (default off, can be set true via startup script) + poll_msec time period between POLL cycles for events + ============== ======================================================= + +The test_device_edac device adds at least one of its own custom control: + + ============== ================================================== + test_bits which in the current test driver does nothing but + show how it is installed. A ported driver can + add one or more such controls and/or attributes + for specific uses. + One out-of-tree driver uses controls here to allow + for ERROR INJECTION operations to hardware + injection registers + ============== ================================================== + +The symlink points to the 'struct dev' that is registered for this edac_device. + +Instances +--------- + +One or more instance directories are present. For the ``test_device_edac`` +case: + + +----------------+ + | test-instance0 | + +----------------+ + + +In this directory there are two default counter attributes, which are totals of +counter in deeper subdirectories. + + ============== ==================================== + ce_count total of CE events of subdirectories + ue_count total of UE events of subdirectories + ============== ==================================== + +Blocks +------ + +At the lowest directory level is the ``block`` directory. There can be 0, 1 +or more blocks specified in each instance: + + +-------------+ + | test-block0 | + +-------------+ + +In this directory the default attributes are: + + ============== ================================================ + ce_count which is counter of CE events for this ``block`` + of hardware being monitored + ue_count which is counter of UE events for this ``block`` + of hardware being monitored + ============== ================================================ + + +The ``test_device_edac`` device adds 4 attributes and 1 control: + + ================== ==================================================== + test-block-bits-0 for every POLL cycle this counter + is incremented + test-block-bits-1 every 10 cycles, this counter is bumped once, + and test-block-bits-0 is set to 0 + test-block-bits-2 every 100 cycles, this counter is bumped once, + and test-block-bits-1 is set to 0 + test-block-bits-3 every 1000 cycles, this counter is bumped once, + and test-block-bits-2 is set to 0 + ================== ==================================================== + + + ================== ==================================================== + reset-counters writing ANY thing to this control will + reset all the above counters. + ================== ==================================================== + + +Use of the ``test_device_edac`` driver should enable any others to create their own +unique drivers for their hardware systems. + +The ``test_device_edac`` sample driver is located at the +http://bluesmoke.sourceforge.net project site for EDAC. + + +Usage of EDAC APIs on Nehalem and newer Intel CPUs +-------------------------------------------------- + +On older Intel architectures, the memory controller was part of the North +Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and +newer Intel architectures integrated an enhanced version of the memory +controller (MC) inside the CPUs. + +This chapter will cover the differences of the enhanced memory controllers +found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and +``sbx_edac`` drivers. + +.. note:: + + The Xeon E7 processor families use a separate chip for the memory + controller, called Intel Scalable Memory Buffer. This section doesn't + apply for such families. + +1) There is one Memory Controller per Quick Patch Interconnect + (QPI). At the driver, the term "socket" means one QPI. This is + associated with a physical CPU socket. + + Each MC have 3 physical read channels, 3 physical write channels and + 3 logic channels. The driver currently sees it as just 3 channels. + Each channel can have up to 3 DIMMs. + + The minimum known unity is DIMMs. There are no information about csrows. + As EDAC API maps the minimum unity is csrows, the driver sequentially + maps channel/DIMM into different csrows. + + For example, supposing the following layout:: + + Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400 + Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs + dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 + + The driver will map it as:: + + csrow0: channel 0, dimm0 + csrow1: channel 0, dimm1 + csrow2: channel 1, dimm0 + csrow3: channel 2, dimm0 + + exports one DIMM per csrow. + + Each QPI is exported as a different memory controller. + +2) The MC has the ability to inject errors to test drivers. The drivers + implement this functionality via some error injection nodes: + + For injecting a memory error, there are some sysfs nodes, under + ``/sys/devices/system/edac/mc/mc?/``: + + - ``inject_addrmatch/*``: + Controls the error injection mask register. It is possible to specify + several characteristics of the address to match an error code:: + + dimm = the affected dimm. Numbers are relative to a channel; + rank = the memory rank; + channel = the channel that will generate an error; + bank = the affected bank; + page = the page address; + column (or col) = the address column. + + each of the above values can be set to "any" to match any valid value. + + At driver init, all values are set to any. + + For example, to generate an error at rank 1 of dimm 2, for any channel, + any bank, any page, any column:: + + echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm + echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank + + To return to the default behaviour of matching any, you can do:: + + echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm + echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank + + - ``inject_eccmask``: + specifies what bits will have troubles, + + - ``inject_section``: + specifies what ECC cache section will get the error:: + + 3 for both + 2 for the highest + 1 for the lowest + + - ``inject_type``: + specifies the type of error, being a combination of the following bits:: + + bit 0 - repeat + bit 1 - ecc + bit 2 - parity + + - ``inject_enable``: + starts the error generation when something different than 0 is written. + + All inject vars can be read. root permission is needed for write. + + Datasheet states that the error will only be generated after a write on an + address that matches inject_addrmatch. It seems, however, that reading will + also produce an error. + + For example, the following code will generate an error for any write access + at socket 0, on any DIMM/address on channel 2:: + + echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel + echo 2 >/sys/devices/system/edac/mc/mc0/inject_type + echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask + echo 3 >/sys/devices/system/edac/mc/mc0/inject_section + echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable + dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null + + For socket 1, it is needed to replace "mc0" by "mc1" at the above + commands. + + The generated error message will look like:: + + EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error)) + +3) Corrected Error memory register counters + + Those newer MCs have some registers to count memory errors. The driver + uses those registers to report Corrected Errors on devices with Registered + DIMMs. + + However, those counters don't work with Unregistered DIMM. As the chipset + offers some counters that also work with UDIMMs (but with a worse level of + granularity than the default ones), the driver exposes those registers for + UDIMM memories. + + They can be read by looking at the contents of ``all_channel_counts/``:: + + $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 + 0 + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 + 0 + /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 + 0 + + What happens here is that errors on different csrows, but at the same + dimm number will increment the same counter. + So, in this memory mapping:: + + csrow0: channel 0, dimm0 + csrow1: channel 0, dimm1 + csrow2: channel 1, dimm0 + csrow3: channel 2, dimm0 + + The hardware will increment udimm0 for an error at the first dimm at either + csrow0, csrow2 or csrow3; + + The hardware will increment udimm1 for an error at the second dimm at either + csrow0, csrow2 or csrow3; + + The hardware will increment udimm2 for an error at the third dimm at either + csrow0, csrow2 or csrow3; + +4) Standard error counters + + The standard error counters are generated when an mcelog error is received + by the driver. Since, with UDIMM, this is counted by software, it is + possible that some errors could be lost. With RDIMM's, they display the + contents of the registers + +Reference documents used on ``amd64_edac`` +------------------------------------------ + +``amd64_edac`` module is based on the following documents +(available from http://support.amd.com/en-us/search/tech-docs): + +1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD + Opteron Processors + :AMD publication #: 26094 + :Revision: 3.26 + :Link: http://support.amd.com/TechDocs/26094.PDF + +2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh + Processors + :AMD publication #: 32559 + :Revision: 3.00 + :Issue Date: May 2006 + :Link: http://support.amd.com/TechDocs/32559.pdf + +3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h + Processors + :AMD publication #: 31116 + :Revision: 3.00 + :Issue Date: September 07, 2007 + :Link: http://support.amd.com/TechDocs/31116.pdf + +4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h + Models 30h-3Fh Processors + :AMD publication #: 49125 + :Revision: 3.06 + :Issue Date: 2/12/2015 (latest release) + :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf + +5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h + Models 60h-6Fh Processors + :AMD publication #: 50742 + :Revision: 3.01 + :Issue Date: 7/23/2015 (latest release) + :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf + +6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h + Models 00h-0Fh Processors + :AMD publication #: 48751 + :Revision: 3.03 + :Issue Date: 2/23/2015 (latest release) + :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf + +Credits +======= + +* Written by Doug Thompson + + - 7 Dec 2005 + - 17 Jul 2007 Updated + +* |copy| Mauro Carvalho Chehab + + - 05 Aug 2009 Nehalem interface + - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section + +* EDAC authors/maintainers: + + - Doug Thompson, Dave Jiang, Dave Peterson et al, + - Mauro Carvalho Chehab + - Borislav Petkov + - original author: Thayne Harbaugh diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index fb40a1f6f79e..dfc06fab9432 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -122,7 +122,7 @@ configure specific aspects of kernel behavior to your liking. pmf pnp rapidio - ras + RAS/index rtc serial-console svga diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst deleted file mode 100644 index 8e03751d126d..000000000000 --- a/Documentation/admin-guide/ras.rst +++ /dev/null @@ -1,1219 +0,0 @@ -.. include:: - -============================================ -Reliability, Availability and Serviceability -============================================ - -RAS concepts -************ - -Reliability, Availability and Serviceability (RAS) is a concept used on -servers meant to measure their robustness. - -Reliability - is the probability that a system will produce correct outputs. - - * Generally measured as Mean Time Between Failures (MTBF) - * Enhanced by features that help to avoid, detect and repair hardware faults - -Availability - is the probability that a system is operational at a given time - - * Generally measured as a percentage of downtime per a period of time - * Often uses mechanisms to detect and correct hardware faults in - runtime; - -Serviceability (or maintainability) - is the simplicity and speed with which a system can be repaired or - maintained - - * Generally measured on Mean Time Between Repair (MTBR) - -Improving RAS -------------- - -In order to reduce systems downtime, a system should be capable of detecting -hardware errors, and, when possible correcting them in runtime. It should -also provide mechanisms to detect hardware degradation, in order to warn -the system administrator to take the action of replacing a component before -it causes data loss or system downtime. - -Among the monitoring measures, the most usual ones include: - -* CPU – detect errors at instruction execution and at L1/L2/L3 caches; -* Memory – add error correction logic (ECC) to detect and correct errors; -* I/O – add CRC checksums for transferred data; -* Storage – RAID, journal file systems, checksums, - Self-Monitoring, Analysis and Reporting Technology (SMART). - -By monitoring the number of occurrences of error detections, it is possible -to identify if the probability of hardware errors is increasing, and, on such -case, do a preventive maintenance to replace a degraded component while -those errors are correctable. - -Types of errors ---------------- - -Most mechanisms used on modern systems use technologies like Hamming -Codes that allow error correction when the number of errors on a bit packet -is below a threshold. If the number of errors is above, those mechanisms -can indicate with a high degree of confidence that an error happened, but -they can't correct. - -Also, sometimes an error occur on a component that it is not used. For -example, a part of the memory that it is not currently allocated. - -That defines some categories of errors: - -* **Correctable Error (CE)** - the error detection mechanism detected and - corrected the error. Such errors are usually not fatal, although some - Kernel mechanisms allow the system administrator to consider them as fatal. - -* **Uncorrected Error (UE)** - the amount of errors happened above the error - correction threshold, and the system was unable to auto-correct. - -* **Fatal Error** - when an UE error happens on a critical component of the - system (for example, a piece of the Kernel got corrupted by an UE), the - only reliable way to avoid data corruption is to hang or reboot the machine. - -* **Non-fatal Error** - when an UE error happens on an unused component, - like a CPU in power down state or an unused memory bank, the system may - still run, eventually replacing the affected hardware by a hot spare, - if available. - - Also, when an error happens on a userspace process, it is also possible to - kill such process and let userspace restart it. - -The mechanism for handling non-fatal errors is usually complex and may -require the help of some userspace application, in order to apply the -policy desired by the system administrator. - -Identifying a bad hardware component ------------------------------------- - -Just detecting a hardware flaw is usually not enough, as the system needs -to pinpoint to the minimal replaceable unit (MRU) that should be exchanged -to make the hardware reliable again. - -So, it requires not only error logging facilities, but also mechanisms that -will translate the error message to the silkscreen or component label for -the MRU. - -Typically, it is very complex for memory, as modern CPUs interlace memory -from different memory modules, in order to provide a better performance. The -DMI BIOS usually have a list of memory module labels, with can be obtained -using the ``dmidecode`` tool. For example, on a desktop machine, it shows:: - - Memory Device - Total Width: 64 bits - Data Width: 64 bits - Size: 16384 MB - Form Factor: SODIMM - Set: None - Locator: ChannelA-DIMM0 - Bank Locator: BANK 0 - Type: DDR4 - Type Detail: Synchronous - Speed: 2133 MHz - Rank: 2 - Configured Clock Speed: 2133 MHz - -On the above example, a DDR4 SO-DIMM memory module is located at the -system's memory labeled as "BANK 0", as given by the *bank locator* field. -Please notice that, on such system, the *total width* is equal to the -*data width*. It means that such memory module doesn't have error -detection/correction mechanisms. - -Unfortunately, not all systems use the same field to specify the memory -bank. On this example, from an older server, ``dmidecode`` shows:: - - Memory Device - Array Handle: 0x1000 - Error Information Handle: Not Provided - Total Width: 72 bits - Data Width: 64 bits - Size: 8192 MB - Form Factor: DIMM - Set: 1 - Locator: DIMM_A1 - Bank Locator: Not Specified - Type: DDR3 - Type Detail: Synchronous Registered (Buffered) - Speed: 1600 MHz - Rank: 2 - Configured Clock Speed: 1600 MHz - -There, the DDR3 RDIMM memory module is located at the system's memory labeled -as "DIMM_A1", as given by the *locator* field. Please notice that this -memory module has 64 bits of *data width* and 72 bits of *total width*. So, -it has 8 extra bits to be used by error detection and correction mechanisms. -Such kind of memory is called Error-correcting code memory (ECC memory). - -To make things even worse, it is not uncommon that systems with different -labels on their system's board to use exactly the same BIOS, meaning that -the labels provided by the BIOS won't match the real ones. - -ECC memory ----------- - -As mentioned in the previous section, ECC memory has extra bits to be -used for error correction. In the above example, a memory module has -64 bits of *data width*, and 72 bits of *total width*. The extra 8 -bits which are used for the error detection and correction mechanisms -are referred to as the *syndrome*\ [#f1]_\ [#f2]_. - -So, when the cpu requests the memory controller to write a word with -*data width*, the memory controller calculates the *syndrome* in real time, -using Hamming code, or some other error correction code, like SECDED+, -producing a code with *total width* size. Such code is then written -on the memory modules. - -At read, the *total width* bits code is converted back, using the same -ECC code used on write, producing a word with *data width* and a *syndrome*. -The word with *data width* is sent to the CPU, even when errors happen. - -The memory controller also looks at the *syndrome* in order to check if -there was an error, and if the ECC code was able to fix such error. -If the error was corrected, a Corrected Error (CE) happened. If not, an -Uncorrected Error (UE) happened. - -The information about the CE/UE errors is stored on some special registers -at the memory controller and can be accessed by reading such registers, -either by BIOS, by some special CPUs or by Linux EDAC driver. On x86 64 -bit CPUs, such errors can also be retrieved via the Machine Check -Architecture (MCA)\ [#f3]_. - -.. [#f1] Please notice that several memory controllers allow operation on a - mode called "Lock-Step", where it groups two memory modules together, - doing 128-bit reads/writes. That gives 16 bits for error correction, with - significantly improves the error correction mechanism, at the expense - that, when an error happens, there's no way to know what memory module is - to blame. So, it has to blame both memory modules. - -.. [#f2] Some memory controllers also allow using memory in mirror mode. - On such mode, the same data is written to two memory modules. At read, - the system checks both memory modules, in order to check if both provide - identical data. On such configuration, when an error happens, there's no - way to know what memory module is to blame. So, it has to blame both - memory modules (or 4 memory modules, if the system is also on Lock-step - mode). - -.. [#f3] For more details about the Machine Check Architecture (MCA), - please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree. - -EDAC - Error Detection And Correction -************************************* - -.. note:: - - "bluesmoke" was the name for this device driver subsystem when it - was "out-of-tree" and maintained at http://bluesmoke.sourceforge.net. - That site is mostly archaic now and can be used only for historical - purposes. - - When the subsystem was pushed upstream for the first time, on - Kernel 2.6.16, it was renamed to ``EDAC``. - -Purpose -------- - -The ``edac`` kernel module's goal is to detect and report hardware errors -that occur within the computer system running under linux. - -Memory ------- - -Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the -primary errors being harvested. These types of errors are harvested by -the ``edac_mc`` device. - -Detecting CE events, then harvesting those events and reporting them, -**can** but must not necessarily be a predictor of future UE events. With -CE events only, the system can and will continue to operate as no data -has been damaged yet. - -However, preventive maintenance and proactive part replacement of memory -modules exhibiting CEs can reduce the likelihood of the dreaded UE events -and system panics. - -Other hardware elements ------------------------ - -A new feature for EDAC, the ``edac_device`` class of device, was added in -the 2.6.23 version of the kernel. - -This new device type allows for non-memory type of ECC hardware detectors -to have their states harvested and presented to userspace via the sysfs -interface. - -Some architectures have ECC detectors for L1, L2 and L3 caches, -along with DMA engines, fabric switches, main data path switches, -interconnections, and various other hardware data paths. If the hardware -reports it, then a edac_device device probably can be constructed to -harvest and present that to userspace. - - -PCI bus scanning ----------------- - -In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors -in order to determine if errors are occurring during data transfers. - -The presence of PCI Parity errors must be examined with a grain of salt. -There are several add-in adapters that do **not** follow the PCI specification -with regards to Parity generation and reporting. The specification says -the vendor should tie the parity status bits to 0 if they do not intend -to generate parity. Some vendors do not do this, and thus the parity bit -can "float" giving false positives. - -There is a PCI device attribute located in sysfs that is checked by -the EDAC PCI scanning code. If that attribute is set, PCI parity/error -scanning is skipped for that device. The attribute is:: - - broken_parity_status - -and is located in ``/sys/devices/pci/0000:XX:YY.Z`` directories for -PCI devices. - - -Versioning ----------- - -EDAC is composed of a "core" module (``edac_core.ko``) and several Memory -Controller (MC) driver modules. On a given system, the CORE is loaded -and one MC driver will be loaded. Both the CORE and the MC driver (or -``edac_device`` driver) have individual versions that reflect current -release level of their respective modules. - -Thus, to "report" on what version a system is running, one must report -both the CORE's and the MC driver's versions. - - -Loading -------- - -If ``edac`` was statically linked with the kernel then no loading -is necessary. If ``edac`` was built as modules then simply modprobe -the ``edac`` pieces that you need. You should be able to modprobe -hardware-specific modules and have the dependencies load the necessary -core modules. - -Example:: - - $ modprobe amd76x_edac - -loads both the ``amd76x_edac.ko`` memory controller module and the -``edac_mc.ko`` core module. - - -Sysfs interface ---------------- - -EDAC presents a ``sysfs`` interface for control and reporting purposes. It -lives in the /sys/devices/system/edac directory. - -Within this directory there currently reside 2 components: - - ======= ============================== - mc memory controller(s) system - pci PCI control and status system - ======= ============================== - - - -Memory Controller (mc) Model ----------------------------- - -Each ``mc`` device controls a set of memory modules [#f4]_. These modules -are laid out in a Chip-Select Row (``csrowX``) and Channel table (``chX``). -There can be multiple csrows and multiple channels. - -.. [#f4] Nowadays, the term DIMM (Dual In-line Memory Module) is widely - used to refer to a memory module, although there are other memory - packaging alternatives, like SO-DIMM, SIMM, etc. The UEFI - specification (Version 2.7) defines a memory module in the Common - Platform Error Record (CPER) section to be an SMBIOS Memory Device - (Type 17). Along this document, and inside the EDAC subsystem, the term - "dimm" is used for all memory modules, even when they use a - different kind of packaging. - -Memory controllers allow for several csrows, with 8 csrows being a -typical value. Yet, the actual number of csrows depends on the layout of -a given motherboard, memory controller and memory module characteristics. - -Dual channels allow for dual data length (e. g. 128 bits, on 64 bit systems) -data transfers to/from the CPU from/to memory. Some newer chipsets allow -for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory -controllers. The following example will assume 2 channels: - - +------------+-----------------------+ - | CS Rows | Channels | - +------------+-----------+-----------+ - | | ``ch0`` | ``ch1`` | - +============+===========+===========+ - | |**DIMM_A0**|**DIMM_B0**| - +------------+-----------+-----------+ - | ``csrow0`` | rank0 | rank0 | - +------------+-----------+-----------+ - | ``csrow1`` | rank1 | rank1 | - +------------+-----------+-----------+ - | |**DIMM_A1**|**DIMM_B1**| - +------------+-----------+-----------+ - | ``csrow2`` | rank0 | rank0 | - +------------+-----------+-----------+ - | ``csrow3`` | rank1 | rank1 | - +------------+-----------+-----------+ - -In the above example, there are 4 physical slots on the motherboard -for memory DIMMs: - - +---------+---------+ - | DIMM_A0 | DIMM_B0 | - +---------+---------+ - | DIMM_A1 | DIMM_B1 | - +---------+---------+ - -Labels for these slots are usually silk-screened on the motherboard. -Slots labeled ``A`` are channel 0 in this example. Slots labeled ``B`` are -channel 1. Notice that there are two csrows possible on a physical DIMM. -These csrows are allocated their csrow assignment based on the slot into -which the memory DIMM is placed. Thus, when 1 DIMM is placed in each -Channel, the csrows cross both DIMMs. - -Memory DIMMs come single or dual "ranked". A rank is a populated csrow. -In the example above 2 dual ranked DIMMs are similarly placed. Thus, -both csrow0 and csrow1 are populated. On the other hand, when 2 single -ranked DIMMs are placed in slots DIMM_A0 and DIMM_B0, then they will -have just one csrow (csrow0) and csrow1 will be empty. The pattern -repeats itself for csrow2 and csrow3. Also note that some memory -controllers don't have any logic to identify the memory module, see -``rankX`` directories below. - -The representation of the above is reflected in the directory -tree in EDAC's sysfs interface. Starting in directory -``/sys/devices/system/edac/mc``, each memory controller will be -represented by its own ``mcX`` directory, where ``X`` is the -index of the MC:: - - ..../edac/mc/ - | - |->mc0 - |->mc1 - |->mc2 - .... - -Under each ``mcX`` directory each ``csrowX`` is again represented by a -``csrowX``, where ``X`` is the csrow index:: - - .../mc/mc0/ - | - |->csrow0 - |->csrow2 - |->csrow3 - .... - -Notice that there is no csrow1, which indicates that csrow0 is composed -of a single ranked DIMMs. This should also apply in both Channels, in -order to have dual-channel mode be operational. Since both csrow2 and -csrow3 are populated, this indicates a dual ranked set of DIMMs for -channels 0 and 1. - -Within each of the ``mcX`` and ``csrowX`` directories are several EDAC -control and attribute files. - -``mcX`` directories -------------------- - -In ``mcX`` directories are EDAC control and attribute files for -this ``X`` instance of the memory controllers. - -For a description of the sysfs API, please see: - - Documentation/ABI/testing/sysfs-devices-edac - - -``dimmX`` or ``rankX`` directories ----------------------------------- - -The recommended way to use the EDAC subsystem is to look at the information -provided by the ``dimmX`` or ``rankX`` directories [#f5]_. - -A typical EDAC system has the following structure under -``/sys/devices/system/edac/``\ [#f6]_:: - - /sys/devices/system/edac/ - ├── mc - │   ├── mc0 - │   │   ├── ce_count - │   │   ├── ce_noinfo_count - │   │   ├── dimm0 - │   │   │   ├── dimm_ce_count - │   │   │   ├── dimm_dev_type - │   │   │   ├── dimm_edac_mode - │   │   │   ├── dimm_label - │   │   │   ├── dimm_location - │   │   │   ├── dimm_mem_type - │   │   │   ├── dimm_ue_count - │   │   │   ├── size - │   │   │   └── uevent - │   │   ├── max_location - │   │   ├── mc_name - │   │   ├── reset_counters - │   │   ├── seconds_since_reset - │   │   ├── size_mb - │   │   ├── ue_count - │   │   ├── ue_noinfo_count - │   │   └── uevent - │   ├── mc1 - │   │   ├── ce_count - │   │   ├── ce_noinfo_count - │   │   ├── dimm0 - │   │   │   ├── dimm_ce_count - │   │   │   ├── dimm_dev_type - │   │   │   ├── dimm_edac_mode - │   │   │   ├── dimm_label - │   │   │   ├── dimm_location - │   │   │   ├── dimm_mem_type - │   │   │   ├── dimm_ue_count - │   │   │   ├── size - │   │   │   └── uevent - │   │   ├── max_location - │   │   ├── mc_name - │   │   ├── reset_counters - │   │   ├── seconds_since_reset - │   │   ├── size_mb - │   │   ├── ue_count - │   │   ├── ue_noinfo_count - │   │   └── uevent - │   └── uevent - └── uevent - -In the ``dimmX`` directories are EDAC control and attribute files for -this ``X`` memory module: - -- ``size`` - Total memory managed by this csrow attribute file - - This attribute file displays, in count of megabytes, the memory - that this csrow contains. - -- ``dimm_ue_count`` - Uncorrectable Errors count attribute file - - This attribute file displays the total count of uncorrectable - errors that have occurred on this DIMM. If panic_on_ue is set - this counter will not have a chance to increment, since EDAC - will panic the system. - -- ``dimm_ce_count`` - Correctable Errors count attribute file - - This attribute file displays the total count of correctable - errors that have occurred on this DIMM. This count is very - important to examine. CEs provide early indications that a - DIMM is beginning to fail. This count field should be - monitored for non-zero values and report such information - to the system administrator. - -- ``dimm_dev_type`` - Device type attribute file - - This attribute file will display what type of DRAM device is - being utilized on this DIMM. - Examples: - - - x1 - - x2 - - x4 - - x8 - -- ``dimm_edac_mode`` - EDAC Mode of operation attribute file - - This attribute file will display what type of Error detection - and correction is being utilized. - -- ``dimm_label`` - memory module label control file - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - -- ``dimm_location`` - location of the memory module - - The location can have up to 3 levels, and describe how the - memory controller identifies the location of a memory module. - Depending on the type of memory and memory controller, it - can be: - - - *csrow* and *channel* - used when the memory controller - doesn't identify a single DIMM - e. g. in ``rankX`` dir; - - *branch*, *channel*, *slot* - typically used on FB-DIMM memory - controllers; - - *channel*, *slot* - used on Nehalem and newer Intel drivers. - -- ``dimm_mem_type`` - Memory Type attribute file - - This attribute file will display what type of memory is currently - on this csrow. Normally, either buffered or unbuffered memory. - Examples: - - - Registered-DDR - - Unbuffered-DDR - -.. [#f5] On some systems, the memory controller doesn't have any logic - to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories. - On modern Intel memory controllers, the memory controller identifies the - memory modules directly. On such systems, the directory is called ``dimmX``. - -.. [#f6] There are also some ``power`` directories and ``subsystem`` - symlinks inside the sysfs mapping that are automatically created by - the sysfs subsystem. Currently, they serve no purpose. - -``csrowX`` directories ----------------------- - -When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX`` -directories. As this API doesn't work properly for Rambus, FB-DIMMs and -modern Intel Memory Controllers, this is being deprecated in favor of -``dimmX`` directories. - -In the ``csrowX`` directories are EDAC control and attribute files for -this ``X`` instance of csrow: - - -- ``ue_count`` - Total Uncorrectable Errors count attribute file - - This attribute file displays the total count of uncorrectable - errors that have occurred on this csrow. If panic_on_ue is set - this counter will not have a chance to increment, since EDAC - will panic the system. - - -- ``ce_count`` - Total Correctable Errors count attribute file - - This attribute file displays the total count of correctable - errors that have occurred on this csrow. This count is very - important to examine. CEs provide early indications that a - DIMM is beginning to fail. This count field should be - monitored for non-zero values and report such information - to the system administrator. - - -- ``size_mb`` - Total memory managed by this csrow attribute file - - This attribute file displays, in count of megabytes, the memory - that this csrow contains. - - -- ``mem_type`` - Memory Type attribute file - - This attribute file will display what type of memory is currently - on this csrow. Normally, either buffered or unbuffered memory. - Examples: - - - Registered-DDR - - Unbuffered-DDR - - -- ``edac_mode`` - EDAC Mode of operation attribute file - - This attribute file will display what type of Error detection - and correction is being utilized. - - -- ``dev_type`` - Device type attribute file - - This attribute file will display what type of DRAM device is - being utilized on this DIMM. - Examples: - - - x1 - - x2 - - x4 - - x8 - - -- ``ch0_ce_count`` - Channel 0 CE Count attribute file - - This attribute file will display the count of CEs on this - DIMM located in channel 0. - - -- ``ch0_ue_count`` - Channel 0 UE Count attribute file - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch0_dimm_label`` - Channel 0 DIMM Label control file - - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - - -- ``ch1_ce_count`` - Channel 1 CE Count attribute file - - - This attribute file will display the count of CEs on this - DIMM located in channel 1. - - -- ``ch1_ue_count`` - Channel 1 UE Count attribute file - - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch1_dimm_label`` - Channel 1 DIMM Label control file - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - - -System Logging --------------- - -If logging for UEs and CEs is enabled, then system logs will contain -information indicating that errors have been detected:: - - EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac - EDAC MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, channel 1 "DIMM_B1": amd76x_edac - - -The structure of the message is: - - +---------------------------------------+-------------+ - | Content | Example | - +=======================================+=============+ - | The memory controller | MC0 | - +---------------------------------------+-------------+ - | Error type | CE | - +---------------------------------------+-------------+ - | Memory page | 0x283 | - +---------------------------------------+-------------+ - | Offset in the page | 0xce0 | - +---------------------------------------+-------------+ - | The byte granularity | grain 8 | - | or resolution of the error | | - +---------------------------------------+-------------+ - | The error syndrome | 0xb741 | - +---------------------------------------+-------------+ - | Memory row | row 0 | - +---------------------------------------+-------------+ - | Memory channel | channel 1 | - +---------------------------------------+-------------+ - | DIMM label, if set prior | DIMM B1 | - +---------------------------------------+-------------+ - | And then an optional, driver-specific | | - | message that may have additional | | - | information. | | - +---------------------------------------+-------------+ - -Both UEs and CEs with no info will lack all but memory controller, error -type, a notice of "no info" and then an optional, driver-specific error -message. - - -PCI Bus Parity Detection ------------------------- - -On Header Type 00 devices, the primary status is looked at for any -parity error regardless of whether parity is enabled on the device or -not. (The spec indicates parity is generated in some cases). On Header -Type 01 bridges, the secondary status register is also looked at to see -if parity occurred on the bus on the other side of the bridge. - - -Sysfs configuration -------------------- - -Under ``/sys/devices/system/edac/pci`` are control and attribute files as -follows: - - -- ``check_pci_parity`` - Enable/Disable PCI Parity checking control file - - This control file enables or disables the PCI Bus Parity scanning - operation. Writing a 1 to this file enables the scanning. Writing - a 0 to this file disables the scanning. - - Enable:: - - echo "1" >/sys/devices/system/edac/pci/check_pci_parity - - Disable:: - - echo "0" >/sys/devices/system/edac/pci/check_pci_parity - - -- ``pci_parity_count`` - Parity Count - - This attribute file will display the number of parity errors that - have been detected. - - -Module parameters ------------------ - -- ``edac_mc_panic_on_ue`` - Panic on UE control file - - An uncorrectable error will cause a machine panic. This is usually - desirable. It is a bad idea to continue when an uncorrectable error - occurs - it is indeterminate what was uncorrected and the operating - system context might be so mangled that continuing will lead to further - corruption. If the kernel has MCE configured, then EDAC will never - notice the UE. - - LOAD TIME:: - - module/kernel parameter: edac_mc_panic_on_ue=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_panic_on_ue - - -- ``edac_mc_log_ue`` - Log UE control file - - - Generate kernel messages describing uncorrectable errors. These errors - are reported through the system message log system. UE statistics - will be accumulated even when UE logging is disabled. - - LOAD TIME:: - - module/kernel parameter: edac_mc_log_ue=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ue - - -- ``edac_mc_log_ce`` - Log CE control file - - - Generate kernel messages describing correctable errors. These - errors are reported through the system message log system. - CE statistics will be accumulated even when CE logging is disabled. - - LOAD TIME:: - - module/kernel parameter: edac_mc_log_ce=[0|1] - - RUN TIME:: - - echo "1" > /sys/module/edac_core/parameters/edac_mc_log_ce - - -- ``edac_mc_poll_msec`` - Polling period control file - - - The time period, in milliseconds, for polling for error information. - Too small a value wastes resources. Too large a value might delay - necessary handling of errors and might loose valuable information for - locating the error. 1000 milliseconds (once each second) is the current - default. Systems which require all the bandwidth they can get, may - increase this. - - LOAD TIME:: - - module/kernel parameter: edac_mc_poll_msec=[0|1] - - RUN TIME:: - - echo "1000" > /sys/module/edac_core/parameters/edac_mc_poll_msec - - -- ``panic_on_pci_parity`` - Panic on PCI PARITY Error - - - This control file enables or disables panicking when a parity - error has been detected. - - - module/kernel parameter:: - - edac_panic_on_pci_pe=[0|1] - - Enable:: - - echo "1" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe - - Disable:: - - echo "0" > /sys/module/edac_core/parameters/edac_panic_on_pci_pe - - - -EDAC device type ----------------- - -In the header file, edac_pci.h, there is a series of edac_device structures -and APIs for the EDAC_DEVICE. - -User space access to an edac_device is through the sysfs interface. - -At the location ``/sys/devices/system/edac`` (sysfs) new edac_device devices -will appear. - -There is a three level tree beneath the above ``edac`` directory. For example, -the ``test_device_edac`` device (found at the http://bluesmoke.sourceforget.net -website) installs itself as:: - - /sys/devices/system/edac/test-instance - -in this directory are various controls, a symlink and one or more ``instance`` -directories. - -The standard default controls are: - - ============== ======================================================= - log_ce boolean to log CE events - log_ue boolean to log UE events - panic_on_ue boolean to ``panic`` the system if an UE is encountered - (default off, can be set true via startup script) - poll_msec time period between POLL cycles for events - ============== ======================================================= - -The test_device_edac device adds at least one of its own custom control: - - ============== ================================================== - test_bits which in the current test driver does nothing but - show how it is installed. A ported driver can - add one or more such controls and/or attributes - for specific uses. - One out-of-tree driver uses controls here to allow - for ERROR INJECTION operations to hardware - injection registers - ============== ================================================== - -The symlink points to the 'struct dev' that is registered for this edac_device. - -Instances ---------- - -One or more instance directories are present. For the ``test_device_edac`` -case: - - +----------------+ - | test-instance0 | - +----------------+ - - -In this directory there are two default counter attributes, which are totals of -counter in deeper subdirectories. - - ============== ==================================== - ce_count total of CE events of subdirectories - ue_count total of UE events of subdirectories - ============== ==================================== - -Blocks ------- - -At the lowest directory level is the ``block`` directory. There can be 0, 1 -or more blocks specified in each instance: - - +-------------+ - | test-block0 | - +-------------+ - -In this directory the default attributes are: - - ============== ================================================ - ce_count which is counter of CE events for this ``block`` - of hardware being monitored - ue_count which is counter of UE events for this ``block`` - of hardware being monitored - ============== ================================================ - - -The ``test_device_edac`` device adds 4 attributes and 1 control: - - ================== ==================================================== - test-block-bits-0 for every POLL cycle this counter - is incremented - test-block-bits-1 every 10 cycles, this counter is bumped once, - and test-block-bits-0 is set to 0 - test-block-bits-2 every 100 cycles, this counter is bumped once, - and test-block-bits-1 is set to 0 - test-block-bits-3 every 1000 cycles, this counter is bumped once, - and test-block-bits-2 is set to 0 - ================== ==================================================== - - - ================== ==================================================== - reset-counters writing ANY thing to this control will - reset all the above counters. - ================== ==================================================== - - -Use of the ``test_device_edac`` driver should enable any others to create their own -unique drivers for their hardware systems. - -The ``test_device_edac`` sample driver is located at the -http://bluesmoke.sourceforge.net project site for EDAC. - - -Usage of EDAC APIs on Nehalem and newer Intel CPUs --------------------------------------------------- - -On older Intel architectures, the memory controller was part of the North -Bridge chipset. Nehalem, Sandy Bridge, Ivy Bridge, Haswell, Sky Lake and -newer Intel architectures integrated an enhanced version of the memory -controller (MC) inside the CPUs. - -This chapter will cover the differences of the enhanced memory controllers -found on newer Intel CPUs, such as ``i7core_edac``, ``sb_edac`` and -``sbx_edac`` drivers. - -.. note:: - - The Xeon E7 processor families use a separate chip for the memory - controller, called Intel Scalable Memory Buffer. This section doesn't - apply for such families. - -1) There is one Memory Controller per Quick Patch Interconnect - (QPI). At the driver, the term "socket" means one QPI. This is - associated with a physical CPU socket. - - Each MC have 3 physical read channels, 3 physical write channels and - 3 logic channels. The driver currently sees it as just 3 channels. - Each channel can have up to 3 DIMMs. - - The minimum known unity is DIMMs. There are no information about csrows. - As EDAC API maps the minimum unity is csrows, the driver sequentially - maps channel/DIMM into different csrows. - - For example, supposing the following layout:: - - Ch0 phy rd0, wr0 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - dimm 1 1024 Mb offset: 4, bank: 8, rank: 1, row: 0x4000, col: 0x400 - Ch1 phy rd1, wr1 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - Ch2 phy rd3, wr3 (0x063f4031): 2 ranks, UDIMMs - dimm 0 1024 Mb offset: 0, bank: 8, rank: 1, row: 0x4000, col: 0x400 - - The driver will map it as:: - - csrow0: channel 0, dimm0 - csrow1: channel 0, dimm1 - csrow2: channel 1, dimm0 - csrow3: channel 2, dimm0 - - exports one DIMM per csrow. - - Each QPI is exported as a different memory controller. - -2) The MC has the ability to inject errors to test drivers. The drivers - implement this functionality via some error injection nodes: - - For injecting a memory error, there are some sysfs nodes, under - ``/sys/devices/system/edac/mc/mc?/``: - - - ``inject_addrmatch/*``: - Controls the error injection mask register. It is possible to specify - several characteristics of the address to match an error code:: - - dimm = the affected dimm. Numbers are relative to a channel; - rank = the memory rank; - channel = the channel that will generate an error; - bank = the affected bank; - page = the page address; - column (or col) = the address column. - - each of the above values can be set to "any" to match any valid value. - - At driver init, all values are set to any. - - For example, to generate an error at rank 1 of dimm 2, for any channel, - any bank, any page, any column:: - - echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm - echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank - - To return to the default behaviour of matching any, you can do:: - - echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm - echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank - - - ``inject_eccmask``: - specifies what bits will have troubles, - - - ``inject_section``: - specifies what ECC cache section will get the error:: - - 3 for both - 2 for the highest - 1 for the lowest - - - ``inject_type``: - specifies the type of error, being a combination of the following bits:: - - bit 0 - repeat - bit 1 - ecc - bit 2 - parity - - - ``inject_enable``: - starts the error generation when something different than 0 is written. - - All inject vars can be read. root permission is needed for write. - - Datasheet states that the error will only be generated after a write on an - address that matches inject_addrmatch. It seems, however, that reading will - also produce an error. - - For example, the following code will generate an error for any write access - at socket 0, on any DIMM/address on channel 2:: - - echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel - echo 2 >/sys/devices/system/edac/mc/mc0/inject_type - echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask - echo 3 >/sys/devices/system/edac/mc/mc0/inject_section - echo 1 >/sys/devices/system/edac/mc/mc0/inject_enable - dd if=/dev/mem of=/dev/null seek=16k bs=4k count=1 >& /dev/null - - For socket 1, it is needed to replace "mc0" by "mc1" at the above - commands. - - The generated error message will look like:: - - EDAC MC0: UE row 0, channel-a= 0 channel-b= 0 labels "-": NON_FATAL (addr = 0x0075b980, socket=0, Dimm=0, Channel=2, syndrome=0x00000040, count=1, Err=8c0000400001009f:4000080482 (read error: read ECC error)) - -3) Corrected Error memory register counters - - Those newer MCs have some registers to count memory errors. The driver - uses those registers to report Corrected Errors on devices with Registered - DIMMs. - - However, those counters don't work with Unregistered DIMM. As the chipset - offers some counters that also work with UDIMMs (but with a worse level of - granularity than the default ones), the driver exposes those registers for - UDIMM memories. - - They can be read by looking at the contents of ``all_channel_counts/``:: - - $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 - 0 - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 - 0 - /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 - 0 - - What happens here is that errors on different csrows, but at the same - dimm number will increment the same counter. - So, in this memory mapping:: - - csrow0: channel 0, dimm0 - csrow1: channel 0, dimm1 - csrow2: channel 1, dimm0 - csrow3: channel 2, dimm0 - - The hardware will increment udimm0 for an error at the first dimm at either - csrow0, csrow2 or csrow3; - - The hardware will increment udimm1 for an error at the second dimm at either - csrow0, csrow2 or csrow3; - - The hardware will increment udimm2 for an error at the third dimm at either - csrow0, csrow2 or csrow3; - -4) Standard error counters - - The standard error counters are generated when an mcelog error is received - by the driver. Since, with UDIMM, this is counted by software, it is - possible that some errors could be lost. With RDIMM's, they display the - contents of the registers - -Reference documents used on ``amd64_edac`` ------------------------------------------- - -``amd64_edac`` module is based on the following documents -(available from http://support.amd.com/en-us/search/tech-docs): - -1. :Title: BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD - Opteron Processors - :AMD publication #: 26094 - :Revision: 3.26 - :Link: http://support.amd.com/TechDocs/26094.PDF - -2. :Title: BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh - Processors - :AMD publication #: 32559 - :Revision: 3.00 - :Issue Date: May 2006 - :Link: http://support.amd.com/TechDocs/32559.pdf - -3. :Title: BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h - Processors - :AMD publication #: 31116 - :Revision: 3.00 - :Issue Date: September 07, 2007 - :Link: http://support.amd.com/TechDocs/31116.pdf - -4. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h - Models 30h-3Fh Processors - :AMD publication #: 49125 - :Revision: 3.06 - :Issue Date: 2/12/2015 (latest release) - :Link: http://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf - -5. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 15h - Models 60h-6Fh Processors - :AMD publication #: 50742 - :Revision: 3.01 - :Issue Date: 7/23/2015 (latest release) - :Link: http://support.amd.com/TechDocs/50742_15h_Models_60h-6Fh_BKDG.pdf - -6. :Title: BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h - Models 00h-0Fh Processors - :AMD publication #: 48751 - :Revision: 3.03 - :Issue Date: 2/23/2015 (latest release) - :Link: http://support.amd.com/TechDocs/48751_16h_bkdg.pdf - -Credits -======= - -* Written by Doug Thompson - - - 7 Dec 2005 - - 17 Jul 2007 Updated - -* |copy| Mauro Carvalho Chehab - - - 05 Aug 2009 Nehalem interface - - 26 Oct 2016 Converted to ReST and cleanups at the Nehalem section - -* EDAC authors/maintainers: - - - Doug Thompson, Dave Jiang, Dave Peterson et al, - - Mauro Carvalho Chehab - - Borislav Petkov - - original author: Thayne Harbaugh diff --git a/Documentation/index.rst b/Documentation/index.rst index 07f2aa07f0fa..9dfdc826618c 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,6 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/index Translations diff --git a/MAINTAINERS b/MAINTAINERS index 5b945fd5a3b9..fc5996feba70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7584,7 +7584,6 @@ R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next -F: Documentation/admin-guide/ras.rst F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h @@ -18359,8 +18358,7 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained -F: Documentation/RAS/ -F: Documentation/admin-guide/ras.rst +F: Documentation/admin-guide/RAS F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h -- cgit v1.2.3 From 3b566b30b41401888ee0e8eb904a1e7a6693794b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Feb 2024 21:35:15 -0600 Subject: RAS/AMD/ATL: Add MI300 row retirement support DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com --- drivers/ras/amd/atl/Kconfig | 1 + drivers/ras/amd/atl/umc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/ras.h | 2 ++ 3 files changed, 54 insertions(+) diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig index a43513a700f1..df49c23e7f62 100644 --- a/drivers/ras/amd/atl/Kconfig +++ b/drivers/ras/amd/atl/Kconfig @@ -10,6 +10,7 @@ config AMD_ATL tristate "AMD Address Translation Library" depends on AMD_NB && X86_64 && RAS + depends on MEMORY_FAILURE default N help This library includes support for implementation-specific diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 7e310d1dfcfc..08c6dbd44c62 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) return addr; } +/* + * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire + * all memory within that DRAM row. This applies to the memory with a DRAM + * bank. + * + * To find the memory addresses, loop through permutations of the DRAM column + * bits and find the System Physical address of each. The column bits are used + * to calculate the intermediate Normalized address, so all permutations should + * be checked. + * + * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. + */ +#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) +static void retire_row_mi300(struct atl_err *a_err) +{ + unsigned long addr; + struct page *p; + u8 col; + + for (col = 0; col < MI300_NUM_COL; col++) { + a_err->addr &= ~MI300_UMC_MCA_COL; + a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col); + + addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); + if (IS_ERR_VALUE(addr)) + continue; + + addr = PHYS_PFN(addr); + + /* + * Skip invalid or already poisoned pages to avoid unnecessary + * error messages from memory_failure(). + */ + p = pfn_to_online_page(addr); + if (!p) + continue; + + if (PageHWPoison(p)) + continue; + + memory_failure(addr, 0); + } +} + +void amd_retire_dram_row(struct atl_err *a_err) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return retire_row_mi300(a_err); +} +EXPORT_SYMBOL_GPL(amd_retire_dram_row); + static unsigned long get_addr(unsigned long addr) { if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) diff --git a/include/linux/ras.h b/include/linux/ras.h index 09c632832bf1..a64182bc72ad 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -45,8 +45,10 @@ struct atl_err { #if IS_ENABLED(CONFIG_AMD_ATL) void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); void amd_atl_unregister_decoder(void); +void amd_retire_dram_row(struct atl_err *err); unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); #else +static inline void amd_retire_dram_row(struct atl_err *err) { } static inline unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #endif /* CONFIG_AMD_ATL */ -- cgit v1.2.3 From 6f15e617cc99323339dc241d19956f0d640c4354 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Feb 2024 21:35:16 -0600 Subject: RAS: Introduce a FRU memory poison manager Memory errors are an expected occurrence on systems with high memory density. Generally, errors within a small number of unique physical locations are acceptable, based on manufacturer and/or admin policy. During run time, memory with errors may be retired so it is no longer used by the system. This is done in mm through page poisoning, and the effect will remain until the system is restarted. If a memory location is consistently faulty, then the same run time error handling may occur in the next reboot cycle, leading to terminating jobs due to that already known bad memory. This could be prevented if information from the previous boot was not lost. Some add-in cards with driver-managed memory have on-board persistent storage. Their driver saves memory error information to the persistent storage during run time. The information is then restored after reset, and known bad memory will be retired before the hardware is used. A running log of bad memory locations is kept across multiple resets. A similar solution is desirable for CPUs. However, this solution should leverage industry-standard components as much as possible, rather than a bespoke platform driver. Two components are needed: a record format and a persistent storage interface. Implement a new module to manage the record formats on persistent storage. Use the requirements for an AMD MI300-based system to start. Vendor- and platform-specific details can be abstracted later as needed. [ bp: Massage commit message and code, squash 30-ish more fixes from Yazen and me. ] Signed-off-by: Yazen Ghannam Co-developed-by: Signed-off-by: Co-developed-by: Signed-off-by: Tested-by: Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240214033516.1344948-3-yazen.ghannam@amd.com --- MAINTAINERS | 6 + drivers/ras/Kconfig | 12 + drivers/ras/Makefile | 1 + drivers/ras/amd/fmpm.c | 812 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 831 insertions(+) create mode 100644 drivers/ras/amd/fmpm.c diff --git a/MAINTAINERS b/MAINTAINERS index fc5996feba70..76163f09e4e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18363,6 +18363,12 @@ F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h +RAS FRU MEMORY POISON MANAGER (FMPM) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/amd/fmpm.c + RC-CORE / LIRC FRAMEWORK M: Sean Young L: linux-media@vger.kernel.org diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index 2e969f59c0ca..fc4f4bb94a4c 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -34,4 +34,16 @@ if RAS source "arch/x86/ras/Kconfig" source "drivers/ras/amd/atl/Kconfig" +config RAS_FMPM + tristate "FRU Memory Poison Manager" + default m + depends on AMD_ATL && ACPI_APEI + help + Support saving and restoring memory error information across reboot + using ACPI ERST as persistent storage. Error information is saved with + the UEFI CPER "FRU Memory Poison" section format. + + Memory will be retired during boot time and run time depending on + platform-specific policies. + endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 3fac80f58005..11f95d59d397 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -3,4 +3,5 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o +obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/ diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c new file mode 100644 index 000000000000..80dd112b720a --- /dev/null +++ b/drivers/ras/amd/fmpm.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * FRU (Field-Replaceable Unit) Memory Poison Manager + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Authors: + * Naveen Krishna Chatradhi + * Muralidhara M K + * Yazen Ghannam + * + * Implementation notes, assumptions, and limitations: + * + * - FRU memory poison section and memory poison descriptor definitions are not yet + * included in the UEFI specification. So they are defined here. Afterwards, they + * may be moved to linux/cper.h, if appropriate. + * + * - Platforms based on AMD MI300 systems will be the first to use these structures. + * There are a number of assumptions made here that will need to be generalized + * to support other platforms. + * + * AMD MI300-based platform(s) assumptions: + * - Memory errors are reported through x86 MCA. + * - The entire DRAM row containing a memory error should be retired. + * - There will be (1) FRU memory poison section per CPER. + * - The FRU will be the CPU package (processor socket). + * - The default number of memory poison descriptor entries should be (8). + * - The platform will use ACPI ERST for persistent storage. + * - All FRU records should be saved to persistent storage. Module init will + * fail if any FRU record is not successfully written. + * + * - Boot time memory retirement may occur later than ideal due to dependencies + * on other libraries and drivers. This leaves a gap where bad memory may be + * accessed during early boot stages. + * + * - Enough memory should be pre-allocated for each FRU record to be able to hold + * the expected number of descriptor entries. This, mostly empty, record is + * written to storage during init time. Subsequent writes to the same record + * should allow the Platform to update the stored record in-place. Otherwise, + * if the record is extended, then the Platform may need to perform costly memory + * management operations on the storage. For example, the Platform may spend time + * in Firmware copying and invalidating memory on a relatively slow SPI ROM. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include + +#include +#include + +#define INVALID_CPU UINT_MAX + +/* Validation Bits */ +#define FMP_VALID_ARCH_TYPE BIT_ULL(0) +#define FMP_VALID_ARCH BIT_ULL(1) +#define FMP_VALID_ID_TYPE BIT_ULL(2) +#define FMP_VALID_ID BIT_ULL(3) +#define FMP_VALID_LIST_ENTRIES BIT_ULL(4) +#define FMP_VALID_LIST BIT_ULL(5) + +/* FRU Architecture Types */ +#define FMP_ARCH_TYPE_X86_CPUID_1_EAX 0 + +/* FRU ID Types */ +#define FMP_ID_TYPE_X86_PPIN 0 + +/* FRU Memory Poison Section */ +struct cper_sec_fru_mem_poison { + u32 checksum; + u64 validation_bits; + u32 fru_arch_type; + u64 fru_arch; + u32 fru_id_type; + u64 fru_id; + u32 nr_entries; +} __packed; + +/* FRU Descriptor ID Types */ +#define FPD_HW_ID_TYPE_MCA_IPID 0 + +/* FRU Descriptor Address Types */ +#define FPD_ADDR_TYPE_MCA_ADDR 0 + +/* Memory Poison Descriptor */ +struct cper_fru_poison_desc { + u64 timestamp; + u32 hw_id_type; + u64 hw_id; + u32 addr_type; + u64 addr; +} __packed; + +/* Collection of headers and sections for easy pointer use. */ +struct fru_rec { + struct cper_record_header hdr; + struct cper_section_descriptor sec_desc; + struct cper_sec_fru_mem_poison fmp; + struct cper_fru_poison_desc entries[]; +} __packed; + +/* + * Pointers to the complete CPER record of each FRU. + * + * Memory allocation will include padded space for descriptor entries. + */ +static struct fru_rec **fru_records; + +#define CPER_CREATOR_FMP \ + GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \ + 0xa0, 0x33, 0x08, 0x75) + +#define CPER_SECTION_TYPE_FMP \ + GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2, \ + 0x12, 0x0a, 0x44, 0x58) + +/** + * DOC: fru_poison_entries (byte) + * Maximum number of descriptor entries possible for each FRU. + * + * Values between '1' and '255' are valid. + * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES. + */ +static u8 max_nr_entries; +module_param(max_nr_entries, byte, 0644); +MODULE_PARM_DESC(max_nr_entries, + "Maximum number of memory poison descriptor entries per FRU"); + +#define FMPM_DEFAULT_MAX_NR_ENTRIES 8 + +/* Maximum number of FRUs in the system. */ +#define FMPM_MAX_NR_FRU 256 +static unsigned int max_nr_fru; + +/* Total length of record including headers and list of descriptor entries. */ +static size_t max_rec_len; + +/* + * Protect the local records cache in fru_records and prevent concurrent + * writes to storage. This is only needed after init once notifier block + * registration is done. + */ +static DEFINE_MUTEX(fmpm_update_mutex); + +#define for_each_fru(i, rec) \ + for (i = 0; rec = fru_records[i], i < max_nr_fru; i++) + +static inline u32 get_fmp_len(struct fru_rec *rec) +{ + return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor); +} + +static struct fru_rec *get_fru_record(u64 fru_id) +{ + struct fru_rec *rec; + unsigned int i; + + for_each_fru(i, rec) { + if (rec->fmp.fru_id == fru_id) + return rec; + } + + pr_debug("Record not found for FRU 0x%016llx\n", fru_id); + + return NULL; +} + +/* + * Sum up all bytes within the FRU Memory Poison Section including the Memory + * Poison Descriptor entries. + * + * Don't include the old checksum here. It's a u32 value, so summing each of its + * bytes will give the wrong total. + */ +static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len) +{ + u32 checksum = 0; + u8 *buf, *end; + + /* Skip old checksum. */ + buf = (u8 *)fmp + sizeof(u32); + end = buf + len; + + while (buf < end) + checksum += (u8)(*(buf++)); + + return checksum; +} + +static int update_record_on_storage(struct fru_rec *rec) +{ + u32 len, checksum; + int ret; + + /* Calculate a new checksum. */ + len = get_fmp_len(rec); + + /* Get the current total. */ + checksum = do_fmp_checksum(&rec->fmp, len); + + /* Use the complement value. */ + rec->fmp.checksum = -checksum; + + pr_debug("Writing to storage\n"); + + ret = erst_write(&rec->hdr); + if (ret) { + pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id); + + if (ret == -ENOSPC) + pr_warn("Not enough space on storage\n"); + } + + return ret; +} + +static bool rec_has_valid_entries(struct fru_rec *rec) +{ + if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES)) + return false; + + if (!(rec->fmp.validation_bits & FMP_VALID_LIST)) + return false; + + return true; +} + +static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) +{ + /* + * Ignore timestamp field. + * The same physical error may be reported multiple times due to stuck bits, etc. + * + * Also, order the checks from most->least likely to fail to shortcut the code. + */ + if (old->addr != new->addr) + return false; + + if (old->hw_id != new->hw_id) + return false; + + if (old->addr_type != new->addr_type) + return false; + + if (old->hw_id_type != new->hw_id_type) + return false; + + return true; +} + +static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd) +{ + unsigned int i; + + for (i = 0; i < rec->fmp.nr_entries; i++) { + struct cper_fru_poison_desc *fpd_i = &rec->entries[i]; + + if (fpds_equal(fpd_i, fpd)) { + pr_debug("Found duplicate record\n"); + return true; + } + } + + return false; +} + +static void update_fru_record(struct fru_rec *rec, struct mce *m) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + struct cper_fru_poison_desc fpd, *fpd_dest; + u32 entry = 0; + + mutex_lock(&fmpm_update_mutex); + + memset(&fpd, 0, sizeof(struct cper_fru_poison_desc)); + + fpd.timestamp = m->time; + fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID; + fpd.hw_id = m->ipid; + fpd.addr_type = FPD_ADDR_TYPE_MCA_ADDR; + fpd.addr = m->addr; + + /* This is the first entry, so just save it. */ + if (!rec_has_valid_entries(rec)) + goto save_fpd; + + /* Ignore already recorded errors. */ + if (rec_has_fpd(rec, &fpd)) + goto out_unlock; + + if (rec->fmp.nr_entries >= max_nr_entries) { + pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id); + goto out_unlock; + } + + entry = fmp->nr_entries; + +save_fpd: + fpd_dest = &rec->entries[entry]; + memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc)); + + fmp->nr_entries = entry + 1; + fmp->validation_bits |= FMP_VALID_LIST_ENTRIES; + fmp->validation_bits |= FMP_VALID_LIST; + + pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry); + + update_record_on_storage(rec); + +out_unlock: + mutex_unlock(&fmpm_update_mutex); +} + +static void retire_dram_row(u64 addr, u64 id, u32 cpu) +{ + struct atl_err a_err; + + memset(&a_err, 0, sizeof(struct atl_err)); + + a_err.addr = addr; + a_err.ipid = id; + a_err.cpu = cpu; + + amd_retire_dram_row(&a_err); +} + +static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + struct fru_rec *rec; + + if (!mce_is_memory_error(m)) + return NOTIFY_DONE; + + retire_dram_row(m->addr, m->ipid, m->extcpu); + + /* + * An invalid FRU ID should not happen on real errors. But it + * could happen from software error injection, etc. + */ + rec = get_fru_record(m->ppin); + if (!rec) + return NOTIFY_DONE; + + update_fru_record(rec, m); + + return NOTIFY_OK; +} + +static struct notifier_block fru_mem_poison_nb = { + .notifier_call = fru_handle_mem_poison, + .priority = MCE_PRIO_LOWEST, +}; + +static void retire_mem_fmp(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + unsigned int i, cpu; + + for (i = 0; i < fmp->nr_entries; i++) { + struct cper_fru_poison_desc *fpd = &rec->entries[i]; + unsigned int err_cpu = INVALID_CPU; + + if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID) + continue; + + if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR) + continue; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + if (topology_ppin(cpu) == fmp->fru_id) { + err_cpu = cpu; + break; + } + } + cpus_read_unlock(); + + if (err_cpu == INVALID_CPU) + continue; + + retire_dram_row(fpd->addr, fpd->hw_id, err_cpu); + } +} + +static void retire_mem_records(void) +{ + struct fru_rec *rec; + unsigned int i; + + for_each_fru(i, rec) { + if (!rec_has_valid_entries(rec)) + continue; + + retire_mem_fmp(rec); + } +} + +/* Set the CPER Record Header and CPER Section Descriptor fields. */ +static void set_rec_fields(struct fru_rec *rec) +{ + struct cper_section_descriptor *sec_desc = &rec->sec_desc; + struct cper_record_header *hdr = &rec->hdr; + + memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + hdr->revision = CPER_RECORD_REV; + hdr->signature_end = CPER_SIG_END; + + /* + * Currently, it is assumed that there is one FRU Memory Poison + * section per CPER. But this may change for other implementations. + */ + hdr->section_count = 1; + + /* The logged errors are recoverable. Otherwise, they'd never make it here. */ + hdr->error_severity = CPER_SEV_RECOVERABLE; + + hdr->validation_bits = 0; + hdr->record_length = max_rec_len; + hdr->creator_id = CPER_CREATOR_FMP; + hdr->notification_type = CPER_NOTIFY_MCE; + hdr->record_id = cper_next_record_id(); + hdr->flags = CPER_HW_ERROR_FLAGS_PREVERR; + + sec_desc->section_offset = sizeof(struct cper_record_header); + sec_desc->section_length = max_rec_len - sizeof(struct cper_record_header); + sec_desc->revision = CPER_SEC_REV; + sec_desc->validation_bits = 0; + sec_desc->flags = CPER_SEC_PRIMARY; + sec_desc->section_type = CPER_SECTION_TYPE_FMP; + sec_desc->section_severity = CPER_SEV_RECOVERABLE; +} + +static int save_new_records(void) +{ + DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU); + struct fru_rec *rec; + unsigned int i; + int ret = 0; + + for_each_fru(i, rec) { + if (rec->hdr.record_length) + continue; + + set_rec_fields(rec); + + ret = update_record_on_storage(rec); + if (ret) + goto out_clear; + + set_bit(i, new_records); + } + + return ret; + +out_clear: + for_each_fru(i, rec) { + if (!test_bit(i, new_records)) + continue; + + erst_clear(rec->hdr.record_id); + } + + return ret; +} + +/* Check that the record matches expected types for the current system.*/ +static bool fmp_is_usable(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + u64 cpuid; + + pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits); + + if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) { + pr_debug("Arch type unknown\n"); + return false; + } + + if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) { + pr_debug("Arch type not 'x86 Family/Model/Stepping'\n"); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ARCH)) { + pr_debug("Arch value unknown\n"); + return false; + } + + cpuid = cpuid_eax(1); + if (fmp->fru_arch != cpuid) { + pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n", + fmp->fru_arch, cpuid); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) { + pr_debug("FRU ID type unknown\n"); + return false; + } + + if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) { + pr_debug("FRU ID type is not 'x86 PPIN'\n"); + return false; + } + + if (!(fmp->validation_bits & FMP_VALID_ID)) { + pr_debug("FRU ID value unknown\n"); + return false; + } + + return true; +} + +static bool fmp_is_valid(struct fru_rec *rec) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + u32 checksum, len; + + len = get_fmp_len(rec); + if (len < sizeof(struct cper_sec_fru_mem_poison)) { + pr_debug("fmp length is too small\n"); + return false; + } + + /* Checksum must sum to zero for the entire section. */ + checksum = do_fmp_checksum(fmp, len) + fmp->checksum; + if (checksum) { + pr_debug("fmp checksum failed: sum = 0x%x\n", checksum); + print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false); + return false; + } + + if (!fmp_is_usable(rec)) + return false; + + return true; +} + +static struct fru_rec *get_valid_record(struct fru_rec *old) +{ + struct fru_rec *new; + + if (!fmp_is_valid(old)) { + pr_debug("Ignoring invalid record\n"); + return NULL; + } + + new = get_fru_record(old->fmp.fru_id); + if (!new) + pr_debug("Ignoring record for absent FRU\n"); + + return new; +} + +/* + * Fetch saved records from persistent storage. + * + * For each found record: + * - If it was not created by this module, then ignore it. + * - If it is valid, then copy its data to the local cache. + * - If it is not valid, then erase it. + */ +static int get_saved_records(void) +{ + struct fru_rec *old, *new; + u64 record_id; + int ret, pos; + ssize_t len; + + /* + * Assume saved records match current max size. + * + * However, this may not be true depending on module parameters. + */ + old = kmalloc(max_rec_len, GFP_KERNEL); + if (!old) { + ret = -ENOMEM; + goto out; + } + + ret = erst_get_record_id_begin(&pos); + if (ret < 0) + goto out_end; + + while (!erst_get_record_id_next(&pos, &record_id)) { + if (record_id == APEI_ERST_INVALID_RECORD_ID) + goto out_end; + /* + * Make sure to clear temporary buffer between reads to avoid + * leftover data from records of various sizes. + */ + memset(old, 0, max_rec_len); + + len = erst_read_record(record_id, &old->hdr, max_rec_len, + sizeof(struct fru_rec), &CPER_CREATOR_FMP); + if (len < 0) + continue; + + if (len > max_rec_len) { + pr_debug("Found record larger than max_rec_len\n"); + continue; + } + + new = get_valid_record(old); + if (!new) + erst_clear(record_id); + + /* Restore the record */ + memcpy(new, old, len); + } + +out_end: + erst_get_record_id_end(); + kfree(old); +out: + return ret; +} + +static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu) +{ + struct cper_sec_fru_mem_poison *fmp = &rec->fmp; + + fmp->fru_arch_type = FMP_ARCH_TYPE_X86_CPUID_1_EAX; + fmp->validation_bits |= FMP_VALID_ARCH_TYPE; + + /* Assume all CPUs in the system have the same value for now. */ + fmp->fru_arch = cpuid_eax(1); + fmp->validation_bits |= FMP_VALID_ARCH; + + fmp->fru_id_type = FMP_ID_TYPE_X86_PPIN; + fmp->validation_bits |= FMP_VALID_ID_TYPE; + + fmp->fru_id = topology_ppin(cpu); + fmp->validation_bits |= FMP_VALID_ID; +} + +static int init_fmps(void) +{ + struct fru_rec *rec; + unsigned int i, cpu; + int ret = 0; + + for_each_fru(i, rec) { + unsigned int fru_cpu = INVALID_CPU; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + if (topology_physical_package_id(cpu) == i) { + fru_cpu = cpu; + break; + } + } + cpus_read_unlock(); + + if (fru_cpu == INVALID_CPU) { + pr_debug("Failed to find matching CPU for FRU #%u\n", i); + ret = -ENODEV; + break; + } + + set_fmp_fields(rec, fru_cpu); + } + + return ret; +} + +static int get_system_info(void) +{ + /* Only load on MI300A systems for now. */ + if (!(boot_cpu_data.x86_model >= 0x90 && + boot_cpu_data.x86_model <= 0x9f)) + return -ENODEV; + + if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) { + pr_debug("PPIN feature not available\n"); + return -ENODEV; + } + + /* Use CPU socket as FRU for MI300 systems. */ + max_nr_fru = topology_max_packages(); + if (!max_nr_fru) + return -ENODEV; + + if (max_nr_fru > FMPM_MAX_NR_FRU) { + pr_warn("Too many FRUs to manage: found: %u, max: %u\n", + max_nr_fru, FMPM_MAX_NR_FRU); + return -ENODEV; + } + + if (!max_nr_entries) + max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES; + + max_rec_len = sizeof(struct fru_rec); + max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries; + + pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n", + max_nr_fru, max_nr_entries, max_rec_len); + + return 0; +} + +static void free_records(void) +{ + struct fru_rec *rec; + int i; + + for_each_fru(i, rec) + kfree(rec); + + kfree(fru_records); +} + +static int allocate_records(void) +{ + int i, ret = 0; + + fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL); + if (!fru_records) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < max_nr_fru; i++) { + fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL); + if (!fru_records[i]) { + ret = -ENOMEM; + goto out_free; + } + } + + return ret; + +out_free: + for (; i >= 0; i--) + kfree(fru_records[i]); + + kfree(fru_records); +out: + return ret; +} + +static const struct x86_cpu_id fmpm_cpuids[] = { + X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), + { } +}; +MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids); + +static int __init fru_mem_poison_init(void) +{ + int ret; + + if (!x86_match_cpu(fmpm_cpuids)) { + ret = -ENODEV; + goto out; + } + + if (erst_disable) { + pr_debug("ERST not available\n"); + ret = -ENODEV; + goto out; + } + + ret = get_system_info(); + if (ret) + goto out; + + ret = allocate_records(); + if (ret) + goto out; + + ret = init_fmps(); + if (ret) + goto out_free; + + ret = get_saved_records(); + if (ret) + goto out_free; + + ret = save_new_records(); + if (ret) + goto out_free; + + retire_mem_records(); + + mce_register_decode_chain(&fru_mem_poison_nb); + + pr_info("FRU Memory Poison Manager initialized\n"); + return 0; + +out_free: + free_records(); +out: + return ret; +} + +static void __exit fru_mem_poison_exit(void) +{ + mce_unregister_decode_chain(&fru_mem_poison_nb); + free_records(); +} + +module_init(fru_mem_poison_init); +module_exit(fru_mem_poison_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("FRU Memory Poison Manager"); -- cgit v1.2.3 From dd61b55d733eee9bbe51abe7ab0e6f2ce1fae332 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 22 Feb 2024 10:54:49 -0600 Subject: RAS/AMD/ATL: Fix bit overflow in denorm_addr_df4_np2() The hash_pa8 and hashed_bit values in denorm_addr_df4_np2() are currently defined as u8 types. These variables represent single bits. 'hash_pa8' is set based on logical AND operations using masks with more than 8 bits. So the calculated value will not fit in this variable. It will always be '0'. The 'hash_pa8' check later in the function will fail which produces incorrect results for some cases. Change these variables to bool type. This clarifies that they are single bit values. Also, this allows the compiler to ensure they hold the proper results. Remove an unnecessary shift operation. [ bp: Remove the unnecessary brackets in the else-branch of the hash_pa8 assignment. ] Fixes: 3f3174996be6 ("RAS: Introduce AMD Address Translation Library") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240222165449.23582-1-yazen.ghannam@amd.com --- drivers/ras/amd/atl/denormalize.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c index 49a900e066f1..e279224288d6 100644 --- a/drivers/ras/amd/atl/denormalize.c +++ b/drivers/ras/amd/atl/denormalize.c @@ -545,7 +545,7 @@ static int denorm_addr_df4_np2(struct addr_ctx *ctx) unsigned int mod_value, shift_value; u16 mask = df_cfg.component_id_mask; u64 temp_addr_a, temp_addr_b; - u8 hash_pa8, hashed_bit; + bool hash_pa8, hashed_bit; switch (ctx->map.intlv_mode) { case DF4_NPS4_3CHAN_HASH: @@ -577,8 +577,7 @@ static int denorm_addr_df4_np2(struct addr_ctx *ctx) hash_pa8 = BIT_ULL(shift_value) & ctx->ret_addr; temp_addr_a = remove_bits(shift_value, shift_value, ctx->ret_addr); } else { - hash_pa8 = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask); - hash_pa8 >>= df_cfg.socket_id_shift; + hash_pa8 = ctx->coh_st_fabric_id & df_cfg.socket_id_mask; temp_addr_a = ctx->ret_addr; } -- cgit v1.2.3 From 9d2b6fa09d15d021fb83ec6f1336176ebaebbeec Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 1 Mar 2024 08:37:46 -0600 Subject: RAS: Export helper to get ras_debugfs_dir Export a getter instead of the debugfs node directly so that, other in-tree-only RAS modules can use it. Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Yazen Ghannam Link: https://lore.kernel.org/r/20240301143748.854090-2-yazen.ghannam@amd.com --- drivers/ras/cec.c | 10 ++++++++-- drivers/ras/debugfs.c | 8 +++++++- drivers/ras/debugfs.h | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 321af498ee11..e440b15fbabc 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -480,9 +480,15 @@ DEFINE_SHOW_ATTRIBUTE(array); static int __init create_debugfs_nodes(void) { - struct dentry *d, *pfn, *decay, *count, *array; + struct dentry *d, *pfn, *decay, *count, *array, *dfs; - d = debugfs_create_dir("cec", ras_debugfs_dir); + dfs = ras_get_debugfs_root(); + if (!dfs) { + pr_warn("Error getting RAS debugfs root!\n"); + return -1; + } + + d = debugfs_create_dir("cec", dfs); if (!d) { pr_warn("Error creating cec debugfs node!\n"); return -1; diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c index ffb973c328e3..42afd3de68b2 100644 --- a/drivers/ras/debugfs.c +++ b/drivers/ras/debugfs.c @@ -3,10 +3,16 @@ #include #include "debugfs.h" -struct dentry *ras_debugfs_dir; +static struct dentry *ras_debugfs_dir; static atomic_t trace_count = ATOMIC_INIT(0); +struct dentry *ras_get_debugfs_root(void) +{ + return ras_debugfs_dir; +} +EXPORT_SYMBOL_GPL(ras_get_debugfs_root); + int ras_userspace_consumers(void) { return atomic_read(&trace_count); diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h index c07443b462ad..4749ccdeeba1 100644 --- a/drivers/ras/debugfs.h +++ b/drivers/ras/debugfs.h @@ -4,6 +4,6 @@ #include -extern struct dentry *ras_debugfs_dir; +struct dentry *ras_get_debugfs_root(void); #endif /* __RAS_DEBUGFS_H__ */ -- cgit v1.2.3 From 838850c50884cdd1c96fce1063ef918c394d4bdc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Mar 2024 08:37:47 -0600 Subject: RAS/AMD/FMPM: Save SPA values The system physical address (SPA) of an error is not a stable value. It will change depending on the location of the memory: parts can be swapped. And it will change depending on memory topology: NUMA nodes and/or interleaving can be adjusted. Therefore, the SPA value is not part of the "FRU Memory Poison" record format. And it will not be saved to persistent storage. However, the SPA values can be helpful during debug and for system admins during run time. Save the SPA values in a separate structure. This is updated when records are restored and when new errors are saved. [ bp: Make error messages more user friendly and add and correct comments. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240301143748.854090-3-yazen.ghannam@amd.com --- drivers/ras/amd/fmpm.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index 80dd112b720a..8c3188488673 100644 --- a/drivers/ras/amd/fmpm.c +++ b/drivers/ras/amd/fmpm.c @@ -111,6 +111,11 @@ struct fru_rec { */ static struct fru_rec **fru_records; +/* system physical addresses array */ +static u64 *spa_entries; + +#define INVALID_SPA ~0ULL + #define CPER_CREATOR_FMP \ GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \ 0xa0, 0x33, 0x08, 0x75) @@ -120,7 +125,7 @@ static struct fru_rec **fru_records; 0x12, 0x0a, 0x44, 0x58) /** - * DOC: fru_poison_entries (byte) + * DOC: max_nr_entries (byte) * Maximum number of descriptor entries possible for each FRU. * * Values between '1' and '255' are valid. @@ -140,6 +145,9 @@ static unsigned int max_nr_fru; /* Total length of record including headers and list of descriptor entries. */ static size_t max_rec_len; +/* Total number of SPA entries across all FRUs. */ +static unsigned int spa_nr_entries; + /* * Protect the local records cache in fru_records and prevent concurrent * writes to storage. This is only needed after init once notifier block @@ -269,6 +277,54 @@ static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd) return false; } +static void save_spa(struct fru_rec *rec, unsigned int entry, + u64 addr, u64 id, unsigned int cpu) +{ + unsigned int i, fru_idx, spa_entry; + struct atl_err a_err; + unsigned long spa; + + if (entry >= max_nr_entries) { + pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n", + entry, max_nr_entries); + return; + } + + /* spa_nr_entries is always multiple of max_nr_entries */ + for (i = 0; i < spa_nr_entries; i += max_nr_entries) { + fru_idx = i / max_nr_entries; + if (fru_records[fru_idx] == rec) + break; + } + + if (i >= spa_nr_entries) { + pr_warn_once("FRU record %d not found\n", i); + return; + } + + spa_entry = i + entry; + if (spa_entry >= spa_nr_entries) { + pr_warn_once("spa_entries[] index out-of-bounds\n"); + return; + } + + memset(&a_err, 0, sizeof(struct atl_err)); + + a_err.addr = addr; + a_err.ipid = id; + a_err.cpu = cpu; + + spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err); + if (IS_ERR_VALUE(spa)) { + pr_debug("Failed to get system address\n"); + return; + } + + spa_entries[spa_entry] = spa; + pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n", + fru_idx, entry, spa_entry, spa_entries[spa_entry]); +} + static void update_fru_record(struct fru_rec *rec, struct mce *m) { struct cper_sec_fru_mem_poison *fmp = &rec->fmp; @@ -301,6 +357,7 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m) entry = fmp->nr_entries; save_fpd: + save_spa(rec, entry, m->addr, m->ipid, m->extcpu); fpd_dest = &rec->entries[entry]; memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc)); @@ -385,6 +442,7 @@ static void retire_mem_fmp(struct fru_rec *rec) continue; retire_dram_row(fpd->addr, fpd->hw_id, err_cpu); + save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu); } } @@ -696,6 +754,8 @@ static int get_system_info(void) if (!max_nr_entries) max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES; + spa_nr_entries = max_nr_fru * max_nr_entries; + max_rec_len = sizeof(struct fru_rec); max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries; @@ -714,6 +774,7 @@ static void free_records(void) kfree(rec); kfree(fru_records); + kfree(spa_entries); } static int allocate_records(void) @@ -734,6 +795,15 @@ static int allocate_records(void) } } + spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL); + if (!spa_entries) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < spa_nr_entries; i++) + spa_entries[i] = INVALID_SPA; + return ret; out_free: -- cgit v1.2.3 From 7d19eea51757ad72faf4b0493e5bde85ca62012e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Mar 2024 08:37:48 -0600 Subject: RAS/AMD/FMPM: Add debugfs interface to print record entries It is helpful to see the saved record entries during run time in human-readable format. This is useful for testing during module development. It can also be used by system admins to quickly and easily see the state of the system. Provide a sequential file in debugfs to print fields of interest from the FRU records and their entries. Don't fail to load the module if the debugfs interface is not available. This is a convenience feature which does not affect other module functionality. The new interface reads the record entries and should hold the mutex. Expand the mutex code comment to clarify when it should be held. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240301143748.854090-4-yazen.ghannam@amd.com --- drivers/ras/amd/fmpm.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index 8c3188488673..0963c9e7b853 100644 --- a/drivers/ras/amd/fmpm.c +++ b/drivers/ras/amd/fmpm.c @@ -54,6 +54,8 @@ #include #include +#include "../debugfs.h" + #define INVALID_CPU UINT_MAX /* Validation Bits */ @@ -116,6 +118,9 @@ static u64 *spa_entries; #define INVALID_SPA ~0ULL +static struct dentry *fmpm_dfs_dir; +static struct dentry *fmpm_dfs_entries; + #define CPER_CREATOR_FMP \ GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \ 0xa0, 0x33, 0x08, 0x75) @@ -152,6 +157,11 @@ static unsigned int spa_nr_entries; * Protect the local records cache in fru_records and prevent concurrent * writes to storage. This is only needed after init once notifier block * registration is done. + * + * The majority of a record is fixed at module init and will not change + * during run time. The entries within a record will be updated as new + * errors are reported. The mutex should be held whenever the entries are + * accessed during run time. */ static DEFINE_MUTEX(fmpm_update_mutex); @@ -815,6 +825,124 @@ out: return ret; } +static void *fmpm_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= (spa_nr_entries + 1)) + return NULL; + return pos; +} + +static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= (spa_nr_entries + 1)) + return NULL; + return pos; +} + +static void fmpm_stop(struct seq_file *f, void *data) +{ +} + +#define SHORT_WIDTH 8 +#define U64_WIDTH 18 +#define TIMESTAMP_WIDTH 19 +#define LONG_WIDTH 24 +#define U64_PAD (LONG_WIDTH - U64_WIDTH) +#define TS_PAD (LONG_WIDTH - TIMESTAMP_WIDTH) +static int fmpm_show(struct seq_file *f, void *data) +{ + unsigned int fru_idx, entry, spa_entry, line; + struct cper_fru_poison_desc *fpd; + struct fru_rec *rec; + + line = *(loff_t *)data; + if (line == 0) { + seq_printf(f, "%-*s", SHORT_WIDTH, "fru_idx"); + seq_printf(f, "%-*s", LONG_WIDTH, "fru_id"); + seq_printf(f, "%-*s", SHORT_WIDTH, "entry"); + seq_printf(f, "%-*s", LONG_WIDTH, "timestamp"); + seq_printf(f, "%-*s", LONG_WIDTH, "hw_id"); + seq_printf(f, "%-*s", LONG_WIDTH, "addr"); + seq_printf(f, "%-*s", LONG_WIDTH, "spa"); + goto out_newline; + } + + spa_entry = line - 1; + fru_idx = spa_entry / max_nr_entries; + entry = spa_entry % max_nr_entries; + + rec = fru_records[fru_idx]; + if (!rec) + goto out; + + seq_printf(f, "%-*u", SHORT_WIDTH, fru_idx); + seq_printf(f, "0x%016llx%-*s", rec->fmp.fru_id, U64_PAD, ""); + seq_printf(f, "%-*u", SHORT_WIDTH, entry); + + mutex_lock(&fmpm_update_mutex); + + if (entry >= rec->fmp.nr_entries) { + seq_printf(f, "%-*s", LONG_WIDTH, "*"); + seq_printf(f, "%-*s", LONG_WIDTH, "*"); + seq_printf(f, "%-*s", LONG_WIDTH, "*"); + seq_printf(f, "%-*s", LONG_WIDTH, "*"); + goto out_unlock; + } + + fpd = &rec->entries[entry]; + + seq_printf(f, "%ptT%-*s", &fpd->timestamp, TS_PAD, ""); + seq_printf(f, "0x%016llx%-*s", fpd->hw_id, U64_PAD, ""); + seq_printf(f, "0x%016llx%-*s", fpd->addr, U64_PAD, ""); + + if (spa_entries[spa_entry] == INVALID_SPA) + seq_printf(f, "%-*s", LONG_WIDTH, "*"); + else + seq_printf(f, "0x%016llx%-*s", spa_entries[spa_entry], U64_PAD, ""); + +out_unlock: + mutex_unlock(&fmpm_update_mutex); +out_newline: + seq_putc(f, '\n'); +out: + return 0; +} + +static const struct seq_operations fmpm_seq_ops = { + .start = fmpm_start, + .next = fmpm_next, + .stop = fmpm_stop, + .show = fmpm_show, +}; + +static int fmpm_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fmpm_seq_ops); +} + +static const struct file_operations fmpm_fops = { + .open = fmpm_open, + .release = seq_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static void setup_debugfs(void) +{ + struct dentry *dfs = ras_get_debugfs_root(); + + if (!dfs) + return; + + fmpm_dfs_dir = debugfs_create_dir("fmpm", dfs); + if (!fmpm_dfs_dir) + return; + + fmpm_dfs_entries = debugfs_create_file("entries", 0400, fmpm_dfs_dir, NULL, &fmpm_fops); + if (!fmpm_dfs_entries) + debugfs_remove(fmpm_dfs_dir); +} + static const struct x86_cpu_id fmpm_cpuids[] = { X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), { } @@ -856,6 +984,8 @@ static int __init fru_mem_poison_init(void) if (ret) goto out_free; + setup_debugfs(); + retire_mem_records(); mce_register_decode_chain(&fru_mem_poison_nb); @@ -872,6 +1002,7 @@ out: static void __exit fru_mem_poison_exit(void) { mce_unregister_decode_chain(&fru_mem_poison_nb); + debugfs_remove(fmpm_dfs_dir); free_records(); } -- cgit v1.2.3 From bd17b7c34fadef645becde1245b9394f69f31702 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Mar 2024 08:30:46 +0300 Subject: RAS/AMD/FMPM: Fix off by one when unwinding on error Decrement the index variable i before the first iteration when freeing the remaining elements on error. Depending on where this fails it could free something from one element beyond the end of the fru_records[] array. [ bp: Massage commit message. ] Fixes: 6f15e617cc99 ("RAS: Introduce a FRU memory poison manager") Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/6fdec71a-846b-4cd0-af69-e5f6cd12f4f6@moroto.mountain --- drivers/ras/amd/fmpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index 0963c9e7b853..2f4ac9591c8f 100644 --- a/drivers/ras/amd/fmpm.c +++ b/drivers/ras/amd/fmpm.c @@ -817,7 +817,7 @@ static int allocate_records(void) return ret; out_free: - for (; i >= 0; i--) + while (--i >= 0) kfree(fru_records[i]); kfree(fru_records); -- cgit v1.2.3 From 4527a2194e7c2f88e940f9071084daa307ce08af Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 8 Mar 2024 09:51:06 +0100 Subject: EDAC/versal: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve this, there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shubhrajyoti Datta Link: https://lore.kernel.org/r/83deca1ce260f7e17ff3cb106c9a6946d4ca4505.1709886922.git.u.kleine-koenig@pengutronix.de --- drivers/edac/versal_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c index 3016870689f1..1688a5050f63 100644 --- a/drivers/edac/versal_edac.c +++ b/drivers/edac/versal_edac.c @@ -1160,7 +1160,7 @@ free_edac_mc: return rc; } -static int mc_remove(struct platform_device *pdev) +static void mc_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); struct edac_priv *priv = mci->pvt_info; @@ -1178,8 +1178,6 @@ static int mc_remove(struct platform_device *pdev) XPM_EVENT_ERROR_MASK_DDRMC_NCR, err_callback, mci); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static struct platform_driver xilinx_ddr_edac_mc_driver = { @@ -1188,7 +1186,7 @@ static struct platform_driver xilinx_ddr_edac_mc_driver = { .of_match_table = xlnx_edac_match, }, .probe = mc_probe, - .remove = mc_remove, + .remove_new = mc_remove, }; module_platform_driver(xilinx_ddr_edac_mc_driver); -- cgit v1.2.3