diff options
author | Avadhut Naik <avadhut.naik@amd.com> | 2024-10-22 19:36:29 +0000 |
---|---|---|
committer | Borislav Petkov (AMD) <bp@alien8.de> | 2024-10-31 10:36:07 +0100 |
commit | d4fca1358ea9096f2f6ed942e2cb3a820073dfc1 (patch) | |
tree | f4832caf73d7728455cc663fd86ef1186701f564 | |
parent | e52750fb1458ae9ea5860a08ed7a149185bc5b97 (diff) | |
download | lwn-d4fca1358ea9096f2f6ed942e2cb3a820073dfc1.tar.gz lwn-d4fca1358ea9096f2f6ed942e2cb3a820073dfc1.zip |
x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers:
MCA_SYND1 and MCA_SYND2.
These registers will include supplemental error information in addition to the
existing MCA_SYND register. The data within these registers is considered
valid if MCA_STATUS[SyndV] is set.
Userspace error decoding tools like rasdaemon gather related hardware error
information through the tracepoints.
Therefore, export these two registers through the mce_record tracepoint so
that tools like rasdaemon can parse them and output the supplemental error
information like FRU text contained in them.
[ bp: Massage. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com
-rw-r--r-- | arch/x86/include/asm/mce.h | 21 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/mce.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/amd.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 9 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 8 | ||||
-rw-r--r-- | include/trace/events/mce.h | 7 |
6 files changed, 46 insertions, 7 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4e45f45673a3..4d936ee20e24 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -122,6 +122,9 @@ #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +/* Registers MISC2 to MISC4 are at offsets B to D. */ +#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e +#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f #define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) @@ -132,6 +135,8 @@ #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x)) #define XEC(x, mask) (((x) >> 16) & mask) @@ -190,9 +195,25 @@ enum mce_notifier_prios { /** * struct mce_hw_err - Hardware Error Record. * @m: Machine Check record. + * @vendor: Vendor-specific error information. + * + * Vendor-specific fields should not be added to struct mce. Instead, vendors + * should export their vendor-specific data through their structure in the + * vendor union below. + * + * AMD's vendor data is parsed by error decoding tools for supplemental error + * information. Thus, current offsets of existing fields must be maintained. + * Only add new fields at the end of AMD's vendor structure. */ struct mce_hw_err { struct mce m; + + union vendor_info { + struct { + u64 synd1; /* MCA_SYND1 MSR */ + u64 synd2; /* MCA_SYND2 MSR */ + } amd; + } vendor; }; #define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index db9adc081c5a..cb6b48a7c22b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -8,7 +8,8 @@ /* * Fields are zero when not available. Also, this struct is shared with * userspace mcelog and thus must keep existing fields at current offsets. - * Only add new fields to the end of the structure + * Only add new, shared fields to the end of the structure. + * Do not add vendor-specific fields. */ struct mce { __u64 status; /* Bank's MCi_STATUS MSR */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5b4d266500b2..6ca80fff1fea 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) if (mce_flags.smca) { rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } mce_log(&err); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 28e28b69d84d..7fb5556a0b53 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 8130c3dc64da..194d9fd47d20 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -793,6 +793,7 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(m); unsigned int fam = x86_family(m->cpuid); int ecc; @@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (boot_cpu_has(X86_FEATURE_SMCA)) { pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); - if (m->status & MCI_STATUS_SYNDV) - pr_cont(", Syndrome: 0x%016llx", m->synd); + if (m->status & MCI_STATUS_SYNDV) { + pr_cont(", Syndrome: 0x%016llx\n", m->synd); + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", + err->vendor.amd.synd1, err->vendor.amd.synd2); + } pr_cont("\n"); diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h index 65aba1afcd07..c1c50df9ecfd 100644 --- a/include/trace/events/mce.h +++ b/include/trace/events/mce.h @@ -43,6 +43,7 @@ TRACE_EVENT(mce_record, __field( u8, bank ) __field( u8, cpuvendor ) __field( u32, microcode ) + __dynamic_array(u8, v_data, sizeof(err->vendor)) ), TP_fast_assign( @@ -65,9 +66,10 @@ TRACE_EVENT(mce_record, __entry->bank = err->m.bank; __entry->cpuvendor = err->m.cpuvendor; __entry->microcode = err->m.microcode; + memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor)); ), - TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x", + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s", __entry->cpu, __entry->mcgcap, __entry->mcgstatus, __entry->bank, __entry->status, @@ -83,7 +85,8 @@ TRACE_EVENT(mce_record, __entry->walltime, __entry->socketid, __entry->apicid, - __entry->microcode) + __entry->microcode, + __print_dynamic_array(v_data, sizeof(u8))) ); #endif /* _TRACE_MCE_H */ |