diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/cxl/core/Makefile | 3 | ||||
-rw-r--r-- | drivers/cxl/core/pci.c | 112 | ||||
-rw-r--r-- | drivers/cxl/core/trace.c | 5 | ||||
-rw-r--r-- | drivers/cxl/core/trace.h | 109 | ||||
-rw-r--r-- | drivers/cxl/cxl.h | 2 | ||||
-rw-r--r-- | drivers/cxl/cxlpci.h | 3 | ||||
-rw-r--r-- | drivers/cxl/pci.c | 111 |
7 files changed, 234 insertions, 111 deletions
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 79c7257f4107..ca4ae31d8f57 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -3,6 +3,8 @@ obj-$(CONFIG_CXL_BUS) += cxl_core.o obj-$(CONFIG_CXL_SUSPEND) += suspend.o ccflags-y += -I$(srctree)/drivers/cxl +CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src) + cxl_core-y := port.o cxl_core-y += pmem.o cxl_core-y += regs.o @@ -10,4 +12,5 @@ cxl_core-y += memdev.o cxl_core-y += mbox.o cxl_core-y += pci.o cxl_core-y += hdm.o +cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 57764e9cd19d..1d1492440287 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -9,6 +9,7 @@ #include <cxlmem.h> #include <cxl.h> #include "core.h" +#include "trace.h" /** * DOC: cxl core pci @@ -622,3 +623,114 @@ void read_cdat_data(struct cxl_port *port) } } EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + void __iomem *addr; + u32 status; + + if (!cxlds->regs.ras) + return; + + addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(dev, status); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) +{ + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!cxlds->regs.ras) + return false; + + addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr))); + } else { + fe = status; + } + + header_log_copy(cxlds, hl); + trace_cxl_aer_uncorrectable_error(dev, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_report_and_clear(cxlds); + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, CXL); diff --git a/drivers/cxl/core/trace.c b/drivers/cxl/core/trace.c new file mode 100644 index 000000000000..29ae7ce81dc5 --- /dev/null +++ b/drivers/cxl/core/trace.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h new file mode 100644 index 000000000000..20ca2fe2ca8e --- /dev/null +++ b/drivers/cxl/core/trace.h @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cxl + +#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _CXL_EVENTS_H + +#include <cxl.h> +#include <linux/tracepoint.h> + +#define CXL_RAS_UC_CACHE_DATA_PARITY BIT(0) +#define CXL_RAS_UC_CACHE_ADDR_PARITY BIT(1) +#define CXL_RAS_UC_CACHE_BE_PARITY BIT(2) +#define CXL_RAS_UC_CACHE_DATA_ECC BIT(3) +#define CXL_RAS_UC_MEM_DATA_PARITY BIT(4) +#define CXL_RAS_UC_MEM_ADDR_PARITY BIT(5) +#define CXL_RAS_UC_MEM_BE_PARITY BIT(6) +#define CXL_RAS_UC_MEM_DATA_ECC BIT(7) +#define CXL_RAS_UC_REINIT_THRESH BIT(8) +#define CXL_RAS_UC_RSVD_ENCODE BIT(9) +#define CXL_RAS_UC_POISON BIT(10) +#define CXL_RAS_UC_RECV_OVERFLOW BIT(11) +#define CXL_RAS_UC_INTERNAL_ERR BIT(14) +#define CXL_RAS_UC_IDE_TX_ERR BIT(15) +#define CXL_RAS_UC_IDE_RX_ERR BIT(16) + +#define show_uc_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" }, \ + { CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" }, \ + { CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \ + { CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" }, \ + { CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" }, \ + { CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" }, \ + { CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" }, \ + { CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" }, \ + { CXL_RAS_UC_POISON, "Received Poison From Peer" }, \ + { CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" }, \ + { CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" }, \ + { CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" }, \ + { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ +) + +TRACE_EVENT(cxl_aer_uncorrectable_error, + TP_PROTO(const struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(dev_name, dev_name(dev)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(dev_name, dev_name(dev)); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("%s: status: '%s' first_error: '%s'", + __get_str(dev_name), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + +#define CXL_RAS_CE_CACHE_DATA_ECC BIT(0) +#define CXL_RAS_CE_MEM_DATA_ECC BIT(1) +#define CXL_RAS_CE_CRC_THRESH BIT(2) +#define CLX_RAS_CE_RETRY_THRESH BIT(3) +#define CXL_RAS_CE_CACHE_POISON BIT(4) +#define CXL_RAS_CE_MEM_POISON BIT(5) +#define CXL_RAS_CE_PHYS_LAYER_ERR BIT(6) + +#define show_ce_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" }, \ + { CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" }, \ + { CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" }, \ + { CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" }, \ + { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ +) + +TRACE_EVENT(cxl_aer_correctable_error, + TP_PROTO(const struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(dev_name, dev_name(dev)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(dev_name, dev_name(dev)); + __entry->status = status; + ), + TP_printk("%s: status: '%s'", + __get_str(dev_name), show_ce_errs(__entry->status) + ) +); + +#endif /* _CXL_EVENTS_H */ + +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 1b1cf459ac77..aa3af3bb73b2 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -140,6 +140,8 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0) #define CXL_RAS_HEADER_LOG_OFFSET 0x18 #define CXL_RAS_CAPABILITY_LENGTH 0x58 +#define CXL_HEADERLOG_SIZE SZ_512 +#define CXL_HEADERLOG_SIZE_U32 SZ_512 / sizeof(u32) /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */ #define CXLDEV_CAP_ARRAY_OFFSET 0x0 diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 920909791bb9..77dbdb980b12 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -66,4 +66,7 @@ int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm); void read_cdat_data(struct cxl_port *port); +void cxl_cor_error_detected(struct pci_dev *pdev); +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 33083a522fd1..3a66aadb4df0 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -14,8 +14,6 @@ #include "cxlmem.h" #include "cxlpci.h" #include "cxl.h" -#define CREATE_TRACE_POINTS -#include <trace/events/cxl.h> /** * DOC: cxl pci @@ -514,96 +512,6 @@ static const struct pci_device_id cxl_mem_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl); -/* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log) -{ - void __iomem *addr; - u32 *log_addr; - int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - - addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET; - log_addr = log; - - for (i = 0; i < log_u32_size; i++) { - *log_addr = readl(addr); - log_addr++; - addr += sizeof(u32); - } -} - -/* - * Log the state of the RAS status registers and prepare them to log the - * next error status. Return 1 if reset needed. - */ -static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) -{ - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - u32 hl[CXL_HEADERLOG_SIZE_U32]; - void __iomem *addr; - u32 status; - u32 fe; - - if (!cxlds->regs.ras) - return false; - - addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) - return false; - - /* If multiple errors, log header points to first error from ctrl reg */ - if (hweight32(status) > 1) { - addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET; - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr))); - } else { - fe = status; - } - - header_log_copy(cxlds, hl); - trace_cxl_aer_uncorrectable_error(dev, status, fe, hl); - writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); - - return true; -} - -static pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - bool ue; - - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_report_and_clear(cxlds); - - switch (state) { - case pci_channel_io_normal: - if (ue) { - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - } - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(&pdev->dev, - "%s: frozen state error detected, disable CXL.mem\n", - dev_name(dev)); - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(&pdev->dev, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} - static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); @@ -628,25 +536,6 @@ static void cxl_error_resume(struct pci_dev *pdev) dev->driver ? "successful" : "failed"); } -static void cxl_cor_error_detected(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - void __iomem *addr; - u32 status; - - if (!cxlds->regs.ras) - return; - - addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(dev, status); - } -} - static const struct pci_error_handlers cxl_error_handlers = { .error_detected = cxl_error_detected, .slot_reset = cxl_slot_reset, |