summaryrefslogtreecommitdiff
path: root/drivers/accel/amdxdna/aie2_error.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/amdxdna/aie2_error.c')
-rw-r--r--drivers/accel/amdxdna/aie2_error.c358
1 files changed, 358 insertions, 0 deletions
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
new file mode 100644
index 000000000000..5ee905632a39
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/dma-mapping.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+struct async_event {
+ struct amdxdna_dev_hdl *ndev;
+ struct async_event_msg_resp resp;
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ u8 *buf;
+ dma_addr_t addr;
+ u32 size;
+};
+
+struct async_events {
+ struct workqueue_struct *wq;
+ u8 *buf;
+ dma_addr_t addr;
+ u32 size;
+ u32 event_cnt;
+ struct async_event event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+ AIE_MEM_MOD = 0,
+ AIE_CORE_MOD,
+ AIE_PL_MOD,
+};
+
+enum aie_error_category {
+ AIE_ERROR_SATURATION = 0,
+ AIE_ERROR_FP,
+ AIE_ERROR_STREAM,
+ AIE_ERROR_ACCESS,
+ AIE_ERROR_BUS,
+ AIE_ERROR_INSTRUCTION,
+ AIE_ERROR_ECC,
+ AIE_ERROR_LOCK,
+ AIE_ERROR_DMA,
+ AIE_ERROR_MEM_PARITY,
+ /* Unknown is not from XAIE, added for better category */
+ AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+ __u8 row;
+ __u8 col;
+ __u32 mod_type;
+ __u8 event_id;
+};
+
+struct aie_err_info {
+ u32 err_cnt;
+ u32 ret_code;
+ u32 rsvd;
+ struct aie_error payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+ u8 event_id;
+ enum aie_error_category category;
+};
+
+#define EVENT_CATEGORY(id, cat) { id, cat }
+static const struct aie_event_category aie_ml_mem_event_cat[] = {
+ EVENT_CATEGORY(88U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(90U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
+ EVENT_CATEGORY(97U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(98U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(99U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(100U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_core_event_cat[] = {
+ EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(58U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
+ EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(62U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(64U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
+ EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
+ EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
+ EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(72U, AIE_ERROR_BUS),
+};
+
+static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
+ EVENT_CATEGORY(130U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(132U, AIE_ERROR_ECC),
+ EVENT_CATEGORY(133U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(134U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(138U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
+ EVENT_CATEGORY(64U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
+ EVENT_CATEGORY(67U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(68U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(69U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(70U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(71U, AIE_ERROR_BUS),
+ EVENT_CATEGORY(72U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(73U, AIE_ERROR_DMA),
+ EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
+};
+
+static enum aie_error_category
+aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
+{
+ const struct aie_event_category *lut;
+ int num_entry;
+ int i;
+
+ switch (mod_type) {
+ case AIE_PL_MOD:
+ lut = aie_ml_shim_tile_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
+ break;
+ case AIE_CORE_MOD:
+ lut = aie_ml_core_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
+ break;
+ case AIE_MEM_MOD:
+ if (row == 1) {
+ lut = aie_ml_mem_tile_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
+ } else {
+ lut = aie_ml_mem_event_cat;
+ num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
+ }
+ break;
+ default:
+ return AIE_ERROR_UNKNOWN;
+ }
+
+ for (i = 0; i < num_entry; i++) {
+ if (event_id != lut[i].event_id)
+ continue;
+
+ return lut[i].category;
+ }
+
+ return AIE_ERROR_UNKNOWN;
+}
+
+static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+ struct aie_error *errs = err_info;
+ u32 err_col = 0; /* assume that AIE has less than 32 columns */
+ int i;
+
+ /* Get err column bitmap */
+ for (i = 0; i < num_err; i++) {
+ struct aie_error *err = &errs[i];
+ enum aie_error_category cat;
+
+ cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
+ XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
+ err->row, err->col, err->mod_type,
+ err->event_id, cat);
+
+ if (err->col >= 32) {
+ XDNA_WARN(ndev->xdna, "Invalid column number");
+ break;
+ }
+
+ err_col |= (1 << err->col);
+ }
+
+ return err_col;
+}
+
+static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
+{
+ struct async_event *e = handle;
+
+ if (data) {
+ e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
+ wmb(); /* Update status in the end, so that no lock for here */
+ e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
+ }
+ queue_work(e->wq, &e->work);
+ return 0;
+}
+
+static int aie2_error_event_send(struct async_event *e)
+{
+ drm_clflush_virt_range(e->buf, e->size); /* device can access */
+ return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
+ aie2_error_async_cb);
+}
+
+static void aie2_error_worker(struct work_struct *err_work)
+{
+ struct aie_err_info *info;
+ struct amdxdna_dev *xdna;
+ struct async_event *e;
+ u32 max_err;
+ u32 err_col;
+
+ e = container_of(err_work, struct async_event, work);
+
+ xdna = e->ndev->xdna;
+
+ if (e->resp.status == MAX_AIE2_STATUS_CODE)
+ return;
+
+ e->resp.status = MAX_AIE2_STATUS_CODE;
+
+ print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
+ e->buf, 0x100, false);
+
+ info = (struct aie_err_info *)e->buf;
+ XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
+
+ max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
+ if (unlikely(info->err_cnt > max_err)) {
+ WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
+ return;
+ }
+ err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
+ if (!err_col) {
+ XDNA_WARN(xdna, "Did not get error column");
+ return;
+ }
+
+ mutex_lock(&xdna->dev_lock);
+ /* Re-sent this event to firmware */
+ if (aie2_error_event_send(e))
+ XDNA_WARN(xdna, "Unable to register async event");
+ mutex_unlock(&xdna->dev_lock);
+}
+
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct async_event *e;
+ int i, ret;
+
+ drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+ for (i = 0; i < ndev->async_events->event_cnt; i++) {
+ e = &ndev->async_events->event[i];
+ ret = aie2_error_event_send(e);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct async_events *events;
+
+ events = ndev->async_events;
+
+ mutex_unlock(&xdna->dev_lock);
+ destroy_workqueue(events->wq);
+ mutex_lock(&xdna->dev_lock);
+
+ dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+ events->addr, DMA_FROM_DEVICE);
+ kfree(events);
+}
+
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
+{
+ struct amdxdna_dev *xdna = ndev->xdna;
+ u32 total_col = ndev->total_col;
+ u32 total_size = ASYNC_BUF_SIZE * total_col;
+ struct async_events *events;
+ int i, ret;
+
+ events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
+ if (!events)
+ return -ENOMEM;
+
+ events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
+ DMA_FROM_DEVICE, GFP_KERNEL);
+ if (!events->buf) {
+ ret = -ENOMEM;
+ goto free_events;
+ }
+ events->size = total_size;
+ events->event_cnt = total_col;
+
+ events->wq = alloc_ordered_workqueue("async_wq", 0);
+ if (!events->wq) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ for (i = 0; i < events->event_cnt; i++) {
+ struct async_event *e = &events->event[i];
+ u32 offset = i * ASYNC_BUF_SIZE;
+
+ e->ndev = ndev;
+ e->wq = events->wq;
+ e->buf = &events->buf[offset];
+ e->addr = events->addr + offset;
+ e->size = ASYNC_BUF_SIZE;
+ e->resp.status = MAX_AIE2_STATUS_CODE;
+ INIT_WORK(&e->work, aie2_error_worker);
+ }
+
+ ndev->async_events = events;
+
+ XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
+ events->event_cnt, events->size);
+ return 0;
+
+free_buf:
+ dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+ events->addr, DMA_FROM_DEVICE);
+free_events:
+ kfree(events);
+ return ret;
+}