// SPDX-License-Identifier: MIT /* * Copyright © 2026 Intel Corporation */ #include #include #include #include #include #include #include "drm_ras_nl.h" /** * DOC: DRM RAS Node Management * * This module provides the infrastructure to manage RAS (Reliability, * Availability, and Serviceability) nodes for DRM drivers. Each * DRM driver may register one or more RAS nodes, which represent * logical components capable of reporting error counters and other * reliability metrics. * * The nodes are stored in a global xarray `drm_ras_xa` to allow * efficient lookup by ID. Nodes can be registered or unregistered * dynamically at runtime. * * A Generic Netlink family `drm_ras` exposes two main operations to * userspace: * * 1. LIST_NODES: Dump all currently registered RAS nodes. * The user receives an array of node IDs, names, and types. * * 2. GET_ERROR_COUNTER: Get error counters of a given node. * Userspace must provide Node ID, Error ID (Optional for specific counter). * Returns all counters of a node if only Node ID is provided or specific * error counters. * * Node registration: * * - drm_ras_node_register(): Registers a new node and assigns * it a unique ID in the xarray. * - drm_ras_node_unregister(): Removes a previously registered * node from the xarray. * * Node type: * * - ERROR_COUNTER: * + Currently, only error counters are supported. * + The driver must implement the query_error_counter() callback to provide * the name and the value of the error counter. * + The driver must provide a error_counter_range.last value informing the * last valid error ID. * + The driver can provide a error_counter_range.first value informing the * first valid error ID. * + The error counters in the driver doesn't need to be contiguous, but the * driver must return -ENOENT to the query_error_counter as an indication * that the ID should be skipped and not listed in the netlink API. * * Netlink handlers: * * - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES * operation, iterating over the xarray. * - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit * operation, fetching all counters from a specific node. * - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit * operation, fetching a counter value from a specific node. */ static DEFINE_XARRAY_ALLOC(drm_ras_xa); /* * The netlink callback context carries dump state across multiple dumpit calls */ struct drm_ras_ctx { /* Which xarray id to restart the dump from */ unsigned long restart; }; /** * drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes * @skb: Netlink message buffer * @cb: Callback context for multi-part dumps * * Iterates over all registered RAS nodes in the global xarray and appends * their attributes (ID, name, type) to the given netlink message buffer. * Uses @cb->ctx to track progress in case the message buffer fills up, allowing * multi-part dump support. On buffer overflow, updates the context to resume * from the last node on the next invocation. * * Return: 0 if all nodes fit in @skb, number of bytes added to @skb if * the buffer filled up (requires multi-part continuation), or * a negative error code on failure. */ int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct drm_ras_ctx *ctx = (void *)cb->ctx; struct drm_ras_node *node; struct nlattr *hdr; unsigned long id; int ret; xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) { hdr = genlmsg_iput(skb, info); if (!hdr) { ret = -EMSGSIZE; break; } ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id); if (ret) { genlmsg_cancel(skb, hdr); break; } ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME, node->device_name); if (ret) { genlmsg_cancel(skb, hdr); break; } ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME, node->node_name); if (ret) { genlmsg_cancel(skb, hdr); break; } ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE, node->type); if (ret) { genlmsg_cancel(skb, hdr); break; } genlmsg_end(skb, hdr); } if (ret == -EMSGSIZE) ctx->restart = id; return ret; } static int get_node_error_counter(u32 node_id, u32 error_id, const char **name, u32 *value) { struct drm_ras_node *node; node = xa_load(&drm_ras_xa, node_id); if (!node || !node->query_error_counter) return -ENOENT; if (error_id < node->error_counter_range.first || error_id > node->error_counter_range.last) return -EINVAL; return node->query_error_counter(node, error_id, name, value); } static int msg_reply_value(struct sk_buff *msg, u32 error_id, const char *error_name, u32 value) { int ret; ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); if (ret) return ret; ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, error_name); if (ret) return ret; return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, value); } static int doit_reply_value(struct genl_info *info, u32 node_id, u32 error_id) { struct sk_buff *msg; struct nlattr *hdr; const char *error_name; u32 value; int ret; msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_iput(msg, info); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } ret = get_node_error_counter(node_id, error_id, &error_name, &value); if (ret) return ret; ret = msg_reply_value(msg, error_id, error_name, value); if (ret) { genlmsg_cancel(msg, hdr); nlmsg_free(msg); return ret; } genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); } /** * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters * @skb: Netlink message buffer * @cb: Callback context for multi-part dumps * * Iterates over all error counters in a given Node and appends * their attributes (ID, name, value) to the given netlink message buffer. * Uses @cb->ctx to track progress in case the message buffer fills up, allowing * multi-part dump support. On buffer overflow, updates the context to resume * from the last node on the next invocation. * * Return: 0 if all errors fit in @skb, number of bytes added to @skb if * the buffer filled up (requires multi-part continuation), or * a negative error code on failure. */ int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct drm_ras_ctx *ctx = (void *)cb->ctx; struct drm_ras_node *node; struct nlattr *hdr; const char *error_name; u32 node_id, error_id, value; int ret; if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID)) return -EINVAL; node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); node = xa_load(&drm_ras_xa, node_id); if (!node) return -ENOENT; for (error_id = max(node->error_counter_range.first, ctx->restart); error_id <= node->error_counter_range.last; error_id++) { ret = get_node_error_counter(node_id, error_id, &error_name, &value); /* * For non-contiguous range, driver return -ENOENT as indication * to skip this ID when listing all errors. */ if (ret == -ENOENT) continue; if (ret) return ret; hdr = genlmsg_iput(skb, info); if (!hdr) { ret = -EMSGSIZE; break; } ret = msg_reply_value(skb, error_id, error_name, value); if (ret) { genlmsg_cancel(skb, hdr); break; } genlmsg_end(skb, hdr); } if (ret == -EMSGSIZE) ctx->restart = error_id; return ret; } /** * drm_ras_nl_get_error_counter_doit() - Query an error counter of an node * @skb: Netlink message buffer * @info: Generic Netlink info containing attributes of the request * * Extracts the node ID and error ID from the netlink attributes and * retrieves the current value of the corresponding error counter. Sends the * result back to the requesting user via the standard Genl reply. * * Return: 0 on success, or negative errno on failure. */ int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, struct genl_info *info) { u32 node_id, error_id; if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) return -EINVAL; node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); return doit_reply_value(info, node_id, error_id); } /** * drm_ras_node_register() - Register a new RAS node * @node: Node structure to register * * Adds the given RAS node to the global node xarray and assigns it * a unique ID. Both @node->name and @node->type must be valid. * * Return: 0 on success, or negative errno on failure: */ int drm_ras_node_register(struct drm_ras_node *node) { if (!node->device_name || !node->node_name) return -EINVAL; /* Currently, only Error Counter Endpoints are supported */ if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER) return -EINVAL; /* Mandatory entries for Error Counter Node */ if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER && (!node->error_counter_range.last || !node->query_error_counter)) return -EINVAL; return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL); } EXPORT_SYMBOL(drm_ras_node_register); /** * drm_ras_node_unregister() - Unregister a previously registered node * @node: Node structure to unregister * * Removes the given node from the global node xarray using its ID. */ void drm_ras_node_unregister(struct drm_ras_node *node) { xa_erase(&drm_ras_xa, node->id); } EXPORT_SYMBOL(drm_ras_node_unregister);