summaryrefslogtreecommitdiff
path: root/drivers/gpu
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2026-06-12 13:57:16 +1000
committerDave Airlie <airlied@redhat.com>2026-06-12 13:57:17 +1000
commit6fe5552f678bdbb925388ecff30a257b382cb9f6 (patch)
treec63ead6390afc3bfdba9d3ee89e8c9946e15b71f /drivers/gpu
parentc7be308858890007b4da9b6498a1c1f3e6647dee (diff)
parent347ccc0453fca2c669e8dc8a72000e76ca4adf10 (diff)
downloadlwn-6fe5552f678bdbb925388ecff30a257b382cb9f6.tar.gz
lwn-6fe5552f678bdbb925388ecff30a257b382cb9f6.zip
Merge tag 'drm-xe-fixes-2026-06-11' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
UAPI Changes: Cross-subsystem Changes: Core Changes: Driver Changes: - fix oops in suspend/shutdown without display (Jani) - RAS fixes (Raag) - Use HW_ERR prefix in log (Raag) - include all registered queues in TLB invalidation (Tangudu) - Fix refcount leak in xe_range_tree in error paths (Wentao) - fix job timeout recovery for unstarted jobs and kernel queues (Rodrigo) Signed-off-by: Dave Airlie <airlied@redhat.com> From: Matthew Brost <matthew.brost@intel.com> Link: https://patch.msgid.link/aitt8ZkYmxIT9cdP@gsse-cloud1.jf.intel.com
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/xe/display/xe_display.c11
-rw-r--r--drivers/gpu/drm/xe/xe_drm_ras.c61
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c49
-rw-r--r--drivers/gpu/drm/xe/xe_guc_tlb_inval.c7
-rw-r--r--drivers/gpu/drm/xe/xe_hw_error.c12
-rw-r--r--drivers/gpu/drm/xe/xe_range_fence.c2
6 files changed, 79 insertions, 63 deletions
diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 00dfa68af29a..b17fb698d2f8 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -124,6 +124,15 @@ int xe_display_init_early(struct xe_device *xe)
intel_display_driver_early_probe(display);
+ intel_display_device_info_runtime_init(display);
+
+ /* Display may have been disabled at runtime init */
+ if (!intel_display_device_present(display)) {
+ xe->info.probe_display = false;
+ unset_display_features(xe);
+ return 0;
+ }
+
/* Early display init.. */
intel_opregion_setup(display);
@@ -137,8 +146,6 @@ int xe_display_init_early(struct xe_device *xe)
intel_bw_init_hw(display);
- intel_display_device_info_runtime_init(display);
-
err = intel_display_driver_probe_noirq(display);
if (err)
goto err_opregion;
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index e07dc23a155e..c6cd32b7eeda 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -52,7 +52,7 @@ static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *x
struct xe_drm_ras_counter *counter;
int i;
- counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
+ counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
if (!counter)
return ERR_PTR(-ENOMEM);
@@ -100,54 +100,47 @@ static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
return 0;
}
-static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
+static void cleanup_node_param(struct drm_ras_node *node)
{
- struct drm_ras_node *node = &ras->node[severity];
-
- kfree(ras->info[severity]);
- ras->info[severity] = NULL;
-
kfree(node->device_name);
node->device_name = NULL;
}
+static void cleanup_node(struct drm_device *drm, void *node)
+{
+ drm_ras_node_unregister(node);
+ cleanup_node_param(node);
+}
+
static int register_nodes(struct xe_device *xe)
{
struct xe_drm_ras *ras = &xe->ras;
- int i;
+ struct drm_ras_node *node;
+ int i, ret;
for_each_error_severity(i) {
- struct drm_ras_node *node = &ras->node[i];
- int ret;
+ node = &ras->node[i];
ret = assign_node_params(xe, node, i);
- if (ret) {
- cleanup_node_param(ras, i);
- return ret;
- }
+ if (ret)
+ goto free_param;
ret = drm_ras_node_register(node);
- if (ret) {
- cleanup_node_param(ras, i);
- return ret;
- }
+ if (ret)
+ goto free_param;
+
+ ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node);
+ if (ret)
+ goto null_info;
}
return 0;
-}
-
-static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
-{
- struct xe_device *xe = arg;
- struct xe_drm_ras *ras = &xe->ras;
- int i;
-
- for_each_error_severity(i) {
- struct drm_ras_node *node = &ras->node[i];
- drm_ras_node_unregister(node);
- cleanup_node_param(ras, i);
- }
+free_param:
+ cleanup_node_param(node);
+null_info:
+ ras->info[i] = NULL;
+ return ret;
}
/**
@@ -176,11 +169,5 @@ int xe_drm_ras_init(struct xe_device *xe)
return err;
}
- err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
- if (err) {
- drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
- return err;
- }
-
return 0;
}
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index a4a8f0d41fe8..42110e01b7d0 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -157,6 +157,11 @@ static void set_exec_queue_banned(struct xe_exec_queue *q)
atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
}
+static void clear_exec_queue_banned(struct xe_exec_queue *q)
+{
+ atomic_andnot(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
+}
+
static bool exec_queue_suspended(struct xe_exec_queue *q)
{
return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_SUSPENDED;
@@ -1361,7 +1366,8 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
q->guc->id);
- return xe_sched_invalidate_job(job, 2);
+ /* GuC never scheduled this job - let the caller trigger a GT reset. */
+ return true;
}
ctx_timestamp = lower_32_bits(xe_lrc_timestamp(q->lrc[0]));
@@ -1458,6 +1464,21 @@ static void disable_scheduling(struct xe_exec_queue *q, bool immediate)
G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
}
+/*
+ * Recover via GT reset for a kernel queue, or for a GuC scheduling failure (job
+ * never started) on a queue that was not already killed or banned. An already
+ * banned queue must stay banned, so its unstarted jobs do not clear the ban or
+ * trigger a reset.
+ */
+static bool timeout_needs_gt_reset(struct xe_exec_queue *q, struct xe_sched_job *job,
+ bool skip_timeout_check)
+{
+ if (q->flags & EXEC_QUEUE_FLAG_KERNEL)
+ return true;
+
+ return !skip_timeout_check && !xe_sched_job_started(job);
+}
+
static enum drm_gpu_sched_stat
guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
{
@@ -1606,19 +1627,19 @@ trigger_reset:
xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
q->guc->id, q->flags);
- /*
- * Kernel jobs should never fail, nor should VM jobs if they do
- * somethings has gone wrong and the GT needs a reset
- */
- xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
- "Kernel-submitted job timed out\n");
- xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
- "VM job timed out on non-killed execqueue\n");
- if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
- (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
- if (!xe_sched_invalidate_job(job, 2)) {
- xe_gt_reset_async(q->gt);
- goto rearm;
+ if (!wedged) {
+ if (timeout_needs_gt_reset(q, job, skip_timeout_check)) {
+ if (!xe_sched_invalidate_job(job, 2)) {
+ clear_exec_queue_banned(q);
+ xe_gt_reset_async(q->gt);
+ goto rearm;
+ }
+ if (q->flags & EXEC_QUEUE_FLAG_KERNEL) {
+ xe_gt_WARN(q->gt, true, "Kernel-submitted job timed out\n");
+ xe_device_declare_wedged(gt_to_xe(q->gt));
+ }
+ } else if (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)) {
+ xe_gt_WARN(q->gt, true, "VM job timed out on non-killed execqueue\n");
}
}
diff --git a/drivers/gpu/drm/xe/xe_guc_tlb_inval.c b/drivers/gpu/drm/xe/xe_guc_tlb_inval.c
index ced58f46f846..cf6d106e6036 100644
--- a/drivers/gpu/drm/xe/xe_guc_tlb_inval.c
+++ b/drivers/gpu/drm/xe/xe_guc_tlb_inval.c
@@ -255,9 +255,8 @@ static int send_tlb_inval_ctx_ppgtt(struct xe_tlb_inval *tlb_inval, u32 seqno,
#undef EXEC_QUEUE_COUNT_FULL_THRESHOLD
/*
- * Move exec queues to a temporary list to issue invalidations. The exec
- * queue must active and a reference must be taken to prevent concurrent
- * deregistrations.
+ * Move exec queues to a temporary list to issue invalidations. A
+ * reference must be taken to prevent concurrent deregistrations.
*
* List modification is safe because we hold 'vm->exec_queues.lock' for
* reading, which prevents external modifications. Using a per-GT list
@@ -266,7 +265,7 @@ static int send_tlb_inval_ctx_ppgtt(struct xe_tlb_inval *tlb_inval, u32 seqno,
*/
list_for_each_entry_safe(q, next, &vm->exec_queues.list[id],
vm_exec_queue_link) {
- if (q->ops->active(q) && xe_exec_queue_get_unless_zero(q)) {
+ if (xe_exec_queue_get_unless_zero(q)) {
last_q = q;
list_move_tail(&q->vm_exec_queue_link, &tlb_inval_list);
}
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 2a31b430570e..e869bc3948d9 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -219,9 +219,9 @@ static void log_hw_error(struct xe_tile *tile, const char *name,
struct xe_device *xe = tile_to_xe(tile);
if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
- drm_warn(&xe->drm, "%s %s detected\n", name, severity_str);
+ drm_warn(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str);
else
- drm_err_ratelimited(&xe->drm, "%s %s detected\n", name, severity_str);
+ drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str);
}
static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
@@ -231,10 +231,10 @@ static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
struct xe_device *xe = tile_to_xe(tile);
if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
- drm_warn(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+ drm_warn(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
name, severity_str, i, err);
else
- drm_err_ratelimited(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
+ drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
name, severity_str, i, err);
}
@@ -251,9 +251,9 @@ static void log_soc_error(struct xe_tile *tile, const char * const *reg_info,
if (strcmp(name, "Undefined")) {
if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
- drm_warn(&xe->drm, "%s SOC %s detected", name, severity_str);
+ drm_warn(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str);
else
- drm_err_ratelimited(&xe->drm, "%s SOC %s detected", name, severity_str);
+ drm_err_ratelimited(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str);
atomic_inc(&info[index].counter);
}
}
diff --git a/drivers/gpu/drm/xe/xe_range_fence.c b/drivers/gpu/drm/xe/xe_range_fence.c
index 372378e89e98..3d8fa194a7b0 100644
--- a/drivers/gpu/drm/xe/xe_range_fence.c
+++ b/drivers/gpu/drm/xe/xe_range_fence.c
@@ -77,6 +77,8 @@ int xe_range_fence_insert(struct xe_range_fence_tree *tree,
} else if (err == 0) {
xe_range_fence_tree_insert(rfence, &tree->root);
return 0;
+ } else {
+ dma_fence_put(fence);
}
free: