diff options
| author | Dave Airlie <airlied@redhat.com> | 2026-06-12 13:57:16 +1000 |
|---|---|---|
| committer | Dave Airlie <airlied@redhat.com> | 2026-06-12 13:57:17 +1000 |
| commit | 6fe5552f678bdbb925388ecff30a257b382cb9f6 (patch) | |
| tree | c63ead6390afc3bfdba9d3ee89e8c9946e15b71f /drivers/gpu | |
| parent | c7be308858890007b4da9b6498a1c1f3e6647dee (diff) | |
| parent | 347ccc0453fca2c669e8dc8a72000e76ca4adf10 (diff) | |
| download | lwn-6fe5552f678bdbb925388ecff30a257b382cb9f6.tar.gz lwn-6fe5552f678bdbb925388ecff30a257b382cb9f6.zip | |
Merge tag 'drm-xe-fixes-2026-06-11' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
UAPI Changes:
Cross-subsystem Changes:
Core Changes:
Driver Changes:
- fix oops in suspend/shutdown without display (Jani)
- RAS fixes (Raag)
- Use HW_ERR prefix in log (Raag)
- include all registered queues in TLB invalidation (Tangudu)
- Fix refcount leak in xe_range_tree in error paths (Wentao)
- fix job timeout recovery for unstarted jobs and kernel queues (Rodrigo)
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Matthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/aitt8ZkYmxIT9cdP@gsse-cloud1.jf.intel.com
Diffstat (limited to 'drivers/gpu')
| -rw-r--r-- | drivers/gpu/drm/xe/display/xe_display.c | 11 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_drm_ras.c | 61 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_submit.c | 49 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_tlb_inval.c | 7 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_hw_error.c | 12 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_range_fence.c | 2 |
6 files changed, 79 insertions, 63 deletions
diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 00dfa68af29a..b17fb698d2f8 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -124,6 +124,15 @@ int xe_display_init_early(struct xe_device *xe) intel_display_driver_early_probe(display); + intel_display_device_info_runtime_init(display); + + /* Display may have been disabled at runtime init */ + if (!intel_display_device_present(display)) { + xe->info.probe_display = false; + unset_display_features(xe); + return 0; + } + /* Early display init.. */ intel_opregion_setup(display); @@ -137,8 +146,6 @@ int xe_display_init_early(struct xe_device *xe) intel_bw_init_hw(display); - intel_display_device_info_runtime_init(display); - err = intel_display_driver_probe_noirq(display); if (err) goto err_opregion; diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c index e07dc23a155e..c6cd32b7eeda 100644 --- a/drivers/gpu/drm/xe/xe_drm_ras.c +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -52,7 +52,7 @@ static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *x struct xe_drm_ras_counter *counter; int i; - counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); + counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); if (!counter) return ERR_PTR(-ENOMEM); @@ -100,54 +100,47 @@ static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, return 0; } -static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) +static void cleanup_node_param(struct drm_ras_node *node) { - struct drm_ras_node *node = &ras->node[severity]; - - kfree(ras->info[severity]); - ras->info[severity] = NULL; - kfree(node->device_name); node->device_name = NULL; } +static void cleanup_node(struct drm_device *drm, void *node) +{ + drm_ras_node_unregister(node); + cleanup_node_param(node); +} + static int register_nodes(struct xe_device *xe) { struct xe_drm_ras *ras = &xe->ras; - int i; + struct drm_ras_node *node; + int i, ret; for_each_error_severity(i) { - struct drm_ras_node *node = &ras->node[i]; - int ret; + node = &ras->node[i]; ret = assign_node_params(xe, node, i); - if (ret) { - cleanup_node_param(ras, i); - return ret; - } + if (ret) + goto free_param; ret = drm_ras_node_register(node); - if (ret) { - cleanup_node_param(ras, i); - return ret; - } + if (ret) + goto free_param; + + ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node); + if (ret) + goto null_info; } return 0; -} - -static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) -{ - struct xe_device *xe = arg; - struct xe_drm_ras *ras = &xe->ras; - int i; - - for_each_error_severity(i) { - struct drm_ras_node *node = &ras->node[i]; - drm_ras_node_unregister(node); - cleanup_node_param(ras, i); - } +free_param: + cleanup_node_param(node); +null_info: + ras->info[i] = NULL; + return ret; } /** @@ -176,11 +169,5 @@ int xe_drm_ras_init(struct xe_device *xe) return err; } - err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); - if (err) { - drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); - return err; - } - return 0; } diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index a4a8f0d41fe8..42110e01b7d0 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -157,6 +157,11 @@ static void set_exec_queue_banned(struct xe_exec_queue *q) atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state); } +static void clear_exec_queue_banned(struct xe_exec_queue *q) +{ + atomic_andnot(EXEC_QUEUE_STATE_BANNED, &q->guc->state); +} + static bool exec_queue_suspended(struct xe_exec_queue *q) { return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_SUSPENDED; @@ -1361,7 +1366,8 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job) xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), q->guc->id); - return xe_sched_invalidate_job(job, 2); + /* GuC never scheduled this job - let the caller trigger a GT reset. */ + return true; } ctx_timestamp = lower_32_bits(xe_lrc_timestamp(q->lrc[0])); @@ -1458,6 +1464,21 @@ static void disable_scheduling(struct xe_exec_queue *q, bool immediate) G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1); } +/* + * Recover via GT reset for a kernel queue, or for a GuC scheduling failure (job + * never started) on a queue that was not already killed or banned. An already + * banned queue must stay banned, so its unstarted jobs do not clear the ban or + * trigger a reset. + */ +static bool timeout_needs_gt_reset(struct xe_exec_queue *q, struct xe_sched_job *job, + bool skip_timeout_check) +{ + if (q->flags & EXEC_QUEUE_FLAG_KERNEL) + return true; + + return !skip_timeout_check && !xe_sched_job_started(job); +} + static enum drm_gpu_sched_stat guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) { @@ -1606,19 +1627,19 @@ trigger_reset: xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), q->guc->id, q->flags); - /* - * Kernel jobs should never fail, nor should VM jobs if they do - * somethings has gone wrong and the GT needs a reset - */ - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL, - "Kernel-submitted job timed out\n"); - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q), - "VM job timed out on non-killed execqueue\n"); - if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL || - (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) { - if (!xe_sched_invalidate_job(job, 2)) { - xe_gt_reset_async(q->gt); - goto rearm; + if (!wedged) { + if (timeout_needs_gt_reset(q, job, skip_timeout_check)) { + if (!xe_sched_invalidate_job(job, 2)) { + clear_exec_queue_banned(q); + xe_gt_reset_async(q->gt); + goto rearm; + } + if (q->flags & EXEC_QUEUE_FLAG_KERNEL) { + xe_gt_WARN(q->gt, true, "Kernel-submitted job timed out\n"); + xe_device_declare_wedged(gt_to_xe(q->gt)); + } + } else if (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)) { + xe_gt_WARN(q->gt, true, "VM job timed out on non-killed execqueue\n"); } } diff --git a/drivers/gpu/drm/xe/xe_guc_tlb_inval.c b/drivers/gpu/drm/xe/xe_guc_tlb_inval.c index ced58f46f846..cf6d106e6036 100644 --- a/drivers/gpu/drm/xe/xe_guc_tlb_inval.c +++ b/drivers/gpu/drm/xe/xe_guc_tlb_inval.c @@ -255,9 +255,8 @@ static int send_tlb_inval_ctx_ppgtt(struct xe_tlb_inval *tlb_inval, u32 seqno, #undef EXEC_QUEUE_COUNT_FULL_THRESHOLD /* - * Move exec queues to a temporary list to issue invalidations. The exec - * queue must active and a reference must be taken to prevent concurrent - * deregistrations. + * Move exec queues to a temporary list to issue invalidations. A + * reference must be taken to prevent concurrent deregistrations. * * List modification is safe because we hold 'vm->exec_queues.lock' for * reading, which prevents external modifications. Using a per-GT list @@ -266,7 +265,7 @@ static int send_tlb_inval_ctx_ppgtt(struct xe_tlb_inval *tlb_inval, u32 seqno, */ list_for_each_entry_safe(q, next, &vm->exec_queues.list[id], vm_exec_queue_link) { - if (q->ops->active(q) && xe_exec_queue_get_unless_zero(q)) { + if (xe_exec_queue_get_unless_zero(q)) { last_q = q; list_move_tail(&q->vm_exec_queue_link, &tlb_inval_list); } diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 2a31b430570e..e869bc3948d9 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -219,9 +219,9 @@ static void log_hw_error(struct xe_tile *tile, const char *name, struct xe_device *xe = tile_to_xe(tile); if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) - drm_warn(&xe->drm, "%s %s detected\n", name, severity_str); + drm_warn(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str); else - drm_err_ratelimited(&xe->drm, "%s %s detected\n", name, severity_str); + drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str); } static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err, @@ -231,10 +231,10 @@ static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err, struct xe_device *xe = tile_to_xe(tile); if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) - drm_warn(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", + drm_warn(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", name, severity_str, i, err); else - drm_err_ratelimited(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", + drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", name, severity_str, i, err); } @@ -251,9 +251,9 @@ static void log_soc_error(struct xe_tile *tile, const char * const *reg_info, if (strcmp(name, "Undefined")) { if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) - drm_warn(&xe->drm, "%s SOC %s detected", name, severity_str); + drm_warn(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str); else - drm_err_ratelimited(&xe->drm, "%s SOC %s detected", name, severity_str); + drm_err_ratelimited(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str); atomic_inc(&info[index].counter); } } diff --git a/drivers/gpu/drm/xe/xe_range_fence.c b/drivers/gpu/drm/xe/xe_range_fence.c index 372378e89e98..3d8fa194a7b0 100644 --- a/drivers/gpu/drm/xe/xe_range_fence.c +++ b/drivers/gpu/drm/xe/xe_range_fence.c @@ -77,6 +77,8 @@ int xe_range_fence_insert(struct xe_range_fence_tree *tree, } else if (err == 0) { xe_range_fence_tree_insert(rfence, &tree->root); return 0; + } else { + dma_fence_put(fence); } free: |
