summaryrefslogtreecommitdiff
path: root/drivers/gpu
diff options
context:
space:
mode:
authorAlex Deucher <alexander.deucher@amd.com>2026-01-01 17:20:18 -0500
committerAlex Deucher <alexander.deucher@amd.com>2026-05-11 16:15:31 -0400
commitc184df870db1e328691ea0fbb7d0e59efd9d3f9f (patch)
treecbdea9ebd1a258bc03738419f3dd1bc0c6641107 /drivers/gpu
parentb0054327595767aec4726929e6ddb94b5d31334f (diff)
downloadlwn-c184df870db1e328691ea0fbb7d0e59efd9d3f9f.tar.gz
lwn-c184df870db1e328691ea0fbb7d0e59efd9d3f9f.zip
drm/amdgpu: plumb timedout fence through to force completion
When we do a full adapter reset, if we know the timedout fence mark the fence with -ETIME rather than -ECANCELED so it gets properly handled by userspace. v2: rebase Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c26
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c21
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c2
9 files changed, 48 insertions, 20 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 092fd3309099..b951b42d66bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -2049,7 +2049,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* swap out the old fences */
amdgpu_ib_preempt_fences_swap(ring, fences);
- amdgpu_fence_driver_force_completion(ring);
+ amdgpu_fence_driver_force_completion(ring, NULL);
/* resubmit unfinished jobs */
amdgpu_ib_preempt_job_recovery(&ring->sched);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 34f933350ccf..be42e8f01def 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5093,6 +5093,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
{
int i, r = 0;
struct amdgpu_job *job = NULL;
+ struct dma_fence *fence = NULL;
struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -5105,6 +5106,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
amdgpu_fence_driver_isr_toggle(adev, true);
+ if (job)
+ fence = &job->hw_fence->base;
+
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -5113,7 +5117,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
continue;
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
- amdgpu_fence_driver_force_completion(ring);
+ amdgpu_fence_driver_force_completion(ring, fence);
}
amdgpu_fence_driver_isr_toggle(adev, false);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 8048a4c04b47..ea69b1bac7c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -547,7 +547,7 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
r = -ENODEV;
/* no need to trigger GPU reset as we are unloading */
if (r)
- amdgpu_fence_driver_force_completion(ring);
+ amdgpu_fence_driver_force_completion(ring, NULL);
if (!drm_dev_is_unplugged(adev_to_drm(adev)) &&
ring->fence_drv.irq_src &&
@@ -662,16 +662,34 @@ void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error)
* amdgpu_fence_driver_force_completion - force signal latest fence of ring
*
* @ring: fence of the ring to signal
+ * @timedout_fence: fence of the timedout job
*
*/
-void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
+void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring,
+ struct dma_fence *timedout_fence)
{
- amdgpu_fence_driver_set_error(ring, -ECANCELED);
+ struct amdgpu_fence_driver *drv = &ring->fence_drv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&drv->lock, flags);
+ for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) {
+ struct dma_fence *fence;
+
+ fence = rcu_dereference_protected(drv->fences[i],
+ lockdep_is_held(&drv->lock));
+ if (fence && !dma_fence_is_signaled_locked(fence)) {
+ if (fence == timedout_fence)
+ dma_fence_set_error(fence, -ETIME);
+ else
+ dma_fence_set_error(fence, -ECANCELED);
+ }
+ }
+ spin_unlock_irqrestore(&drv->lock, flags);
+
amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
amdgpu_fence_process(ring);
}
-
/*
* Kernel queue reset handling
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 715c9e43e13a..8f28b3bd7010 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -159,7 +159,8 @@ struct amdgpu_fence {
extern const struct drm_sched_backend_ops amdgpu_sched_ops;
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
-void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
+void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring,
+ struct dma_fence *timedout_fence);
void amdgpu_ring_set_fence_errors_and_reemit(struct amdgpu_ring *ring,
struct amdgpu_fence *guilty_fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 321310ba2c08..fcd81242059e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -600,10 +600,10 @@ exit:
* to be submitted to the queues after the reset is complete.
*/
if (!ret) {
- amdgpu_fence_driver_force_completion(gfx_ring);
+ amdgpu_fence_driver_force_completion(gfx_ring, NULL);
drm_sched_wqueue_start(&gfx_ring->sched);
if (adev->sdma.has_page_queue) {
- amdgpu_fence_driver_force_completion(page_ring);
+ amdgpu_fence_driver_force_completion(page_ring, NULL);
drm_sched_wqueue_start(&page_ring->sched);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 9d5cca7da1d9..3a3bc0d370fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -512,7 +512,7 @@ int amdgpu_uvd_resume(struct amdgpu_device *adev)
}
memset_io(ptr, 0, size);
/* to restore uvd fence seq */
- amdgpu_fence_driver_force_completion(&adev->uvd.inst[i].ring);
+ amdgpu_fence_driver_force_completion(&adev->uvd.inst[i].ring, NULL);
}
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index debb82a2e031..7cbd330643cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1487,15 +1487,16 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
/**
* amdgpu_vcn_reset_engine - Reset a specific VCN engine
- * @adev: Pointer to the AMDGPU device
- * @instance_id: VCN engine instance to reset
+ * @ring: Pointer to the VCN ring
+ * @timedout_fence: fence that timed out
*
* Returns: 0 on success, or a negative error code on failure.
*/
-static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
- uint32_t instance_id)
+static int amdgpu_vcn_reset_engine(struct amdgpu_ring *ring,
+ struct amdgpu_fence *timedout_fence)
{
- struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
+ struct amdgpu_device *adev = ring->adev;
+ struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
int r, i;
mutex_lock(&vinst->engine_reset_mutex);
@@ -1519,9 +1520,13 @@ static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
if (r)
goto unlock;
}
- amdgpu_fence_driver_force_completion(&vinst->ring_dec);
+ amdgpu_fence_driver_force_completion(&vinst->ring_dec,
+ (&vinst->ring_dec == ring) ?
+ &timedout_fence->base : NULL);
for (i = 0; i < vinst->num_enc_rings; i++)
- amdgpu_fence_driver_force_completion(&vinst->ring_enc[i]);
+ amdgpu_fence_driver_force_completion(&vinst->ring_enc[i],
+ (&vinst->ring_enc[i] == ring) ?
+ &timedout_fence->base : NULL);
/* Restart the scheduler's work queue for the dec and enc rings
* if they were stopped by this function. This allows new tasks
@@ -1557,7 +1562,7 @@ int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
if (adev->vcn.inst[ring->me].using_unified_queue)
return -EINVAL;
- return amdgpu_vcn_reset_engine(adev, ring->me);
+ return amdgpu_vcn_reset_engine(ring, timedout_fence);
}
int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 10e8fc2821f3..7f001c32e911 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1686,7 +1686,7 @@ static int vcn_v4_0_3_reset_jpeg_post_helper(struct amdgpu_device *adev, int ins
for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
ring = &adev->jpeg.inst[inst].ring_dec[i];
/* Force completion of any remaining jobs */
- amdgpu_fence_driver_force_completion(ring);
+ amdgpu_fence_driver_force_completion(ring, NULL);
if (ring->use_doorbell)
WREG32_SOC15_OFFSET(
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
index 54fbf8d73ca6..d3db0494341e 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
@@ -1332,7 +1332,7 @@ static int vcn_v5_0_1_reset_jpeg_post_helper(struct amdgpu_device *adev, int ins
for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
ring = &adev->jpeg.inst[inst].ring_dec[i];
/* Force completion of any remaining jobs */
- amdgpu_fence_driver_force_completion(ring);
+ amdgpu_fence_driver_force_completion(ring, NULL);
if (ring->use_doorbell)
WREG32_SOC15_OFFSET(