drm/amdgpu: add gang submit backend v2

Allows submitting jobs as gang which needs to run on multiple engines at the same time. Basic idea is that we have a global gang submit fence representing when the gang leader is finally pushed to run on the hardware last. Jobs submitted as gang are never re-submitted in case of a GPU reset since this won't work and will just deadlock the hardware immediately again. v2: fix logic inversion, improve documentation, fix rcu Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Christian König <christian.koenig@amd.com> 2022-03-02 16:26:53 +0100
committer: Alex Deucher <alexander.deucher@amd.com> 2022-09-20 12:40:32 -0400
commit: 68ce8b242242651eb3cb4ff29b79c44d02f752c9 (patch)
tree: 2405418f61d1a63f04f8502d8c85df90e3b32b03 /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent: c05d789fed948ed6a45963ae0d5d79c67b87aebf (diff)
download: lwn-68ce8b242242651eb3cb4ff29b79c44d02f752c9.tar.gz
lwn-68ce8b242242651eb3cb4ff29b79c44d02f752c9.zip
1 files changed, 26 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 877b3c22a8a8..cfbe19cfe9af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -173,11 +173,29 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 	dma_fence_put(&job->hw_fence);
 }
 
+void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
+				struct amdgpu_job *leader)
+{
+	struct dma_fence *fence = &leader->base.s_fence->scheduled;
+
+	WARN_ON(job->gang_submit);
+
+	/*
+	 * Don't add a reference when we are the gang leader to avoid circle
+	 * dependency.
+	 */
+	if (job != leader)
+		dma_fence_get(fence);
+	job->gang_submit = fence;
+}
+
 void amdgpu_job_free(struct amdgpu_job *job)
 {
 	amdgpu_job_free_resources(job);
 	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->sched_sync);
+	if (job->gang_submit != &job->base.s_fence->scheduled)
+		dma_fence_put(job->gang_submit);
 
 	if (!job->hw_fence.ops)
 		kfree(job);
@@ -247,12 +265,16 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
 		fence = amdgpu_sync_get_fence(&job->sync);
 	}
 
+	if (!fence && job->gang_submit)
+		fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
+
 	return fence;
 }
 
 static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched);
+	struct amdgpu_device *adev = ring->adev;
 	struct dma_fence *fence = NULL, *finished;
 	struct amdgpu_job *job;
 	int r = 0;
@@ -264,8 +286,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 
 	trace_amdgpu_sched_run_job(job);
 
-	if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
-		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+	/* Skip job if VRAM is lost and never resubmit gangs */
+	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) ||
+	    (job->job_run_counter && job->gang_submit))
+		dma_fence_set_error(finished, -ECANCELED);
 
 	if (finished->error < 0) {
 		DRM_INFO("Skip scheduling IBs!\n");
author	Christian König <christian.koenig@amd.com>	2022-03-02 16:26:53 +0100
committer	Alex Deucher <alexander.deucher@amd.com>	2022-09-20 12:40:32 -0400
commit	68ce8b242242651eb3cb4ff29b79c44d02f752c9 (patch)
tree	2405418f61d1a63f04f8502d8c85df90e3b32b03 /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent	c05d789fed948ed6a45963ae0d5d79c67b87aebf (diff)
download	lwn-68ce8b242242651eb3cb4ff29b79c44d02f752c9.tar.gz lwn-68ce8b242242651eb3cb4ff29b79c44d02f752c9.zip