From a952f1ab696873be124e31ce5ef964d36bce817f Mon Sep 17 00:00:00 2001 From: qianyi liu Date: Tue, 11 Mar 2025 14:02:51 +0800 Subject: drm/sched: Fix fence reference count leak The last_scheduled fence leaks when an entity is being killed and adding the cleanup callback fails. Decrement the reference count of prev when dma_fence_add_callback() fails, ensuring proper balance. Cc: stable@vger.kernel.org # v6.2+ [phasta: add git tag info for stable kernel] Fixes: 2fdb8a8f07c2 ("drm/scheduler: rework entity flush, kill and fini") Signed-off-by: qianyi liu Signed-off-by: Philipp Stanner Link: https://patchwork.freedesktop.org/patch/msgid/20250311060251.4041101-1-liuqianyi125@gmail.com --- drivers/gpu/drm/scheduler/sched_entity.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 69bcf0e99d57..da00572d7d42 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -259,9 +259,16 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity) struct drm_sched_fence *s_fence = job->s_fence; dma_fence_get(&s_fence->finished); - if (!prev || dma_fence_add_callback(prev, &job->finish_cb, - drm_sched_entity_kill_jobs_cb)) + if (!prev || + dma_fence_add_callback(prev, &job->finish_cb, + drm_sched_entity_kill_jobs_cb)) { + /* + * Adding callback above failed. + * dma_fence_put() checks for NULL. + */ + dma_fence_put(prev); drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + } prev = &s_fence->finished; } -- cgit v1.2.3 From 80cbee810e4e13cdbd3ae9654e9ecddf17f3e828 Mon Sep 17 00:00:00 2001 From: Maíra Canal Date: Thu, 13 Mar 2025 11:43:26 -0300 Subject: drm/v3d: Don't run jobs that have errors flagged in its fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The V3D driver still relies on `drm_sched_increase_karma()` and `drm_sched_resubmit_jobs()` for resubmissions when a timeout occurs. The function `drm_sched_increase_karma()` marks the job as guilty, while `drm_sched_resubmit_jobs()` sets an error (-ECANCELED) in the DMA fence of that guilty job. Because of this, we must check whether the job’s DMA fence has been flagged with an error before executing the job. Otherwise, the same guilty job may be resubmitted indefinitely, causing repeated GPU resets. This patch adds a check for an error on the job's fence to prevent running a guilty job that was previously flagged when the GPU timed out. Note that the CPU and CACHE_CLEAN queues do not require this check, as their jobs are executed synchronously once the DRM scheduler starts them. Cc: stable@vger.kernel.org Fixes: d223f98f0209 ("drm/v3d: Add support for compute shader dispatch.") Fixes: 1584f16ca96e ("drm/v3d: Add support for submitting jobs to the TFU.") Reviewed-by: Iago Toral Quiroga Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20250313-v3d-gpu-reset-fixes-v4-1-c1e780d8e096@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index da08ddb01d21..84c0c142d760 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -327,11 +327,15 @@ v3d_tfu_job_run(struct drm_sched_job *sched_job) struct drm_device *dev = &v3d->drm; struct dma_fence *fence; + if (unlikely(job->base.base.s_fence->finished.error)) + return NULL; + + v3d->tfu_job = job; + fence = v3d_fence_create(v3d, V3D_TFU); if (IS_ERR(fence)) return NULL; - v3d->tfu_job = job; if (job->base.irq_fence) dma_fence_put(job->base.irq_fence); job->base.irq_fence = dma_fence_get(fence); @@ -369,6 +373,9 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; int i, csd_cfg0_reg; + if (unlikely(job->base.base.s_fence->finished.error)) + return NULL; + v3d->csd_job = job; v3d_invalidate_caches(v3d); -- cgit v1.2.3 From c3e4a25602f8b941b154f52a4da13ae77b4664c4 Mon Sep 17 00:00:00 2001 From: Maíra Canal Date: Thu, 13 Mar 2025 11:43:27 -0300 Subject: drm/v3d: Set job pointer to NULL when the job's fence has an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to commit e4b5ccd392b9 ("drm/v3d: Ensure job pointer is set to NULL after job completion"), ensure the job pointer is set to `NULL` when a job's fence has an error. Failing to do so can trigger kernel warnings in specific scenarios, such as: 1. v3d_csd_job_run() assigns `v3d->csd_job = job` 2. CSD job exceeds hang limit, causing a timeout → v3d_gpu_reset_for_timeout() 3. GPU reset 4. drm_sched_resubmit_jobs() sets the job's fence to `-ECANCELED`. 5. v3d_csd_job_run() detects the fence error and returns NULL, not submitting the job to the GPU 6. User-space runs `modprobe -r v3d` 7. v3d_gem_destroy() v3d_gem_destroy() triggers a warning indicating that the CSD job never ended, as we didn't set `v3d->csd_job` to NULL after the timeout. The same can also happen to BIN, RENDER, and TFU jobs. Reviewed-by: Iago Toral Quiroga Signed-off-by: Maíra Canal Link: https://patchwork.freedesktop.org/patch/msgid/20250313-v3d-gpu-reset-fixes-v4-2-c1e780d8e096@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 84c0c142d760..05608c894ed9 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -226,8 +226,12 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; unsigned long irqflags; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + spin_lock_irqsave(&v3d->job_lock, irqflags); + v3d->bin_job = NULL; + spin_unlock_irqrestore(&v3d->job_lock, irqflags); return NULL; + } /* Lock required around bin_job update vs * v3d_overflow_mem_work(). @@ -281,8 +285,10 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job) struct drm_device *dev = &v3d->drm; struct dma_fence *fence; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->render_job = NULL; return NULL; + } v3d->render_job = job; @@ -327,8 +333,10 @@ v3d_tfu_job_run(struct drm_sched_job *sched_job) struct drm_device *dev = &v3d->drm; struct dma_fence *fence; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->tfu_job = NULL; return NULL; + } v3d->tfu_job = job; @@ -373,8 +381,10 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; int i, csd_cfg0_reg; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->csd_job = NULL; return NULL; + } v3d->csd_job = job; -- cgit v1.2.3 From cb83f4b965a66d85e9a03621ef3b22c044f4a033 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Feb 2025 15:18:19 -0400 Subject: gpu: host1x: Do not assume that a NULL domain means no DMA IOMMU Previously with tegra-smmu, even with CONFIG_IOMMU_DMA, the default domain could have been left as NULL. The NULL domain is specially recognized by host1x_iommu_attach() as meaning it is not the DMA domain and should be replaced with the special shared domain. This happened prior to the below commit because tegra-smmu was using the NULL domain to mean IDENTITY. Now that the domain is properly labled the test in DRM doesn't see NULL. Check for IDENTITY as well to enable the special domains. This is the same issue and basic fix as seen in commit fae6e669cdc5 ("drm/tegra: Do not assume that a NULL domain means no DMA IOMMU"). Fixes: c8cc2655cc6c ("iommu/tegra-smmu: Implement an IDENTITY domain") Reported-by: Diogo Ivo Closes: https://lore.kernel.org/all/c6a6f114-3acd-4d56-a13b-b88978e927dc@tecnico.ulisboa.pt/ Tested-by: Diogo Ivo Signed-off-by: Jason Gunthorpe Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/0-v1-10dcc8ce3869+3a7-host1x_identity_jgg@nvidia.com --- drivers/gpu/host1x/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu') diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 46cae925b095..1f93e5e276c0 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -361,6 +361,10 @@ static bool host1x_wants_iommu(struct host1x *host1x) return true; } +/* + * Returns ERR_PTR on failure, NULL if the translation is IDENTITY, otherwise a + * valid paging domain. + */ static struct iommu_domain *host1x_iommu_attach(struct host1x *host) { struct iommu_domain *domain = iommu_get_domain_for_dev(host->dev); @@ -385,6 +389,8 @@ static struct iommu_domain *host1x_iommu_attach(struct host1x *host) * Similarly, if host1x is already attached to an IOMMU (via the DMA * API), don't try to attach again. */ + if (domain && domain->type == IOMMU_DOMAIN_IDENTITY) + domain = NULL; if (!host1x_wants_iommu(host) || domain) return domain; -- cgit v1.2.3