From 5f98f9d1a2d423ef5adcaa6783a351f728b7f373 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 30 Apr 2026 12:49:06 -0400 Subject: drm/amdgpu/userq: add mes userq reset callback Enable per queue reset for MES managed queues. Reviewed-by: Jesse Zhang Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 16625c31bfd3..ebcb829f7d04 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -179,6 +179,26 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) return r; } +static int mes_userq_reset(struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + struct mes_reset_queue_input queue_input; + int r; + + /* XXX: add a FW version check for SDMA per queue reset */ + memset(&queue_input, 0x0, sizeof(struct mes_reset_queue_input)); + queue_input.doorbell_offset = queue->doorbell_index; + queue_input.queue_type = queue->queue_type; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + if (r) + return r; + return mes_userq_unmap(queue); +} + static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue, struct drm_amdgpu_userq_in *mqd_user) @@ -552,4 +572,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = { .detect_and_reset = mes_userq_detect_and_reset, .preempt = mes_userq_preempt, .restore = mes_userq_restore, + .reset = mes_userq_reset, }; -- cgit v1.2.3 From 7f9569006302c764e692831ef0095aaa9b1eff85 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 30 Apr 2026 14:57:59 -0400 Subject: drm/amdgpu/userq: drop detect_and_reset callback No longer needed. Reviewed-by: Jesse Zhang Reviewed-by: Prike Liang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 2 -- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 53 ------------------------------ 2 files changed, 55 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 4559f7440788..9df1b78407f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -111,8 +111,6 @@ struct amdgpu_userq_funcs { int (*map)(struct amdgpu_usermode_queue *queue); int (*preempt)(struct amdgpu_usermode_queue *queue); int (*restore)(struct amdgpu_usermode_queue *queue); - int (*detect_and_reset)(struct amdgpu_device *adev, - int queue_type); int (*reset)(struct amdgpu_usermode_queue *queue); }; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index ebcb829f7d04..b8f77ac5760a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -225,58 +225,6 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } -static int mes_userq_detect_and_reset(struct amdgpu_device *adev, - int queue_type) -{ - int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev); - struct mes_detect_and_reset_queue_input input; - struct amdgpu_usermode_queue *queue; - unsigned int hung_db_num = 0; - unsigned long queue_id; - u32 db_array[8]; - bool found_hung_queue = false; - int r, i; - - if (db_array_size > 8) { - dev_err(adev->dev, "DB array size (%d vs 8) too small\n", - db_array_size); - return -EINVAL; - } - - memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input)); - - input.queue_type = queue_type; - - amdgpu_mes_lock(&adev->mes); - r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false, - &hung_db_num, db_array, 0); - amdgpu_mes_unlock(&adev->mes); - if (r) { - dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r); - } else if (hung_db_num) { - xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { - if (queue->queue_type == queue_type) { - for (i = 0; i < hung_db_num; i++) { - if (queue->doorbell_index == db_array[i]) { - queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; - atomic_inc(&adev->gpu_reset_counter); - amdgpu_userq_fence_driver_force_completion(queue); - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); - } - } - } - } - } - - if (found_hung_queue) { - /* Resume scheduling after hang recovery */ - r = amdgpu_mes_resume(adev, input.xcc_id); - } - - return r; -} - static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, struct drm_amdgpu_userq_in *args_in) { @@ -569,7 +517,6 @@ const struct amdgpu_userq_funcs userq_mes_funcs = { .mqd_destroy = mes_userq_mqd_destroy, .unmap = mes_userq_unmap, .map = mes_userq_map, - .detect_and_reset = mes_userq_detect_and_reset, .preempt = mes_userq_preempt, .restore = mes_userq_restore, .reset = mes_userq_reset, -- cgit v1.2.3 From b86e1ea9e2290088d676442ddec29da9663416c2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 20 May 2026 16:11:40 -0400 Subject: drm/amdgpu/userq: add MES userq reset helper Will be used by the common compute queue reset handler. Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 39 +++++++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/mes_userqueue.h | 9 +++++++ 2 files changed, 47 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index b8f77ac5760a..3e5f3ee0a82c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -179,7 +179,7 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) return r; } -static int mes_userq_reset(struct amdgpu_usermode_queue *queue) +int mes_userq_reset(struct amdgpu_usermode_queue *queue) { struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; struct amdgpu_device *adev = uq_mgr->adev; @@ -199,6 +199,43 @@ static int mes_userq_reset(struct amdgpu_usermode_queue *queue) return mes_userq_unmap(queue); } +int mes_userq_reset_queue(struct amdgpu_device *adev, + struct amdgpu_usermode_queue *guilty_uq, + int queue_type, + unsigned int pipe, + unsigned int queue, + unsigned int db) +{ + struct amdgpu_usermode_queue *uq; + bool use_mmio = false; + unsigned long uq_id; + int r; + + xa_for_each(&adev->userq_doorbell_xa, uq_id, uq) { + if (uq->queue_type == queue_type) { + if (uq == guilty_uq) + continue; + if (uq->doorbell_index == db) { + uq->state = AMDGPU_USERQ_STATE_HUNG; + if (use_mmio) + r = amdgpu_mes_reset_queue_mmio(adev, queue_type, 0, 1, pipe, queue, 0); + else + r = amdgpu_mes_reset_user_queue(adev, queue_type, db, 0); + if (r) + return r; + r = mes_userq_unmap(uq); + if (r) + return r; + atomic_inc(&adev->gpu_reset_counter); + amdgpu_userq_fence_driver_force_completion(uq); + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); + break; + } + } + } + return 0; +} + static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue, struct drm_amdgpu_userq_in *mqd_user) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h index 090ae8897770..a473360d6a8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h @@ -27,4 +27,13 @@ #include "amdgpu_userq.h" extern const struct amdgpu_userq_funcs userq_mes_funcs; + +int mes_userq_reset(struct amdgpu_usermode_queue *queue); +int mes_userq_reset_queue(struct amdgpu_device *adev, + struct amdgpu_usermode_queue *guilty_uq, + int queue_type, + unsigned int pipe, + unsigned int queue, + unsigned int db); + #endif -- cgit v1.2.3 From f94bbd648bb499a96aab6fd90d44fb4b1ddcd9e3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 19 May 2026 18:34:00 -0400 Subject: drm/amdgpu: use a single entry point for mes compute reset When we reset MES queues we need to coordinate across KGD and KFD. Use a single function to handle the queue resets across KFD and KGD. v2: squash in fixes for userqs Co-developed-by: Jesse Zhang Co-developed-by: Amber Lin Signed-off-by: Amber Lin Signed-off-by: Jesse Zhang Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 7 +- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 3 +- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 2 +- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 75 ++++------------------ 5 files changed, 22 insertions(+), 68 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index c29d97b786b9..5f0f8a5e3b7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -138,7 +138,12 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work) if (amdgpu_userq_is_reset_type_supported(adev, queue->queue_type, AMDGPU_RESET_TYPE_PER_QUEUE)) { - int r = userq_funcs->reset(queue); + int r; + + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, NULL, NULL); + else + r = userq_funcs->reset(queue); if (r) gpu_reset = true; } else { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index f5840358460d..244c51c70c7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6834,9 +6834,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = adev->gfx.mec.use_mmio_for_reset; - return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL); } static void gfx_v11_ip_print(struct amdgpu_ip_block *ip_block, struct drm_printer *p) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index f222deef4047..1334402d211d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -5260,9 +5260,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = adev->gfx.mec.use_mmio_for_reset; - return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL); } static void gfx_v12_0_ring_begin_use(struct amdgpu_ring *ring) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 3e5f3ee0a82c..e9bd5ad98265 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -207,7 +207,7 @@ int mes_userq_reset_queue(struct amdgpu_device *adev, unsigned int db) { struct amdgpu_usermode_queue *uq; - bool use_mmio = false; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; unsigned long uq_id; int r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index d2c81a79b614..6054c8e216b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -412,7 +412,7 @@ static int reset_queue_mes(struct device_queue_manager *dqm, struct queue *q, { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; struct kfd_process_device *pdd; - bool use_mmio = false; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; int r; pdd = kfd_get_process_device_data(q->device, q->process); @@ -447,11 +447,8 @@ int kfd_reset_queue_mes(struct device_queue_manager *dqm, int queue_type, static int reset_queues_mes(struct device_queue_manager *dqm) { struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; - int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; - int num_hung = 0, r = 0, i, pipe, queue, queue_type; - u32 *hung_array = dqm->hung_db_array; - struct amdgpu_mes_hung_queue_hqd_info *hqd_info = dqm->hqd_info; - struct queue *q; + unsigned int num_hung = 0; + int r = 0; if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) { r = -ENOTRECOVERABLE; @@ -467,51 +464,9 @@ static int reset_queues_mes(struct device_queue_manager *dqm) goto fail; } - if (!hung_array || !hqd_info) { - r = -ENOMEM; - goto fail; - } - - memset(hqd_info, 0, hqd_info_size * sizeof(struct amdgpu_mes_hung_queue_hqd_info)); - - /* - * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called - * post suspend_all as reset & detect will return all hung queue types. - * - * Passed parameter is for targeting queues not scheduled by MES add_queue. - */ - r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, - true, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); - - if (!num_hung || r) { - r = -ENOTRECOVERABLE; + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, NULL, &num_hung); + if (r) goto fail; - } - - /* MES resets queue/pipe and cleans up internally */ - for (i = 0; i < num_hung; i++) { - hqd_info[i].bit0_31 = hung_array[i + hqd_info_size]; - pipe = hqd_info[i].pipe_index; - queue = hqd_info[i].queue_index; - queue_type = hqd_info[i].queue_type; - - if (queue_type != MES_QUEUE_TYPE_COMPUTE && - queue_type != MES_QUEUE_TYPE_SDMA) { - pr_warn("Unsupported hung queue reset type: %d\n", queue_type); - hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET; - continue; - } - - q = find_queue_by_doorbell_offset(dqm, hung_array[i]); - /* skip queues not owned by KFD */ - if (!q) { - continue; - } else { - r = reset_queue_mes(dqm, q, queue_type, pipe, queue, hung_array[i]); - if (r) - goto fail; - } - } dqm->detect_hang_count = num_hung; kfd_signal_reset_event(dqm->dev); @@ -529,22 +484,18 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; - r = amdgpu_mes_suspend(adev, ffs(dqm->dev->xcc_mask) - 1); - up_read(&adev->reset_domain->sem); - - if (r) { - if (!reset_queues_mes(dqm)) { - r = 0; - goto out; - } - dev_err(adev->dev, "failed to suspend gangs from MES\n"); - dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); - kfd_hws_hang(dqm); + if (!reset_queues_mes(dqm)) { + r = 0; + goto out; } + + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); out: - resume_all_queues_mes(dqm); + up_read(&adev->reset_domain->sem); return r; } -- cgit v1.2.3 From 97bcaf15ad25b14bd272fdff3616f9af5a8820c5 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Fri, 12 Jun 2026 14:02:49 +0800 Subject: drm/amdgpu: implement per-process MES context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MES process context is a process-level page where process specific context is saved for MES scheduler. However, current user-queue code path assigns fw_obj of a queue to MES process_context_addr when adding the queue to MES. This means every new queue from the same process would replace the previous process context address with that queue's fw_obj address. What's worse is, when user space frees a queue, its fw_obj will be freed as well, causing MES working on a NULL page pointer. This issue leads to inconsistency and crash in the scheduler. This commit allocates a process-level page for MES process contexts for a process other than queue-level Signed-off-by: Zhu Lingshan Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 6 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 +++++++++++++++++++++++------- 3 files changed, 47 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 3bcde67aa092..3644e9193f58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -1165,6 +1165,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f xa_init_flags(&userq_mgr->userq_xa, XA_FLAGS_ALLOC); userq_mgr->adev = adev; userq_mgr->file = file_priv; + mutex_init(&userq_mgr->proc_ctx_lock); INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work); @@ -1218,6 +1219,11 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) */ cancel_work_sync(&userq_mgr->reset_work); + amdgpu_bo_free_kernel(&userq_mgr->proc_ctx_obj.obj, + &userq_mgr->proc_ctx_obj.gpu_addr, + &userq_mgr->proc_ctx_obj.cpu_ptr); + + mutex_destroy(&userq_mgr->proc_ctx_lock); mutex_destroy(&userq_mgr->userq_mutex); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 9df1b78407f5..7a5f8ed794b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -126,6 +126,8 @@ struct amdgpu_userq_mgr { struct amdgpu_device *adev; struct delayed_work resume_work; struct drm_file *file; + struct mutex proc_ctx_lock; + struct amdgpu_userq_obj proc_ctx_obj; /** * @reset_work: diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index e9bd5ad98265..dba3707c2659 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -133,8 +133,8 @@ static int mes_userq_map(struct amdgpu_usermode_queue *queue) queue_input.gang_quantum = 10000; queue_input.paging = false; - queue_input.process_context_addr = ctx->gpu_addr; - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.process_context_addr = uq_mgr->proc_ctx_obj.gpu_addr; + queue_input.gang_context_addr = ctx->gpu_addr; queue_input.inprocess_gang_priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; queue_input.gang_global_priority_level = convert_to_mes_priority(queue->priority); @@ -169,7 +169,7 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = queue->doorbell_index; - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); @@ -243,12 +243,8 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_userq_obj *ctx = &queue->fw_obj; int r, size; - /* - * The FW expects at least one page space allocated for - * process ctx and gang ctx each. Create an object - * for the same. - */ - size = AMDGPU_USERQ_PROC_CTX_SZ + AMDGPU_USERQ_GANG_CTX_SZ; + /* The FW expects at least one page space allocated for gang ctx. */ + size = AMDGPU_USERQ_GANG_CTX_SZ; r = amdgpu_bo_create_kernel(uq_mgr->adev, size, 0, AMDGPU_GEM_DOMAIN_GTT, &ctx->obj, &ctx->gpu_addr, @@ -262,6 +258,30 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } +static int mes_userq_create_proc_ctx_space(struct amdgpu_userq_mgr *uq_mgr) +{ + int r = 0; + + mutex_lock(&uq_mgr->proc_ctx_lock); + /* This check is a necessary because amdgpu_bo_create_kernel() + * calls helpers like amdgpu_bo_pin() and memset() unconditionally + */ + if (!uq_mgr->proc_ctx_obj.obj) { + r = amdgpu_bo_create_kernel(uq_mgr->adev, AMDGPU_USERQ_PROC_CTX_SZ, + 0, AMDGPU_GEM_DOMAIN_GTT, + &uq_mgr->proc_ctx_obj.obj, + &uq_mgr->proc_ctx_obj.gpu_addr, + &uq_mgr->proc_ctx_obj.cpu_ptr); + + if (!r) + memset(uq_mgr->proc_ctx_obj.cpu_ptr, 0, AMDGPU_USERQ_PROC_CTX_SZ); + } + + mutex_unlock(&uq_mgr->proc_ctx_lock); + + return r; +} + static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, struct drm_amdgpu_userq_in *args_in) { @@ -434,7 +454,14 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, goto free_mqd; } - /* Create BO for FW operations */ + /* Create per-process MES process context BO */ + r = mes_userq_create_proc_ctx_space(uq_mgr); + if (r) { + DRM_ERROR("Failed to allocate MES process context space bo, error: %d\n", r); + goto free_mqd; + } + + /* Create BO of a gang for FW operations */ r = mes_userq_create_ctx_space(uq_mgr, queue, mqd_user); if (r) { DRM_ERROR("Failed to allocate BO for userqueue (%d)", r); @@ -502,7 +529,7 @@ static int mes_userq_preempt(struct amdgpu_usermode_queue *queue) *fence_ptr = 0; memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input)); - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; queue_input.suspend_fence_addr = fence_gpu_addr; queue_input.suspend_fence_value = 1; amdgpu_mes_lock(&adev->mes); @@ -539,7 +566,7 @@ static int mes_userq_restore(struct amdgpu_usermode_queue *queue) return 0; memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input)); - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input); -- cgit v1.2.3 From 89db46e455abf1654f88d36e5429cb408abbc95e Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Wed, 24 Jun 2026 12:32:18 +1000 Subject: drm/amdgpu,amdkfd: correct setting MES queue type MES ADD_QUEUE programs the firmware with the queue type from the driver input, but MES REMOVE_QUEUE leaves queue_type at the zero-initialized value. Zero decodes as GFX in the MES REMOVE_QUEUE packet. That means removing a KFD compute queue can be submitted to MES as a GFX queue. In a debug-trap suspend/remove sequence this can leave MES looking for the doorbell in the wrong queue class and the REMOVE_QUEUE command may never complete. The observed failing packet removed doorbell 0x1002 with queue_type=GFX even though the corresponding ADD_QUEUE for the same doorbell was queue_type=COMPUTE. Populate REMOVE_QUEUE.queue_type the same way ADD_QUEUE does. Signed-off-by: Geoffrey McRae Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 1 + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++ 6 files changed, 11 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/mes_userqueue.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 5255360353f4..dbedb1e47c3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -274,6 +274,7 @@ struct mes_remove_queue_input { uint32_t xcc_id; uint32_t doorbell_offset; uint64_t gang_context_addr; + uint32_t queue_type; bool remove_queue_after_reset; }; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index dba3707c2659..e947c16e694d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -170,6 +170,7 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = queue->doorbell_index; queue_input.gang_context_addr = ctx->gpu_addr; + queue_input.queue_type = queue->queue_type; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 9e27d01cbfa3..76e6769cf7ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -383,6 +383,8 @@ static int mes_v11_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); if (mes_rev >= 0x60) mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 20f4fd57b1da..1b0c649d97a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -371,6 +371,8 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); if (mes_rev >= 0x5a) mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c index 8007a6e69305..c449efa70b60 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c @@ -362,6 +362,8 @@ static int mes_v12_1_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); return mes_v12_1_submit_pkt_and_poll_completion(mes, xcc_id, AMDGPU_MES_SCHED_PIPE, @@ -2270,6 +2272,7 @@ static int mes_v12_1_test_queue(struct amdgpu_device *adev, int xcc_id, remove_queue.xcc_id = xcc_id; remove_queue.doorbell_offset = doorbell_idx; remove_queue.gang_context_addr = add_queue.gang_context_addr; + remove_queue.queue_type = queue_type; r = mes_v12_1_remove_hw_queue(&adev->mes, &remove_queue); error: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ce28a7c77704..9dc65d5fb2b3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -299,6 +299,7 @@ static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, st memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; queue_input.gang_context_addr = q->gang_ctx_gpu_addr; + queue_input.queue_type = convert_to_mes_queue_type(q->properties.type); queue_input.remove_queue_after_reset = flush_mes_queue; queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; @@ -467,6 +468,7 @@ static int reset_queues_mes(struct device_queue_manager *dqm, struct queue *q) memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; queue_input.gang_context_addr = q->gang_ctx_gpu_addr; + queue_input.queue_type = convert_to_mes_queue_type(q->properties.type); queue_input.remove_queue_after_reset = false; queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; /* pass the known bad queue info to the reset function */ -- cgit v1.2.3