diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 159 |
1 files changed, 76 insertions, 83 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index bd36a75309e1..44e39ce222b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -69,8 +69,8 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, pr_debug("The new slot id %lu\n", found); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_info("Cannot open more queues for process with pasid 0x%x\n", - pqm->process->pasid); + pr_info("Cannot open more queues for process with pid %d\n", + pqm->process->lead_thread->pid); return -ENOMEM; } @@ -94,7 +94,8 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) if (dev->kfd->shared_resources.enable_mes && !!pdd->proc_ctx_gpu_addr && down_read_trylock(&dev->adev->reset_domain->sem)) { amdgpu_mes_flush_shader_debugger(dev->adev, - pdd->proc_ctx_gpu_addr); + pdd->proc_ctx_gpu_addr, + ffs(pdd->dev->xcc_mask) - 1); up_read(&dev->adev->reset_domain->sem); } pdd->already_dequeued = true; @@ -209,8 +210,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, } if (dev->kfd->shared_resources.enable_mes) { - amdgpu_amdkfd_free_gtt_mem(dev->adev, &pqn->q->gang_ctx_bo); - amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); + amdgpu_amdkfd_free_kernel_mem(dev->adev, &pqn->q->gang_ctx_bo); + amdgpu_amdkfd_free_kernel_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); } } @@ -264,8 +265,9 @@ static int init_user_queue(struct process_queue_manager *pqm, (*q)->process = pqm->process; if (dev->kfd->shared_resources.enable_mes) { - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, + retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, AMDGPU_MES_GANG_CTX_SIZE, + AMDGPU_GEM_DOMAIN_GTT, &(*q)->gang_ctx_bo, &(*q)->gang_ctx_gpu_addr, &(*q)->gang_ctx_cpu_ptr, @@ -279,20 +281,17 @@ static int init_user_queue(struct process_queue_manager *pqm, /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) */ - if (((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { - pr_err("Queue memory allocated to wrong device\n"); - retval = -EINVAL; - goto free_gang_ctx_bo; - } + if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { + pr_err("Queue memory allocated to wrong device\n"); + retval = -EINVAL; + goto free_gang_ctx_bo; + } - retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, - &(*q)->wptr_bo_gart); - if (retval) { - pr_err("Failed to map wptr bo to GART\n"); - goto free_gang_ctx_bo; - } + retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, + &(*q)->wptr_bo_gart); + if (retval) { + pr_err("Failed to map wptr bo to GART\n"); + goto free_gang_ctx_bo; } } @@ -300,7 +299,7 @@ static int init_user_queue(struct process_queue_manager *pqm, return 0; free_gang_ctx_bo: - amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); + amdgpu_amdkfd_free_kernel_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; @@ -348,7 +347,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, * If we are just about to create DIQ, the is_debug flag is not set yet * Hence we also check the type as well */ - if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) + if (pdd->qpd.is_debug) max_queues = dev->kfd->device_info.max_no_of_hqd/2; if (pdd->qpd.queue_count >= max_queues) @@ -363,11 +362,28 @@ int pqm_create_queue(struct process_queue_manager *pqm, if (retval != 0) return retval; + /* Register process if this is the first queue */ if (list_empty(&pdd->qpd.queues_list) && list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); - pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); + /* Allocate proc_ctx_bo only if MES is enabled and this is the first queue */ + if (!pdd->proc_ctx_cpu_ptr && dev->kfd->shared_resources.enable_mes) { + retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, + AMDGPU_MES_PROC_CTX_SIZE, + AMDGPU_GEM_DOMAIN_GTT, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (retval) { + dev_err(dev->adev->dev, "failed to allocate process context bo\n"); + return retval; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + + pqn = kzalloc_obj(*pqn); if (!pqn) { retval = -ENOMEM; goto err_allocate_pqn; @@ -413,30 +429,21 @@ int pqm_create_queue(struct process_queue_manager *pqm, restore_mqd, restore_ctl_stack); print_queue(q); break; - case KFD_QUEUE_TYPE_DIQ: - kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); - if (!kq) { - retval = -ENOMEM; - goto err_create_queue; - } - kq->queue->properties.queue_id = *qid; - pqn->kq = kq; - pqn->q = NULL; - retval = kfd_process_drain_interrupts(pdd); - if (retval) - break; - - retval = dev->dqm->ops.create_kernel_queue(dev->dqm, - kq, &pdd->qpd); - break; default: WARN(1, "Invalid queue type %d", type); retval = -EINVAL; } if (retval != 0) { - pr_err("Pasid 0x%x DQM create queue type %d failed. ret %d\n", - pqm->process->pasid, type, retval); + if ((type == KFD_QUEUE_TYPE_SDMA || + type == KFD_QUEUE_TYPE_SDMA_XGMI || + type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) && + retval == -ENOMEM) + pr_warn("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); + else + pr_err("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); goto err_create_queue; } @@ -530,9 +537,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n", - pqm->process->pasid, + pdd->pasid, pqn->q->properties.queue_id, retval); - if (retval != -ETIME) + if (retval != -ETIME && retval != -EIO) goto err_destroy_queue; } kfd_procfs_del_queue(pqn->q); @@ -583,9 +590,11 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, return err; if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, - p->queue_size)) { + p->queue_size + + pqn->q->properties.metadata_queue_size)) { pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", p->queue_address, p->queue_size); + amdgpu_bo_unreserve(vm->root.bo); return -EFAULT; } @@ -652,19 +661,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, return 0; } -struct kernel_queue *pqm_get_kernel_queue( - struct process_queue_manager *pqm, - unsigned int qid) -{ - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (pqn && pqn->kq) - return pqn->kq; - - return NULL; -} - struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, unsigned int qid) { @@ -907,7 +903,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd, q_data = (struct kfd_criu_queue_priv_data *)q_private_data; - /* data stored in this order: priv_data, mqd, ctl_stack */ + /* + * data stored in this order: + * priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]... + */ q_data->mqd_size = mqd_size; q_data->ctl_stack_size = ctl_stack_size; @@ -956,7 +955,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p, } static void set_queue_properties_from_criu(struct queue_properties *qp, - struct kfd_criu_queue_priv_data *q_data) + struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc) { qp->is_interop = false; qp->queue_percent = q_data->q_percent; @@ -969,7 +968,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp, qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size; qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address; qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size; - qp->ctl_stack_size = q_data->ctl_stack_size; + if (q_data->type == KFD_QUEUE_TYPE_COMPUTE) + qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc; + else + qp->ctl_stack_size = q_data->ctl_stack_size; + qp->type = q_data->type; qp->format = q_data->format; } @@ -990,7 +993,7 @@ int kfd_criu_restore_queue(struct kfd_process *p, if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size) return -EINVAL; - q_data = kmalloc(sizeof(*q_data), GFP_KERNEL); + q_data = kmalloc_obj(*q_data); if (!q_data) return -ENOMEM; @@ -1029,12 +1032,15 @@ int kfd_criu_restore_queue(struct kfd_process *p, goto exit; } - /* data stored in this order: mqd, ctl_stack */ + /* + * data stored in this order: + * mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]... + */ mqd = q_extra_data; ctl_stack = mqd + q_data->mqd_size; memset(&qp, 0, sizeof(qp)); - set_queue_properties_from_criu(&qp, q_data); + set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask)); print_queue_properties(&qp); @@ -1065,6 +1071,7 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, uint32_t *ctl_stack_size) { struct process_queue_node *pqn; + int ret; pqn = get_queue_by_qid(pqm, qid); if (!pqn) { @@ -1077,9 +1084,14 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, return -EOPNOTSUPP; } - pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm, + ret = pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm, pqn->q, mqd_size, ctl_stack_size); + if (ret) { + pr_debug("amdkfd: Overflow while computing stack size for queue %d\n", qid); + return ret; + } + return 0; } @@ -1114,32 +1126,13 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) break; default: seq_printf(m, - " Bad user queue type %d on device %x\n", + " Queue node with bad user queue type %d on device %x\n", q->properties.type, q->device->id); continue; } mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type]; size = mqd_mgr->mqd_stride(mqd_mgr, &q->properties); - } else if (pqn->kq) { - q = pqn->kq->queue; - mqd_mgr = pqn->kq->mqd_mgr; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_DIQ: - seq_printf(m, " DIQ on device %x\n", - pqn->kq->dev->id); - break; - default: - seq_printf(m, - " Bad kernel queue type %d on device %x\n", - q->properties.type, - pqn->kq->dev->id); - continue; - } - } else { - seq_printf(m, - " Weird: Queue node with neither kernel nor user queue\n"); - continue; } for (xcc = 0; xcc < num_xccs; xcc++) { |
