diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 136 |
1 files changed, 94 insertions, 42 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index d075f24e5f9f..44150a71ffd5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -67,7 +67,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) void *backing_store; struct kfd_signal_page *page; - page = kzalloc(sizeof(*page), GFP_KERNEL); + page = kzalloc_obj(*page); if (!page) return NULL; @@ -142,6 +142,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) * @p: Pointer to struct kfd_process * @id: ID to look up * @bits: Number of valid bits in @id + * @signal_mailbox_updated: flag indicates if FW updates signal mailbox entry * * Finds the first signaled event with a matching partial ID. If no * matching signaled event is found, returns NULL. In that case the @@ -155,7 +156,8 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) * driver. */ static struct kfd_event *lookup_signaled_event_by_partial_id( - struct kfd_process *p, uint32_t id, uint32_t bits) + struct kfd_process *p, uint32_t id, uint32_t bits, + bool signal_mailbox_updated) { struct kfd_event *ev; @@ -166,7 +168,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id( * and we only need a single lookup. */ if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) { - if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + if (signal_mailbox_updated && + page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) return NULL; return idr_find(&p->event_idr, id); @@ -331,7 +334,13 @@ static int kfd_event_page_set(struct kfd_process *p, void *kernel_address, if (p->signal_page) return -EBUSY; - page = kzalloc(sizeof(*page), GFP_KERNEL); + if (size < KFD_SIGNAL_EVENT_LIMIT * 8) { + pr_err("Event page size %llu is too small, need at least %lu bytes\n", + size, (unsigned long)(KFD_SIGNAL_EVENT_LIMIT * 8)); + return -EINVAL; + } + + page = kzalloc_obj(*page); if (!page) return -ENOMEM; @@ -399,7 +408,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint64_t *event_page_offset, uint32_t *event_slot_index) { int ret = 0; - struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + struct kfd_event *ev = kzalloc_obj(*ev); if (!ev) return -ENOMEM; @@ -452,11 +461,11 @@ int kfd_criu_restore_event(struct file *devkfd, struct kfd_event *ev = NULL; int ret = 0; - ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL); + ev_priv = kmalloc_obj(*ev_priv); if (!ev_priv) return -ENOMEM; - ev = kzalloc(sizeof(*ev), GFP_KERNEL); + ev = kzalloc_obj(*ev); if (!ev) { ret = -ENOMEM; goto exit; @@ -718,7 +727,7 @@ static void set_event_from_interrupt(struct kfd_process *p, } void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, - uint32_t valid_id_bits) + uint32_t valid_id_bits, bool signal_mailbox_updated) { struct kfd_event *ev = NULL; @@ -727,7 +736,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -736,7 +745,8 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, if (valid_id_bits) ev = lookup_signaled_event_by_partial_id(p, partial_id, - valid_id_bits); + valid_id_bits, + signal_mailbox_updated); if (ev) { set_event_from_interrupt(p, ev); } else if (p->signal_page) { @@ -748,16 +758,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint64_t *slots = page_slots(p->signal_page); uint32_t id; - /* - * If id is valid but slot is not signaled, GPU may signal the same event twice - * before driver have chance to process the first interrupt, then signal slot is - * auto-reset after set_event wakeup the user space, just drop the second event as - * the application only need wakeup once. - */ - if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) && - partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT) - goto out_unlock; - if (valid_id_bits) pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", partial_id, valid_id_bits); @@ -786,7 +786,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, } } -out_unlock: rcu_read_unlock(); kfd_unref_process(p); } @@ -796,8 +795,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) struct kfd_event_waiter *event_waiters; uint32_t i; - event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), - GFP_KERNEL); + event_waiters = kzalloc_objs(struct kfd_event_waiter, num_events); if (!event_waiters) return NULL; @@ -1139,8 +1137,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGSEGV to process pid %d", + p->lead_thread->pid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -1148,13 +1146,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGTERM to process pid %d", + p->lead_thread->pid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "Process %d (pasid 0x%x) got unhandled exception", - p->lead_thread->pid, p->pasid); + "Process pid %d got unhandled exception", + p->lead_thread->pid); } } @@ -1168,7 +1166,7 @@ void kfd_signal_hw_exception_event(u32 pasid) * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -1177,22 +1175,39 @@ void kfd_signal_hw_exception_event(u32 pasid) kfd_unref_process(p); } -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va) +{ + struct kfd_process_device *pdd; + struct kfd_hsa_memory_exception_data exception_data; + int i; + + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.va = gpu_va; + exception_data.failure.NotPresent = 1; + + // Send VM seg fault to all kfd process device + for (i = 0; i < p->n_pdds; i++) { + pdd = p->pdds[i]; + exception_data.gpu_id = pdd->user_gpu_id; + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, &exception_data); + } +} + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data) { struct kfd_event *ev; uint32_t id; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = pdd->process; struct kfd_hsa_memory_exception_data memory_exception_data; int user_gpu_id; - if (!p) - return; /* Presumably process exited. */ - - user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id); if (unlikely(user_gpu_id == -EINVAL)) { - WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", + pdd->dev->id); return; } @@ -1229,7 +1244,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, } rcu_read_unlock(); - kfd_unref_process(p); } void kfd_signal_reset_event(struct kfd_node *dev) @@ -1264,7 +1278,8 @@ void kfd_signal_reset_event(struct kfd_node *dev) } if (unlikely(!pdd)) { - WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid); + WARN_ONCE(1, "Could not get device data from process pid:%d\n", + p->lead_thread->pid); continue; } @@ -1273,12 +1288,19 @@ void kfd_signal_reset_event(struct kfd_node *dev) if (dev->dqm->detect_hang_count) { struct amdgpu_task_info *ti; + struct amdgpu_fpriv *drv_priv; - ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); + if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) { + WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n", + dev->id, p->lead_thread->pid); + continue; + } + + ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm); if (ti) { dev_err(dev->adev->dev, "Queues reset on process %s tid %d thread %s pid %d\n", - ti->process_name, ti->tgid, ti->task_name, ti->pid); + ti->process_name, ti->tgid, ti->task.comm, ti->task.pid); amdgpu_vm_put_task_info(ti); } } @@ -1311,7 +1333,7 @@ void kfd_signal_reset_event(struct kfd_node *dev) void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_event *ev; @@ -1326,6 +1348,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + kfd_unref_process(p); return; } @@ -1366,3 +1389,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) kfd_unref_process(p); } + +/* signal KFD_EVENT_TYPE_SIGNAL events from process p + * send signal SIGBUS to correspondent user space process + */ +void kfd_signal_process_terminate_event(struct kfd_process *p) +{ + struct kfd_event *ev; + u32 id; + + rcu_read_lock(); + + /* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */ + id = 1; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_SIGNAL) { + spin_lock(&ev->lock); + set_event(ev); + spin_unlock(&ev->lock); + } + + /* Send SIGBUS to p->lead_thread */ + dev_notice(kfd_device, + "Sending SIGBUS to process %d", + p->lead_thread->pid); + + send_sig(SIGBUS, p->lead_thread, 0); + + rcu_read_unlock(); +} |
