summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_events.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c136
1 files changed, 94 insertions, 42 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index d075f24e5f9f..44150a71ffd5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -67,7 +67,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
void *backing_store;
struct kfd_signal_page *page;
- page = kzalloc(sizeof(*page), GFP_KERNEL);
+ page = kzalloc_obj(*page);
if (!page)
return NULL;
@@ -142,6 +142,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
* @p: Pointer to struct kfd_process
* @id: ID to look up
* @bits: Number of valid bits in @id
+ * @signal_mailbox_updated: flag indicates if FW updates signal mailbox entry
*
* Finds the first signaled event with a matching partial ID. If no
* matching signaled event is found, returns NULL. In that case the
@@ -155,7 +156,8 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
* driver.
*/
static struct kfd_event *lookup_signaled_event_by_partial_id(
- struct kfd_process *p, uint32_t id, uint32_t bits)
+ struct kfd_process *p, uint32_t id, uint32_t bits,
+ bool signal_mailbox_updated)
{
struct kfd_event *ev;
@@ -166,7 +168,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id(
* and we only need a single lookup.
*/
if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
- if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+ if (signal_mailbox_updated &&
+ page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
return NULL;
return idr_find(&p->event_idr, id);
@@ -331,7 +334,13 @@ static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
if (p->signal_page)
return -EBUSY;
- page = kzalloc(sizeof(*page), GFP_KERNEL);
+ if (size < KFD_SIGNAL_EVENT_LIMIT * 8) {
+ pr_err("Event page size %llu is too small, need at least %lu bytes\n",
+ size, (unsigned long)(KFD_SIGNAL_EVENT_LIMIT * 8));
+ return -EINVAL;
+ }
+
+ page = kzalloc_obj(*page);
if (!page)
return -ENOMEM;
@@ -399,7 +408,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index)
{
int ret = 0;
- struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ struct kfd_event *ev = kzalloc_obj(*ev);
if (!ev)
return -ENOMEM;
@@ -452,11 +461,11 @@ int kfd_criu_restore_event(struct file *devkfd,
struct kfd_event *ev = NULL;
int ret = 0;
- ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
+ ev_priv = kmalloc_obj(*ev_priv);
if (!ev_priv)
return -ENOMEM;
- ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ ev = kzalloc_obj(*ev);
if (!ev) {
ret = -ENOMEM;
goto exit;
@@ -718,7 +727,7 @@ static void set_event_from_interrupt(struct kfd_process *p,
}
void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
- uint32_t valid_id_bits)
+ uint32_t valid_id_bits, bool signal_mailbox_updated)
{
struct kfd_event *ev = NULL;
@@ -727,7 +736,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -736,7 +745,8 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
if (valid_id_bits)
ev = lookup_signaled_event_by_partial_id(p, partial_id,
- valid_id_bits);
+ valid_id_bits,
+ signal_mailbox_updated);
if (ev) {
set_event_from_interrupt(p, ev);
} else if (p->signal_page) {
@@ -748,16 +758,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
uint64_t *slots = page_slots(p->signal_page);
uint32_t id;
- /*
- * If id is valid but slot is not signaled, GPU may signal the same event twice
- * before driver have chance to process the first interrupt, then signal slot is
- * auto-reset after set_event wakeup the user space, just drop the second event as
- * the application only need wakeup once.
- */
- if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
- partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
- goto out_unlock;
-
if (valid_id_bits)
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
partial_id, valid_id_bits);
@@ -786,7 +786,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
}
}
-out_unlock:
rcu_read_unlock();
kfd_unref_process(p);
}
@@ -796,8 +795,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
struct kfd_event_waiter *event_waiters;
uint32_t i;
- event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter),
- GFP_KERNEL);
+ event_waiters = kzalloc_objs(struct kfd_event_waiter, num_events);
if (!event_waiters)
return NULL;
@@ -1139,8 +1137,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (type == KFD_EVENT_TYPE_MEMORY) {
dev_warn(kfd_device,
- "Sending SIGSEGV to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGSEGV to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGSEGV, p->lead_thread, 0);
}
@@ -1148,13 +1146,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (send_signal) {
if (send_sigterm) {
dev_warn(kfd_device,
- "Sending SIGTERM to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGTERM to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGTERM, p->lead_thread, 0);
} else {
dev_err(kfd_device,
- "Process %d (pasid 0x%x) got unhandled exception",
- p->lead_thread->pid, p->pasid);
+ "Process pid %d got unhandled exception",
+ p->lead_thread->pid);
}
}
@@ -1168,7 +1166,7 @@ void kfd_signal_hw_exception_event(u32 pasid)
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -1177,22 +1175,39 @@ void kfd_signal_hw_exception_event(u32 pasid)
kfd_unref_process(p);
}
-void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_hsa_memory_exception_data exception_data;
+ int i;
+
+ memset(&exception_data, 0, sizeof(exception_data));
+ exception_data.va = gpu_va;
+ exception_data.failure.NotPresent = 1;
+
+ // Send VM seg fault to all kfd process device
+ for (i = 0; i < p->n_pdds; i++) {
+ pdd = p->pdds[i];
+ exception_data.gpu_id = pdd->user_gpu_id;
+ kfd_evict_process_device(pdd);
+ kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
+ }
+}
+
+void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data)
{
struct kfd_event *ev;
uint32_t id;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = pdd->process;
struct kfd_hsa_memory_exception_data memory_exception_data;
int user_gpu_id;
- if (!p)
- return; /* Presumably process exited. */
-
- user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
- WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
+ pdd->dev->id);
return;
}
@@ -1229,7 +1244,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
}
rcu_read_unlock();
- kfd_unref_process(p);
}
void kfd_signal_reset_event(struct kfd_node *dev)
@@ -1264,7 +1278,8 @@ void kfd_signal_reset_event(struct kfd_node *dev)
}
if (unlikely(!pdd)) {
- WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
+ WARN_ONCE(1, "Could not get device data from process pid:%d\n",
+ p->lead_thread->pid);
continue;
}
@@ -1273,12 +1288,19 @@ void kfd_signal_reset_event(struct kfd_node *dev)
if (dev->dqm->detect_hang_count) {
struct amdgpu_task_info *ti;
+ struct amdgpu_fpriv *drv_priv;
- ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+ if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) {
+ WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n",
+ dev->id, p->lead_thread->pid);
+ continue;
+ }
+
+ ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
if (ti) {
dev_err(dev->adev->dev,
"Queues reset on process %s tid %d thread %s pid %d\n",
- ti->process_name, ti->tgid, ti->task_name, ti->pid);
+ ti->process_name, ti->tgid, ti->task.comm, ti->task.pid);
amdgpu_vm_put_task_info(ti);
}
}
@@ -1311,7 +1333,7 @@ void kfd_signal_reset_event(struct kfd_node *dev)
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
{
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_event *ev;
@@ -1326,6 +1348,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ kfd_unref_process(p);
return;
}
@@ -1366,3 +1389,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
kfd_unref_process(p);
}
+
+/* signal KFD_EVENT_TYPE_SIGNAL events from process p
+ * send signal SIGBUS to correspondent user space process
+ */
+void kfd_signal_process_terminate_event(struct kfd_process *p)
+{
+ struct kfd_event *ev;
+ u32 id;
+
+ rcu_read_lock();
+
+ /* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */
+ id = 1;
+ idr_for_each_entry_continue(&p->event_idr, ev, id)
+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
+ spin_lock(&ev->lock);
+ set_event(ev);
+ spin_unlock(&ev->lock);
+ }
+
+ /* Send SIGBUS to p->lead_thread */
+ dev_notice(kfd_device,
+ "Sending SIGBUS to process %d",
+ p->lead_thread->pid);
+
+ send_sig(SIGBUS, p->lead_thread, 0);
+
+ rcu_read_unlock();
+}