summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_events.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c54
1 files changed, 35 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index ea3792249209..fecdb6794075 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -727,7 +727,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -748,6 +748,16 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
uint64_t *slots = page_slots(p->signal_page);
uint32_t id;
+ /*
+ * If id is valid but slot is not signaled, GPU may signal the same event twice
+ * before driver have chance to process the first interrupt, then signal slot is
+ * auto-reset after set_event wakeup the user space, just drop the second event as
+ * the application only need wakeup once.
+ */
+ if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
+ partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
+ goto out_unlock;
+
if (valid_id_bits)
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
partial_id, valid_id_bits);
@@ -776,6 +786,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
}
}
+out_unlock:
rcu_read_unlock();
kfd_unref_process(p);
}
@@ -1128,8 +1139,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (type == KFD_EVENT_TYPE_MEMORY) {
dev_warn(kfd_device,
- "Sending SIGSEGV to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGSEGV to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGSEGV, p->lead_thread, 0);
}
@@ -1137,13 +1148,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (send_signal) {
if (send_sigterm) {
dev_warn(kfd_device,
- "Sending SIGTERM to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGTERM to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGTERM, p->lead_thread, 0);
} else {
dev_err(kfd_device,
- "Process %d (pasid 0x%x) got unhandled exception",
- p->lead_thread->pid, p->pasid);
+ "Process pid %d got unhandled exception",
+ p->lead_thread->pid);
}
}
@@ -1157,7 +1168,7 @@ void kfd_signal_hw_exception_event(u32 pasid)
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -1166,22 +1177,20 @@ void kfd_signal_hw_exception_event(u32 pasid)
kfd_unref_process(p);
}
-void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
+void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data)
{
struct kfd_event *ev;
uint32_t id;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = pdd->process;
struct kfd_hsa_memory_exception_data memory_exception_data;
int user_gpu_id;
- if (!p)
- return; /* Presumably process exited. */
-
- user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
- WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
+ pdd->dev->id);
return;
}
@@ -1218,7 +1227,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
}
rcu_read_unlock();
- kfd_unref_process(p);
}
void kfd_signal_reset_event(struct kfd_node *dev)
@@ -1253,7 +1261,8 @@ void kfd_signal_reset_event(struct kfd_node *dev)
}
if (unlikely(!pdd)) {
- WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
+ WARN_ONCE(1, "Could not get device data from process pid:%d\n",
+ p->lead_thread->pid);
continue;
}
@@ -1262,8 +1271,15 @@ void kfd_signal_reset_event(struct kfd_node *dev)
if (dev->dqm->detect_hang_count) {
struct amdgpu_task_info *ti;
+ struct amdgpu_fpriv *drv_priv;
+
+ if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) {
+ WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n",
+ dev->id, p->lead_thread->pid);
+ continue;
+ }
- ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+ ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
if (ti) {
dev_err(dev->adev->dev,
"Queues reset on process %s tid %d thread %s pid %d\n",
@@ -1300,7 +1316,7 @@ void kfd_signal_reset_event(struct kfd_node *dev)
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
{
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_event *ev;