diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
55 files changed, 5799 insertions, 2087 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index d3c3d3ab7225..a5d7467c2f34 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -5,7 +5,7 @@ config HSA_AMD bool "HSA kernel driver for AMD GPU devices" - depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64) + depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64 || (RISCV && 64BIT) || (LOONGARCH && 64BIT)) select HMM_MIRROR select MMU_NOTIFIER select DRM_AMDGPU_USERPTR @@ -27,7 +27,7 @@ config HSA_AMD_SVM config HSA_AMD_P2P bool "HSA kernel driver support for peer-to-peer for AMD GPU devices" - depends on HSA_AMD && PCI_P2PDMA && DMABUF_MOVE_NOTIFY + depends on HSA_AMD && PCI_P2PDMA help Enable peer-to-peer (P2P) communication between AMD GPUs over the PCIe bus. This can improve performance of multi-GPU compute diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 0d3d8972240d..85fc67d521e5 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -27,7 +27,6 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device.o \ $(AMDKFD_PATH)/kfd_chardev.o \ $(AMDKFD_PATH)/kfd_topology.o \ - $(AMDKFD_PATH)/kfd_pasid.o \ $(AMDKFD_PATH)/kfd_doorbell.o \ $(AMDKFD_PATH)/kfd_flat_memory.o \ $(AMDKFD_PATH)/kfd_process.o \ @@ -39,6 +38,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v11.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v12.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v12_1.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_packet_manager_vi.o \ @@ -51,12 +51,14 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v12.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v12_1.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_int_process_v10.o \ $(AMDKFD_PATH)/kfd_int_process_v11.o \ + $(AMDKFD_PATH)/kfd_int_process_v12_1.o \ $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o \ $(AMDKFD_PATH)/kfd_debug.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 795382b55e0a..b799c70f5742 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -91,36 +91,41 @@ static void cik_event_interrupt_wq(struct kfd_node *dev, const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; uint32_t context_id = ihre->data & 0xfffffff; - unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; u32 pasid = (ihre->ring_id & 0xffff0000) >> 16; if (pasid == 0) return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id, 28); + kfd_signal_event_interrupt(pasid, context_id, 28, true); else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, context_id, 28); + kfd_signal_event_interrupt(pasid, context_id, 28, true); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); + kfd_signal_event_interrupt(pasid, context_id & 0xff, 8, true); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) kfd_signal_hw_exception_event(pasid); else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { + struct kfd_process_device *pdd = NULL; struct kfd_vm_fault_info info; + struct kfd_process *p; kfd_smi_event_update_vmfault(dev, pasid); - kfd_dqm_evict_pasid(dev->dqm, pasid); + p = kfd_lookup_process_by_pasid(pasid, &pdd); + if (!pdd) + return; + + kfd_evict_process_device(pdd); memset(&info, 0, sizeof(info)); amdgpu_amdkfd_gpuvm_get_vm_fault_info(dev->adev, &info); - if (!info.page_addr && !info.status) + if (!info.page_addr && !info.status) { + kfd_unref_process(p); return; + } - if (info.vmid == vmid) - kfd_signal_vm_fault_event(dev, pasid, &info, NULL); - else - kfd_signal_vm_fault_event(dev, pasid, NULL, NULL); + kfd_signal_vm_fault_event(pdd, &info, NULL); + kfd_unref_process(p); } } diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 651660958e5b..54fa76f374c9 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { }; static const uint32_t cwsr_trap_gfx12_hex[] = { - 0xbfa00001, 0xbfa0024b, - 0xb0804009, 0xb8f8f804, + 0xbfa00001, 0xbfa00239, + 0xb0804009, 0xb8eef81a, + 0xbf880000, 0xb980081a, + 0x00000000, 0xb8f8f804, + 0x9177ff77, 0x0c000000, + 0x846e9a6e, 0x8c776e77, 0x9178ff78, 0x00008c00, 0xb8fbf811, 0x8b6eff78, 0x00004000, 0xbfa10008, 0x8b6eff7b, 0x00000080, 0xbfa20018, 0x8b6ea07b, - 0xbfa20042, 0xbf830010, + 0xbfa2004a, 0xbf830010, 0xb8fbf811, 0xbfa0fffb, 0x8b6eff7b, 0x00000bd0, 0xbfa20010, 0xb8eef812, @@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xf0000000, 0xbfa20005, 0x8b6fff6f, 0x00000200, 0xbfa20002, 0x8b6ea07b, - 0xbfa2002c, 0xbefa4d82, + 0xbfa20034, 0xbefa4d82, 0xbf8a0000, 0x84fa887a, 0xbf0d8f7b, 0xbfa10002, 0x8c7bff7b, 0xffff0000, - 0xf4601bbd, 0xf8000010, - 0xbf8a0000, 0x846e976e, - 0x9177ff77, 0x00800000, - 0x8c776e77, 0xf4603bbd, - 0xf8000000, 0xbf8a0000, - 0xf4603ebd, 0xf8000008, - 0xbf8a0000, 0x8bee6e6e, - 0xbfa10001, 0xbe80486e, - 0x8b6eff6d, 0xf0000000, - 0xbfa20009, 0xb8eef811, - 0x8b6eff6e, 0x00000080, - 0xbfa20007, 0x8c78ff78, - 0x00004000, 0x80ec886c, - 0x82ed806d, 0xbfa00002, - 0x806c846c, 0x826d806d, - 0x8b6dff6d, 0x0000ffff, - 0x8bfe7e7e, 0x8bea6a6a, - 0x85788978, 0xb9783244, + 0x8b6eff77, 0x0c000000, + 0x916dff6d, 0x0c000000, + 0x8c6d6e6d, 0xf4601bbd, + 0xf8000010, 0xbf8a0000, + 0x846e976e, 0x9177ff77, + 0x00800000, 0x8c776e77, + 0xf4603bbd, 0xf8000000, + 0xbf8a0000, 0xf4603ebd, + 0xf8000008, 0xbf8a0000, + 0x8bee6e6e, 0xbfa10001, + 0xbe80486e, 0x8b6eff6d, + 0xf0000000, 0xbfa20009, + 0xb8eef811, 0x8b6eff6e, + 0x00000080, 0xbfa20007, + 0x8c78ff78, 0x00004000, + 0x80ec886c, 0x82ed806d, + 0xbfa00002, 0x806c846c, + 0x826d806d, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0x85788978, + 0x936eff77, 0x0002001a, + 0xb96ef81a, 0xb9783244, 0xbe804a6c, 0xb8faf802, 0xbf0d987a, 0xbfa10001, 0xbfb00000, 0x8b6dff6d, @@ -3703,66 +3711,57 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x807a817a, 0xbf0d997b, 0xbfa20002, 0x847a897a, 0xbfa00001, 0x847a8a7a, - 0xb8fb1e06, 0x847b8a7b, - 0x807a7b7a, 0x8b7bff7f, - 0x0000ffff, 0x807aff7a, - 0x00000200, 0x807a7e7a, - 0x827b807b, 0xd7610000, - 0x00010870, 0xd7610000, - 0x00010a71, 0xd7610000, - 0x00010c72, 0xd7610000, - 0x00010e73, 0xd7610000, - 0x00011074, 0xd7610000, - 0x00011275, 0xd7610000, - 0x00011476, 0xd7610000, - 0x00011677, 0xd7610000, - 0x00011a79, 0xd7610000, - 0x00011c7e, 0xd7610000, - 0x00011e7f, 0xbefe00ff, - 0x00003fff, 0xbeff0080, - 0xee0a407a, 0x000c0000, - 0x00004000, 0xd760007a, - 0x00011d00, 0xd760007b, - 0x00011f00, 0xbefe007a, - 0xbeff007b, 0xbef4007e, - 0x8b75ff7f, 0x0000ffff, - 0x8c75ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x10807fac, 0xbef1007d, - 0xbef00080, 0xb8f30742, - 0x84739973, 0xbefe00c1, - 0x857d9973, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00002, - 0xbeff00c1, 0xbfa0000c, - 0xbef600ff, 0x01000000, - 0xc4068070, 0x008ce801, - 0x00008000, 0xc4068070, - 0x008ce802, 0x00010000, - 0xc4068070, 0x008ce803, - 0x00018000, 0xbfa0000b, - 0xbef600ff, 0x01000000, - 0xc4068070, 0x008ce801, - 0x00010000, 0xc4068070, - 0x008ce802, 0x00020000, - 0xc4068070, 0x008ce803, - 0x00030000, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0x8070ff70, - 0x00000200, 0xbef600ff, - 0x01000000, 0x7e000280, + 0x8b7bff7f, 0x0000ffff, + 0x807aff7a, 0x00000240, + 0x807a7e7a, 0x827b807b, + 0xd7610000, 0x00010870, + 0xd7610000, 0x00010a71, + 0xd7610000, 0x00010c72, + 0xd7610000, 0x00010e73, + 0xd7610000, 0x00011074, + 0xd7610000, 0x00011275, + 0xd7610000, 0x00011476, + 0xd7610000, 0x00011677, + 0xd7610000, 0x00011a79, + 0xd7610000, 0x00011c7e, + 0xd7610000, 0x00011e7f, + 0xbefe00ff, 0x00003fff, + 0xbeff0080, 0xee0a407a, + 0x000c0000, 0x00000000, + 0xd760007a, 0x00011d00, + 0xd760007b, 0x00011f00, + 0xbefe007a, 0xbeff007b, + 0xbef4007e, 0x8b75ff7f, + 0x0000ffff, 0xbef1007d, + 0xb8f30742, 0x84739973, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00002, 0xbeff00c1, + 0xbfa0000a, 0xee0a4074, + 0x008c0000, 0x00008000, + 0xee0a4074, 0x010c0000, + 0x00010000, 0xee0a4074, + 0x018c0000, 0x00018000, + 0xbfa00009, 0xee0a4074, + 0x008c0000, 0x00010000, + 0xee0a4074, 0x010c0000, + 0x00020000, 0xee0a4074, + 0x018c0000, 0x00030000, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0x8070ff70, + 0x00000200, 0x7e000280, 0x7e020280, 0x7e040280, - 0xbefd0080, 0xbe804ec2, - 0xbf94fffe, 0xb8faf804, - 0x8b7a847a, 0x91788478, - 0x8c787a78, 0xd7610002, + 0xbefd0080, 0xd7610002, 0x0000fa71, 0x807d817d, + 0xbe804ec2, 0xbf94fffe, + 0xb8faf804, 0x8b7a847a, + 0x91788478, 0x8c787a78, 0xd7610002, 0x0000fa6c, - 0x807d817d, 0x917aff6d, - 0x80000000, 0xd7610002, + 0x807d817d, 0x8b7aff6d, + 0x0000ffff, 0xd7610002, 0x0000fa7a, 0x807d817d, 0xd7610002, 0x0000fa6e, 0x807d817d, 0xd7610002, @@ -3770,32 +3769,31 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xd7610002, 0x0000fa78, 0x807d817d, 0xb8faf811, 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xd7610002, - 0x0000fa7b, 0x807d817d, - 0xb8f1f801, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f814, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f815, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f812, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f813, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8faf802, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xbefa50c1, 0xbfc70000, + 0x807d817d, 0xbefa0080, 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xbefe00ff, - 0x0000ffff, 0xbeff0080, - 0xc4068070, 0x008ce802, - 0x00000000, 0xbefe00c1, - 0xb8f03b05, 0x80708170, - 0xbf0d9973, 0xbfa20002, - 0x84708970, 0xbfa00001, - 0x84708a70, 0xb8fa1e06, - 0x847a8a7a, 0x80707a70, - 0xbef600ff, 0x01000000, + 0x807d817d, 0xb8f1f801, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8f1f814, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8f1f815, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8f1f812, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8f1f813, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8faf802, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xbefa50c1, + 0xbfc70000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xbefe00ff, 0x0000ffff, + 0xbeff0080, 0x80767074, + 0x82778075, 0xee0a4076, + 0x010c0000, 0x00000000, + 0xbefe00c1, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, 0xbef90080, 0xbefd0080, 0xbf800000, 0xbe804100, 0xbe824102, 0xbe844104, @@ -3826,12 +3824,13 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x0000f20e, 0x80798179, 0xd7610002, 0x0000f20f, 0x80798179, 0xbf06a079, - 0xbfa10007, 0xc4068070, - 0x008ce802, 0x00000000, + 0xbfa10009, 0x80767074, + 0x82778075, 0xee0a4076, + 0x010c0000, 0x00000000, 0x8070ff70, 0x00000080, 0xbef90080, 0x7e040280, 0x807d907d, 0xbf0aff7d, - 0x00000060, 0xbfa2ffbb, + 0x00000060, 0xbfa2ffb9, 0xbe804100, 0xbe824102, 0xbe844104, 0xbe864106, 0xbe884108, 0xbe8a410a, @@ -3853,47 +3852,47 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xd7610002, 0x0000f20a, 0x80798179, 0xd7610002, 0x0000f20b, 0x80798179, - 0xc4068070, 0x008ce802, + 0x80767074, 0x82778075, + 0xee0a4076, 0x010c0000, 0x00000000, 0xbefe00c1, 0x857d9973, 0x8b7d817d, 0xbf06817d, 0xbfa20002, 0xbeff0080, 0xbfa00001, 0xbeff00c1, 0xb8fb4306, - 0x8b7bc17b, 0xbfa10044, + 0x8b7bc17b, 0xbfa10042, 0x8b7aff6d, 0x80000000, - 0xbfa10041, 0x847b897b, - 0xbef6007b, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0x8070ff70, + 0xbfa1003f, 0x847b897b, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0x8070ff70, 0x00000200, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xd71f0000, + 0x00000080, 0xd71f0000, 0x000100c1, 0xd7200000, 0x000200c1, 0x16000084, 0x857d9973, 0x8b7d817d, 0xbf06817d, 0xbefd0080, - 0xbfa20013, 0xbe8300ff, + 0xbfa20015, 0xbe8300ff, 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, - 0xbf8a0000, 0xc4068070, - 0x008ce801, 0x00000000, + 0xbf8a0000, 0x80767074, + 0x82778075, 0xee0a4076, + 0x008c0000, 0x00000000, 0x807d037d, 0x80700370, 0xd5250000, 0x0001ff00, 0x00000080, 0xbf0a7b7d, - 0xbfa2fff3, 0xbfa00012, + 0xbfa2fff1, 0xbfa00014, 0xbe8300ff, 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8a0000, - 0xc4068070, 0x008ce801, + 0x80767074, 0x82778075, + 0xee0a4076, 0x008c0000, 0x00000000, 0x807d037d, 0x80700370, 0xd5250000, 0x0001ff00, 0x00000100, - 0xbf0a7b7d, 0xbfa2fff3, + 0xbf0a7b7d, 0xbfa2fff1, 0xbefe00c1, 0x857d9973, 0x8b7d817d, 0xbf06817d, 0xbfa20004, 0xbef000ff, @@ -3903,78 +3902,72 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xb8fb3b05, 0x807b817b, 0x847b827b, 0x857d9973, 0x8b7d817d, 0xbf06817d, - 0xbfa2001b, 0xbef600ff, - 0x01000000, 0xbefd0084, - 0xbf0a7b7d, 0xbfa10040, + 0xbfa2001b, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10032, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, - 0xc4068070, 0x008ce800, - 0x00000000, 0xc4068070, - 0x008ce801, 0x00008000, - 0xc4068070, 0x008ce802, - 0x00010000, 0xc4068070, - 0x008ce803, 0x00018000, + 0x80767074, 0x82778075, + 0xee0a4076, 0x000c0000, + 0x00000000, 0xee0a4076, + 0x008c0000, 0x00008000, + 0xee0a4076, 0x010c0000, + 0x00010000, 0xee0a4076, + 0x018c0000, 0x00018000, 0x807d847d, 0x8070ff70, 0x00000200, 0xbf0a7b7d, - 0xbfa2ffeb, 0xbfa0002a, - 0xbef600ff, 0x01000000, + 0xbfa2ffe9, 0xbfa0001a, 0xbefd0084, 0xbf0a7b7d, - 0xbfa10015, 0x7e008700, + 0xbfa10017, 0x7e008700, 0x7e028701, 0x7e048702, - 0x7e068703, 0xc4068070, - 0x008ce800, 0x00000000, - 0xc4068070, 0x008ce801, - 0x00010000, 0xc4068070, - 0x008ce802, 0x00020000, - 0xc4068070, 0x008ce803, + 0x7e068703, 0x80767074, + 0x82778075, 0xee0a4076, + 0x000c0000, 0x00000000, + 0xee0a4076, 0x008c0000, + 0x00010000, 0xee0a4076, + 0x010c0000, 0x00020000, + 0xee0a4076, 0x018c0000, 0x00030000, 0x807d847d, 0x8070ff70, 0x00000400, - 0xbf0a7b7d, 0xbfa2ffeb, - 0xb8fb1e06, 0x8b7bc17b, - 0xbfa1000d, 0x847b837b, - 0x807b7d7b, 0xbefe00c1, - 0xbeff0080, 0x7e008700, - 0xc4068070, 0x008ce800, - 0x00000000, 0x807d817d, - 0x8070ff70, 0x00000080, - 0xbf0a7b7d, 0xbfa2fff7, - 0xbfa0016e, 0xbef4007e, + 0xbf0a7b7d, 0xbfa2ffe9, + 0xbfa0014c, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, - 0x8c75ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x10807fac, 0xbef1007f, - 0xb8f20742, 0x84729972, - 0x8b6eff7f, 0x04000000, - 0xbfa1003b, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef4306, - 0x8b6fc16f, 0xbfa10030, - 0x846f896f, 0xbef6006f, + 0xbef1007f, 0xb8f20742, + 0x84729972, 0x8b6eff7f, + 0x04000000, 0xbfa10044, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef4306, 0x8b6fc16f, + 0xbfa10039, 0x846f896f, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, - 0x84788a78, 0xb8ee1e06, - 0x846e8a6e, 0x80786e78, - 0x8078ff78, 0x00000200, + 0x84788a78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xd71f0001, + 0x000100c1, 0xd7200001, + 0x000202c1, 0x30020282, + 0xbfa20012, 0x80767874, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xbf8a0000, 0xd8340000, + 0x00000001, 0xd5250001, + 0x0001ff01, 0x00000080, + 0x807dff7d, 0x00000080, 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa2000d, 0xc4050078, - 0x0080e800, 0x00000000, - 0xbf8a0000, 0xdac00000, - 0x00000000, 0x807dff7d, - 0x00000080, 0x8078ff78, - 0x00000080, 0xbf0a6f7d, - 0xbfa2fff4, 0xbfa0000c, - 0xc4050078, 0x0080e800, - 0x00000000, 0xbf8a0000, - 0xdac00000, 0x00000000, + 0xbf0a6f7d, 0xbfa2ffef, + 0xbfa00011, 0x80767874, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xbf8a0000, 0xd8340000, + 0x00000001, 0xd5250001, + 0x0001ff01, 0x00000100, 0x807dff7d, 0x00000100, 0x8078ff78, 0x00000100, - 0xbf0a6f7d, 0xbfa2fff4, + 0xbf0a6f7d, 0xbfa2ffef, 0xbef80080, 0xbefe00c1, 0x857d9972, 0x8b7d817d, 0xbf06817d, 0xbfa20002, @@ -3983,120 +3976,102 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x806f816f, 0x846f826f, 0x857d9972, 0x8b7d817d, 0xbf06817d, 0xbfa2002c, - 0xbef600ff, 0x01000000, 0xbeee0078, 0x8078ff78, 0x00000200, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10061, - 0xc4050078, 0x008ce800, - 0x00000000, 0xc4050078, - 0x008ce801, 0x00008000, - 0xc4050078, 0x008ce802, - 0x00010000, 0xc4050078, - 0x008ce803, 0x00018000, + 0x80767874, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xee0a0076, + 0x000c0001, 0x00008000, + 0xee0a0076, 0x000c0002, + 0x00010000, 0xee0a0076, + 0x000c0003, 0x00018000, 0xbf8a0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807d847d, 0x8078ff78, 0x00000200, - 0xbf0a6f7d, 0xbfa2ffea, - 0xc405006e, 0x008ce800, - 0x00000000, 0xc405006e, - 0x008ce801, 0x00008000, - 0xc405006e, 0x008ce802, - 0x00010000, 0xc405006e, - 0x008ce803, 0x00018000, - 0xbf8a0000, 0xbfa0003d, - 0xbef600ff, 0x01000000, + 0xbf0a6f7d, 0xbfa2ffe8, + 0x80766e74, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xee0a0076, + 0x000c0001, 0x00008000, + 0xee0a0076, 0x000c0002, + 0x00010000, 0xee0a0076, + 0x000c0003, 0x00018000, + 0xbf8a0000, 0xbfa0002d, 0xbeee0078, 0x8078ff78, 0x00000400, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10016, - 0xc4050078, 0x008ce800, - 0x00000000, 0xc4050078, - 0x008ce801, 0x00010000, - 0xc4050078, 0x008ce802, - 0x00020000, 0xc4050078, - 0x008ce803, 0x00030000, + 0xbf0a6f7d, 0xbfa10018, + 0x80767874, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xee0a0076, + 0x000c0001, 0x00010000, + 0xee0a0076, 0x000c0002, + 0x00020000, 0xee0a0076, + 0x000c0003, 0x00030000, 0xbf8a0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807d847d, 0x8078ff78, 0x00000400, - 0xbf0a6f7d, 0xbfa2ffea, - 0xb8ef1e06, 0x8b6fc16f, - 0xbfa1000f, 0x846f836f, - 0x806f7d6f, 0xbefe00c1, - 0xbeff0080, 0xc4050078, - 0x008ce800, 0x00000000, - 0xbf8a0000, 0x7e008500, - 0x807d817d, 0x8078ff78, - 0x00000080, 0xbf0a6f7d, - 0xbfa2fff6, 0xbeff00c1, - 0xc405006e, 0x008ce800, - 0x00000000, 0xc405006e, - 0x008ce801, 0x00010000, - 0xc405006e, 0x008ce802, - 0x00020000, 0xc405006e, - 0x008ce803, 0x00030000, + 0xbf0a6f7d, 0xbfa2ffe8, + 0x80766e74, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xee0a0076, + 0x000c0001, 0x00010000, + 0xee0a0076, 0x000c0002, + 0x00020000, 0xee0a0076, + 0x000c0003, 0x00030000, 0xbf8a0000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0x80f8ff78, - 0x00000050, 0xbef600ff, - 0x01000000, 0xbefd00ff, - 0x0000006c, 0x80f89078, - 0xf462403a, 0xf0000000, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000060, + 0x80767874, 0x82778075, + 0xbefd00ff, 0x0000006c, + 0xf460403b, 0xf8000000, 0xbf8a0000, 0x80fd847d, 0xbf800000, 0xbe804300, - 0xbe824302, 0x80f8a078, - 0xf462603a, 0xf0000000, - 0xbf8a0000, 0x80fd887d, - 0xbf800000, 0xbe804300, - 0xbe824302, 0xbe844304, - 0xbe864306, 0x80f8c078, - 0xf462803a, 0xf0000000, + 0xbe824302, 0x80f6a076, + 0x82f78077, 0xf460603b, + 0xf8000000, 0xbf8a0000, + 0x80fd887d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0x80f6c076, 0x82f78077, + 0xf460803b, 0xf8000000, 0xbf8a0000, 0x80fd907d, 0xbf800000, 0xbe804300, 0xbe824302, 0xbe844304, 0xbe864306, 0xbe884308, 0xbe8a430a, 0xbe8c430c, 0xbe8e430e, 0xbf06807d, - 0xbfa1fff0, 0xb980f801, + 0xbfa1ffef, 0xb980f801, 0x00000000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef600ff, - 0x01000000, 0xbeff0071, - 0xf4621bfa, 0xf0000000, - 0x80788478, 0xf4621b3a, - 0xf0000000, 0x80788478, - 0xf4621b7a, 0xf0000000, - 0x80788478, 0xf4621c3a, - 0xf0000000, 0x80788478, - 0xf4621c7a, 0xf0000000, - 0x80788478, 0xf4621eba, - 0xf0000000, 0x80788478, - 0xf4621efa, 0xf0000000, - 0x80788478, 0xf4621e7a, - 0xf0000000, 0x80788478, - 0xf4621cfa, 0xf0000000, - 0x80788478, 0xf4621bba, - 0xf0000000, 0x80788478, - 0xbf8a0000, 0xb96ef814, - 0xf4621bba, 0xf0000000, - 0x80788478, 0xbf8a0000, - 0xb96ef815, 0xf4621bba, - 0xf0000000, 0x80788478, - 0xbf8a0000, 0xb96ef812, - 0xf4621bba, 0xf0000000, - 0x80788478, 0xbf8a0000, + 0x8078ff78, 0x00000200, + 0x80767874, 0x82778075, + 0xbeff0071, 0xf4601bfb, + 0xf8000000, 0xf4601b3b, + 0xf8000004, 0xf4601b7b, + 0xf8000008, 0xf4601c3b, + 0xf800000c, 0xf4601c7b, + 0xf8000010, 0xf4601ebb, + 0xf8000014, 0xf4601efb, + 0xf8000018, 0xf4601e7b, + 0xf800001c, 0xf4601cfb, + 0xf8000020, 0xf4601bbb, + 0xf8000024, 0xbf8a0000, + 0xb96ef814, 0xf4601bbb, + 0xf8000028, 0xbf8a0000, + 0xb96ef815, 0xf4601bbb, + 0xf800002c, 0xbf8a0000, + 0xb96ef812, 0xf4601bbb, + 0xf8000030, 0xbf8a0000, 0xb96ef813, 0x8b6eff7f, - 0x04000000, 0xbfa1000d, - 0x80788478, 0xf4621bba, - 0xf0000000, 0x80788478, + 0x04000000, 0xbfa1000b, + 0xf4601bbb, 0xf8000038, 0xbf8a0000, 0xbf0d806e, 0xbfa10006, 0x856e906e, 0x8b6e6e6e, 0xbfa10003, @@ -4109,17 +4084,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xb8ee3b05, 0x806e816e, 0xbf0d9972, 0xbfa20002, 0x846e896e, 0xbfa00001, - 0x846e8a6e, 0xb8ef1e06, - 0x846f8a6f, 0x806e6f6e, - 0x806eff6e, 0x00000200, - 0x806e746e, 0x826f8075, - 0x8b6fff6f, 0x0000ffff, - 0xf4605c37, 0xf8000050, - 0xf4605d37, 0xf8000060, - 0xf4601e77, 0xf8000074, - 0xbf8a0000, 0x8b6dff6d, - 0x0000ffff, 0x8bfe7e7e, - 0x8bea6a6a, 0xb97af804, + 0x846e8a6e, 0x806eff6e, + 0x00000240, 0x806e746e, + 0x826f8075, 0xf4605c37, + 0xf8000010, 0xf4605d37, + 0xf8000020, 0xf4601e77, + 0xf8000034, 0xbf8a0000, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0x936eff77, 0x0002001a, + 0xb96ef81a, 0xb97af804, 0xbe804ec2, 0xbf94fffe, 0xbe804a6c, 0xbe804ec2, 0xbf94fffe, 0xbfb10000, @@ -4611,3 +4585,687 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { 0xbf8a0000, 0xbe801f6c, 0xbf9b0000, 0x00000000, }; + +static const uint32_t cwsr_trap_gfx12_1_0_hex[] = { + 0xbfa00001, 0xbfa003be, + 0xb0804009, 0xb8f8f804, + 0x9178ff78, 0x00008c00, + 0xb8fbf811, 0x8b6eff78, + 0x00004000, 0xbfa10008, + 0x8b6eff7b, 0x00000080, + 0xbfa20018, 0x8b6ea07b, + 0xbfa200da, 0xbf830010, + 0xb8fbf811, 0xbfa0fffb, + 0x8b6eff7b, 0x00000bd0, + 0xbfa20010, 0xb8eef812, + 0x8b6f8f7b, 0xbfa10002, + 0x8c6eff6e, 0x00000080, + 0xb8eff813, 0x8b6e6e6f, + 0xbfa20008, 0x8b6eff6d, + 0xf0000000, 0xbfa20005, + 0x8b6fff6f, 0x00000200, + 0xbfa20002, 0x8b6ea07b, + 0xbfa200c4, 0x9177ff77, + 0x007fc000, 0xb8fa04a1, + 0x847a967a, 0x8c777a77, + 0xb8fa0421, 0x847a957a, + 0x8c777a77, 0xb8fa3021, + 0x847a8e7a, 0x8c777a77, + 0xb980f821, 0x00000000, + 0xbefa4d82, 0xbf8a0000, + 0x84fa887a, 0xbf0d987b, + 0xbfa10002, 0x8c7bff7b, + 0xfe000000, 0xf4601bbd, + 0xf8000010, 0xbf8a0000, + 0x846e976e, 0x9177ff77, + 0x00800000, 0x8c776e77, + 0xf4603bbd, 0xf8000000, + 0xbf8a0000, 0xf4603ebd, + 0xf8000008, 0xbf8a0000, + 0x8bee6e6e, 0xbfa10001, + 0xbe80486e, 0x8b6eff6d, + 0xf0000000, 0xbfa20009, + 0xb8eef811, 0x8b6eff6e, + 0x00000080, 0xbfa20007, + 0x8c78ff78, 0x00004000, + 0x80ec886c, 0x82ed806d, + 0xbfa00002, 0x806c846c, + 0x826d806d, 0x8b6dff6d, + 0x01ffffff, 0xb8fbf811, + 0xbf0d847b, 0xbfa20081, + 0xf4003eb6, 0xf8000000, + 0xbfc70000, 0xf4003bb6, + 0xf8000008, 0x8b76ff7a, + 0x80000000, 0xbfa20027, + 0x9376ff7a, 0x00060019, + 0x81f9a376, 0xbf0b8179, + 0xbfa2006e, 0x81f9ac76, + 0xbf0b8179, 0xbfa20068, + 0x81f9b776, 0xbf0b8179, + 0xbfa20065, 0x8b76ff7a, + 0x000001ff, 0xbf06ff76, + 0x000000fe, 0xbfa20063, + 0xbf06ff76, 0x000000ff, + 0xbfa2005d, 0xbf06ff76, + 0x000000fa, 0xbfa2005a, + 0x81f9ff76, 0x000000e9, + 0xbf0b8179, 0xbfa20056, + 0x8b76ff7b, 0xffff0000, + 0xbf06ff76, 0xbf860000, + 0xbfa1005a, 0x9376ff7b, + 0x0002000e, 0x8b79ff7b, + 0x00003f00, 0x85798679, + 0x8c767976, 0xb9763b01, + 0xbfa00052, 0x8b76ff7a, + 0xfc000000, 0xbf06ff76, + 0xd4000000, 0xbfa20019, + 0xbf06ff76, 0xc8000000, + 0xbfa2002d, 0x8b76ff7a, + 0xff000000, 0xbf06ff76, + 0xcf000000, 0xbfa2003f, + 0x8b79ff7a, 0xffff0000, + 0xbf06ff79, 0xcc330000, + 0xbfa2003d, 0xbf06ff79, + 0xcc880000, 0xbfa2003a, + 0xbf06ff79, 0xcc350000, + 0xbfa2003a, 0xbf06ff79, + 0xcc3a0000, 0xbfa20037, + 0xbf06ff76, 0xcc000000, + 0xbfa10034, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20029, + 0xbf06ff76, 0x000000fa, + 0xbfa20026, 0x81f6ff76, + 0x000000e9, 0xbf0b8176, + 0xbfa20022, 0x8b76ff7b, + 0x0003fe00, 0xbf06ff76, + 0x0001fe00, 0xbfa2001d, + 0x8b76ff7b, 0x07fc0000, + 0xbf06ff76, 0x03fc0000, + 0xbfa20018, 0xbfa00014, + 0x9376ff7a, 0x00040016, + 0x81f68176, 0xbf0b8176, + 0xbfa20012, 0x9376ff7a, + 0x00050011, 0x81f68176, + 0xbf0b8176, 0xbfa2000d, + 0x8b76ff7a, 0x000001ff, + 0xbf06ff76, 0x000000ff, + 0xbfa20008, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20003, + 0xbfc70000, 0xbefb006e, + 0xbfa0ffa7, 0xbfc70000, + 0xbefb006f, 0xbfa0ffa4, + 0x80ec886c, 0x82ed806d, + 0xbfa0fff7, 0xbfc70000, + 0x857a9677, 0xb97a04a1, + 0x857a9577, 0xb97a0421, + 0x857a8e77, 0xb97a3021, + 0x8bfe7e7e, 0x8bea6a6a, + 0x85788978, 0xb9783244, + 0xbe804a6c, 0xb8faf802, + 0xbf0d987a, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x01ffffff, 0xbefa0080, + 0xb97a0151, 0x9177ff77, + 0x007fc000, 0xb8fa04a1, + 0x847a967a, 0x8c777a77, + 0xb8fa0421, 0x847a957a, + 0x8c777a77, 0xb8fa3021, + 0x847a8e7a, 0x8c777a77, + 0xb980f821, 0x00000000, + 0xbf0d847b, 0xbfa20081, + 0xf4003eb6, 0xf8000000, + 0xbfc70000, 0xf4003bb6, + 0xf8000008, 0x8b76ff7a, + 0x80000000, 0xbfa20027, + 0x9376ff7a, 0x00060019, + 0x81f9a376, 0xbf0b8179, + 0xbfa2006e, 0x81f9ac76, + 0xbf0b8179, 0xbfa20068, + 0x81f9b776, 0xbf0b8179, + 0xbfa20065, 0x8b76ff7a, + 0x000001ff, 0xbf06ff76, + 0x000000fe, 0xbfa20063, + 0xbf06ff76, 0x000000ff, + 0xbfa2005d, 0xbf06ff76, + 0x000000fa, 0xbfa2005a, + 0x81f9ff76, 0x000000e9, + 0xbf0b8179, 0xbfa20056, + 0x8b76ff7b, 0xffff0000, + 0xbf06ff76, 0xbf860000, + 0xbfa1005a, 0x9376ff7b, + 0x0002000e, 0x8b79ff7b, + 0x00003f00, 0x85798679, + 0x8c767976, 0xb9763b01, + 0xbfa00052, 0x8b76ff7a, + 0xfc000000, 0xbf06ff76, + 0xd4000000, 0xbfa20019, + 0xbf06ff76, 0xc8000000, + 0xbfa2002d, 0x8b76ff7a, + 0xff000000, 0xbf06ff76, + 0xcf000000, 0xbfa2003f, + 0x8b79ff7a, 0xffff0000, + 0xbf06ff79, 0xcc330000, + 0xbfa2003d, 0xbf06ff79, + 0xcc880000, 0xbfa2003a, + 0xbf06ff79, 0xcc350000, + 0xbfa2003a, 0xbf06ff79, + 0xcc3a0000, 0xbfa20037, + 0xbf06ff76, 0xcc000000, + 0xbfa10034, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20029, + 0xbf06ff76, 0x000000fa, + 0xbfa20026, 0x81f6ff76, + 0x000000e9, 0xbf0b8176, + 0xbfa20022, 0x8b76ff7b, + 0x0003fe00, 0xbf06ff76, + 0x0001fe00, 0xbfa2001d, + 0x8b76ff7b, 0x07fc0000, + 0xbf06ff76, 0x03fc0000, + 0xbfa20018, 0xbfa00014, + 0x9376ff7a, 0x00040016, + 0x81f68176, 0xbf0b8176, + 0xbfa20012, 0x9376ff7a, + 0x00050011, 0x81f68176, + 0xbf0b8176, 0xbfa2000d, + 0x8b76ff7a, 0x000001ff, + 0xbf06ff76, 0x000000ff, + 0xbfa20008, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20003, + 0xbfc70000, 0xbefb006e, + 0xbfa0ffa7, 0xbfc70000, + 0xbefb006f, 0xbfa0ffa4, + 0x80ec886c, 0x82ed806d, + 0xbfa0fff7, 0xbfc70000, + 0xbeee007e, 0xbeef007f, + 0xbefe0180, 0xbefe4d84, + 0xbf8a0000, 0x8b7aff7f, + 0x04000000, 0x847a857a, + 0x8c6d7a6d, 0xb8eff822, + 0xb980f822, 0x00000000, + 0xb8fa2b01, 0x847a997a, + 0x8c6d7a6d, 0xbefa0080, + 0xb97a2b01, 0xbefa007e, + 0x8b7bff7f, 0x01ffffff, + 0xbefe00c1, 0xbeff00c1, + 0xee0a407a, 0x000c0000, + 0x00000000, 0x7e000280, + 0xbefe007a, 0xbeff007b, + 0xb8fb0742, 0x847b997b, + 0xb8fa3b05, 0x807a817a, + 0xbf0d997b, 0xbfa20002, + 0x847a897a, 0xbfa00001, + 0x847a8a7a, 0x8b7bff7f, + 0x01ffffff, 0x807aff7a, + 0x000001c0, 0x807a7e7a, + 0x827b807b, 0xd7610000, + 0x00010870, 0xd7610000, + 0x00010a71, 0xd7610000, + 0x00010c72, 0xd7610000, + 0x00010e73, 0xd7610000, + 0x00011074, 0xd7610000, + 0x00011275, 0xd7610000, + 0x00011476, 0xd7610000, + 0x00011677, 0xd7610000, + 0x00011a79, 0xd7610000, + 0x00011c7e, 0xd7610000, + 0x00011e7f, 0xbefe00ff, + 0x00003fff, 0xbeff0080, + 0xee0a407a, 0x000c0000, + 0x00000000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x01ffffff, + 0xbef1007d, 0xb8f30742, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa0000a, + 0xee0a4074, 0x008c0000, + 0x00008000, 0xee0a4074, + 0x010c0000, 0x00010000, + 0xee0a4074, 0x018c0000, + 0x00018000, 0xbfa00009, + 0xee0a4074, 0x008c0000, + 0x00010000, 0xee0a4074, + 0x010c0000, 0x00020000, + 0xee0a4074, 0x018c0000, + 0x00030000, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0x8070ff70, 0x00000200, + 0x7e000280, 0x7e020280, + 0x7e040280, 0xbefd0080, + 0xd7610002, 0x0000fa71, + 0x807d817d, 0xb8faf802, + 0xbf0c8b7a, 0xbfa20003, + 0xbe804fc2, 0xbf94fffe, + 0xbfa10001, 0xbe804ec4, + 0xbf94fffc, 0xbefa4c88, + 0xbfc70000, 0xbf0c807a, + 0xbfa20006, 0x9371ff7a, + 0x00070004, 0x937aff7a, + 0x00070010, 0xbf06717a, + 0xbfa2fff6, 0xb8faf804, + 0x8b7aff7a, 0x0001000c, + 0x9178ff78, 0x0001000c, + 0x8c787a78, 0xd7610002, + 0x0000fa6c, 0x807d817d, + 0x8b7aff6d, 0x01ffffff, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa6e, 0x807d817d, + 0xbefa0080, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xd7610002, 0x0000fa78, + 0x807d817d, 0xb8faf811, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa6f, 0x807d817d, + 0xb8f1f801, 0x937aff6d, + 0x00060019, 0x847a8c7a, + 0x8c717a71, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f814, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f815, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f812, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f813, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8faf802, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xbefa50c1, 0xbfc70000, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xbefa4c88, + 0xbfc70000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xb8faf81a, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xbefe00c1, 0xbeff0080, + 0x80767074, 0x82778075, + 0xee0a4076, 0x010c0000, + 0x00000000, 0xbefe00c1, + 0x7e040280, 0xbefa5081, + 0xbfc70000, 0xd7610002, + 0x0001007a, 0xbefa5082, + 0xbfc70000, 0xd7610002, + 0x0001027a, 0xbefa5083, + 0xbfc70000, 0xd7610002, + 0x0001047a, 0xbefa5084, + 0xbfc70000, 0xd7610002, + 0x0001067a, 0xbefa5085, + 0xbfc70000, 0xd7610002, + 0x0001087a, 0xbefa5086, + 0xbfc70000, 0xd7610002, + 0x00010a7a, 0xbefa5087, + 0xbfc70000, 0xd7610002, + 0x00010c7a, 0xbefa5088, + 0xbfc70000, 0xd7610002, + 0x00010e7a, 0xbefa5089, + 0xbfc70000, 0xd7610002, + 0x0001107a, 0xbefa508a, + 0xbfc70000, 0xd7610002, + 0x0001127a, 0xbefa508b, + 0xbfc70000, 0xd7610002, + 0x0001147a, 0xbefa508c, + 0xbfc70000, 0xd7610002, + 0x0001167a, 0xbefa508d, + 0xbfc70000, 0xd7610002, + 0x0001187a, 0xbefa508e, + 0xbfc70000, 0xd7610002, + 0x00011a7a, 0xbefa508f, + 0xbfc70000, 0xd7610002, + 0x00011c7a, 0xbefa5090, + 0xbfc70000, 0xd7610002, + 0x00011e7a, 0xee0a4076, + 0x010c0000, 0x00008000, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xbef90080, + 0xbefd0080, 0xbf800000, + 0xbe804100, 0xbe824102, + 0xbe844104, 0xbe864106, + 0xbe884108, 0xbe8a410a, + 0xbe8c410c, 0xbe8e410e, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xd7610002, 0x0000f20c, + 0x80798179, 0xd7610002, + 0x0000f20d, 0x80798179, + 0xd7610002, 0x0000f20e, + 0x80798179, 0xd7610002, + 0x0000f20f, 0x80798179, + 0xbf06a079, 0xbfa10009, + 0x80767074, 0x82778075, + 0xee0a4076, 0x010c0000, + 0x00000000, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ffb9, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, + 0x80798179, 0xd7610002, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, + 0x80798179, 0xd7610002, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, + 0x80798179, 0xd7610002, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, + 0x80798179, 0xd7610002, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, + 0x80798179, 0xd7610002, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, + 0x80798179, 0xbefe00ff, + 0x0000ffff, 0x80767074, + 0x82778075, 0xee0a4076, + 0x010c0000, 0x00000000, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8fb4306, 0x8b7bc17b, + 0xbfa10042, 0x8b7aff6d, + 0x80000000, 0xbfa1003f, + 0x847b8a7b, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000200, + 0xd71f0000, 0x000100c1, + 0xd7200000, 0x000200c1, + 0x16000084, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa20015, + 0xbe8300ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8a0000, + 0x80767074, 0x82778075, + 0xee0a4076, 0x008c0000, + 0x00000000, 0x807d037d, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7d, 0xbfa2fff1, + 0xbfa00014, 0xbe8300ff, + 0x00000100, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8a0000, 0x80767074, + 0x82778075, 0xee0a4076, + 0x008c0000, 0x00000000, + 0x807d037d, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7d, + 0xbfa2fff1, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20004, + 0xbef000ff, 0x00000200, + 0xbeff0080, 0xbfa00003, + 0xbef000ff, 0x00000400, + 0xbeff00c1, 0xb8fb3b05, + 0x807b817b, 0x847b827b, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa2001b, + 0xbefd0084, 0xbf0a7b7d, + 0xbfa10032, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0x80767074, + 0x82778075, 0xee0a4076, + 0x000c0000, 0x00000000, + 0xee0a4076, 0x008c0000, + 0x00008000, 0xee0a4076, + 0x010c0000, 0x00010000, + 0xee0a4076, 0x018c0000, + 0x00018000, 0x807d847d, + 0x8070ff70, 0x00000200, + 0xbf0a7b7d, 0xbfa2ffe9, + 0xbfa0001a, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10017, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0x80767074, 0x82778075, + 0xee0a4076, 0x000c0000, + 0x00000000, 0xee0a4076, + 0x008c0000, 0x00010000, + 0xee0a4076, 0x010c0000, + 0x00020000, 0xee0a4076, + 0x018c0000, 0x00030000, + 0x807d847d, 0x8070ff70, + 0x00000400, 0xbf0a7b7d, + 0xbfa2ffe9, 0xbfa00184, + 0xbef4007e, 0x8b75ff7f, + 0x01ffffff, 0xbef1007f, + 0xb8f20742, 0x84729972, + 0x8b6eff7f, 0x04000000, + 0xbfa10044, 0xbefe00c1, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8ef4306, + 0x8b6fc16f, 0xbfa10039, + 0x846f8a6f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0x8078ff78, 0x00000200, + 0x8078ff78, 0x00000200, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbefd0080, + 0xd71f0001, 0x000100c1, + 0xd7200001, 0x000202c1, + 0x30020282, 0xbfa20012, + 0x80767874, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xbf8a0000, + 0xd8340000, 0x00000001, + 0xd5250001, 0x0001ff01, + 0x00000080, 0x807dff7d, + 0x00000080, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2ffef, 0xbfa00011, + 0x80767874, 0x82778075, + 0xee0a0076, 0x000c0000, + 0x00000000, 0xbf8a0000, + 0xd8340000, 0x00000001, + 0xd5250001, 0x0001ff01, + 0x00000100, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2ffef, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa2002c, 0xbeee0078, + 0x8078ff78, 0x00000200, + 0xbefd0084, 0x80767874, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xee0a0076, 0x000c0001, + 0x00008000, 0xee0a0076, + 0x000c0002, 0x00010000, + 0xee0a0076, 0x000c0003, + 0x00018000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffe8, 0x80766e74, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xee0a0076, 0x000c0001, + 0x00008000, 0xee0a0076, + 0x000c0002, 0x00010000, + 0xee0a0076, 0x000c0003, + 0x00018000, 0xbf8a0000, + 0xbfa0002d, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10018, 0x80767874, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xee0a0076, 0x000c0001, + 0x00010000, 0xee0a0076, + 0x000c0002, 0x00020000, + 0xee0a0076, 0x000c0003, + 0x00030000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffe8, 0x80766e74, + 0x82778075, 0xee0a0076, + 0x000c0000, 0x00000000, + 0xee0a0076, 0x000c0001, + 0x00010000, 0xee0a0076, + 0x000c0002, 0x00020000, + 0xee0a0076, 0x000c0003, + 0x00030000, 0xbf8a0000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000060, 0x80767874, + 0x82778075, 0xbefd00ff, + 0x0000006c, 0xf460403b, + 0xf8000000, 0xbf8a0000, + 0x80fd847d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0x80f6a076, 0x82f78077, + 0xf460603b, 0xf8000000, + 0xbf8a0000, 0x80fd887d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0x80f6c076, + 0x82f78077, 0xf460803b, + 0xf8000000, 0xbf8a0000, + 0x80fd907d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0xbe884308, 0xbe8a430a, + 0xbe8c430c, 0xbe8e430e, + 0xbf06807d, 0xbfa1ffef, + 0xb980f801, 0x00000000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0x8078ff78, + 0x00000200, 0x80767874, + 0x82778075, 0xbeff0071, + 0xf4601bfb, 0xf8000000, + 0xf4601b3b, 0xf8000004, + 0xf4601b7b, 0xf8000008, + 0xf4601c3b, 0xf800000c, + 0xf4601c7b, 0xf8000010, + 0xf4601ebb, 0xf8000014, + 0xf4601efb, 0xf8000018, + 0xf4601e7b, 0xf800001c, + 0xf4601cfb, 0xf8000020, + 0xf4601bbb, 0xf8000024, + 0xbf8a0000, 0xb96ef814, + 0xf4601bbb, 0xf8000028, + 0xbf8a0000, 0xb96ef815, + 0xf4601bbb, 0xf800002c, + 0xbf8a0000, 0xb96ef812, + 0xf4601bbb, 0xf8000030, + 0xbf8a0000, 0xb96ef813, + 0x8b6eff7f, 0x04000000, + 0xbfa10022, 0xf4601bbb, + 0xf8000038, 0xbf8a0000, + 0xbf0d806e, 0xbfa1001d, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804ec1, + 0x816ec16e, 0xbfa0fffb, + 0xbef800ff, 0x00000080, + 0xbefd0081, 0xf4601bbb, + 0xf0000000, 0xbfc70000, + 0x80788478, 0x937eff6e, + 0x00070004, 0x847e907e, + 0x8c7d7e7d, 0xbe80517d, + 0x917dff7d, 0x007f0000, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804e7d, + 0x816ec16e, 0xbfa0fffb, + 0x807d817d, 0xbf08907d, + 0xbfa1ffec, 0xf4601bbb, + 0xf800003c, 0xbfc70000, + 0xbf0d806e, 0xbfa1000c, + 0xbf0d9a7f, 0xbfa10002, + 0xbf068180, 0xbe804fc4, + 0xbf94fffc, 0xbfa10006, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804ec3, + 0x816ec16e, 0xbfa0fffb, + 0xf4601bbb, 0xf8000040, + 0xbfc70000, 0xb96ef81a, + 0xbefd006f, 0xbefe0070, + 0xbeff0071, 0xb979f822, + 0xb97b2011, 0x857b867b, + 0xb97b0191, 0x857b827b, + 0xb97bba11, 0xb973f801, + 0xb8ee3b05, 0x806e816e, + 0xbf0d9972, 0xbfa20002, + 0x846e896e, 0xbfa00001, + 0x846e8a6e, 0x806eff6e, + 0x000001c0, 0x806e746e, + 0x826f8075, 0xf4605c37, + 0xf8000010, 0xf4605d37, + 0xf8000020, 0xf4601e77, + 0xf8000034, 0xbf8a0000, + 0x856e9677, 0xb96e04a1, + 0x856e9577, 0xb96e0421, + 0x856e8e77, 0xb96e3021, + 0x8b6dff6d, 0x01ffffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0xb97af804, 0xb8eef802, + 0xbf0c8b6e, 0xbfa20003, + 0xbe804fc2, 0xbf94fffe, + 0xbfa10001, 0xbe804ec4, + 0xbf94fffc, 0x857a897a, + 0xb97a0244, 0xbe804a6c, + 0xb8eef802, 0xbf0c8b6e, + 0xbfa20003, 0xbe804fc2, + 0xbf94fffe, 0xbfa10001, + 0xbe804ec4, 0xbf94fffc, + 0xbfb10000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index 7b9d36e5fa43..d38ff404277b 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -28,10 +28,30 @@ */ #define CHIP_GFX12 37 +#define CHIP_GC_12_0_3 38 + +#define HAVE_XNACK (ASIC_FAMILY == CHIP_GC_12_0_3) +#define HAVE_57BIT_ADDRESS (ASIC_FAMILY == CHIP_GC_12_0_3) +#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3) +#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0) +#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3) +#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3) +#define RELAXED_SCHEDULING_IN_TRAP (ASIC_FAMILY == CHIP_GFX12) +#define HAVE_INSTRUCTION_FIXUP (ASIC_FAMILY == CHIP_GC_12_0_3) #define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised +#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) +#define WAVE32_ONLY (ASIC_FAMILY == CHIP_GC_12_0_3) +#define SAVE_TTMPS_IN_SGPR_BLOCK (ASIC_FAMILY >= CHIP_GC_12_0_3) + +#if HAVE_XNACK && !WAVE32_ONLY +# error +#endif -var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +#define ADDRESS_HI32_NUM_BITS ((HAVE_57BIT_ADDRESS ? 57 : 48) - 32) +#define ADDRESS_HI32_MASK ((1 << ADDRESS_HI32_NUM_BITS) - 1) + +var SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK = 0x4 | (NUM_NAMED_BARRIERS ? 0x8 : 0) | (HAVE_CLUSTER_BARRIER ? 0x10000 : 0) var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 @@ -40,6 +60,7 @@ var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 var SQ_WAVE_STATUS_WAVE64_SIZE = 1 var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 +var SQ_WAVE_STATUS_IN_WG_SHIFT = 11 var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 @@ -47,11 +68,15 @@ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 -var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 -var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 + +#if ASIC_FAMILY < CHIP_GC_12_0_3 var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 +#else +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 10 +#endif var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF +var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT = 4 var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 @@ -77,11 +102,46 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT + +var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0 +var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2 + var BARRIER_STATE_SIGNAL_OFFSET = 16 +var BARRIER_STATE_SIGNAL_SIZE = 7 +var BARRIER_STATE_MEMBER_OFFSET = 4 +var BARRIER_STATE_MEMBER_SIZE = 7 var BARRIER_STATE_VALID_OFFSET = 0 +#if RELAXED_SCHEDULING_IN_TRAP +var TTMP11_SCHED_MODE_SHIFT = 26 +var TTMP11_SCHED_MODE_SIZE = 2 +var TTMP11_SCHED_MODE_MASK = 0xC000000 +#endif + +var NAMED_BARRIERS_SR_OFFSET_FROM_HWREG = 0x80 +var S_BARRIER_INIT_MEMBERCNT_MASK = 0x7F0000 +var S_BARRIER_INIT_MEMBERCNT_SHIFT = 0x10 + +var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT = 18 +var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE = 1 +var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT = 16 +var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE = 1 +var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT = 0 +var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE = 7 + +#if HAVE_BANKED_VGPRS +var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 12 +var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6 +#endif + var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 +var TTMP11_FIRST_REPLAY_SHIFT = 22 +var TTMP11_FIRST_REPLAY_MASK = 0x400000 +var TTMP11_REPLAY_W64H_SHIFT = 21 +var TTMP11_REPLAY_W64H_MASK = 0x200000 +var TTMP11_FXPTR_SHIFT = 14 +var TTMP11_FXPTR_MASK = 0x1FC000 // SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE @@ -93,6 +153,11 @@ var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000 var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31 +#if HAVE_BANKED_VGPRS +var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 25 +var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6 +#endif + var s_sgpr_save_num = 108 var s_save_spi_init_lo = exec_lo @@ -103,12 +168,12 @@ var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 var s_save_state_priv = ttmp12 var s_save_excp_flag_priv = ttmp15 -var s_save_xnack_mask = s_save_excp_flag_priv +var s_save_xnack_mask = s_save_exec_hi var s_wave_size = ttmp7 -var s_save_buf_rsrc0 = ttmp8 -var s_save_buf_rsrc1 = ttmp9 -var s_save_buf_rsrc2 = ttmp10 -var s_save_buf_rsrc3 = ttmp11 +var s_save_base_addr_lo = ttmp8 +var s_save_base_addr_hi = ttmp9 +var s_save_addr_lo = ttmp10 +var s_save_addr_hi = ttmp11 var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_excp_flag_priv var s_save_tmp = ttmp14 @@ -116,9 +181,6 @@ var s_save_m0 = ttmp5 var s_save_ttmps_lo = s_save_tmp var s_save_ttmps_hi = s_save_excp_flag_priv -var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE -var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC - var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 var S_WAVE_SIZE = 25 @@ -139,15 +201,21 @@ var s_restore_exec_hi = ttmp5 var s_restore_state_priv = ttmp14 var s_restore_excp_flag_priv = ttmp15 var s_restore_xnack_mask = ttmp13 -var s_restore_buf_rsrc0 = ttmp8 -var s_restore_buf_rsrc1 = ttmp9 -var s_restore_buf_rsrc2 = ttmp10 -var s_restore_buf_rsrc3 = ttmp11 +var s_restore_base_addr_lo = ttmp8 +var s_restore_base_addr_hi = ttmp9 +var s_restore_addr_lo = ttmp10 +var s_restore_addr_hi = ttmp11 var s_restore_size = ttmp6 var s_restore_ttmps_lo = s_restore_tmp var s_restore_ttmps_hi = s_restore_alloc_size var s_restore_spi_init_hi_save = s_restore_exec_hi +#if SAVE_TTMPS_IN_SGPR_BLOCK +var TTMP_SR_OFFSET_FROM_HWREG = -0x40 +#else +var TTMP_SR_OFFSET_FROM_HWREG = 0x40 +#endif + shader main asic(DEFAULT) type(CS) @@ -159,8 +227,23 @@ L_JUMP_TO_RESTORE: s_branch L_RESTORE L_SKIP_RESTORE: +#if RELAXED_SCHEDULING_IN_TRAP + // Assume most relaxed scheduling mode is set. Save and revert to normal mode. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE) + s_wait_alu 0 + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, \ + SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0 +#endif + s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC +#if RELAXED_SCHEDULING_IN_TRAP + // Save SCHED_MODE[1:0] into ttmp11[27:26]. + s_andn2_b32 ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK + s_lshl_b32 ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT + s_or_b32 ttmp11, ttmp11, ttmp2 +#endif + // Clear SPI_PRIO: do not save with elevated priority. // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK @@ -226,6 +309,10 @@ L_CHECK_TRAP_ID: s_cbranch_scc1 L_SAVE L_FETCH_2ND_TRAP: +#if HAVE_XNACK + save_and_clear_xnack_state_priv(ttmp14) +#endif + // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) // ttmp12 holds SQ_WAVE_STATUS @@ -233,10 +320,17 @@ L_FETCH_2ND_TRAP: s_wait_idle s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 - s_bitcmp1_b32 ttmp15, 0xF + s_bitcmp1_b32 ttmp15, (ADDRESS_HI32_NUM_BITS - 1) s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA - s_or_b32 ttmp15, ttmp15, 0xFFFF0000 + s_or_b32 ttmp15, ttmp15, ~ADDRESS_HI32_MASK L_NO_SIGN_EXTEND_TMA: +#if RELAXED_SCHEDULING_IN_TRAP + // Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI). + // The second-level trap will restore from ttmp1 for backwards compatibility. + s_and_b32 ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK + s_andn2_b32 ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK + s_or_b32 ttmp1, ttmp1, ttmp2 +#endif s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag s_wait_idle @@ -277,15 +371,30 @@ L_TRAP_CASE: s_addc_u32 ttmp1, ttmp1, 0x0 L_EXIT_TRAP: - s_and_b32 ttmp1, ttmp1, 0xFFFF + s_and_b32 ttmp1, ttmp1, ADDRESS_HI32_MASK + +#if HAVE_INSTRUCTION_FIXUP + s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + fixup_instruction() +#endif + +#if HAVE_XNACK + restore_xnack_state_priv(s_save_tmp) +#endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // STATE_PRIV.*BARRIER_COMPLETE may have changed since we read it. // Only restore fields which the trap handler changes. s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT + +#if RELAXED_SCHEDULING_IN_TRAP + // Assume relaxed scheduling mode after this point. + restore_sched_mode(ttmp2) +#endif + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv @@ -299,11 +408,18 @@ L_SAVE: s_cbranch_scc0 L_HAVE_VGPRS s_endpgm L_HAVE_VGPRS: - - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_and_b32 s_save_pc_hi, s_save_pc_hi, ADDRESS_HI32_MASK s_mov_b32 s_save_tmp, 0 s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit +#if HAVE_XNACK + save_and_clear_xnack_state_priv(s_save_tmp) +#endif + +#if HAVE_INSTRUCTION_FIXUP + fixup_instruction() +#endif + /* inform SPI the readiness and wait for SPI's go signal */ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI s_mov_b32 s_save_exec_hi, exec_hi @@ -317,11 +433,25 @@ L_HAVE_VGPRS: s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT) s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp +#if HAVE_XNACK + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_WAVE_XNACK_MASK) + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_MASK), 0 +#endif + +#if HAVE_BANKED_VGPRS + // Save and clear shader's DST/SRC0/SRC1 VGPR bank selection so we can use v[0-255]. + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE) + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_mov_b32 s_save_tmp, 0 + s_setreg_b32 hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE), s_save_tmp +#endif + // Trap temporaries must be saved via VGPR but all VGPRs are in use. // There is no ttmp space to hold the resource constant for VGPR save. // Save v0 by itself since it requires only two SGPRs. s_mov_b32 s_save_ttmps_lo, exec_lo - s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF + s_and_b32 s_save_ttmps_hi, exec_hi, ADDRESS_HI32_MASK s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS @@ -330,13 +460,13 @@ L_HAVE_VGPRS: s_mov_b32 exec_hi, s_save_ttmps_hi // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic - // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + // ttmp SR memory offset: + // - gfx12: size(VGPR)+size(SGPR)+0x40 + // - gfx12.5: size(VGPR)+size(SGPR)-0x40 get_wave_size2(s_save_ttmps_hi) get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) - get_svgpr_size_bytes(s_save_ttmps_hi) - s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi - s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF - s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG) s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 @@ -354,23 +484,16 @@ L_HAVE_VGPRS: s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_hi, 0x0 - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS v_readlane_b32 ttmp14, v0, 0xE v_readlane_b32 ttmp15, v0, 0xF s_mov_b32 exec_lo, ttmp14 s_mov_b32 exec_hi, ttmp15 - /* setup Resource Contants */ - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo - s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited - s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC - + s_mov_b32 s_save_base_addr_lo, s_save_spi_init_lo + s_and_b32 s_save_base_addr_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK s_mov_b32 s_save_m0, m0 - /* global mem offset */ - s_mov_b32 s_save_mem_offset, 0x0 get_wave_size2(s_wave_size) /* save first 4 VGPRs, needed for SGPR save */ @@ -385,65 +508,72 @@ L_ENABLE_SAVE_4VGPR_EXEC_HI: s_mov_b32 exec_hi, 0xFFFFFFFF s_branch L_SAVE_4VGPR_WAVE64 L_SAVE_4VGPR_WAVE32: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128 + global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*2 + global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*3 s_branch L_SAVE_HWREG L_SAVE_4VGPR_WAVE64: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256 + global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*2 + global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*3 /* save HW registers */ L_SAVE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + // HWREG SR memory offset : size(VGPR)+size(SGPR) get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) - get_svgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store s_mov_b32 m0, 0x0 //Next lane of v2 to write to + write_hwreg_to_v2(s_save_m0) + // Ensure no further changes to barrier or LDS state. - // STATE_PRIV.BARRIER_COMPLETE may change up to this point. - s_barrier_signal -2 - s_barrier_wait -2 + // STATE_PRIV.*BARRIER_COMPLETE may change up to this point. + wait_trap_barriers(s_save_tmp, s_save_m0, 1) - // Re-read final state of BARRIER_COMPLETE field for save. + // Re-read final state of *BARRIER_COMPLETE fields for save. s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV) - s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK - s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp - write_hwreg_to_v2(s_save_m0) write_hwreg_to_v2(s_save_pc_lo) - s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK + s_and_b32 s_save_tmp, s_save_pc_hi, ADDRESS_HI32_MASK write_hwreg_to_v2(s_save_tmp) write_hwreg_to_v2(s_save_exec_lo) +#if WAVE32_ONLY + s_mov_b32 s_save_tmp, 0 + write_hwreg_to_v2(s_save_tmp) +#else write_hwreg_to_v2(s_save_exec_hi) +#endif write_hwreg_to_v2(s_save_state_priv) s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) write_hwreg_to_v2(s_save_tmp) +#if HAVE_XNACK write_hwreg_to_v2(s_save_xnack_mask) +#else + s_mov_b32 s_save_tmp, 0 + write_hwreg_to_v2(s_save_tmp) +#endif s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE) + +#if HAVE_BANKED_VGPRS + s_bfe_u32 s_save_tmp, s_save_pc_hi, (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT | (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE << 0x10)) + s_lshl_b32 s_save_tmp, s_save_tmp, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT + s_or_b32 s_save_m0, s_save_m0, s_save_tmp +#endif + write_hwreg_to_v2(s_save_m0) s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) @@ -465,22 +595,49 @@ L_SAVE_HWREG: s_wait_kmcnt (0) write_hwreg_to_v2(s_save_tmp) +#if HAVE_CLUSTER_BARRIER + s_sendmsg_rtn_b32 s_save_tmp, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) + s_wait_kmcnt 0 + write_hwreg_to_v2(s_save_tmp) +#endif + +#if ASIC_FAMILY >= CHIP_GC_12_0_3 + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCHED_MODE) + write_hwreg_to_v2(s_save_tmp) +#endif + +#if ! SAVE_TTMPS_IN_SGPR_BLOCK // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. s_mov_b32 exec_lo, 0xFFFF +#else + // All 128 bytes are available for HWREGs. + s_mov_b32 exec_lo, 0xFFFFFFFF +#endif s_mov_b32 exec_hi, 0x0 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. s_mov_b32 exec_lo, 0xFFFFFFFF +#if NUM_NAMED_BARRIERS + v_mov_b32 v2, 0 + + for var bar_idx = 0; bar_idx < NUM_NAMED_BARRIERS; bar_idx ++ + s_get_barrier_state s_save_tmp, (bar_idx + 1) + s_wait_kmcnt 0 + v_writelane_b32 v2, s_save_tmp, bar_idx + end + + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:NAMED_BARRIERS_SR_OFFSET_FROM_HWREG +#endif + /* save SGPRs */ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... - // SGPR SR memory offset : size(VGPR)+size(SVGPR) + // SGPR SR memory offset : size(VGPR) get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) - get_svgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into @@ -502,7 +659,9 @@ L_SAVE_SGPR_LOOP: s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 s_mov_b32 ttmp13, 0x0 v_mov_b32 v2, 0x0 @@ -521,7 +680,14 @@ L_SAVE_SGPR_SKIP_TCP_STORE: s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] write_12sgpr_to_v2(s0) - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS +#if SAVE_TTMPS_IN_SGPR_BLOCK + // Last 16 dwords of the SGPR block already contain the TTMPS. Make + // sure to not override them. + s_mov_b32 exec_lo, 0xFFFF +#endif + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS /* save LDS */ @@ -547,18 +713,13 @@ L_SAVE_LDS_NORMAL: // first wave do LDS save; s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY - s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes - // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) // get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) - get_svgpr_size_bytes(s_save_tmp) - s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - //load 0~63*4(byte address) to vgpr v0 v_mbcnt_lo_u32_b32 v0, -1, 0 v_mbcnt_hi_u32_b32 v0, -1, v0 @@ -578,7 +739,9 @@ L_SAVE_LDS_W32: L_SAVE_LDS_LOOP_W32: ds_read_b32 v1, v0 s_wait_idle - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -596,7 +759,9 @@ L_SAVE_LDS_W64: L_SAVE_LDS_LOOP_W64: ds_read_b32 v1, v0 s_wait_idle - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -629,8 +794,6 @@ L_SAVE_VGPR_NORMAL: s_cmp_eq_u32 m0, 1 s_cbranch_scc1 L_SAVE_VGPR_WAVE64 - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR Allocated in 4-GPR granularity // VGPR store using dw burst @@ -644,10 +807,12 @@ L_SAVE_VGPR_W32_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3 + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS + global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128 + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*2 + global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes @@ -657,12 +822,10 @@ L_SAVE_VGPR_W32_LOOP: s_branch L_SAVE_VGPR_END L_SAVE_VGPR_WAVE64: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR store using dw burst s_mov_b32 m0, 0x4 //VGPR initial index value =4 s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_SHARED_VGPR + s_cbranch_scc0 L_SAVE_VGPR_END L_SAVE_VGPR_W64_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] @@ -670,45 +833,24 @@ L_SAVE_VGPR_W64_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3 + s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset + s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0 + global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS + global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256 + global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*2 + global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? -L_SAVE_SHARED_VGPR: - s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) - s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? - s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) - //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. - //save shared_vgpr will start from the index of m0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 - s_mov_b32 exec_lo, 0xFFFFFFFF - s_mov_b32 exec_hi, 0x00000000 - -L_SAVE_SHARED_VGPR_WAVE64_LOOP: - v_movrels_b32 v0, v0 //v0 = v[0+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS - s_add_u32 m0, m0, 1 //next vgpr index - s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 - s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? - L_SAVE_VGPR_END: s_branch L_END_PGM L_RESTORE: - /* Setup Resource Contants */ - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo - s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE - s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) - s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_mov_b32 s_restore_base_addr_lo, s_restore_spi_init_lo + s_and_b32 s_restore_base_addr_hi, s_restore_spi_init_hi, ADDRESS_HI32_MASK // Save s_restore_spi_init_hi for later use. s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi @@ -735,28 +877,31 @@ L_RESTORE_LDS_NORMAL: s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY - s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes - // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) // get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) - get_svgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE s_and_b32 m0, m0, 1 s_cmp_eq_u32 m0, 1 s_mov_b32 m0, 0x0 + + v_mbcnt_lo_u32_b32 v1, -1, 0 + v_mbcnt_hi_u32_b32 v1, -1, v1 + v_lshlrev_b32 v1, 2, v1 // 0, 4, 8, ... 124 (W32) or 252 (W64) + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 L_RESTORE_LDS_LOOP_W32: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS s_wait_idle - ds_store_addtid_b32 v0 + ds_store_b32 v1, v0 + v_add_nc_u32 v1, v1, 128 s_add_u32 m0, m0, 128 // 128 DW s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 @@ -764,9 +909,12 @@ L_RESTORE_LDS_LOOP_W32: s_branch L_RESTORE_VGPR L_RESTORE_LDS_LOOP_W64: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS s_wait_idle - ds_store_addtid_b32 v0 + ds_store_b32 v1, v0 + v_add_nc_u32 v1, v1, 256 s_add_u32 m0, m0, 256 // 256 DW s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 @@ -795,20 +943,18 @@ L_RESTORE_VGPR_NORMAL: s_cmp_eq_u32 m0, 1 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR load using dw burst s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 s_mov_b32 m0, 4 //VGPR initial index value = 4 - s_cmp_lt_u32 m0, s_restore_alloc_size - s_cbranch_scc0 L_RESTORE_SGPR L_RESTORE_VGPR_WAVE32_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3 + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS + global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128 + global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2 + global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3 s_wait_idle v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 @@ -820,29 +966,31 @@ L_RESTORE_VGPR_WAVE32_LOOP: s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? /* VGPR restore on v0 */ - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3 + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS + global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128 + global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2 + global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3 s_wait_idle s_branch L_RESTORE_SGPR L_RESTORE_VGPR_WAVE64: - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - // VGPR load using dw burst s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 s_mov_b32 m0, 4 //VGPR initial index value = 4 s_cmp_lt_u32 m0, s_restore_alloc_size - s_cbranch_scc0 L_RESTORE_SHARED_VGPR + s_cbranch_scc0 L_RESTORE_V0 L_RESTORE_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3 + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS + global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256 + global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2 + global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3 s_wait_idle v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 @@ -853,50 +1001,29 @@ L_RESTORE_VGPR_WAVE64_LOOP: s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? -L_RESTORE_SHARED_VGPR: - s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size - s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? - s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) - //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. - //restore shared_vgpr will start from the index of m0 - s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 - s_mov_b32 exec_lo, 0xFFFFFFFF - s_mov_b32 exec_hi, 0x00000000 -L_RESTORE_SHARED_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS - s_wait_idle - v_movreld_b32 v0, v0 //v[0+m0] = v0 - s_add_u32 m0, m0, 1 //next vgpr index - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 - s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? - - s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! - /* VGPR restore on v0 */ L_RESTORE_V0: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3 + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 + global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS + global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256 + global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2 + global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3 s_wait_idle /* restore SGPRs */ //will be 2+8+16*6 - // SGPR SR memory offset : size(VGPR)+size(SVGPR) + // SGPR SR memory offset : size(VGPR) L_RESTORE_SGPR: get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) - get_svgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved - - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 24*4 // s[104:107] + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 s_mov_b32 m0, s_sgpr_save_num - read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b128 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS s_wait_idle s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] @@ -905,7 +1032,9 @@ L_RESTORE_SGPR: s_movreld_b64 s0, s0 //s[0+m0] = s0 s_movreld_b64 s2, s2 - read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 8*4 // s[96:103] + s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0 + s_load_b256 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS s_wait_idle s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] @@ -917,7 +1046,9 @@ L_RESTORE_SGPR: s_movreld_b64 s6, s6 L_RESTORE_SGPR_LOOP: - read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 16*4 // s[0,16,32,48,64,80] + s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0 + s_load_b512 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS s_wait_idle s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] @@ -941,76 +1072,123 @@ L_RESTORE_SGPR: /* restore HW registers */ L_RESTORE_HWREG: - // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + // HWREG SR memory offset : size(VGPR)+size(SGPR) get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) - get_svgpr_size_bytes(s_restore_tmp) - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() - - s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset + s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0 // Restore s_restore_spi_init_hi before the saved value gets clobbered. s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save - read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) - read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b32 s_restore_m0, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS + s_load_b32 s_restore_pc_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x4 + s_load_b32 s_restore_pc_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x8 + s_load_b32 s_restore_exec_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0xC + s_load_b32 s_restore_exec_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x10 + s_load_b32 s_restore_state_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x14 + s_load_b32 s_restore_excp_flag_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x18 + s_load_b32 s_restore_xnack_mask, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x1C + s_load_b32 s_restore_mode, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x20 + s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x24 s_wait_idle s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch - read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x28 s_wait_idle s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch - read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x2C s_wait_idle s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp - read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x30 s_wait_idle s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp - // Only the first wave needs to restore the workgroup barrier. + // Only the first wave needs to restore group barriers. s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK - s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE // Skip over WAVE_STATUS, since there is no state to restore from it - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4 - read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x38 s_wait_idle + // Skip group barriers if wave is not part of a group. s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET - s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE - // extract the saved signal count from s_restore_tmp - s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET + // Restore workgroup barrier signal count. + restore_barrier_signal_count(-1) - // We need to call s_barrier_signal repeatedly to restore the signal - // count of the work group barrier. The member count is already - // initialized with the number of waves in the work group. -L_BARRIER_RESTORE_LOOP: - s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp - s_cbranch_scc0 L_SKIP_BARRIER_RESTORE - s_barrier_signal -1 - s_add_i32 s_restore_tmp, s_restore_tmp, -1 - s_branch L_BARRIER_RESTORE_LOOP +#if NUM_NAMED_BARRIERS + s_mov_b32 s_restore_mem_offset, NAMED_BARRIERS_SR_OFFSET_FROM_HWREG + s_mov_b32 m0, 1 + +L_RESTORE_NAMED_BARRIER_LOOP: + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], s_restore_mem_offset scope:SCOPE_SYS + s_wait_kmcnt 0 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 0x4 -L_SKIP_BARRIER_RESTORE: + // Restore named barrier member count. + s_bfe_u32 exec_lo, s_restore_tmp, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 16)) + s_lshl_b32 exec_lo, exec_lo, S_BARRIER_INIT_MEMBERCNT_SHIFT + s_or_b32 m0, m0, exec_lo + s_barrier_init m0 + s_andn2_b32 m0, m0, S_BARRIER_INIT_MEMBERCNT_MASK + + // Restore named barrier signal count. + restore_barrier_signal_count(m0) + + s_add_u32 m0, m0, 1 + s_cmp_gt_u32 m0, NUM_NAMED_BARRIERS + s_cbranch_scc0 L_RESTORE_NAMED_BARRIER_LOOP +#endif + +L_SKIP_GROUP_BARRIER_RESTORE: +#if HAVE_CLUSTER_BARRIER + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x3C + s_wait_kmcnt 0 + + // Skip cluster barrier restore if wave is not part of a cluster. + s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET + s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE + + // Only the first wave in the group signals the trap cluster barrier. + s_bitcmp1_b32 s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT + s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL + + // Clear SCC: s_barrier_signal_isfirst -4 writes SCC=>1 but not SCC=>0. + s_cmp_eq_u32 0, 1 + s_barrier_signal_isfirst -4 +L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL: + s_barrier_wait -4 + + // Only the first wave in the cluster restores the barrier. + s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE + + // Restore cluster barrier signal count. + restore_barrier_signal_count(-3) +L_SKIP_CLUSTER_BARRIER_RESTORE: +#endif + +#if ASIC_FAMILY >= CHIP_GC_12_0_3 + s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x40 + s_wait_kmcnt 0 + s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_restore_tmp +#endif s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi +#if HAVE_XNACK + s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_MASK), s_restore_xnack_mask +#endif + // EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed. // Only restore the other fields to avoid clobbering them. s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv @@ -1022,37 +1200,50 @@ L_SKIP_BARRIER_RESTORE: s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic - // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 + // ttmp SR memory offset : + // - gfx12: size(VGPR)+size(SGPR)+0x40 + // - gfx12.5: size(VGPR)+size(SGPR)-0x40 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) - get_svgpr_size_bytes(s_restore_ttmps_hi) - s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi - s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() - s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 - s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 - s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_base_addr_lo + s_addc_u32 s_restore_ttmps_hi, s_restore_base_addr_hi, 0x0 + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x10 scope:SCOPE_SYS + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x20 scope:SCOPE_SYS + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x34 scope:SCOPE_SYS s_wait_idle - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS +#if HAVE_XNACK + restore_xnack_state_priv(s_restore_tmp) +#endif + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, ADDRESS_HI32_MASK //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 +#if RELAXED_SCHEDULING_IN_TRAP + // Assume relaxed scheduling mode after this point. + restore_sched_mode(s_restore_tmp) +#endif + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu - // Make barrier and LDS state visible to all waves in the group. - // STATE_PRIV.BARRIER_COMPLETE may change after this point. - s_barrier_signal -2 - s_barrier_wait -2 + // Make barrier and LDS state visible to all waves in the group/cluster. + // STATE_PRIV.*BARRIER_COMPLETE may change after this point. + wait_trap_barriers(s_restore_tmp, 0, 0) + +#if HAVE_CLUSTER_BARRIER + // SCC is changed by wait_trap_barriers, restore it separately. + s_lshr_b32 s_restore_state_priv, s_restore_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, 1), s_restore_state_priv +#endif s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: - // Make sure that no wave of the workgroup can exit the trap handler - // before the workgroup barrier state is saved. - s_barrier_signal -2 - s_barrier_wait -2 + // Make sure that no wave of the group/cluster can exit the trap handler + // before the group/cluster barrier state is saved. + wait_trap_barriers(s_restore_tmp, 0, 0) + s_endpgm_saved end @@ -1079,26 +1270,6 @@ function write_12sgpr_to_v2(s) end end -function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS - s_add_u32 s_mem_offset, s_mem_offset, 4 -end - -function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_sub_u32 s_mem_offset, s_mem_offset, 4*16 - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS -end - -function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_sub_u32 s_mem_offset, s_mem_offset, 4*8 - s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS -end - -function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) - s_sub_u32 s_mem_offset, s_mem_offset, 4*4 - s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS -end - function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 @@ -1111,20 +1282,275 @@ L_ENABLE_SHIFT_W64: L_SHIFT_DONE: end -function get_svgpr_size_bytes(s_svgpr_size_byte) - s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) - s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) -end - function get_sgpr_size_bytes return 512 end function get_hwreg_size_bytes +#if ASIC_FAMILY >= CHIP_GC_12_0_3 + return 512 +#else return 128 +#endif end function get_wave_size2(s_reg) s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE end + +#if HAVE_XNACK +function save_and_clear_xnack_state_priv(s_tmp) + // Preserve and clear XNACK state before issuing further translations. + // Save XNACK_STATE_PRIV.{FIRST_REPLAY, REPLAY_W64H, FXPTR} into ttmp11[22:14]. + s_andn2_b32 ttmp11, ttmp11, (TTMP11_FIRST_REPLAY_MASK | TTMP11_REPLAY_W64H_MASK | TTMP11_FXPTR_MASK) + + s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE) + s_lshl_b32 s_tmp, s_tmp, TTMP11_FIRST_REPLAY_SHIFT + s_or_b32 ttmp11, ttmp11, s_tmp + + s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE) + s_lshl_b32 s_tmp, s_tmp, TTMP11_REPLAY_W64H_SHIFT + s_or_b32 ttmp11, ttmp11, s_tmp + + s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE) + s_lshl_b32 s_tmp, s_tmp, TTMP11_FXPTR_SHIFT + s_or_b32 ttmp11, ttmp11, s_tmp + + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV), 0 +end + +function restore_xnack_state_priv(s_tmp) + s_lshr_b32 s_tmp, ttmp11, TTMP11_FIRST_REPLAY_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE), s_tmp + + s_lshr_b32 s_tmp, ttmp11, TTMP11_REPLAY_W64H_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE), s_tmp + + s_lshr_b32 s_tmp, ttmp11, TTMP11_FXPTR_SHIFT + s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE), s_tmp +end +#endif + +function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa) +#if HAVE_CLUSTER_BARRIER + // If not in a WG then wave cannot use s_barrier_signal_isfirst. + s_getreg_b32 s_tmp1, hwreg(HW_REG_WAVE_STATUS) + s_bitcmp0_b32 s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT + s_cbranch_scc1 L_TRAP_CLUSTER_BARRIER_SIGNAL + + s_barrier_signal_isfirst -2 + s_barrier_wait -2 + + // Only the first wave in the group signals the trap cluster barrier. + s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL + +L_TRAP_CLUSTER_BARRIER_SIGNAL: + s_barrier_signal -4 + +L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL: + s_barrier_wait -4 + +#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND +if serialize_wa + // Trap cluster barrier may complete with a user cluster barrier in-flight. + // This is indicated if user cluster member count and signal count are equal. +L_WAIT_USER_CLUSTER_BARRIER_COMPLETE: + s_sendmsg_rtn_b32 s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) + s_wait_kmcnt 0 + s_bitcmp0_b32 s_tmp1, BARRIER_STATE_VALID_OFFSET + s_cbranch_scc1 L_NOT_IN_CLUSTER + + s_bfe_u32 s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10)) + s_bfe_u32 s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10)) + s_cmp_eq_u32 s_tmp1, s_tmp2 + s_cbranch_scc1 L_WAIT_USER_CLUSTER_BARRIER_COMPLETE +end +L_NOT_IN_CLUSTER: +#endif + +#else + s_barrier_signal -2 + s_barrier_wait -2 +#endif +end + +#if RELAXED_SCHEDULING_IN_TRAP +function restore_sched_mode(s_tmp) + s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10)) + s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp +end +#endif + +function restore_barrier_signal_count(barrier_id) + // extract the saved signal count from s_restore_tmp + s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET + + // We need to call s_barrier_signal repeatedly to restore the signal count + // of the group/cluster barrier. The member count is already initialized. +L_BARRIER_RESTORE_LOOP: + s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp + s_cbranch_scc0 L_BARRIER_RESTORE_DONE + s_barrier_signal barrier_id + s_add_i32 s_restore_tmp, s_restore_tmp, -1 + s_branch L_BARRIER_RESTORE_LOOP + +L_BARRIER_RESTORE_DONE: +end + +#if HAVE_INSTRUCTION_FIXUP +function fixup_instruction + // PC read may fault if memory violation has been asserted. + // In this case no further progress is expected so fixup is not needed. + s_bitcmp1_b32 s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT + s_cbranch_scc1 L_FIXUP_DONE + + // ttmp[0:1]: {7b'0} PC[56:0] + // ttmp2, 3, 10, 13, 14, 15: free + s_load_b64 [ttmp14, ttmp15], [ttmp0, ttmp1], 0 scope:SCOPE_CU // Load the 2 instruction DW we are returning to + s_wait_kmcnt 0 + s_load_b64 [ttmp2, ttmp3], [ttmp0, ttmp1], 8 scope:SCOPE_CU // Load the next 2 instruction DW, just in case + s_and_b32 ttmp10, ttmp14, 0x80000000 // Check bit 31 in the first DWORD + // SCC set if ttmp10 is != 0, i.e. if bit 31 == 1 + s_cbranch_scc1 L_FIXUP_NOT_VOP12C // If bit 31 is 1, we are not VOP1, VOP2, or VOP3C + // Fall through here means bit 31 == 0, meaning we are VOP1, VOP2, or VOPC + // Size of instruction depends on Opcode or SRC0_9 + // Check for VOP2 opcode + s_bfe_u32 ttmp10, ttmp14, (25 | (6 << 0x10)) // Check bits 30:25 for VOP2 Opcode + // VOP2 V_FMAMK_F64 of V_FMAAK_F64 has implied 64-bit literature, 3 DW + s_sub_co_i32 ttmp13, ttmp10, 0x23 // V_FMAMK_F64 is 0x23, V_FMAAK_F64 is 0x24 + s_cmp_le_u32 ttmp13, 0x1 // 0==0x23, 1==0x24 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst + // VOP2 V_FMAMK_F32, V_FMAAK_F32, V_FMAMK_F16, V_FMAAK_F16, 2 DW + s_sub_co_i32 ttmp13, ttmp10, 0x2c // V_FMAMK_F32 is 0x2c, V_FMAAK_F32 is 0x2d + s_cmp_le_u32 ttmp13, 0x1 // 0==0x2c, 1==0x2d + s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst + s_sub_co_i32 ttmp13, ttmp10, 0x37 // V_FMAMK_F16 is 0x37, V_FMAAK_F16 is 0x38 + s_cmp_le_u32 ttmp13, 0x1 // 0==0x37, 1==0x38 + s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst + // Check SRC0_9 for VOP1, VOP2, and VOPC + s_and_b32 ttmp10, ttmp14, 0x1ff // Check bits 8:0 for SRC0_9 + // Literal constant 64 is 3 DWORDs + s_cmp_eq_u32 ttmp10, 0xfe // 0xfe == 254 == Literal constant64 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + // Literal constant 32, DPP16, DPP8, and DPP8FI are 2 DWORDs + s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst + s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16 + s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst + s_sub_co_i32 ttmp13, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea + s_cmp_le_u32 ttmp13, 0x1 // 0==0xe9, 1==0xea + s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst + // Instruction is 1 DWORD otherwise + +L_FIXUP_ONE_DWORD: + // Check if TTMP15 contains the value for S_SET_VGPR_MSB instruction + s_and_b32 ttmp10, ttmp15, 0xffff0000 // Check encoding in upper 16 bits + s_cmp_eq_u32 ttmp10, 0xbf860000 // Check if SOPP (9b'10_1111111) and S_SET_VGPR_MSB (7b'0000110) + s_cbranch_scc0 L_FIXUP_DONE // No problem, no fixup needed + // VALU op followed by a S_SET_VGPR_MSB. Need to pull SIMM[15:8] to fix up MODE.*_VGPR_MSB + s_bfe_u32 ttmp10, ttmp15, (14 | (2 << 0x10)) // Shift SIMM[15:14] over to 1:0, Dst + s_and_b32 ttmp13, ttmp15, 0x3f00 // Mask to get SIMM[13:8] only + s_lshr_b32 ttmp13, ttmp13, 6 // Shift SIMM[13:8] into 7:2, Src2, Src1, Src0 + s_or_b32 ttmp10, ttmp10, ttmp13 // Src2, Src1, Src0, Dst --> format in MODE register + s_setreg_b32 hwreg(HW_REG_WAVE_MODE, 12, 8), ttmp10 // Write value into MODE[19:12] + s_branch L_FIXUP_DONE + +L_FIXUP_NOT_VOP12C: + // ttmp[0:1]: {7b'0} PC[56:0] + // ttmp2: PC+2 value (not waitcnt'ed yet) + // ttmp3: PC+3 value (not waitcnt'ed yet) + // ttmp10, ttmp13: free + // ttmp14: PC+O value + // ttmp15: PC+1 value + // Not VOP1, VOP2, or VOPC. + // Check if we are VOP3 or VOP3SD + s_and_b32 ttmp10, ttmp14, 0xfc000000 // Bits 31:26 + s_cmp_eq_u32 ttmp10, 0xd4000000 // If 31:26 = 0x35, this is VOP3 or VOP3SD + s_cbranch_scc1 L_FIXUP_CHECK_VOP3 // If VOP3 or VOP3SD, need to check SRC2_9, SRC1_9, SRC0_9 + // Not VOP1, VOP2, VOPC, VOP3, or VOP3SD. + // Check for VOPD + s_cmp_eq_u32 ttmp10, 0xc8000000 // If 31:26 = 0x32, this is VOPD + s_cbranch_scc1 L_FIXUP_CHECK_VOPD // If VOPD, need to check OpX, OpY, SRCX0 and SRCY0 + // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD. + // Check if we are VOPD3 + s_and_b32 ttmp10, ttmp14, 0xff000000 // Bits 31:24 + s_cmp_eq_u32 ttmp10, 0xcf000000 // If 31:24 = 0xcf, this is VOPD3 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // If VOPD3, 3 DWORD inst + // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD, or VOPD3. + // Check if we are in the middle of VOP3PX. + s_and_b32 ttmp13, ttmp14, 0xffff0000 // Bits 31:16 + s_cmp_eq_u32 ttmp13, 0xcc330000 // If 31:16 = 0xcc33, this is 8 bytes past VOP3PX + s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE + s_cmp_eq_u32 ttmp13, 0xcc880000 // If 31:16 = 0xcc88, this is 8 bytes past VOP3PX + s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE + // Might be in VOP3P, but we must ensure we are not VOP3PX2 + s_cmp_eq_u32 ttmp13, 0xcc350000 // If 31:16 = 0xcc35, this is VOP3PX2 + s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed + s_cmp_eq_u32 ttmp13, 0xcc3a0000 // If 31:16 = 0xcc3a, this is VOP3PX2 + s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed + // Check if we are VOP3P + s_cmp_eq_u32 ttmp10, 0xcc000000 // If 31:24 = 0xcc, this is VOP3P + s_cbranch_scc0 L_FIXUP_DONE // Not in VOP3P, so instruction is not VOP1, VOP2, + // VOPC, VOP3, VOP3SD, VOP3P, VOPD, or VOPD3 + // No fixup needed. + // Fall-through if we are in VOP3P to check SRC2_9, SRC1_9, and SRC0_9 +L_FIXUP_CHECK_VOP3: + // Start with Src0, which is in bits 8:0 of second instruction DW, ttmp15 + s_and_b32 ttmp10, ttmp15, 0x1ff // Mask out unused bits + // Src0_9 == Literal constant 32, DPP16, DPP8, and DPP8FI means 3 DWORDs + s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + s_sub_co_i32 ttmp10, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea + s_cmp_le_u32 ttmp10, 0x1 // 0==0xe9, 1==0xea + s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst + s_and_b32 ttmp10, ttmp15, 0x3fe00 // Next is Src1, which is in 17:9 + s_cmp_eq_u32 ttmp10, 0x1fe00 // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + s_and_b32 ttmp10, ttmp15, 0x7fc0000 // Next is Src2, which is in 26:18 + s_cmp_eq_u32 ttmp10, 0x3fc0000 // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + s_branch L_FIXUP_TWO_DWORD // No special encodings, VOP3* is 2 Dword + +L_FIXUP_CHECK_VOPD: + // OpX being V_DUAL_FMA*K_F32 means 3 DWORDs + s_bfe_u32 ttmp10, ttmp14, (22 | (4 << 0x10)) // OPX is bits 25:22 + s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2 + s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst + // OpY being V_DUAL_FMA*K_F32 means 3 DWORDs + s_bfe_u32 ttmp10, ttmp14, (17 | (5 << 0x10)) // OPX is bits 21:17 + s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2 + s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst + // SRCX0 == Literal constant 32 means 3 DWORDs + s_and_b32 ttmp10, ttmp14, 0x1ff // SRCX0 is in bits 8:0 of 1st DWORD + s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + // SRCY0 == Literal constant 32 means 3 DWORDs + s_and_b32 ttmp10, ttmp15, 0x1ff // SRCY0 is in bits 8:0 of 2nd DWORD + s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32 + s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst + // If otherwise, no special encodings. Default VOPD is 2 Dword + // Fall-thru if true, because this is a 2 DWORD inst +L_FIXUP_TWO_DWORD: + s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3 + s_mov_b32 ttmp15, ttmp2 // Move possible S_SET_VGPR_MSB into ttmp15 + s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB + +L_FIXUP_THREE_DWORD: + s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3 + s_mov_b32 ttmp15, ttmp3 // Move possible S_SET_VGPR_MSB into ttmp15 + s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB + +L_FIXUP_VOP3PX_MIDDLE: + s_sub_co_u32 ttmp0, ttmp0, 8 // Rewind PC 8 bytes to beginning of instruction + s_sub_co_ci_u32 ttmp1, ttmp1, 0 + s_branch L_FIXUP_TWO_DWORD // 2 DWORD inst (2nd half of a 4 DWORD inst) + +L_FIXUP_DONE: + s_wait_kmcnt 0 // Ensure load of ttmp2 and ttmp3 is done +end +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 065d87841459..03b266b26738 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -22,10 +22,10 @@ */ #include <linux/device.h> -#include <linux/export.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/overflow.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uaccess.h> @@ -67,6 +67,21 @@ static const struct class kfd_class = { .name = kfd_dev_name, }; +/* + * Cache the address space of the chardev on first open so that the reset + * path can drop all userspace mappings of doorbell and MMIO ranges via + * unmap_mapping_range(). + */ +static struct address_space *kfd_dev_mapping; + +void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen) +{ + struct address_space *mapping = READ_ONCE(kfd_dev_mapping); + + if (mapping) + unmap_mapping_range(mapping, holebegin, holelen, 1); +} + static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id) { struct kfd_process_device *pdd; @@ -133,6 +148,13 @@ static int kfd_open(struct inode *inode, struct file *filep) if (iminor(inode) != 0) return -ENODEV; + /* + * /dev/kfd is a single chardev so all opens share one inode. Cache + * its address_space on the first open for use by the reset path. + */ + if (!READ_ONCE(kfd_dev_mapping)) + cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping); + is_32bit_user_mode = in_compat_syscall(); if (is_32bit_user_mode) { @@ -155,8 +177,8 @@ static int kfd_open(struct inode *inode, struct file *filep) /* filep now owns the reference returned by kfd_create_process */ filep->private_data = process; - dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", - process->pasid, process->is_32bit_user_mode); + dev_dbg(kfd_device, "process pid %d opened kfd node, compat mode (32 bit) - %d\n", + process->lead_thread->pid, process->is_32bit_user_mode); return 0; } @@ -165,8 +187,13 @@ static int kfd_release(struct inode *inode, struct file *filep) { struct kfd_process *process = filep->private_data; - if (process) - kfd_unref_process(process); + if (!process) + return 0; + + if (process->context_id != KFD_CONTEXT_ID_PRIMARY) + kfd_process_notifier_release_internal(process); + + kfd_unref_process(process); return 0; } @@ -212,6 +239,16 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, return -EINVAL; } + if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) { + args->ring_size = KFD_MIN_QUEUE_RING_SIZE; + pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE"); + } + + if ((args->metadata_ring_size != 0) && !is_power_of_2(args->metadata_ring_size)) { + pr_err("Metadata ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + if (!access_ok((const void __user *) args->read_pointer_address, sizeof(uint32_t))) { pr_err("Can't access read pointer\n"); @@ -246,6 +283,9 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->metadata_queue_size = args->metadata_ring_size; + q_properties->read_ptr = (void __user *)args->read_pointer_address; q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; @@ -361,8 +401,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_acquire_queue_buf; } - pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", - p->pasid, + pr_debug("Creating queue for process pid %d on gpu 0x%x\n", + p->lead_thread->pid, dev->id); err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id, @@ -415,9 +455,9 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_destroy_queue_args *args = data; - pr_debug("Destroying queue id %d for pasid 0x%x\n", + pr_debug("Destroying queue id %d for process pid %d\n", args->queue_id, - p->pasid); + p->lead_thread->pid); mutex_lock(&p->mutex); @@ -461,6 +501,11 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, return -EINVAL; } + if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) { + args->ring_size = KFD_MIN_QUEUE_RING_SIZE; + pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE"); + } + properties.queue_address = args->ring_base_address; properties.queue_size = args->ring_size; properties.queue_percent = args->queue_percentage & 0xFF; @@ -468,8 +513,8 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, properties.pm4_target_xcc = (args->queue_percentage >> 8) & 0xFF; properties.priority = args->queue_priority; - pr_debug("Updating queue id %d for pasid 0x%x\n", - args->queue_id, p->pasid); + pr_debug("Updating queue id %d for process pid %d\n", + args->queue_id, p->lead_thread->pid); mutex_lock(&p->mutex); @@ -512,15 +557,10 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); } - minfo.cu_mask.ptr = kzalloc(cu_mask_size, GFP_KERNEL); - if (!minfo.cu_mask.ptr) - return -ENOMEM; - - retval = copy_from_user(minfo.cu_mask.ptr, cu_mask_ptr, cu_mask_size); - if (retval) { + minfo.cu_mask.ptr = memdup_user(cu_mask_ptr, cu_mask_size); + if (IS_ERR(minfo.cu_mask.ptr)) { pr_debug("Could not copy CU mask from userspace"); - retval = -EFAULT; - goto out; + return PTR_ERR(minfo.cu_mask.ptr); } mutex_lock(&p->mutex); @@ -529,7 +569,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, mutex_unlock(&p->mutex); -out: kfree(minfo.cu_mask.ptr); return retval; } @@ -596,7 +635,8 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, default_policy, alternate_policy, (void __user *)args->alternate_aperture_base, - args->alternate_aperture_size)) + args->alternate_aperture_size, + args->misc_process_flag)) err = -EINVAL; out: @@ -695,7 +735,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, struct kfd_process_device_apertures *pAperture; int i; - dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); + dev_dbg(kfd_device, "get apertures for process pid %d", p->lead_thread->pid); args->num_of_nodes = 0; @@ -747,7 +787,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, int ret; int i; - dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); + dev_dbg(kfd_device, "get apertures for process pid %d", + p->lead_thread->pid); if (args->num_of_nodes == 0) { /* Return number of nodes, so that user space can alloacate @@ -758,12 +799,15 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, goto out_unlock; } + if (args->num_of_nodes > kfd_topology_get_num_devices()) + return -EINVAL; + /* Fill in process-aperture information for all available * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), - GFP_KERNEL); + pa = kzalloc_objs(struct kfd_process_device_apertures, + args->num_of_nodes); if (!pa) return -ENOMEM; @@ -1052,6 +1096,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (args->size == 0) return -EINVAL; + if (p->context_id != KFD_CONTEXT_ID_PRIMARY && (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) { + pr_debug("USERPTR is not supported on non-primary kfd_process\n"); + + return -EOPNOTSUPP; + } + #if IS_ENABLED(CONFIG_HSA_AMD_SVM) /* Flush pending deferred work to avoid racing with deferred actions * from previous memory map changes (e.g. munmap). @@ -1059,7 +1109,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, svm_range_list_lock_and_flush_work(&p->svms, current->mm); mutex_lock(&p->svms.lock); mmap_write_unlock(current->mm); - if (interval_tree_iter_first(&p->svms.objects, + + /* Skip a special case that allocates VRAM without VA, + * VA will be invalid of 0. + */ + if (!(!args->va_addr && (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)) && + interval_tree_iter_first(&p->svms.objects, args->va_addr >> PAGE_SHIFT, (args->va_addr + args->size - 1) >> PAGE_SHIFT)) { pr_err("Address: 0x%llx already allocated by SVM\n", @@ -1327,7 +1382,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); if (WARN_ON_ONCE(!peer_pdd)) continue; - kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(peer_pdd); } kfree(devices_arr); @@ -1422,7 +1477,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, if (WARN_ON_ONCE(!peer_pdd)) continue; if (flush_tlb) - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); + kfd_flush_tlb(peer_pdd); /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */ err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); @@ -1663,6 +1718,16 @@ static int kfd_ioctl_smi_events(struct file *filep, return kfd_smi_event_open(pdd->dev, &args->anon_fd); } +static int kfd_ioctl_svm_validate(void *kdata, unsigned int usize) +{ + struct kfd_ioctl_svm_args *args = kdata; + size_t expected = struct_size(args, attrs, args->nattr); + + if (expected == SIZE_MAX || usize < expected) + return -EINVAL; + return 0; +} + #if IS_ENABLED(CONFIG_HSA_AMD_SVM) static int kfd_ioctl_set_xnack_mode(struct file *filep, @@ -1703,6 +1768,12 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) struct kfd_ioctl_svm_args *args = data; int r = 0; + if (p->context_id != KFD_CONTEXT_ID_PRIMARY) { + pr_debug("SVM ioctl not supported on non-primary kfd process\n"); + + return -EOPNOTSUPP; + } + pr_debug("start 0x%llx size 0x%llx op 0x%x nattr 0x%x\n", args->start_addr, args->size, args->op, args->nattr); @@ -2027,9 +2098,7 @@ static int criu_get_process_object_info(struct kfd_process *p, num_events = kfd_get_num_events(p); - ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); - if (ret) - return ret; + svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); *num_objects = num_queues + num_events + num_svm_ranges; @@ -2191,7 +2260,7 @@ static int criu_restore_devices(struct kfd_process *p, if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size) return -EINVAL; - device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL); + device_buckets = kmalloc_objs(*device_buckets, args->num_devices); if (!device_buckets) return -ENOMEM; @@ -2434,7 +2503,7 @@ static int criu_restore_bos(struct kfd_process *p, /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */ amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info); - bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL); + bo_buckets = kvmalloc_objs(*bo_buckets, args->num_bos); if (!bo_buckets) return -ENOMEM; @@ -2452,7 +2521,7 @@ static int criu_restore_bos(struct kfd_process *p, goto exit; } - bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL); + bo_privs = kvmalloc_objs(*bo_privs, args->num_bos); if (!bo_privs) { ret = -ENOMEM; goto exit; @@ -2557,8 +2626,8 @@ static int criu_restore(struct file *filep, pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n", args->num_devices, args->num_bos, args->num_objects, args->priv_data_size); - if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size || - !args->num_devices || !args->num_bos) + if ((args->num_bos > 0 && !args->bos) || !args->devices || !args->priv_data || + !args->priv_data_size || !args->num_devices) return -EINVAL; mutex_lock(&p->mutex); @@ -2771,8 +2840,12 @@ static int runtime_enable(struct kfd_process *p, uint64_t r_debug, * SET_SHADER_DEBUGGER clears any stale process context data * saved in MES. */ - if (pdd->dev->kfd->shared_resources.enable_mes) - kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); + if (pdd->dev->kfd->shared_resources.enable_mes) { + ret = kfd_dbg_set_mes_debug_mode( + pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); + if (ret) + return ret; + } } p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; @@ -2818,7 +2891,7 @@ retry: static int runtime_disable(struct kfd_process *p) { - int i = 0, ret; + int i = 0, ret = 0; bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED; p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED; @@ -2855,6 +2928,7 @@ static int runtime_disable(struct kfd_process *p) /* disable ttmp setup */ for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + int last_err = 0; if (kfd_dbg_is_per_vmid_supported(pdd->dev)) { pdd->spi_dbg_override = @@ -2864,14 +2938,17 @@ static int runtime_disable(struct kfd_process *p) pdd->dev->vm_info.last_vmid_kfd); if (!pdd->dev->kfd->shared_resources.enable_mes) - debug_refresh_runlist(pdd->dev->dqm); + last_err = debug_refresh_runlist(pdd->dev->dqm); else - kfd_dbg_set_mes_debug_mode(pdd, + last_err = kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); + + if (last_err) + ret = last_err; } } - return 0; + return ret; } static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data) @@ -2902,6 +2979,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v struct kfd_process_device *pdd = NULL; int r = 0; + if (p->context_id != KFD_CONTEXT_ID_PRIMARY) { + pr_debug("Set debug trap ioctl can not be invoked on non-primary kfd process\n"); + + return -EOPNOTSUPP; + } + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) { pr_err("Debugging does not support sched_policy %i", sched_policy); return -EINVAL; @@ -2946,6 +3029,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v goto out; } + if (target->context_id != KFD_CONTEXT_ID_PRIMARY) { + pr_debug("Set debug trap ioctl not supported on non-primary kfd process\n"); + r = -EOPNOTSUPP; + goto out; + } + /* Check if target is still PTRACED. */ rcu_read_lock(); if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE @@ -3109,9 +3198,55 @@ out: return r; } +/* userspace programs need to invoke this ioctl explicitly on a FD to + * create a secondary kfd_process which replacing its primary kfd_process + */ +static int kfd_ioctl_create_process(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_process *process; + int ret; + + if (!filep->private_data || !p) + return -EINVAL; + + /* Each FD owns only one kfd_process */ + if (p->context_id != KFD_CONTEXT_ID_PRIMARY) + return -EINVAL; + + mutex_lock(&kfd_processes_mutex); + if (p != filep->private_data) { + mutex_unlock(&kfd_processes_mutex); + return -EINVAL; + } + + process = create_process(current, false); + if (IS_ERR(process)) { + mutex_unlock(&kfd_processes_mutex); + return PTR_ERR(process); + } + + filep->private_data = process; + mutex_unlock(&kfd_processes_mutex); + + ret = kfd_create_process_sysfs(process); + if (ret) + pr_warn("Failed to create sysfs entry for the kfd_process"); + + /* Each open() increases kref of the primary kfd_process, + * so we need to reduce it here when we create a new secondary process replacing it + */ + kfd_unref_process(p); + + return 0; +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ - .cmd_drv = 0, .name = #ioctl} + .validate = NULL, .cmd_drv = 0, .name = #ioctl} + +#define AMDKFD_IOCTL_DEF_V(ioctl, _func, _validate, _flags) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ + .validate = _validate, .cmd_drv = 0, .name = #ioctl} /** Ioctl table */ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { @@ -3208,7 +3343,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, kfd_ioctl_smi_events, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SVM, kfd_ioctl_svm, 0), + AMDKFD_IOCTL_DEF_V(AMDKFD_IOC_SVM, kfd_ioctl_svm, + kfd_ioctl_svm_validate, 0), AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE, kfd_ioctl_set_xnack_mode, 0), @@ -3227,6 +3363,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP, kfd_ioctl_set_debug_trap, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_PROCESS, + kfd_ioctl_create_process, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -3243,8 +3382,10 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) int retcode = -EINVAL; bool ptrace_attached = false; - if (nr >= AMDKFD_CORE_IOCTL_COUNT) + if (nr >= AMDKFD_CORE_IOCTL_COUNT) { + retcode = -ENOTTY; goto err_i1; + } if ((nr >= AMDKFD_COMMAND_START) && (nr < AMDKFD_COMMAND_END)) { u32 amdkfd_size; @@ -3257,8 +3398,10 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) asize = amdkfd_size; cmd = ioctl->cmd; - } else + } else { + retcode = -ENOTTY; goto err_i1; + } dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, arg); @@ -3326,6 +3469,12 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) memset(kdata, 0, usize); } + if (ioctl->validate) { + retcode = ioctl->validate(kdata, usize); + if (retcode) + goto err_i1; + } + retcode = func(filep, process, kdata); if (cmd & IOC_OUT) @@ -3365,12 +3514,12 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pr_debug("pasid 0x%x mapping mmio page\n" + pr_debug("process pid %d mapping mmio page\n" " target user address == 0x%08llX\n" " physical address == 0x%08llX\n" " vm_flags == 0x%04lX\n" " size == 0x%04lX\n", - process->pasid, (unsigned long long) vma->vm_start, + process->lead_thread->pid, (unsigned long long) vma->vm_start, address, vma->vm_flags, PAGE_SIZE); return io_remap_pfn_range(vma, @@ -3381,16 +3530,19 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process, } -static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) +static int kfd_mmap(struct file *filep, struct vm_area_struct *vma) { struct kfd_process *process; struct kfd_node *dev = NULL; unsigned long mmap_offset; unsigned int gpu_id; - process = kfd_get_process(current); - if (IS_ERR(process)) - return PTR_ERR(process); + process = filep->private_data; + if (!process) + return -ESRCH; + + if (process->lead_thread != current->group_leader) + return -EBADF; mmap_offset = vma->vm_pgoff << PAGE_SHIFT; gpu_id = KFD_MMAP_GET_GPU_ID(mmap_offset); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 693469c18c60..a1087c13f241 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1704,6 +1704,8 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): case IP_VERSION(11, 5, 2): + case IP_VERSION(11, 5, 3): + case IP_VERSION(11, 5, 4): /* Cacheline size not available in IP discovery for gc11. * kfd_fill_gpu_cache_info_from_gfx_config to hard code it */ @@ -1711,6 +1713,7 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc fallthrough; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): + case IP_VERSION(12, 1, 0): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, cache_line_size_missing, @@ -2132,9 +2135,6 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3); int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT : KFD_CRAT_INTRA_SOCKET_WEIGHT; - uint32_t bandwidth = ext_cpu ? amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( - kdev->adev, NULL, true) : mem_bw; - /* * with host gpu xgmi link, host can access gpu memory whether * or not pcie bar type is large, so always create bidirectional @@ -2143,8 +2143,16 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; sub_type_hdr->weight_xgmi = weight; - sub_type_hdr->minimum_bandwidth_mbs = bandwidth; - sub_type_hdr->maximum_bandwidth_mbs = bandwidth; + if (ext_cpu) { + amdgpu_xgmi_get_bandwidth(kdev->adev, NULL, + AMDGPU_XGMI_BW_MODE_PER_LINK, + AMDGPU_XGMI_BW_UNIT_MBYTES, + &sub_type_hdr->minimum_bandwidth_mbs, + &sub_type_hdr->maximum_bandwidth_mbs); + } else { + sub_type_hdr->minimum_bandwidth_mbs = mem_bw; + sub_type_hdr->maximum_bandwidth_mbs = mem_bw; + } } else { sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; sub_type_hdr->minimum_bandwidth_mbs = @@ -2197,12 +2205,12 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, if (use_ta_info) { sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT * - amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev); - sub_type_hdr->maximum_bandwidth_mbs = - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, - peer_kdev->adev, false); - sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ? - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0; + amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev); + amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev, + AMDGPU_XGMI_BW_MODE_PER_PEER, + AMDGPU_XGMI_BW_UNIT_MBYTES, + &sub_type_hdr->minimum_bandwidth_mbs, + &sub_type_hdr->maximum_bandwidth_mbs); } else { bool is_single_hop = kdev->kfd == peer_kdev->kfd; int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT : @@ -2351,7 +2359,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, if (kdev->kfd->hive_id) { for (nid = 0; nid < proximity_domain; ++nid) { peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid); - if (!peer_dev->gpu) + if (!peer_dev || !peer_dev->gpu) continue; if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) continue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index a8abc3091801..0f7aa51b629e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -204,11 +204,12 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, size_t exception_data_size) { struct kfd_process *p; + struct kfd_process_device *pdd = NULL; bool signaled_to_debugger_or_runtime = false; - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, &pdd); - if (!p) + if (!pdd) return false; if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, @@ -238,9 +239,8 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, mutex_unlock(&p->mutex); } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { - kfd_dqm_evict_pasid(dev->dqm, p->pasid); - kfd_signal_vm_fault_event(dev, p->pasid, NULL, - exception_data); + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, exception_data); signaled_to_debugger_or_runtime = true; } @@ -276,8 +276,8 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, data = (struct kfd_hsa_memory_exception_data *) pdd->vm_fault_exc_data; - kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); - kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, data); error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); } @@ -357,12 +357,13 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) return 0; if (!pdd->proc_ctx_cpu_ptr) { - r = amdgpu_amdkfd_alloc_gtt_mem(adev, - AMDGPU_MES_PROC_CTX_SIZE, - &pdd->proc_ctx_bo, - &pdd->proc_ctx_gpu_addr, - &pdd->proc_ctx_cpu_ptr, - false); + r = amdgpu_amdkfd_alloc_kernel_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + AMDGPU_GEM_DOMAIN_GTT, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); if (r) { dev_err(adev->dev, "failed to allocate process context bo\n"); @@ -371,8 +372,10 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); } - return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, - pdd->watch_points, flags, sq_trap_en); + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, + pdd->proc_ctx_gpu_addr, spi_dbg_cntl, + pdd->watch_points, flags, sq_trap_en, + ffs(pdd->dev->xcc_mask) - 1); } #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 @@ -401,27 +404,25 @@ static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_i return -ENOMEM; } -static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) +static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id) { spin_lock(&pdd->dev->watch_points_lock); /* process owns device watch point so safe to clear */ - if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { - pdd->alloc_watch_ids &= ~(0x1 << watch_id); - pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id); + if (pdd->alloc_watch_ids & BIT(watch_id)) { + pdd->alloc_watch_ids &= ~BIT(watch_id); + pdd->dev->alloc_watch_ids &= ~BIT(watch_id); } spin_unlock(&pdd->dev->watch_points_lock); } -static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) +static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id) { bool owns_watch_id = false; spin_lock(&pdd->dev->watch_points_lock); - owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && - ((pdd->alloc_watch_ids >> watch_id) & 0x1); - + owns_watch_id = pdd->alloc_watch_ids & BIT(watch_id); spin_unlock(&pdd->dev->watch_points_lock); return owns_watch_id; @@ -432,6 +433,9 @@ int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, { int r; + if (watch_id >= MAX_WATCH_ADDRESSES) + return -EINVAL; + if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) return -EINVAL; @@ -469,6 +473,9 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, if (r) return r; + if (*watch_id >= MAX_WATCH_ADDRESSES) + return -EINVAL; + if (!pdd->dev->kfd->shared_resources.enable_mes) { r = debug_lock_and_unmap(pdd->dev->dqm); if (r) { @@ -516,9 +523,15 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) int i, r = 0, rewind_count = 0; for (i = 0; i < target->n_pdds; i++) { + uint32_t caps; + uint32_t caps2; struct kfd_topology_device *topo_dev = - kfd_topology_device_by_id(target->pdds[i]->dev->id); - uint32_t caps = topo_dev->node_props.capability; + kfd_topology_device_by_id(target->pdds[i]->dev->id); + if (!topo_dev) + return -EINVAL; + + caps = topo_dev->node_props.capability; + caps2 = topo_dev->node_props.capability2; if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { @@ -531,6 +544,12 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) *flags = prev_flags; return -EACCES; } + + if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) && + (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) { + *flags = prev_flags; + return -EACCES; + } } target->dbg_flags = *flags; @@ -565,9 +584,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) continue; if (!pdd->dev->kfd->shared_resources.enable_mes) - debug_refresh_runlist(pdd->dev->dqm); + (void)debug_refresh_runlist(pdd->dev->dqm); else - kfd_dbg_set_mes_debug_mode(pdd, true); + (void)kfd_dbg_set_mes_debug_mode(pdd, true); } } @@ -627,9 +646,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); if (!pdd->dev->kfd->shared_resources.enable_mes) - debug_refresh_runlist(pdd->dev->dqm); + (void)debug_refresh_runlist(pdd->dev->dqm); else - kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); + (void)kfd_dbg_set_mes_debug_mode(pdd, + !kfd_dbg_has_cwsr_workaround(pdd->dev)); } kfd_dbg_set_workaround(target, false); @@ -1071,6 +1091,10 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target, for (i = 0; i < tmp_num_devices; i++) { struct kfd_process_device *pdd = target->pdds[i]; struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id); + if (!topo_dev) { + r = -EINVAL; + break; + } device_info.gpu_id = pdd->dev->id; device_info.exception_status = pdd->exception_status; @@ -1098,6 +1122,7 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target, device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); device_info.capability = topo_dev->node_props.capability; device_info.debug_prop = topo_dev->node_props.debug_prop; + device_info.capability2 = topo_dev->node_props.capability2; if (exception_clear_mask) pdd->exception_status &= ~exception_clear_mask; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h index 27aa1a5b120f..fbb751821c69 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h @@ -120,8 +120,7 @@ static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev) && dev->kfd->mec2_fw_version < 0x1b6) || (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1) && dev->kfd->mec2_fw_version < 0x30) || - (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0))) + kfd_dbg_has_cwsr_workaround(dev)) return false; /* Assume debugging and cooperative launch supported otherwise. */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c index 4a5a0a4e00f2..7d4e07452cdb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -27,6 +27,16 @@ #include "kfd_priv.h" static struct dentry *debugfs_root; +static struct dentry *debugfs_proc; +static struct list_head procs; + +struct debugfs_proc_entry { + struct list_head list; + struct dentry *proc_dentry; + pid_t pid; +}; + +#define MAX_DEBUGFS_FILENAME_LEN 32 static int kfd_debugfs_open(struct inode *inode, struct file *file) { @@ -92,6 +102,8 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = { void kfd_debugfs_init(void) { debugfs_root = debugfs_create_dir("kfd", NULL); + debugfs_proc = debugfs_create_dir("proc", debugfs_root); + INIT_LIST_HEAD(&procs); debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, kfd_debugfs_mqds_by_process, &kfd_debugfs_fops); @@ -107,5 +119,69 @@ void kfd_debugfs_init(void) void kfd_debugfs_fini(void) { + debugfs_remove_recursive(debugfs_proc); debugfs_remove_recursive(debugfs_root); } + +static ssize_t kfd_debugfs_pasid_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kfd_process_device *pdd = file_inode(file)->i_private; + char tmp[32]; + int len; + + len = snprintf(tmp, sizeof(tmp), "%u\n", pdd->pasid); + + return simple_read_from_buffer(buf, count, ppos, tmp, len); +} + +static const struct file_operations kfd_debugfs_pasid_fops = { + .owner = THIS_MODULE, + .read = kfd_debugfs_pasid_read, +}; + +void kfd_debugfs_add_process(struct kfd_process *p) +{ + int i; + char name[MAX_DEBUGFS_FILENAME_LEN]; + struct debugfs_proc_entry *entry; + + entry = kzalloc_obj(*entry); + if (!entry) + return; + + list_add(&entry->list, &procs); + entry->pid = p->lead_thread->pid; + snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "%d", + (int)entry->pid); + entry->proc_dentry = debugfs_create_dir(name, debugfs_proc); + + /* Create debugfs files for each GPU: + * - proc/<pid>/pasid_<gpuid> + */ + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "pasid_%u", + pdd->dev->id); + debugfs_create_file((const char *)name, S_IFREG | 0444, + entry->proc_dentry, pdd, + &kfd_debugfs_pasid_fops); + } +} + +void kfd_debugfs_remove_process(struct kfd_process *p) +{ + struct debugfs_proc_entry *entry, *next; + + mutex_lock(&kfd_processes_mutex); + list_for_each_entry_safe(entry, next, &procs, list) { + if (entry->pid != p->lead_thread->pid) + continue; + + debugfs_remove_recursive(entry->proc_dentry); + list_del(&entry->list); + kfree(entry); + } + mutex_unlock(&kfd_processes_mutex); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index a29374c86405..b7f8f7ff8198 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -57,6 +57,7 @@ extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v11_kfd2kgd; extern const struct kfd2kgd_calls gfx_v12_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v12_1_kfd2kgd; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); @@ -94,6 +95,8 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(5, 2, 2):/* NAVY_FLOUNDER */ case IP_VERSION(5, 2, 4):/* DIMGREY_CAVEFISH */ case IP_VERSION(5, 2, 5):/* BEIGE_GOBY */ + kfd->device_info.num_sdma_queues_per_engine = 8; + break; case IP_VERSION(6, 0, 0): case IP_VERSION(6, 0, 1): case IP_VERSION(6, 0, 2): @@ -101,9 +104,14 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 1, 0): case IP_VERSION(6, 1, 1): case IP_VERSION(6, 1, 2): + case IP_VERSION(6, 1, 3): + case IP_VERSION(6, 1, 4): case IP_VERSION(7, 0, 0): case IP_VERSION(7, 0, 1): + case IP_VERSION(7, 1, 0): kfd->device_info.num_sdma_queues_per_engine = 8; + /* Reserve 1 for paging and 1 for gfx */ + kfd->device_info.num_reserved_sdma_queues_per_engine = 2; break; default: dev_warn(kfd_device, @@ -111,29 +119,6 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) sdma_version); kfd->device_info.num_sdma_queues_per_engine = 8; } - - bitmap_zero(kfd->device_info.reserved_sdma_queues_bitmap, KFD_MAX_SDMA_QUEUES); - - switch (sdma_version) { - case IP_VERSION(6, 0, 0): - case IP_VERSION(6, 0, 1): - case IP_VERSION(6, 0, 2): - case IP_VERSION(6, 0, 3): - case IP_VERSION(6, 1, 0): - case IP_VERSION(6, 1, 1): - case IP_VERSION(6, 1, 2): - case IP_VERSION(7, 0, 0): - case IP_VERSION(7, 0, 1): - /* Reserve 1 for paging and 1 for gfx */ - kfd->device_info.num_reserved_sdma_queues_per_engine = 2; - /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ - bitmap_set(kfd->device_info.reserved_sdma_queues_bitmap, 0, - kfd->adev->sdma.num_instances * - kfd->device_info.num_reserved_sdma_queues_per_engine); - break; - default: - break; - } } static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) @@ -180,6 +165,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): case IP_VERSION(11, 5, 2): + case IP_VERSION(11, 5, 3): + case IP_VERSION(11, 5, 4): kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; case IP_VERSION(12, 0, 0): @@ -187,6 +174,10 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) /* GFX12_TODO: Change to v12 version. */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; + case IP_VERSION(12, 1, 0): + kfd->device_info.event_interrupt_class = + &event_interrupt_class_v12_1; + break; default: dev_warn(kfd_device, "v9 event interrupt handler is set due to " "mismatch of gc ip block(GC_HWIP:0x%x).\n", gc_version); @@ -349,11 +340,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &aldebaran_kfd2kgd; break; case IP_VERSION(9, 4, 3): - gfx_target_version = adev->rev_id >= 1 ? 90402 - : adev->flags & AMD_IS_APU ? 90400 - : 90401; - f2g = &gc_9_4_3_kfd2kgd; - break; case IP_VERSION(9, 4, 4): gfx_target_version = 90402; f2g = &gc_9_4_3_kfd2kgd; @@ -454,6 +440,14 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 110502; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 5, 3): + gfx_target_version = 110503; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(11, 5, 4): + gfx_target_version = 110504; + f2g = &gfx_v11_kfd2kgd; + break; case IP_VERSION(12, 0, 0): gfx_target_version = 120000; f2g = &gfx_v12_kfd2kgd; @@ -462,6 +456,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 120001; f2g = &gfx_v12_kfd2kgd; break; + case IP_VERSION(12, 1, 0): + gfx_target_version = 120500; + f2g = &gfx_v12_1_kfd2kgd; + break; default: break; } @@ -480,7 +478,7 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) return NULL; } - kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + kfd = kzalloc_obj(*kfd); if (!kfd) return NULL; @@ -493,6 +491,7 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) mutex_init(&kfd->doorbell_mutex); ida_init(&kfd->doorbell_ida); + atomic_set(&kfd->kfd_processes_count, 0); return kfd; } @@ -546,11 +545,16 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx11_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex); - } else { + } else if (KFD_GC_VERSION(kfd) < IP_VERSION(12, 1, 0)) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_hex) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx12_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_1_0_hex) + > KFD_CWSR_TMA_OFFSET); + kfd->cwsr_isa = cwsr_trap_gfx12_1_0_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_1_0_hex); } kfd->cwsr_enabled = true; @@ -583,9 +587,13 @@ static int kfd_gws_init(struct kfd_node *node) && kfd->mec2_fw_version >= 0x6b) || (KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0) && KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0) - && mes_rev >= 68)))) + && mes_rev >= 68) || + (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0))))) { + if (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0)) + node->adev->gds.gws_size = 64; ret = amdgpu_amdkfd_alloc_gws(node->adev, node->adev->gds.gws_size, &node->gws); + } return ret; } @@ -675,6 +683,7 @@ static void kfd_setup_interrupt_bitmap(struct kfd_node *node, struct amdgpu_device *adev = node->adev; uint32_t xcc_mask = node->xcc_mask; uint32_t xcc, mapped_xcc; + uint32_t bitmap; /* * Interrupt bitmap is setup for processing interrupts from * different XCDs and AIDs. @@ -696,9 +705,22 @@ static void kfd_setup_interrupt_bitmap(struct kfd_node *node, * - AND VMID reported in the interrupt lies within the * VMID range of the node. */ - for_each_inst(xcc, xcc_mask) { - mapped_xcc = GET_INST(GC, xcc); - node->interrupt_bitmap |= (mapped_xcc % 2 ? 5 : 3) << (4 * (mapped_xcc / 2)); + switch (KFD_GC_VERSION(node)) { + case IP_VERSION(12, 1, 0): + for_each_inst(xcc, xcc_mask) { + mapped_xcc = GET_INST(GC, xcc); + bitmap = 0x2 | (0x4 << (mapped_xcc % 4)); + if (mapped_xcc/4) + bitmap = bitmap << 8; + node->interrupt_bitmap |= bitmap; + } + break; + default: + for_each_inst(xcc, xcc_mask) { + mapped_xcc = GET_INST(GC, xcc); + node->interrupt_bitmap |= (mapped_xcc % 2 ? 5 : 3) << (4 * (mapped_xcc / 2)); + } + break; } dev_info(kfd_device, "Node: %d, interrupt_bitmap: %x\n", kfd_node_idx, node->interrupt_bitmap); @@ -761,7 +783,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, * If the VMID range changes for multi-partition capable GPUs, then * this code MUST be revisited. */ - if (kfd->adev->xcp_mgr) { + if (kfd->adev->xcp_mgr && (KFD_GC_VERSION(kfd) != IP_VERSION(12, 1, 0))) { partition_mode = amdgpu_xcp_query_partition_mode(kfd->adev->xcp_mgr, AMDGPU_XCP_FL_LOCKED); if (partition_mode == AMDGPU_CPX_PARTITION_MODE && @@ -798,12 +820,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, /* add another 512KB for all other allocations on gart (HPD, fences) */ size += 512 * 1024; - if (amdgpu_amdkfd_alloc_gtt_mem( - kfd->adev, size, &kfd->gtt_mem, + if (amdgpu_amdkfd_alloc_kernel_mem( + kfd->adev, size, AMDGPU_GEM_DOMAIN_GTT, + &kfd->gtt_mem, &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, false)) { dev_err(kfd_device, "Could not allocate %d bytes\n", size); - goto alloc_gtt_mem_failure; + goto alloc_kernel_mem_failure; } dev_info(kfd_device, "Allocated %d bytes on gart\n", size); @@ -841,7 +864,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, /* Allocate the KFD nodes */ for (i = 0, xcp_idx = 0; i < kfd->num_nodes; i++) { - node = kzalloc(sizeof(struct kfd_node), GFP_KERNEL); + node = kzalloc_obj(struct kfd_node); if (!node) goto node_alloc_error; @@ -868,7 +891,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, } if (partition_mode == AMDGPU_CPX_PARTITION_MODE && - kfd->num_nodes != 1) { + kfd->num_nodes != 1 && + (KFD_GC_VERSION(kfd) != IP_VERSION(12, 1, 0))) { /* For multi-partition capable GPUs and CPX mode, first * XCD gets VMID range 4-9 and second XCD gets VMID * range 10-15. @@ -889,6 +913,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, node->compute_vmid_bitmap = gpu_resources->compute_vmid_bitmap; } + node->max_proc_per_quantum = max_proc_per_quantum; atomic_set(&node->sram_ecc_flag, 0); @@ -927,8 +952,8 @@ node_alloc_error: kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: - amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); -alloc_gtt_mem_failure: + amdgpu_amdkfd_free_kernel_mem(kfd->adev, &kfd->gtt_mem); +alloc_kernel_mem_failure: dev_err(kfd_device, "device %x:%x NOT added due to errors\n", kfd->adev->pdev->vendor, kfd->adev->pdev->device); @@ -945,10 +970,13 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); - amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); + amdgpu_amdkfd_free_kernel_mem(kfd->adev, &kfd->gtt_mem); } kfree(kfd); + + /* after remove a kfd device unlock kfd driver */ + kgd2kfd_unlock_kfd(NULL); } int kgd2kfd_pre_reset(struct kfd_dev *kfd, @@ -965,7 +993,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd, kfd_smi_event_update_gpu_reset(node, false, reset_context); } - kgd2kfd_suspend(kfd, false); + kgd2kfd_suspend(kfd, true); for (i = 0; i < kfd->num_nodes; i++) kfd_signal_reset_event(kfd->nodes[i]); @@ -1007,13 +1035,33 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) return 0; } -bool kfd_is_locked(void) +bool kfd_is_locked(struct kfd_dev *kfd) { + uint8_t id = 0; + struct kfd_node *dev; + lockdep_assert_held(&kfd_processes_mutex); - return (kfd_locked > 0); + + /* check reset/suspend lock */ + if (kfd_locked > 0) + return true; + + if (kfd) + return kfd->kfd_dev_lock > 0; + + /* check lock on all cgroup accessible devices */ + while (kfd_topology_enum_kfd_devices(id++, &dev) == 0) { + if (!dev || kfd_devcgroup_check_permission(dev)) + continue; + + if (dev->kfd->kfd_dev_lock > 0) + return true; + } + + return false; } -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) +void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc) { struct kfd_node *node; int i; @@ -1021,14 +1069,8 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) if (!kfd->init_complete) return; - /* for runtime suspend, skip locking kfd */ - if (!run_pm) { - mutex_lock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (++kfd_locked == 1) - kfd_suspend_all_processes(); - mutex_unlock(&kfd_processes_mutex); - } + if (suspend_proc) + kgd2kfd_suspend_process(kfd); for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; @@ -1036,9 +1078,9 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) } } -int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) +int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc) { - int ret, i; + int ret = 0, i; if (!kfd->init_complete) return 0; @@ -1049,14 +1091,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) return ret; } - /* for runtime resume, skip unlocking kfd */ - if (!run_pm) { - mutex_lock(&kfd_processes_mutex); - if (--kfd_locked == 0) - ret = kfd_resume_all_processes(); - WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); - mutex_unlock(&kfd_processes_mutex); - } + if (resume_proc) + ret = kgd2kfd_resume_process(kfd); + + return ret; +} + +void kgd2kfd_suspend_process(struct kfd_dev *kfd) +{ + if (!kfd->init_complete) + return; + + mutex_lock(&kfd_processes_mutex); + /* For first KFD device suspend all the KFD processes */ + if (++kfd_locked == 1) + kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); +} + +int kgd2kfd_resume_process(struct kfd_dev *kfd) +{ + int ret = 0; + + if (!kfd->init_complete) + return 0; + + mutex_lock(&kfd_processes_mutex); + if (--kfd_locked == 0) + ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); return ret; } @@ -1091,7 +1155,15 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) } for (i = 0; i < kfd->num_nodes; i++) { - node = kfd->nodes[i]; + /* Race if another thread in b/w + * kfd_cleanup_nodes and kfree(kfd), + * when kfd->nodes[i] = NULL + */ + if (kfd->nodes[i]) + node = kfd->nodes[i]; + else + return; + spin_lock_irqsave(&node->interrupt_lock, flags); if (node->interrupts_active @@ -1151,12 +1223,13 @@ int kgd2kfd_resume_mm(struct mm_struct *mm) * prepare for safe eviction of KFD BOs that belong to the specified * process. * - * @mm: mm_struct that identifies the specified KFD process + * @mm: mm_struct that identifies a group of KFD processes + * @context_id: an id that identifies a specific KFD context in the above kfd process group * @fence: eviction fence attached to KFD process BOs * */ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, - struct dma_fence *fence) + u16 context_id, struct dma_fence *fence) { struct kfd_process *p; unsigned long active_time; @@ -1168,7 +1241,7 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, if (dma_fence_is_signaled(fence)) return 0; - p = kfd_lookup_process_by_mm(mm); + p = kfd_lookup_process_by_id(mm, context_id); if (!p) return -ENODEV; @@ -1255,7 +1328,7 @@ int kfd_gtt_sa_allocate(struct kfd_node *node, unsigned int size, if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) return -ENOMEM; - *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + *mem_obj = kzalloc_obj(struct kfd_mem_obj); if (!(*mem_obj)) return -ENOMEM; @@ -1436,24 +1509,66 @@ unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node) kfd_get_num_sdma_engines(node); } -int kgd2kfd_check_and_lock_kfd(void) +int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd) { + struct kfd_process *p; + int r = 0, temp, idx; + mutex_lock(&kfd_processes_mutex); - if (!hash_empty(kfd_processes_table) || kfd_is_locked()) { - mutex_unlock(&kfd_processes_mutex); - return -EBUSY; + + /* kfd_processes_count is per kfd_dev, return -EBUSY without + * further check + */ + if (!!atomic_read(&kfd->kfd_processes_count)) { + pr_debug("process_wq_release not finished\n"); + r = -EBUSY; + goto out; } - ++kfd_locked; + if (hash_empty(kfd_processes_table) && !kfd_is_locked(kfd)) + goto out; + + /* fail under system reset/resume or kfd device is partition switching. */ + if (kfd_is_locked(kfd)) { + r = -EBUSY; + goto out; + } + + /* + * ensure all running processes are cgroup excluded from device before mode switch. + * i.e. no pdd was created on the process socket. + */ + idx = srcu_read_lock(&kfd_processes_srcu); + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + int i; + + for (i = 0; i < p->n_pdds; i++) { + if (p->pdds[i]->dev->kfd != kfd) + continue; + + r = -EBUSY; + goto proc_check_unlock; + } + } + +proc_check_unlock: + srcu_read_unlock(&kfd_processes_srcu, idx); +out: + if (!r) + ++kfd->kfd_dev_lock; mutex_unlock(&kfd_processes_mutex); - return 0; + return r; } -void kgd2kfd_unlock_kfd(void) +/* unlock a kfd dev or kfd driver */ +void kgd2kfd_unlock_kfd(struct kfd_dev *kfd) { mutex_lock(&kfd_processes_mutex); - --kfd_locked; + if (kfd) + --kfd->kfd_dev_lock; + else + --kfd_locked; mutex_unlock(&kfd_processes_mutex); } @@ -1479,6 +1594,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return ret; } +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.unhalt(node->dqm); + if (r) { + dev_err(kfd_device, "Error in starting scheduler\n"); + return r; + } + } + return 0; +} + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; @@ -1496,6 +1630,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) return node->dqm->ops.halt(node->dqm); } +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.halt(node->dqm); + if (r) + return r; + } + return 0; +} + bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; @@ -1556,15 +1707,20 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr { struct kfd_process *p; u32 cam_index; + u32 src_data_idx; + + src_data_idx = (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0)) ? + 3 : 2; if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) { - p = kfd_lookup_process_by_pasid(entry->pasid); + p = kfd_lookup_process_by_pasid(entry->pasid, NULL); if (!p) return true; if (p->gpu_page_fault && !p->debug_trap_enabled) { if (retry_fault && adev->irq.retry_cam_enabled) { - cam_index = entry->src_data[2] & 0x3ff; + cam_index = entry->src_data[src_data_idx] & 0x3ff; + WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); } @@ -1581,6 +1737,42 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr return false; } +/** kgd2kfd_teardown_processes - gracefully tear down existing + * kfd processes that use adev + * + * @adev: amdgpu_device where kfd processes run on and will be + * teardown + * + */ +void kgd2kfd_teardown_processes(struct amdgpu_device *adev) +{ + struct hlist_node *p_temp; + struct kfd_process *p; + struct kfd_node *dev; + unsigned int temp; + + mutex_lock(&kfd_processes_mutex); + + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return; + } + + hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { + for (int i = 0; i < p->n_pdds; i++) { + dev = p->pdds[i]->dev; + if (dev->adev == adev) + kfd_signal_process_terminate_event(p); + } + } + + mutex_unlock(&kfd_processes_mutex); + + /* wait all kfd processes use adev terminate */ + while (!!atomic_read(&adev->kfd.dev->kfd_processes_count)) + cond_resched(); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS @@ -1593,6 +1785,11 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev) return -EINVAL; } + if (dev->kfd->shared_resources.enable_mes) { + dev_err(dev->adev->dev, "Inducing MES hang is not supported\n"); + return -EINVAL; + } + return dqm_debugfs_hang_hws(dev->dqm); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 34c2c42c0f95..e0a31e11f0ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -36,12 +36,15 @@ #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" #include "amdgpu_reset.h" +#include "amdgpu_sdma.h" #include "mes_v11_api_def.h" #include "kfd_debug.h" /* Size of the per-pipe EOP queue */ #define CIK_HPD_EOP_BYTES_LOG2 11 #define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2) +/* See unmap_queues_cpsch() */ +#define USE_DEFAULT_GRACE_PERIOD 0xffffffff static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid, unsigned int vmid); @@ -66,7 +69,8 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm, static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id); -static void kfd_process_hw_exception(struct work_struct *work); + +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) @@ -133,9 +137,10 @@ static void init_sdma_bitmaps(struct device_queue_manager *dqm) bitmap_set(dqm->xgmi_sdma_bitmap, 0, get_num_xgmi_sdma_queues(dqm)); /* Mask out the reserved queues */ - bitmap_andnot(dqm->sdma_bitmap, dqm->sdma_bitmap, - dqm->dev->kfd->device_info.reserved_sdma_queues_bitmap, - KFD_MAX_SDMA_QUEUES); + bitmap_clear(dqm->sdma_bitmap, 0, kfd_get_num_sdma_engines(dqm->dev) * + dqm->dev->kfd->device_info.num_reserved_sdma_queues_per_engine); + bitmap_clear(dqm->xgmi_sdma_bitmap, 0, kfd_get_num_xgmi_sdma_engines(dqm->dev) * + dqm->dev->kfd->device_info.num_reserved_sdma_queues_per_engine); } void program_sh_mem_settings(struct device_queue_manager *dqm, @@ -170,7 +175,7 @@ static void kfd_hws_hang(struct device_queue_manager *dqm) /* * Issue a GPU reset if HWS is unresponsive */ - schedule_work(&dqm->hw_exception_work); + amdgpu_amdkfd_gpu_reset(dqm->dev->adev); } static int convert_to_mes_queue_type(int queue_type) @@ -207,23 +212,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; - if (!pdd->proc_ctx_cpu_ptr) { - r = amdgpu_amdkfd_alloc_gtt_mem(adev, - AMDGPU_MES_PROC_CTX_SIZE, - &pdd->proc_ctx_bo, - &pdd->proc_ctx_gpu_addr, - &pdd->proc_ctx_cpu_ptr, - false); - if (r) { - dev_err(adev->dev, - "failed to allocate process context bo\n"); - return r; - } - memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); - } - memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); - queue_input.process_id = qpd->pqm->process->pasid; + queue_input.process_id = pdd->pasid; queue_input.page_table_base_addr = qpd->page_table_base; queue_input.process_va_start = 0; queue_input.process_va_end = adev->vm_manager.max_pfn - 1; @@ -265,6 +255,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.queue_type = (uint32_t)queue_type; queue_input.exclusively_scheduled = q->properties.is_gws; + queue_input.sh_mem_config_data = qpd->sh_mem_config; + queue_input.vm_cntx_cntl = qpd->vm_cntx_cntl; + queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); @@ -295,6 +288,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = q->properties.doorbell_off; queue_input.gang_context_addr = q->gang_ctx_gpu_addr; + queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); @@ -410,8 +404,7 @@ static void increment_queue_count(struct device_queue_manager *dqm, struct queue *q) { dqm->active_queue_count++; - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_DIQ) + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) dqm->active_cp_queue_count++; if (q->properties.is_gws) { @@ -425,8 +418,7 @@ static void decrement_queue_count(struct device_queue_manager *dqm, struct queue *q) { dqm->active_queue_count--; - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_DIQ) + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) dqm->active_cp_queue_count--; if (q->properties.is_gws) { @@ -483,6 +475,9 @@ static int allocate_doorbell(struct qcm_process_device *qpd, } else { /* For CP queues on SOC15 */ if (restore_id) { + if (*restore_id >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return -EINVAL; + /* make sure that ID is free */ if (__test_and_set_bit(*restore_id, qpd->doorbell_bitmap)) return -EINVAL; @@ -542,6 +537,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); struct device *dev = dqm->dev->adev->dev; int allocated_vmid = -1, i; @@ -560,9 +556,9 @@ static int allocate_vmid(struct device_queue_manager *dqm, pr_debug("vmid allocated: %d\n", allocated_vmid); - dqm->vmid_pasid[allocated_vmid] = q->process->pasid; + dqm->vmid_pasid[allocated_vmid] = pdd->pasid; - set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid); + set_pasid_vmid_mapping(dqm, pdd->pasid, allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; @@ -579,7 +575,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, qpd->vmid, qpd->page_table_base); /* invalidate the VM context after pasid and vmid mapping is set up */ - kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY); + kfd_flush_tlb(qpd_to_pdd(qpd)); if (dqm->dev->kfd2kgd->set_scratch_backing_va) dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->adev, @@ -617,7 +613,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, if (flush_texture_cache_nocpsch(q->device, qpd)) dev_err(dev, "Failed to flush TC\n"); - kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY); + kfd_flush_tlb(qpd_to_pdd(qpd)); /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); @@ -683,7 +679,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, /* Temporarily release dqm lock to avoid a circular lock dependency */ dqm_unlock(dqm); - q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr, &q->properties); dqm_lock(dqm); if (!q->mqd_mem_obj) { @@ -814,6 +810,11 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process return -EOPNOTSUPP; } + /* taking the VMID for that process on the safe way using PDD */ + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) + return -EFAULT; + /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. * ATC_VMID15_PASID_MAPPING * to check which VMID the current process is mapped to. @@ -823,23 +824,19 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info (dev->adev, vmid, &queried_pasid); - if (status && queried_pasid == p->pasid) { - pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n", - vmid, p->pasid); + if (status && queried_pasid == pdd->pasid) { + pr_debug("Killing wave fronts of vmid %d and process pid %d\n", + vmid, p->lead_thread->pid); break; } } if (vmid > last_vmid_to_scan) { - dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", p->pasid); + dev_err(dev->adev->dev, "Didn't find vmid for process pid %d\n", + p->lead_thread->pid); return -EFAULT; } - /* taking the VMID for that process on the safe way using PDD */ - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) - return -EFAULT; - reg_gfx_index.bits.sh_broadcast_writes = 1; reg_gfx_index.bits.se_broadcast_writes = 1; reg_gfx_index.bits.instance_broadcast_writes = 1; @@ -865,8 +862,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, int retval; struct mqd_manager *mqd_mgr; - mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( - q->properties.type)]; + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)]; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) deallocate_hqd(dqm, q); @@ -1075,8 +1071,8 @@ static int suspend_single_queue(struct device_queue_manager *dqm, if (q->properties.is_suspended) return 0; - pr_debug("Suspending PASID %u queue [%i]\n", - pdd->process->pasid, + pr_debug("Suspending process pid %d queue [%i]\n", + pdd->process->lead_thread->pid, q->properties.queue_id); is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW); @@ -1123,8 +1119,8 @@ static int resume_single_queue(struct device_queue_manager *dqm, pdd = qpd_to_pdd(qpd); - pr_debug("Restoring from suspend PASID %u queue [%i]\n", - pdd->process->pasid, + pr_debug("Restoring from suspend process pid %d queue [%i]\n", + pdd->process->lead_thread->pid, q->properties.queue_id); q->properties.is_suspended = false; @@ -1157,8 +1153,8 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; pdd = qpd_to_pdd(qpd); - pr_debug_ratelimited("Evicting PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Evicting process pid %d queues\n", + pdd->process->lead_thread->pid); pdd->last_evict_timestamp = get_jiffies_64(); /* Mark all queues as evicted. Deactivate all active queues on @@ -1215,8 +1211,11 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, if (!pdd->drm_priv) goto out; - pr_debug_ratelimited("Evicting PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Evicting process pid %d queues\n", + pdd->process->lead_thread->pid); + + if (dqm->dev->kfd->shared_resources.enable_mes) + pdd->last_evict_timestamp = get_jiffies_64(); /* Mark all queues as evicted. Deactivate all active queues on * the qpd. @@ -1230,23 +1229,23 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, decrement_queue_count(dqm, qpd, q); if (dqm->dev->kfd->shared_resources.enable_mes) { - int err; - - err = remove_queue_mes(dqm, q, qpd); - if (err) { + retval = remove_queue_mes(dqm, q, qpd); + if (retval) { dev_err(dev, "Failed to evict queue %d\n", q->properties.queue_id); - retval = err; + goto out; } } } - pdd->last_evict_timestamp = get_jiffies_64(); - if (!dqm->dev->kfd->shared_resources.enable_mes) + + if (!dqm->dev->kfd->shared_resources.enable_mes) { + pdd->last_evict_timestamp = get_jiffies_64(); retval = execute_queues_cpsch(dqm, qpd->is_debug ? KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + } out: dqm_unlock(dqm); @@ -1276,8 +1275,8 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; } - pr_debug_ratelimited("Restoring PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Restoring process pid %d queues\n", + pdd->process->lead_thread->pid); /* Update PD Base in QPD */ qpd->page_table_base = pd_base; @@ -1288,7 +1287,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, dqm->dev->adev, qpd->vmid, qpd->page_table_base); - kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(pdd); } /* Take a safe reference to the mm_struct, which may otherwise @@ -1360,8 +1359,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, if (!pdd->drm_priv) goto vm_not_acquired; - pr_debug_ratelimited("Restoring PASID 0x%x queues\n", - pdd->process->pasid); + pr_debug_ratelimited("Restoring process pid %d queues\n", + pdd->process->lead_thread->pid); /* Update PD Base in QPD */ qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv); @@ -1405,7 +1404,7 @@ static int register_process(struct device_queue_manager *dqm, uint64_t pd_base; int retval; - n = kzalloc(sizeof(*n), GFP_KERNEL); + n = kzalloc_obj(*n); if (!n) return -ENOMEM; @@ -1439,13 +1438,12 @@ static int register_process(struct device_queue_manager *dqm, static int unregister_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { - int retval; + int retval = 0; struct device_process_node *cur, *next; pr_debug("qpd->queues_list is %s\n", list_empty(&qpd->queues_list) ? "empty" : "not empty"); - retval = 0; dqm_lock(dqm); list_for_each_entry_safe(cur, next, &dqm->queues, list) { @@ -1475,7 +1473,7 @@ set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid, unsigned int vmid) { uint32_t xcc_mask = dqm->dev->xcc_mask; - int xcc_id, ret; + int xcc_id, ret = 0; for_each_inst(xcc_id, xcc_mask) { ret = dqm->dev->kfd2kgd->set_pasid_vmid_mapping( @@ -1585,12 +1583,16 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, int bit; if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more SDMA queue to allocate\n"); + if (bitmap_empty(dqm->sdma_bitmap, get_num_sdma_queues(dqm))) { + dev_warn(dev, "No more SDMA queue to allocate (%d total queues)\n", + get_num_sdma_queues(dqm)); return -ENOMEM; } if (restore_sdma_id) { + if (*restore_sdma_id >= get_num_sdma_queues(dqm)) + return -EINVAL; + /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) { dev_err(dev, "SDMA queue already in use\n"); @@ -1611,11 +1613,15 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { - if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - dev_err(dev, "No more XGMI SDMA queue to allocate\n"); + if (bitmap_empty(dqm->xgmi_sdma_bitmap, get_num_xgmi_sdma_queues(dqm))) { + dev_warn(dev, "No more XGMI SDMA queue to allocate (%d total queues)\n", + get_num_xgmi_sdma_queues(dqm)); return -ENOMEM; } if (restore_sdma_id) { + if (*restore_sdma_id >= get_num_xgmi_sdma_queues(dqm)) + return -EINVAL; + /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) { dev_err(dev, "SDMA queue already in use\n"); @@ -1671,8 +1677,8 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, } if (!free_bit_found) { - dev_err(dev, "No more SDMA queue to allocate for target ID %i\n", - q->properties.sdma_engine_id); + dev_warn(dev, "No more SDMA queue to allocate for target ID %i (%d total queues)\n", + q->properties.sdma_engine_id, num_queues); return -ENOMEM; } } @@ -1755,15 +1761,11 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->active_cp_queue_count = 0; dqm->gws_queue_count = 0; dqm->active_runlist = false; - INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); dqm->trap_debug_vmid = 0; init_sdma_bitmaps(dqm); - if (dqm->dev->kfd2kgd->get_iq_wait_times) - dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev, - &dqm->wait_times, - ffs(dqm->dev->xcc_mask) - 1); + update_dqm_wait_times(dqm); return 0; } @@ -1829,8 +1831,6 @@ static int start_cpsch(struct device_queue_manager *dqm) struct device *dev = dqm->dev->adev->dev; int retval, num_hw_queue_slots; - retval = 0; - dqm_lock(dqm); if (!dqm->dev->kfd->shared_resources.enable_mes) { @@ -1859,25 +1859,11 @@ static int start_cpsch(struct device_queue_manager *dqm) /* clear hang status when driver try to start the hw scheduler */ dqm->sched_running = true; - if (!dqm->dev->kfd->shared_resources.enable_mes) + if (!dqm->dev->kfd->shared_resources.enable_mes) { + if (pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_INIT, 0 /* unused */)) + dev_err(dev, "Setting optimized dequeue wait failed. Using default values\n"); execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); - - /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */ - if (amdgpu_emu_mode == 0 && dqm->dev->adev->gmc.is_app_apu && - (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3))) { - uint32_t reg_offset = 0; - uint32_t grace_period = 1; - - retval = pm_update_grace_period(&dqm->packet_mgr, - grace_period); - if (retval) - dev_err(dev, "Setting grace timeout failed\n"); - else if (dqm->dev->kfd2kgd->build_grace_period_packet_info) - /* Update dqm->wait_times maintained in software */ - dqm->dev->kfd2kgd->build_grace_period_packet_info( - dqm->dev->adev, dqm->wait_times, - grace_period, ®_offset, - &dqm->wait_times); } /* setup per-queue reset detection buffer */ @@ -1909,6 +1895,8 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { + int ret = 0; + dqm_lock(dqm); if (!dqm->sched_running) { dqm_unlock(dqm); @@ -1916,9 +1904,10 @@ static int stop_cpsch(struct device_queue_manager *dqm) } if (!dqm->dev->kfd->shared_resources.enable_mes) - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); + ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, + 0, USE_DEFAULT_GRACE_PERIOD, false); else - remove_all_kfd_queues_mes(dqm); + ret = remove_all_kfd_queues_mes(dqm); dqm->sched_running = false; @@ -1932,7 +1921,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) dqm->detect_hang_info = NULL; dqm_unlock(dqm); - return 0; + return ret; } static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, @@ -2022,7 +2011,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, dqm->asic_ops.init_sdma_vm(dqm, q, qpd); q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr, &q->properties); if (!q->mqd_mem_obj) { retval = -ENOMEM; goto out_deallocate_doorbell; @@ -2103,7 +2092,8 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, while (*fence_addr != fence_value) { /* Fatal err detected, this response won't come */ - if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) + if (amdgpu_amdkfd_is_fed(dqm->dev->adev) || + amdgpu_in_reset(dqm->dev->adev)) return -EIO; if (time_after(jiffies, end_jiffies)) { @@ -2152,8 +2142,8 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q { struct kfd_process_device *pdd = qpd_to_pdd(qpd); - dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x is reset\n", - q->properties.queue_id, q->process->pasid); + dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid %d is reset\n", + q->properties.queue_id, pdd->process->lead_thread->pid); pdd->has_reset_queue = true; if (q->properties.is_active) { @@ -2222,8 +2212,7 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin return NULL; } -/* only for compute queue */ -static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) +static int reset_hung_queues(struct device_queue_manager *dqm) { int r = 0, reset_count = 0, i; @@ -2276,7 +2265,121 @@ reset_fail: return r; } -/* dqm->lock mutex has to be locked before calling this function */ +static bool sdma_has_hang(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int i, j; + + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j)) + continue; + + return true; + } + } + + return false; +} + +static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm, + uint32_t doorbell_off) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) && + q->properties.doorbell_off == doorbell_off) { + set_queue_as_reset(dqm, q, qpd); + return true; + } + } + } + + return false; +} + +static int reset_hung_queues_sdma(struct device_queue_manager *dqm) +{ + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); + int engine_end = engine_start + get_num_all_sdma_engines(dqm); + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; + int r = 0, i, j; + + if (dqm->is_hws_hang) + return -EIO; + + /* Scan for hung HW queues and reset engine. */ + dqm->detect_hang_count = 0; + for (i = engine_start; i < engine_end; i++) { + for (j = 0; j < num_queues_per_eng; j++) { + uint32_t doorbell_off = + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j); + + if (!doorbell_off) + continue; + + /* Reset engine and check. */ + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) || + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || + !set_sdma_queue_as_reset(dqm, doorbell_off)) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + /* Should only expect one queue active per engine */ + dqm->detect_hang_count++; + break; + } + } + + /* Signal process reset */ + if (dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma) +{ + struct amdgpu_device *adev = dqm->dev->adev; + + while (halt_if_hws_hang) + schedule(); + + if (adev->debug_disable_gpu_ring_reset) { + dev_info_once(adev->dev, + "%s queue hung, but ring reset disabled", + is_sdma ? "sdma" : "compute"); + + return -EPERM; + } + if (!amdgpu_gpu_recovery) + return -ENOTRECOVERABLE; + + return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm); +} + +/* dqm->lock mutex has to be locked before calling this function + * + * @grace_period: If USE_DEFAULT_GRACE_PERIOD then default wait time + * for context switch latency. Lower values are used by debugger + * since context switching are triggered at high frequency. + * This is configured by setting CP_IQ_WAIT_TIME2.SCH_WAVE + * + */ static int unmap_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, uint32_t filter_param, @@ -2295,7 +2398,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, return -EIO; if (grace_period != USE_DEFAULT_GRACE_PERIOD) { - retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); + retval = pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_SET_SCH_WAVE, grace_period); if (retval) goto out; } @@ -2326,30 +2430,32 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { - while (halt_if_hws_hang) - schedule(); - if (reset_queues_on_hws_hang(dqm)) { - dqm->is_hws_hang = true; - kfd_hws_hang(dqm); - retval = -ETIME; - goto out; - } - } + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) && + reset_queues_on_hws_hang(dqm, false)) + goto reset_fail; + + /* Check for SDMA hang and attempt SDMA reset */ + if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true)) + goto reset_fail; /* We need to reset the grace period value for this device */ if (grace_period != USE_DEFAULT_GRACE_PERIOD) { - if (pm_update_grace_period(&dqm->packet_mgr, - USE_DEFAULT_GRACE_PERIOD)) + if (pm_config_dequeue_wait_counts(&dqm->packet_mgr, + KFD_DEQUEUE_WAIT_RESET, 0 /* unused */)) dev_err(dev, "Failed to reset grace period\n"); } pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; - out: up_read(&dqm->dev->adev->reset_domain->sem); return retval; + +reset_fail: + dqm->is_hws_hang = true; + kfd_hws_hang(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return -ETIME; } /* only for compute queue */ @@ -2506,20 +2612,13 @@ failed_try_destroy_debugged_queue: return retval; } -/* - * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to - * stay in user mode. - */ -#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL -/* APE1 limit is inclusive and 64K aligned. */ -#define APE1_LIMIT_ALIGNMENT 0xFFFF - static bool set_cache_memory_policy(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { bool retval = true; @@ -2528,41 +2627,17 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, dqm_lock(dqm); - if (alternate_aperture_size == 0) { - /* base > limit disables APE1 */ - qpd->sh_mem_ape1_base = 1; - qpd->sh_mem_ape1_limit = 0; - } else { - /* - * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, - * SH_MEM_APE1_BASE[31:0], 0x0000 } - * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, - * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } - * Verify that the base and size parameters can be - * represented in this format and convert them. - * Additionally restrict APE1 to user-mode addresses. - */ - - uint64_t base = (uintptr_t)alternate_aperture_base; - uint64_t limit = base + alternate_aperture_size - 1; - - if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || - (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { - retval = false; - goto out; - } - - qpd->sh_mem_ape1_base = base >> 16; - qpd->sh_mem_ape1_limit = limit >> 16; - } - retval = dqm->asic_ops.set_cache_memory_policy( dqm, qpd, default_policy, alternate_policy, alternate_aperture_base, - alternate_aperture_size); + alternate_aperture_size, + misc_process_properties); + + if (retval) + goto out; if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) program_sh_mem_settings(dqm, qpd); @@ -2654,7 +2729,7 @@ static int get_wave_state(struct device_queue_manager *dqm, ctl_stack, ctl_stack_used_size, save_area_used_size); } -static void get_queue_checkpoint_info(struct device_queue_manager *dqm, +static int get_queue_checkpoint_info(struct device_queue_manager *dqm, const struct queue *q, u32 *mqd_size, u32 *ctl_stack_size) @@ -2662,16 +2737,19 @@ static void get_queue_checkpoint_info(struct device_queue_manager *dqm, struct mqd_manager *mqd_mgr; enum KFD_MQD_TYPE mqd_type = get_mqd_type_from_queue_type(q->properties.type); + int ret = 0; dqm_lock(dqm); mqd_mgr = dqm->mqd_mgrs[mqd_type]; - *mqd_size = mqd_mgr->mqd_size; + *mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask); *ctl_stack_size = 0; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info) - mqd_mgr->get_checkpoint_info(mqd_mgr, q->mqd, ctl_stack_size); + ret = mqd_mgr->get_checkpoint_info(mqd_mgr, q->mqd, ctl_stack_size); dqm_unlock(dqm); + + return ret; } static int checkpoint_mqd(struct device_queue_manager *dqm, @@ -2707,7 +2785,7 @@ dqm_unlock: static int process_termination_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { - int retval; + int retval = 0; struct queue *q; struct device *dev = dqm->dev->adev->dev; struct kernel_queue *kq, *kq_next; @@ -2717,8 +2795,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; bool found = false; - retval = 0; - dqm_lock(dqm); /* Clean all kernel queues */ @@ -2835,20 +2911,29 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) (dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size * NUM_XCC(dqm->dev->xcc_mask)); - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size, - &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), + retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, size, + AMDGPU_GEM_DOMAIN_GTT, + &(mem_obj->mem), &(mem_obj->gpu_addr), (void *)&(mem_obj->cpu_ptr), false); return retval; } +static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, + struct kfd_mem_obj *mqd) +{ + WARN(!mqd, "No hiq sdma mqd trunk to free"); + + amdgpu_amdkfd_free_kernel_mem(dev->adev, &mqd->mem); +} + struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) { struct device_queue_manager *dqm; pr_debug("Loading device queue manager\n"); - dqm = kzalloc(sizeof(*dqm), GFP_KERNEL); + dqm = kzalloc_obj(*dqm); if (!dqm) return NULL; @@ -2937,7 +3022,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 1, 0)) + device_queue_manager_init_v12_1(&dqm->asic_ops); + else if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) device_queue_manager_init_v12(&dqm->asic_ops); else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) device_queue_manager_init_v11(&dqm->asic_ops); @@ -2965,19 +3052,14 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) return dqm; } + if (!dev->kfd->shared_resources.enable_mes) + deallocate_hiq_sdma_mqd(dev, &dqm->hiq_sdma_mqd); + out_free: kfree(dqm); return NULL; } -static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, - struct kfd_mem_obj *mqd) -{ - WARN(!mqd, "No hiq sdma mqd trunk to free"); - - amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); -} - void device_queue_manager_uninit(struct device_queue_manager *dqm) { dqm->ops.stop(dqm); @@ -2989,20 +3071,19 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) { - struct kfd_process_device *pdd; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process_device *pdd = NULL; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, &pdd); struct device_queue_manager *dqm = knode->dqm; struct device *dev = dqm->dev->adev->dev; struct qcm_process_device *qpd; struct queue *q = NULL; int ret = 0; - if (!p) + if (!pdd) return -EINVAL; dqm_lock(dqm); - pdd = kfd_get_process_device_data(dqm->dev, p); if (pdd) { qpd = &pdd->qpd; @@ -3035,74 +3116,21 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel out: dqm_unlock(dqm); + kfd_unref_process(p); return ret; } -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +int kfd_evict_process_device(struct kfd_process_device *pdd) { - struct device *dev = dqm->dev->adev->dev; - int ret = 0; - - /* Check if process is already evicted */ - dqm_lock(dqm); - if (qpd->evicted) { - /* Increment the evicted count to make sure the - * process stays evicted before its terminated. - */ - qpd->evicted++; - dqm_unlock(dqm); - goto out; - } - dqm_unlock(dqm); - - ret = suspend_all_queues_mes(dqm); - if (ret) { - dev_err(dev, "Suspending all queues failed"); - goto out; - } - - ret = dqm->ops.evict_process_queues(dqm, qpd); - if (ret) { - dev_err(dev, "Evicting process queues failed"); - goto out; - } + struct device_queue_manager *dqm; + struct kfd_process *p; - ret = resume_all_queues_mes(dqm); - if (ret) - dev_err(dev, "Resuming all queues failed"); + p = pdd->process; + dqm = pdd->dev->dqm; -out: - return ret; -} - -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) -{ - struct kfd_process_device *pdd; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - int ret = 0; - - if (!p) - return -EINVAL; WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); - pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) { - if (dqm->dev->kfd->shared_resources.enable_mes) - ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); - else - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); - } - - kfd_unref_process(p); - - return ret; -} -static void kfd_process_hw_exception(struct work_struct *work) -{ - struct device_queue_manager *dqm = container_of(work, - struct device_queue_manager, hw_exception_work); - amdgpu_amdkfd_gpu_reset(dqm->dev->adev); + return dqm->ops.evict_process_queues(dqm, &pdd->qpd); } int reserve_debug_trap_vmid(struct device_queue_manager *dqm, @@ -3441,7 +3469,6 @@ int suspend_queues(struct kfd_process *p, else per_device_suspended++; } else if (err != -EBUSY) { - r = err; queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 09ab36f8e8c6..3272328da11f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -37,7 +37,6 @@ #define KFD_MES_PROCESS_QUANTUM 100000 #define KFD_MES_GANG_QUANTUM 10000 -#define USE_DEFAULT_GRACE_PERIOD 0xffffffff struct device_process_node { struct qcm_process_device *qpd; @@ -174,7 +173,8 @@ struct device_queue_manager_ops { enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); int (*process_termination)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); @@ -192,7 +192,7 @@ struct device_queue_manager_ops { int (*reset_queues)(struct device_queue_manager *dqm, uint16_t pasid); - void (*get_queue_checkpoint_info)(struct device_queue_manager *dqm, + int (*get_queue_checkpoint_info)(struct device_queue_manager *dqm, const struct queue *q, u32 *mqd_size, u32 *ctl_stack_size); @@ -210,7 +210,8 @@ struct device_queue_manager_asic_ops { enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); void (*init_sdma_vm)(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); @@ -269,7 +270,6 @@ struct device_queue_manager { /* hw exception */ bool is_hws_hang; bool is_resetting; - struct work_struct hw_exception_work; struct kfd_mem_obj hiq_sdma_mqd; bool sched_running; bool sched_halt; @@ -299,6 +299,8 @@ void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v12( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v12_1( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_cp_queues_num(struct device_queue_manager *dqm); @@ -359,4 +361,14 @@ static inline int read_sdma_queue_counter(uint64_t __user *q_rptr, uint64_t *val /* SDMA activity counter is stored at queue's RPTR + 0x8 location. */ return get_user(*val, q_rptr + 1); } + +static inline void update_dqm_wait_times(struct device_queue_manager *dqm) +{ + if (dqm->dev->kfd2kgd->get_iq_wait_times) + dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev, + &dqm->wait_times, + ffs(dqm->dev->xcc_mask) - 1); +} + + #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index d4d95c7f2e5d..0508ef5a41d7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -27,12 +27,21 @@ #include "oss/oss_2_4_sh_mask.h" #include "gca/gfx_7_2_sh_mask.h" +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); static int update_qpd_cik(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm(struct device_queue_manager *dqm, @@ -80,10 +89,41 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { uint32_t default_mtype; uint32_t ape1_mtype; + unsigned int temp; + bool retval = true; + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || + (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { + retval = false; + goto out; + } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } default_mtype = (default_policy == cache_policy_coherent) ? MTYPE_NONCACHED : @@ -97,37 +137,22 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, | ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | DEFAULT_MTYPE(default_mtype) | APE1_MTYPE(ape1_mtype); - - return true; -} - -static int update_qpd_cik(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | - DEFAULT_MTYPE(MTYPE_NONCACHED) | - APE1_MTYPE(MTYPE_NONCACHED); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit * aperture addresses. */ - temp = get_sh_mem_bases_nybble_64(pdd); + temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd)); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); +out: + return retval; +} + +static int update_qpd_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c index 245a90dfc2f6..ba6e3d747ccd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -31,10 +31,18 @@ static int update_qpd_v10(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v10( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v10; asic_ops->update_qpd = update_qpd_v10; asic_ops->init_sdma_vm = init_sdma_vm_v10; asic_ops->mqd_manager_init = mqd_manager_init_v10; @@ -49,27 +57,28 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } -static int update_qpd_v10(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) { - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - (SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | - (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c index 2e129da7acb4..8b447d04558f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c @@ -30,10 +30,18 @@ static int update_qpd_v11(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v11(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v11; asic_ops->update_qpd = update_qpd_v11; asic_ops->init_sdma_vm = init_sdma_vm_v11; asic_ops->mqd_manager_init = mqd_manager_init_v11; @@ -48,28 +56,29 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } -static int update_qpd_v11(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) { - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - (SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | - (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} +static int update_qpd_v11(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c index 4f3295b29dfb..3550da3a46f9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c @@ -30,10 +30,18 @@ static int update_qpd_v12(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v12( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v12; asic_ops->update_qpd = update_qpd_v12; asic_ops->init_sdma_vm = init_sdma_vm_v12; asic_ops->mqd_manager_init = mqd_manager_init_v12; @@ -48,28 +56,29 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } -static int update_qpd_v12(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) +static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) { - struct kfd_process_device *pdd; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - (SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | - (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } + qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); - qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + return true; +} +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c new file mode 100644 index 000000000000..9e70a5f8a50b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "gc/gc_12_1_0_sh_mask.h" +#include "soc_v1_0_enum.h" + +static int update_qpd_v12_1(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v12_1(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v12_1( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v12_1; + asic_ops->init_sdma_vm = init_sdma_vm_v12_1; + asic_ops->mqd_manager_init = mqd_manager_init_v12_1; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 58; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + (private_base << SH_MEM_BASES__PRIVATE_BASE__SHIFT); +} + +static int update_qpd_v12_1(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + struct amdgpu_device *adev = dqm->dev->adev; + struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; + + pdd = qpd_to_pdd(qpd); + qpd->vm_cntx_cntl = hub->vm_cntx_cntl; + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + + qpd->sh_mem_config |= + (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + if (KFD_SUPPORT_XNACK_PER_PROCESS(dqm->dev)) { + if (!pdd->process->xnack_enabled) { + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; + qpd->vm_cntx_cntl &= + ~(1 << GCVM_CONTEXT0_CNTL__RETRY_PERMISSION_OR_INVALID_PAGE_FAULT__SHIFT); + } else { + qpd->sh_mem_config &= ~(1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT); + qpd->vm_cntx_cntl |= + (1 << GCVM_CONTEXT0_CNTL__RETRY_PERMISSION_OR_INVALID_PAGE_FAULT__SHIFT); + } + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v12_1(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 67137e674f1d..9fcc8c6e57b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -30,10 +30,18 @@ static int update_qpd_v9(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); +static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties); void device_queue_manager_init_v9( struct device_queue_manager_asic_ops *asic_ops) { + asic_ops->set_cache_memory_policy = set_cache_memory_policy_v9; asic_ops->update_qpd = update_qpd_v9; asic_ops->init_sdma_vm = init_sdma_vm_v9; asic_ops->mqd_manager_init = mqd_manager_init_v9; @@ -48,10 +56,42 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) private_base; } +static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size, + u32 misc_process_properties) +{ + qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; + + if (dqm->dev->kfd->noretry) + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; + + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) + qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); + + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) { + if (misc_process_properties & KFD_PROC_FLAG_MFMA_HIGH_PRECISION) + qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRECISION_MODE__SHIFT; + } + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd)); + + pr_debug("sh_mem_bases 0x%X sh_mem_config 0x%X\n", qpd->sh_mem_bases, + qpd->sh_mem_config); + return true; +} + static int update_qpd_v9(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { - struct kfd_process_device *pdd; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); pdd = qpd_to_pdd(qpd); @@ -64,8 +104,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm, qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || - KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4) || - KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index b291ee0fab94..dad83356e976 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -27,12 +27,21 @@ #include "gca/gfx_8_0_sh_mask.h" #include "oss/oss_3_0_sh_mask.h" +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size); + uint64_t alternate_aperture_size, + u32 misc_process_properties); static int update_qpd_vi(struct device_queue_manager *dqm, struct qcm_process_device *qpd); static void init_sdma_vm(struct device_queue_manager *dqm, @@ -81,10 +90,41 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, - uint64_t alternate_aperture_size) + uint64_t alternate_aperture_size, + u32 misc_process_properties) { uint32_t default_mtype; uint32_t ape1_mtype; + unsigned int temp; + bool retval = true; + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 || + (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) { + retval = false; + goto out; + } + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } default_mtype = (default_policy == cache_policy_coherent) ? MTYPE_UC : @@ -100,40 +140,21 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - return true; -} - -static int update_qpd_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct kfd_process_device *pdd; - unsigned int temp; - - pdd = qpd_to_pdd(qpd); - - /* check if sh_mem_config register already configured */ - if (qpd->sh_mem_config == 0) { - qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | - MTYPE_UC << - SH_MEM_CONFIG__APE1_MTYPE__SHIFT; - - qpd->sh_mem_ape1_limit = 0; - qpd->sh_mem_ape1_base = 0; - } - /* On dGPU we're always in GPUVM64 addressing mode with 64-bit * aperture addresses. */ - temp = get_sh_mem_bases_nybble_64(pdd); + temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd)); qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", temp, qpd->sh_mem_bases); +out: + return retval; +} +static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index d075f24e5f9f..44150a71ffd5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -67,7 +67,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) void *backing_store; struct kfd_signal_page *page; - page = kzalloc(sizeof(*page), GFP_KERNEL); + page = kzalloc_obj(*page); if (!page) return NULL; @@ -142,6 +142,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) * @p: Pointer to struct kfd_process * @id: ID to look up * @bits: Number of valid bits in @id + * @signal_mailbox_updated: flag indicates if FW updates signal mailbox entry * * Finds the first signaled event with a matching partial ID. If no * matching signaled event is found, returns NULL. In that case the @@ -155,7 +156,8 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id) * driver. */ static struct kfd_event *lookup_signaled_event_by_partial_id( - struct kfd_process *p, uint32_t id, uint32_t bits) + struct kfd_process *p, uint32_t id, uint32_t bits, + bool signal_mailbox_updated) { struct kfd_event *ev; @@ -166,7 +168,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id( * and we only need a single lookup. */ if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) { - if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) + if (signal_mailbox_updated && + page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT) return NULL; return idr_find(&p->event_idr, id); @@ -331,7 +334,13 @@ static int kfd_event_page_set(struct kfd_process *p, void *kernel_address, if (p->signal_page) return -EBUSY; - page = kzalloc(sizeof(*page), GFP_KERNEL); + if (size < KFD_SIGNAL_EVENT_LIMIT * 8) { + pr_err("Event page size %llu is too small, need at least %lu bytes\n", + size, (unsigned long)(KFD_SIGNAL_EVENT_LIMIT * 8)); + return -EINVAL; + } + + page = kzalloc_obj(*page); if (!page) return -ENOMEM; @@ -399,7 +408,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint64_t *event_page_offset, uint32_t *event_slot_index) { int ret = 0; - struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); + struct kfd_event *ev = kzalloc_obj(*ev); if (!ev) return -ENOMEM; @@ -452,11 +461,11 @@ int kfd_criu_restore_event(struct file *devkfd, struct kfd_event *ev = NULL; int ret = 0; - ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL); + ev_priv = kmalloc_obj(*ev_priv); if (!ev_priv) return -ENOMEM; - ev = kzalloc(sizeof(*ev), GFP_KERNEL); + ev = kzalloc_obj(*ev); if (!ev) { ret = -ENOMEM; goto exit; @@ -718,7 +727,7 @@ static void set_event_from_interrupt(struct kfd_process *p, } void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, - uint32_t valid_id_bits) + uint32_t valid_id_bits, bool signal_mailbox_updated) { struct kfd_event *ev = NULL; @@ -727,7 +736,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -736,7 +745,8 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, if (valid_id_bits) ev = lookup_signaled_event_by_partial_id(p, partial_id, - valid_id_bits); + valid_id_bits, + signal_mailbox_updated); if (ev) { set_event_from_interrupt(p, ev); } else if (p->signal_page) { @@ -748,16 +758,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint64_t *slots = page_slots(p->signal_page); uint32_t id; - /* - * If id is valid but slot is not signaled, GPU may signal the same event twice - * before driver have chance to process the first interrupt, then signal slot is - * auto-reset after set_event wakeup the user space, just drop the second event as - * the application only need wakeup once. - */ - if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) && - partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT) - goto out_unlock; - if (valid_id_bits) pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", partial_id, valid_id_bits); @@ -786,7 +786,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, } } -out_unlock: rcu_read_unlock(); kfd_unref_process(p); } @@ -796,8 +795,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) struct kfd_event_waiter *event_waiters; uint32_t i; - event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), - GFP_KERNEL); + event_waiters = kzalloc_objs(struct kfd_event_waiter, num_events); if (!event_waiters) return NULL; @@ -1139,8 +1137,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGSEGV to process pid %d", + p->lead_thread->pid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -1148,13 +1146,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGTERM to process pid %d", + p->lead_thread->pid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "Process %d (pasid 0x%x) got unhandled exception", - p->lead_thread->pid, p->pasid); + "Process pid %d got unhandled exception", + p->lead_thread->pid); } } @@ -1168,7 +1166,7 @@ void kfd_signal_hw_exception_event(u32 pasid) * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -1177,22 +1175,39 @@ void kfd_signal_hw_exception_event(u32 pasid) kfd_unref_process(p); } -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va) +{ + struct kfd_process_device *pdd; + struct kfd_hsa_memory_exception_data exception_data; + int i; + + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.va = gpu_va; + exception_data.failure.NotPresent = 1; + + // Send VM seg fault to all kfd process device + for (i = 0; i < p->n_pdds; i++) { + pdd = p->pdds[i]; + exception_data.gpu_id = pdd->user_gpu_id; + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, &exception_data); + } +} + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data) { struct kfd_event *ev; uint32_t id; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = pdd->process; struct kfd_hsa_memory_exception_data memory_exception_data; int user_gpu_id; - if (!p) - return; /* Presumably process exited. */ - - user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id); if (unlikely(user_gpu_id == -EINVAL)) { - WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", + pdd->dev->id); return; } @@ -1229,7 +1244,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, } rcu_read_unlock(); - kfd_unref_process(p); } void kfd_signal_reset_event(struct kfd_node *dev) @@ -1264,7 +1278,8 @@ void kfd_signal_reset_event(struct kfd_node *dev) } if (unlikely(!pdd)) { - WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid); + WARN_ONCE(1, "Could not get device data from process pid:%d\n", + p->lead_thread->pid); continue; } @@ -1273,12 +1288,19 @@ void kfd_signal_reset_event(struct kfd_node *dev) if (dev->dqm->detect_hang_count) { struct amdgpu_task_info *ti; + struct amdgpu_fpriv *drv_priv; - ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); + if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) { + WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n", + dev->id, p->lead_thread->pid); + continue; + } + + ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm); if (ti) { dev_err(dev->adev->dev, "Queues reset on process %s tid %d thread %s pid %d\n", - ti->process_name, ti->tgid, ti->task_name, ti->pid); + ti->process_name, ti->tgid, ti->task.comm, ti->task.pid); amdgpu_vm_put_task_info(ti); } } @@ -1311,7 +1333,7 @@ void kfd_signal_reset_event(struct kfd_node *dev) void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_event *ev; @@ -1326,6 +1348,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + kfd_unref_process(p); return; } @@ -1366,3 +1389,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) kfd_unref_process(p); } + +/* signal KFD_EVENT_TYPE_SIGNAL events from process p + * send signal SIGBUS to correspondent user space process + */ +void kfd_signal_process_terminate_event(struct kfd_process *p) +{ + struct kfd_event *ev; + u32 id; + + rcu_read_lock(); + + /* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */ + id = 1; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_SIGNAL) { + spin_lock(&ev->lock); + set_event(ev); + spin_unlock(&ev->lock); + } + + /* Send SIGBUS to p->lead_thread */ + dev_notice(kfd_device, + "Sending SIGBUS to process %d", + p->lead_thread->pid); + + send_sig(SIGBUS, p->lead_thread, 0); + + rcu_read_unlock(); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index 52ccfd397c2b..1dc21c13833b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -85,6 +85,7 @@ struct kfd_event { #define KFD_EVENT_TYPE_MEMORY 8 extern void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, - uint32_t valid_id_bits); + uint32_t valid_id_bits, + bool signal_mailbox_updated); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index dbcb60eb54b2..04c5e26f01ed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -23,7 +23,6 @@ */ #include <linux/device.h> -#include <linux/export.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/sched.h> @@ -360,6 +359,25 @@ static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev); } +static void kfd_init_apertures_v12(struct kfd_process_device *pdd, uint8_t id) +{ + pdd->lds_base = pdd->dev->adev->gmc.shared_aperture_start; + pdd->lds_limit = pdd->dev->adev->gmc.shared_aperture_end; + + pdd->gpuvm_base = AMDGPU_VA_RESERVED_BOTTOM; + pdd->gpuvm_limit = + pdd->dev->kfd->shared_resources.gpuvm_size - 1; + + pdd->scratch_base = pdd->dev->adev->gmc.private_aperture_start; + pdd->scratch_limit = pdd->dev->adev->gmc.private_aperture_end; + + /* + * Place TBA/TMA on opposite side of VM hole to prevent + * stray faults from triggering SVM on these pages. + */ + pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev); +} + int kfd_init_apertures(struct kfd_process *process) { uint8_t id = 0; @@ -407,9 +425,11 @@ int kfd_init_apertures(struct kfd_process *process) kfd_init_apertures_vi(pdd, id); break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 0, 1)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 1, 0)) { + kfd_init_apertures_v12(pdd, id); + } else if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 0, 1)) { kfd_init_apertures_v9(pdd, id); - else { + } else { WARN(1, "Unexpected ASIC family %u", dev->adev->asic_type); return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 37b69fe0ede3..19406ab92c5b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -168,14 +168,14 @@ static bool event_interrupt_isr_v10(struct kfd_node *dev, client_id != SOC15_IH_CLIENTID_SE3SH) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return 0; /* Interrupt types we care about: various signals and faults. @@ -211,43 +211,72 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SE2SH || client_id == SOC15_IH_CLIENTID_SE3SH) { if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id0, 32); + kfd_signal_event_interrupt(pasid, context_id0, 32, true); else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { encoding = REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF0_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF1_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_AUTO_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + WLT), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF0_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF1_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID)); + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) { if (kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), @@ -259,27 +288,43 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0, ERR_TYPE); - pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID), + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID), sq_intr_err_type); break; default: break; } - kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23); + kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23, true); } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -299,7 +344,7 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SDMA6 || client_id == SOC15_IH_CLIENTID_SDMA7) { if (source_id == SOC15_INTSRC_SDMA_TRAP) { - kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); + kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true); } } else if (client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_VMC1 || diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index b3f988b275a8..12d81abed748 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -148,44 +148,69 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \ KFD_CTXID0_DOORBELL_ID_MASK) -static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF_FULL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + REG_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + CMD_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_CMD_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_REG_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + IMMED_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); } -static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + SH_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); } -static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_warn_ratelimited( + dev_warn_ratelimited( + dev->adev->dev, "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + DETAIL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + TYPE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + PRIV), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + WGP_ID)); } static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, @@ -194,7 +219,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, enum amdgpu_ras_block block = 0; int ret = -EINVAL; uint32_t reset = 0; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; @@ -255,14 +280,14 @@ static bool event_interrupt_isr_v11(struct kfd_node *dev, (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG)) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return false; /* Interrupt types we care about: various signals and faults. @@ -328,7 +353,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id0, 32); + kfd_signal_event_interrupt(pasid, context_id0, 32, true); else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); @@ -341,7 +366,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, /* SDMA */ else if (source_id == SOC21_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); + kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true); else if (source_id == SOC21_INTSRC_SDMA_ECC) { event_interrupt_poison_consumption_v11(dev, pasid, source_id); return; @@ -353,10 +378,10 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (sq_int_enc) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - print_sq_intr_info_auto(context_id0, context_id1); + print_sq_intr_info_auto(dev, context_id0, context_id1); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - print_sq_intr_info_inst(context_id0, context_id1); + print_sq_intr_info_inst(dev, context_id0, context_id1); sq_int_priv = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -366,7 +391,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, return; break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: - print_sq_intr_info_error(context_id0, context_id1); + print_sq_intr_info_error(dev, context_id0, context_id1); sq_int_errtype = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE); if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && @@ -379,7 +404,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, default: break; } - kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24); + kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24, true); } } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c new file mode 100644 index 000000000000..0da7e1db55c9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "kfd_priv.h" +#include "kfd_events.h" +#include "soc15_int.h" +#include "kfd_device_queue_manager.h" +#include "ivsrcid/vmc/irqsrcs_vmc_1_0.h" +#include "kfd_smi_events.h" +#include "kfd_debug.h" +#include "amdgpu_ras_mgr.h" + +/* + * GFX12.1 SQ Interrupts + * + * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit + * packet to the Interrupt Handler: + * Auto - Generated by the SQG (various cmd overflows, timestamps etc) + * Wave - Generated by S_SENDMSG through a shader program + * Error - HW generated errors (Illegal instructions, Memviols, EDC etc) + * + * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus + * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such: + * + * - context_id1[7:6] + * Encoding type (0 = Auto, 1 = Wave, 2 = Error) + * + * - context_id0[26] + * PRIV bit indicates that Wave S_SEND or error occurred within trap + * + * - context_id0[24:0] + * 25-bit data with the following layout per encoding type: + * Auto - only context_id0[8:0] is used, which reports various interrupts + * generated by SQG. The rest is 0. + * Wave - user data sent from m0 via S_SENDMSG (context_id0[23:0]) + * Error - Error Type (context_id0[24:21]), Error Details (context_id0[20:0]) + * + * The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave + * S_SENDMSG and Errors. These are 0 for Auto. + */ + +enum SQ_INTERRUPT_WORD_ENCODING { + SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, + SQ_INTERRUPT_WORD_ENCODING_INST, + SQ_INTERRUPT_WORD_ENCODING_ERROR, +}; + +enum SQ_INTERRUPT_ERROR_TYPE { + SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0, + SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST, + SQ_INTERRUPT_ERROR_TYPE_MEMVIOL, + SQ_INTERRUPT_ERROR_TYPE_EDC_FED, +}; + +/* SQ_INTERRUPT_WORD_AUTO_CTXID */ +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL__SHIFT 2 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL__SHIFT 3 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 8 +#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6 + +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL_MASK 0x00000004 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL_MASK 0x00000008 +#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000100 +#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x000000c0 + +/* SQ_INTERRUPT_WORD_WAVE_CTXID */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0 +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID__SHIFT 25 +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 26 +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 27 +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID__SHIFT 0 +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 2 +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6 + +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x00ffffff /* [23:0] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID_MASK 0x02000000 /* [25] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x04000000 /* [26] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */ +#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */ + +/* SQ_INTERRUPT_WORD_ERROR_CTXID */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL__SHIFT 0 +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL__SHIFT 19 +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE__SHIFT 21 +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID__SHIFT 25 +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV__SHIFT 26 +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID__SHIFT 27 +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID__SHIFT 0 +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID__SHIFT 2 +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING__SHIFT 6 + +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL_MASK 0x0007ffff /* [18:0] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL_MASK 0x00180000 /* [20:19] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE_MASK 0x01e00000 /* [24:21] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID_MASK 0x02000000 /* [25] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV_MASK 0x04000000 /* [26] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */ +#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */ + +/* + * The debugger will send user data(m0) with PRIV=1 to indicate it requires + * notification from the KFD with the following queue id (DOORBELL_ID) and + * trap code (TRAP_CODE). + */ +#define KFD_CTXID0_TRAP_CODE_SHIFT 10 +#define KFD_CTXID0_TRAP_CODE_MASK 0xfffc00 +#define KFD_CTXID0_CP_BAD_OP_ECODE_MASK 0x3ffffff +#define KFD_CTXID0_DOORBELL_ID_MASK 0x0003ff + +#define KFD_CTXID0_TRAP_CODE(ctxid0) (((ctxid0) & \ + KFD_CTXID0_TRAP_CODE_MASK) >> \ + KFD_CTXID0_TRAP_CODE_SHIFT) +#define KFD_CTXID0_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \ + KFD_CTXID0_CP_BAD_OP_ECODE_MASK) >> \ + KFD_CTXID0_TRAP_CODE_SHIFT) +#define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \ + KFD_CTXID0_DOORBELL_ID_MASK) + +static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1) +{ + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf0_full %d, ttrace_buf1_full %d ttrace_utc_err %d\n", + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF0_FULL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF1_FULL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR)); +} + +static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1) +{ + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SA_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID)); +} + +static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1) +{ + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SA_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID)); +} + +static void event_interrupt_poison_consumption_v12_1(struct kfd_node *node, + uint16_t pasid, uint16_t source_id) +{ + enum amdgpu_ras_block block = 0; + int ret = -EINVAL; + uint32_t reset = 0; + u64 event_id = RAS_EVENT_INVALID_ID; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); + + if (!p) + return; + + /* all queues of a process will be unmapped in one time */ + if (atomic_read(&p->poison)) { + kfd_unref_process(p); + return; + } + + atomic_set(&p->poison, 1); + kfd_unref_process(p); + + switch (source_id) { + case SOC15_INTSRC_SQ_INTERRUPT_MSG: + if (node->dqm->ops.reset_queues) + ret = node->dqm->ops.reset_queues(node->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + break; + case SOC21_INTSRC_SDMA_ECC: + default: + block = AMDGPU_RAS_BLOCK__GFX; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + break; + } + + kfd_signal_poison_consumed_event(node, pasid); + + /* + * resetting queue passes, do page retirement without gpu reset + * resetting queue fails, fallback to gpu reset solution + */ + if (amdgpu_uniras_enabled(node->adev)) + event_id = amdgpu_ras_mgr_gen_ras_event_seqno(node->adev, + RAS_SEQNO_TYPE_POISON_CONSUMPTION); + + RAS_EVENT_LOG(node->adev, event_id, + "poison is consumed by source %d, kick off gpu reset flow\n", source_id); + + amdgpu_amdkfd_ras_pasid_poison_consumption_handler(node->adev, + block, pasid, NULL, NULL, reset); +} + +static bool event_interrupt_isr_v12_1(struct kfd_node *node, + const uint32_t *ih_ring_entry, + uint32_t *patched_ihre, + bool *patched_flag) +{ + uint16_t source_id, client_id, pasid, vmid, node_id; + const uint32_t *data = ih_ring_entry; + uint32_t context_id0; + + node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); + + if (!kfd_irq_is_from_node(node, node_id, vmid)) { + dev_dbg_ratelimited(node->adev->dev, + "Interrupt not for Node, node_id: %d, vmid: %d\n", node_id, vmid); + return false; + } + + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + + /* Only handle interrupts from KFD VMIDs */ + if (!KFD_IRQ_IS_FENCE(client_id, source_id) && + (vmid < node->vm_info.first_vmid_kfd || + vmid > node->vm_info.last_vmid_kfd)) + return false; + + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); + context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + + if ((source_id == SOC15_INTSRC_CP_END_OF_PIPE) && + (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG)) + return false; + + dev_dbg(node->adev->dev, "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(node->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); + + /* If there is no valid PASID, it's likely a bug */ + if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + return false; + + /* Interrupt types we care about: various signals and faults. + * They will be forwarded to a work queue (see below). + */ + return source_id == SOC15_INTSRC_CP_END_OF_PIPE || + source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || + source_id == SOC15_INTSRC_CP_BAD_OPCODE || + source_id == SOC21_INTSRC_SDMA_TRAP || + KFD_IRQ_IS_FENCE(client_id, source_id) || + ((client_id == SOC21_IH_CLIENTID_VMC || + client_id == SOC21_IH_CLIENTID_UTCL2) && + !amdgpu_no_queue_eviction_on_vm_fault); +} + +static void event_interrupt_wq_v12_1(struct kfd_node *node, + const uint32_t *ih_ring_entry) +{ + uint16_t source_id, client_id, ring_id, pasid, vmid; + uint32_t context_id0, context_id1; + uint8_t sq_int_enc, sq_int_priv, sq_int_errtype; + struct kfd_vm_fault_info info = {0}; + struct kfd_hsa_memory_exception_data exception_data; + + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); + context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry); + + /* VMC, UTCL2 */ + if (client_id == SOC21_IH_CLIENTID_VMC || + client_id == SOC21_IH_CLIENTID_UTCL2) { + info.vmid = vmid; + info.mc_id = client_id; + info.page_addr = ih_ring_entry[4] | + (uint64_t)(ih_ring_entry[5] & 0xf) << 32; + info.prot_valid = ring_id & 0x08; + info.prot_read = ring_id & 0x10; + info.prot_write = ring_id & 0x20; + + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.gpu_id = node->id; + exception_data.va = (info.page_addr) << PAGE_SHIFT; + exception_data.failure.NotPresent = info.prot_valid ? 1 : 0; + exception_data.failure.NoExecute = info.prot_exec ? 1 : 0; + exception_data.failure.ReadOnly = info.prot_write ? 1 : 0; + exception_data.failure.imprecise = 0; + + kfd_set_dbg_ev_from_interrupt(node, pasid, -1, + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION), + &exception_data, sizeof(exception_data)); + kfd_smi_event_update_vmfault(node, pasid); + + /* GRBM, SDMA, SE, PMM */ + } else if (client_id == SOC21_IH_CLIENTID_GRBM_CP || + client_id == SOC21_IH_CLIENTID_GFX) { + + /* CP */ + if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) + kfd_signal_event_interrupt(pasid, context_id0, 32, false); + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); + + kfd_set_dbg_ev_from_interrupt(node, pasid, doorbell_id, + KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), + NULL, 0); + kfd_dqm_suspend_bad_queue_mes(node, pasid, doorbell_id); + } + + /* SDMA */ + else if (source_id == SOC21_INTSRC_SDMA_TRAP) + kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true); + else if (source_id == SOC21_INTSRC_SDMA_ECC) { + event_interrupt_poison_consumption_v12_1(node, pasid, source_id); + return; + } + + /* SQ */ + else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { + sq_int_enc = REG_GET_FIELD(context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); + switch (sq_int_enc) { + case SQ_INTERRUPT_WORD_ENCODING_AUTO: + print_sq_intr_info_auto(node, context_id0, context_id1); + break; + case SQ_INTERRUPT_WORD_ENCODING_INST: + print_sq_intr_info_inst(node, context_id0, context_id1); + sq_int_priv = REG_GET_FIELD(context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); + if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(node, pasid, + KFD_CTXID0_DOORBELL_ID(context_id0), + KFD_CTXID0_TRAP_CODE(context_id0), + NULL, 0))) + return; + break; + case SQ_INTERRUPT_WORD_ENCODING_ERROR: + print_sq_intr_info_error(node, context_id0, context_id1); + sq_int_errtype = REG_GET_FIELD(context_id0, + SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE); + if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && + sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { + event_interrupt_poison_consumption_v12_1( + node, pasid, source_id); + return; + } + break; + default: + break; + } + kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24, true); + } + + } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { + kfd_process_close_interrupt_drain(pasid); + } +} + +const struct kfd_event_interrupt_class event_interrupt_class_v12_1 = { + .interrupt_isr = event_interrupt_isr_v12_1, + .interrupt_wq = event_interrupt_wq_v12_1, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 0cb5c582ce7d..1688d8e595f2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -28,6 +28,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" #include "amdgpu_ras.h" +#include "amdgpu_ras_mgr.h" /* * GFX9 SQ Interrupts @@ -146,7 +147,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, { enum amdgpu_ras_block block = 0; uint32_t reset = 0; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; u64 event_id; int old_poison, ret; @@ -228,7 +229,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, kfd_signal_poison_consumed_event(dev, pasid); - event_id = amdgpu_ras_acquire_event_id(dev->adev, type); + if (amdgpu_uniras_enabled(dev->adev)) + event_id = amdgpu_ras_mgr_gen_ras_event_seqno(dev->adev, + RAS_SEQNO_TYPE_POISON_CONSUMPTION); + else + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); RAS_EVENT_LOG(dev->adev, event_id, "poison is consumed by client %d, kick off gpu reset flow\n", client_id); @@ -314,11 +319,12 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev, & ~pasid_mask) | pasid); } - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); /* If there is no valid PASID, it's likely a bug */ if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) @@ -373,34 +379,88 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SE2SH || client_id == SOC15_IH_CLIENTID_SE3SH) { if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id0, 32); + kfd_signal_event_interrupt(pasid, context_id0, 32, true); else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { sq_int_data = KFD_CONTEXT_ID_GET_SQ_INT_DATA(context_id0, context_id1); encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_BUF_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, REG_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, CMD_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_CMD_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_REG_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, IMMED_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + WLT), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE_BUF_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + REG_TIMESTAMP), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + CMD_TIMESTAMP), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + HOST_CMD_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + HOST_REG_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + IMMED_OVERFLOW), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID, + THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SH_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SIMD_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + CU_ID), sq_int_data); if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) { if (kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -412,14 +472,37 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); - pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SH_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + SIMD_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID, + CU_ID), sq_intr_err); if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { @@ -430,7 +513,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, default: break; } - kfd_signal_event_interrupt(pasid, sq_int_data, 24); + kfd_signal_event_interrupt(pasid, sq_int_data, 24, true); } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) { kfd_set_dbg_ev_from_interrupt(dev, pasid, @@ -447,7 +530,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SDMA6 || client_id == SOC15_IH_CLIENTID_SDMA7) { if (source_id == SOC15_INTSRC_SDMA_TRAP) { - kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); + kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true); } else if (source_id == SOC15_INTSRC_SDMA_ECC) { event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 2b0a830f5b29..3ffa081daaec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -46,11 +46,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, int retval; union PM4_MES_TYPE_3_HEADER nop; - if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ)) - return false; - - pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ, - queue_size); + pr_debug("Initializing queue type %d size %d\n", type, queue_size); memset(&prop, 0, sizeof(prop)); memset(&nop, 0, sizeof(nop)); @@ -61,18 +57,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, kq->dev = dev; kq->nop_packet = nop.u32all; - switch (type) { - case KFD_QUEUE_TYPE_DIQ: - kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ]; - break; - case KFD_QUEUE_TYPE_HIQ: - kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - break; - default: - dev_err(dev->adev->dev, "Invalid queue type %d\n", type); - return false; - } - + kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; if (!kq->mqd_mgr) return false; @@ -144,9 +129,8 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, goto err_init_queue; kq->queue->device = dev; - kq->queue->process = kfd_get_process(current); - kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev, + kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr, &kq->queue->properties); if (!kq->queue->mqd_mem_obj) goto err_allocate_mqd; @@ -162,24 +146,11 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->pipe, kq->queue->queue, &kq->queue->properties, NULL); - } else { - /* allocate fence for DIQ */ - - retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t), - &kq->fence_mem_obj); - - if (retval != 0) - goto err_alloc_fence; - - kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr; - kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr; } print_queue(kq->queue); return true; -err_alloc_fence: - kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); err_allocate_mqd: uninit_queue(kq->queue); err_init_queue: @@ -209,8 +180,6 @@ static void kq_uninitialize(struct kernel_queue *kq) kq->queue->queue); up_read(&kq->dev->adev->reset_domain->sem); } - else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) - kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); @@ -259,7 +228,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq, if (packet_size_in_dwords > available_size) { /* * make sure calling functions know - * acquire_packet_buffer() failed + * kq_acquire_packet_buffer() failed */ goto err_no_space; } @@ -340,7 +309,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev, { struct kernel_queue *kq; - kq = kzalloc(sizeof(*kq), GFP_KERNEL); + kq = kzalloc_obj(*kq); if (!kq) return NULL; @@ -358,34 +327,3 @@ void kernel_queue_uninit(struct kernel_queue *kq) kq_uninitialize(kq); kfree(kq); } - -/* FIXME: Can this test be removed? */ -static __attribute__((unused)) void test_kq(struct kfd_node *dev) -{ - struct kernel_queue *kq; - uint32_t *buffer, i; - int retval; - - dev_err(dev->adev->dev, "Starting kernel queue test\n"); - - kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); - if (unlikely(!kq)) { - dev_err(dev->adev->dev, " Failed to initialize HIQ\n"); - dev_err(dev->adev->dev, "Kernel queue test failed\n"); - return; - } - - retval = kq_acquire_packet_buffer(kq, 5, &buffer); - if (unlikely(retval != 0)) { - dev_err(dev->adev->dev, " Failed to acquire packet buffer\n"); - dev_err(dev->adev->dev, "Kernel queue test failed\n"); - return; - } - for (i = 0; i < 5; i++) - buffer[i] = kq->nop_packet; - kq_submit_packet(kq); - - dev_err(dev->adev->dev, "Ending kernel queue test\n"); -} - - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index d05d199b5e44..964efa325908 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -21,7 +21,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/types.h> -#include <linux/hmm.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/migrate.h> @@ -39,36 +38,38 @@ #endif #define dev_fmt(fmt) "kfd_migrate: " fmt -static uint64_t -svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, uint64_t addr) +static u64 +svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, u64 addr) { return addr + amdgpu_ttm_domain_start(adev, TTM_PL_VRAM); } static int -svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, - dma_addr_t *addr, uint64_t *gart_addr, uint64_t flags) +svm_migrate_gart_map(struct amdgpu_ring *ring, + struct amdgpu_ttm_buffer_entity *entity, + u64 npages, + dma_addr_t *addr, u64 *gart_addr, u64 flags) { struct amdgpu_device *adev = ring->adev; struct amdgpu_job *job; unsigned int num_dw, num_bytes; struct dma_fence *fence; - uint64_t src_addr, dst_addr; - uint64_t pte_flags; + u64 src_addr, dst_addr; + u64 pte_flags; void *cpu_addr; int r; - /* use gart window 0 */ - *gart_addr = adev->gmc.gart_start; + *gart_addr = amdgpu_compute_gart_address(&adev->gmc, entity, 0); num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); - num_bytes = npages * 8; + num_bytes = npages * 8 * AMDGPU_GPU_PAGES_IN_CPU_PAGE; - r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr, + r = amdgpu_job_alloc_with_ib(adev, &entity->base, AMDGPU_FENCE_OWNER_UNDEFINED, num_dw * 4 + num_bytes, AMDGPU_IB_POOL_DELAYED, - &job); + &job, + AMDGPU_KERNEL_JOB_ID_KFD_GART_MAP); if (r) return r; @@ -76,6 +77,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, src_addr += job->ibs[0].gpu_addr; dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); + dst_addr += (entity->gart_window_offs[0] >> AMDGPU_GPU_PAGE_SHIFT) * 8; amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr, num_bytes, 0); @@ -114,7 +116,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for * the last sdma finish fence which is returned to check copy memory is done. * - * Context: Process context, takes and releases gtt_window_lock + * Context: Process context * * Return: * 0 - OK, otherwise error code @@ -122,28 +124,31 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, static int svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, - uint64_t *vram, uint64_t npages, + u64 *vram, u64 npages, enum MIGRATION_COPY_DIR direction, struct dma_fence **mfence) { - const uint64_t GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE; + const u64 GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE; struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; - uint64_t gart_s, gart_d; + struct amdgpu_ttm_buffer_entity *entity; + u64 gart_s, gart_d; struct dma_fence *next; - uint64_t size; + u64 size; int r; - mutex_lock(&adev->mman.gtt_window_lock); + entity = &adev->mman.move_entities[0]; + + mutex_lock(&entity->lock); while (npages) { size = min(GTT_MAX_PAGES, npages); if (direction == FROM_VRAM_TO_RAM) { gart_s = svm_migrate_direct_mapping_addr(adev, *vram); - r = svm_migrate_gart_map(ring, size, sys, &gart_d, 0); + r = svm_migrate_gart_map(ring, entity, size, sys, &gart_d, 0); } else if (direction == FROM_RAM_TO_VRAM) { - r = svm_migrate_gart_map(ring, size, sys, &gart_s, + r = svm_migrate_gart_map(ring, entity, size, sys, &gart_s, KFD_IOCTL_SVM_FLAG_GPU_RO); gart_d = svm_migrate_direct_mapping_addr(adev, *vram); } @@ -152,8 +157,9 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, goto out_unlock; } - r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE, - NULL, &next, false, true, 0); + r = amdgpu_copy_buffer(adev, entity, + gart_s, gart_d, size * PAGE_SIZE, + NULL, &next, true, 0); if (r) { dev_err(adev->dev, "fail %d to copy memory\n", r); goto out_unlock; @@ -169,7 +175,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, } out_unlock: - mutex_unlock(&adev->mman.gtt_window_lock); + mutex_unlock(&entity->lock); return r; } @@ -217,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - zone_device_page_init(page); + zone_device_page_init(page, page_pgmap(page), 0); } static void @@ -260,39 +266,39 @@ static void svm_migrate_put_sys_page(unsigned long addr) put_page(page); } -static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate) +static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate) { - unsigned long upages = 0; + unsigned long mpages = 0; unsigned long i; for (i = 0; i < migrate->npages; i++) { - if (migrate->src[i] & MIGRATE_PFN_VALID && - !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) - upages++; + if (migrate->dst[i] & MIGRATE_PFN_VALID && + migrate->src[i] & MIGRATE_PFN_MIGRATE) + mpages++; } - return upages; + return mpages; } static int svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, - dma_addr_t *scratch, uint64_t ttm_res_offset) + dma_addr_t *scratch, u64 ttm_res_offset) { - uint64_t npages = migrate->npages; + u64 npages = migrate->npages; struct amdgpu_device *adev = node->adev; struct device *dev = adev->dev; struct amdgpu_res_cursor cursor; - uint64_t mpages = 0; + u64 mpages = 0; dma_addr_t *src; - uint64_t *dst; - uint64_t i, j; + u64 *dst; + u64 i, j; int r; pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n", prange->svms, prange->start, prange->last, ttm_res_offset); src = scratch; - dst = (uint64_t *)(scratch + npages); + dst = (u64 *)(scratch + npages); amdgpu_res_first(prange->ttm_res, ttm_res_offset, npages << PAGE_SHIFT, &cursor); @@ -385,11 +391,11 @@ out_free_vram_pages: static long svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, - struct vm_area_struct *vma, uint64_t start, - uint64_t end, uint32_t trigger, uint64_t ttm_res_offset) + struct vm_area_struct *vma, u64 start, + u64 end, uint32_t trigger, u64 ttm_res_offset) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); - uint64_t npages = (end - start) >> PAGE_SHIFT; + u64 npages = (end - start) >> PAGE_SHIFT; struct amdgpu_device *adev = node->adev; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; @@ -408,7 +414,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); buf = kvcalloc(npages, - 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t), + 2 * sizeof(*migrate.src) + sizeof(u64) + sizeof(dma_addr_t), GFP_KERNEL); if (!buf) goto out; @@ -447,9 +453,9 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); - mpages = cpages - svm_migrate_unsuccessful_pages(&migrate); - pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", - mpages, cpages, migrate.npages); + mpages = svm_migrate_successful_pages(&migrate); + pr_debug("migrated/collected/requested 0x%lx/0x%lx/0x%lx\n", + mpages, cpages, migrate.npages); svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); @@ -490,7 +496,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, { unsigned long addr, start, end; struct vm_area_struct *vma; - uint64_t ttm_res_offset; + u64 ttm_res_offset; struct kfd_node *node; unsigned long mpages = 0; long r = 0; @@ -567,8 +573,9 @@ out: return r < 0 ? r : 0; } -static void svm_migrate_page_free(struct page *page) +static void svm_migrate_folio_free(struct folio *folio) { + struct page *page = &folio->page; struct svm_range_bo *svm_bo = page->zone_device_data; if (svm_bo) { @@ -580,14 +587,14 @@ static void svm_migrate_page_free(struct page *page) static int svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, - dma_addr_t *scratch, uint64_t npages) + dma_addr_t *scratch, u64 npages) { struct device *dev = adev->dev; - uint64_t *src; + u64 *src; dma_addr_t *dst; struct page *dpage; - uint64_t i = 0, j; - uint64_t addr; + u64 i = 0, j; + u64 addr; int r = 0; pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, @@ -595,7 +602,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, addr = migrate->start; - src = (uint64_t *)(scratch + npages); + src = (u64 *)(scratch + npages); dst = scratch; for (i = 0, j = 0; i < npages; i++, addr += PAGE_SIZE) { @@ -683,12 +690,11 @@ out_oom: */ static long svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, - struct vm_area_struct *vma, uint64_t start, uint64_t end, + struct vm_area_struct *vma, u64 start, u64 end, uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); - uint64_t npages = (end - start) >> PAGE_SHIFT; - unsigned long upages = npages; + u64 npages = (end - start) >> PAGE_SHIFT; unsigned long cpages = 0; unsigned long mpages = 0; struct amdgpu_device *adev = node->adev; @@ -710,7 +716,7 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; buf = kvcalloc(npages, - 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t), + 2 * sizeof(*migrate.src) + sizeof(u64) + sizeof(dma_addr_t), GFP_KERNEL); if (!buf) goto out; @@ -736,7 +742,6 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, if (!cpages) { pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n", prange->start, prange->last); - upages = svm_migrate_unsuccessful_pages(&migrate); goto out_free; } if (cpages != npages) @@ -749,9 +754,9 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, scratch, npages); migrate_vma_pages(&migrate); - upages = svm_migrate_unsuccessful_pages(&migrate); - pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n", - upages, cpages, migrate.npages); + mpages = svm_migrate_successful_pages(&migrate); + pr_debug("migrated/collected/requested 0x%lx/0x%lx/0x%lx\n", + mpages, cpages, migrate.npages); svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(&migrate); @@ -764,8 +769,7 @@ out_free: start >> PAGE_SHIFT, end >> PAGE_SHIFT, node->id, 0, trigger, r); out: - if (!r && cpages) { - mpages = cpages - upages; + if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); if (pdd) WRITE_ONCE(pdd->page_out, pdd->page_out + mpages); @@ -848,6 +852,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } if (r >= 0) { + WARN_ONCE(prange->vram_pages < mpages, + "Recorded vram pages(0x%llx) should not be less than migration pages(0x%lx).", + prange->vram_pages, mpages); prange->vram_pages -= mpages; /* prange does not have vram page set its actual_loc to system @@ -1008,7 +1015,7 @@ out_mmput: } static const struct dev_pagemap_ops svm_migrate_pgmap_ops = { - .page_free = svm_migrate_page_free, + .folio_free = svm_migrate_folio_free, .migrate_to_ram = svm_migrate_to_ram, }; @@ -1027,7 +1034,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; - if (adev->flags & AMD_IS_APU) + if (adev->apu_prefer_gtt) return 0; pgmap = &kfddev->pgmap; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index 2eebf67f9c2c..2b7fd442d29c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -31,7 +31,6 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/sched/mm.h> -#include <linux/hmm.h> #include "kfd_priv.h" #include "kfd_svm.h" diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index aee2212e52f6..33aa23450b3f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -78,8 +78,8 @@ err_ioctl: static void kfd_exit(void) { kfd_cleanup_processes(); - kfd_debugfs_fini(); kfd_process_destroy_wq(); + kfd_debugfs_fini(); kfd_procfs_shutdown(); kfd_topology_shutdown(); kfd_chardev_exit(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index d9ae854b6908..723b725d20b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -46,28 +46,30 @@ int pipe_priority_map[] = { KFD_PIPE_PRIORITY_CS_HIGH }; -struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_node *dev, struct queue_properties *q) +struct kfd_mem_obj *allocate_hiq_mqd(struct mqd_manager *mm, struct queue_properties *q) { struct kfd_mem_obj *mqd_mem_obj; + struct kfd_node *dev = mm->dev; - mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj); if (!mqd_mem_obj) return NULL; - mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem; + mqd_mem_obj->mem = dev->dqm->hiq_sdma_mqd.mem; mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr; mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr; return mqd_mem_obj; } -struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev, +struct kfd_mem_obj *allocate_sdma_mqd(struct mqd_manager *mm, struct queue_properties *q) { struct kfd_mem_obj *mqd_mem_obj; + struct kfd_node *dev = mm->dev; uint64_t offset; - mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj); if (!mqd_mem_obj) return NULL; @@ -79,7 +81,7 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev, offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size * NUM_XCC(dev->xcc_mask); - mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem + mqd_mem_obj->mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.mem + offset); mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset; mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t) @@ -91,7 +93,7 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev, void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { - WARN_ON(!mqd_mem_obj->gtt_mem); + WARN_ON(!mqd_mem_obj->mem); kfree(mqd_mem_obj); } @@ -224,8 +226,8 @@ int kfd_destroy_mqd_cp(struct mqd_manager *mm, void *mqd, void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { - if (mqd_mem_obj->gtt_mem) { - amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, &mqd_mem_obj->gtt_mem); + if (mqd_mem_obj->mem) { + amdgpu_amdkfd_free_kernel_mem(mm->dev->adev, &mqd_mem_obj->mem); kfree(mqd_mem_obj); } else { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); @@ -280,8 +282,8 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, struct kfd_mem_obj *mqd_mem_obj, offset = kfd_hiq_mqd_stride(dev) * virtual_xcc_id; - mqd_mem_obj->gtt_mem = (virtual_xcc_id == 0) ? - dev->dqm->hiq_sdma_mqd.gtt_mem : NULL; + mqd_mem_obj->mem = (virtual_xcc_id == 0) ? + dev->dqm->hiq_sdma_mqd.mem : NULL; mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset; mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t) dev->dqm->hiq_sdma_mqd.cpu_ptr + offset); @@ -290,6 +292,9 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, struct kfd_mem_obj *mqd_mem_obj, uint64_t kfd_mqd_stride(struct mqd_manager *mm, struct queue_properties *q) { + if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0)) + return AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); + return mm->mqd_size; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 17cc1f25c8d0..06ca6235ff1b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -68,7 +68,7 @@ */ extern int pipe_priority_map[]; struct mqd_manager { - struct kfd_mem_obj* (*allocate_mqd)(struct kfd_node *kfd, + struct kfd_mem_obj* (*allocate_mqd)(struct mqd_manager *mm, struct queue_properties *q); void (*init_mqd)(struct mqd_manager *mm, void **mqd, @@ -102,7 +102,8 @@ struct mqd_manager { u32 *ctl_stack_used_size, u32 *save_area_used_size); - void (*get_checkpoint_info)(struct mqd_manager *mm, void *mqd, uint32_t *ctl_stack_size); + int (*get_checkpoint_info)(struct mqd_manager *mm, void *mqd, + uint32_t *ctl_stack_size); void (*checkpoint_mqd)(struct mqd_manager *mm, void *mqd, @@ -153,10 +154,10 @@ struct mqd_user_context_save_area_header { uint32_t wave_state_size; }; -struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_node *dev, +struct kfd_mem_obj *allocate_hiq_mqd(struct mqd_manager *mm, struct queue_properties *q); -struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev, +struct kfd_mem_obj *allocate_sdma_mqd(struct mqd_manager *mm, struct queue_properties *q); void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 05f3ac2eaef9..bb70e57ae4d5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -70,12 +70,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct cik_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd, +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { + struct kfd_node *kfd = mm->dev; struct kfd_mem_obj *mqd_mem_obj; if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd), @@ -388,7 +388,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 1695dd78ede8..77fb41e2486a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -70,12 +70,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct v10_compute_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd, +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { + struct kfd_node *kfd = mm->dev; struct kfd_mem_obj *mqd_mem_obj; if (kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd), @@ -450,7 +450,7 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 3c0ae28c5923..a1e3cf2384dd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -96,25 +96,16 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct v11_compute_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); + struct kfd_node *node = mm->dev; struct kfd_mem_obj *mqd_mem_obj; - int size; - - /* - * MES write to areas beyond MQD size. So allocate - * 1 PAGE_SIZE memory for MQD is MES is enabled. - */ - if (node->kfd->shared_resources.enable_mes) - size = PAGE_SIZE; - else - size = sizeof(struct v11_compute_mqd); - if (kfd_gtt_sa_allocate(node, size, &mqd_mem_obj)) + if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj)) return NULL; return mqd_mem_obj; @@ -126,18 +117,13 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, { uint64_t addr; struct v11_compute_mqd *m; - int size; + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff; m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr; addr = mqd_mem_obj->gpu_addr; - if (mm->dev->kfd->shared_resources.enable_mes) - size = PAGE_SIZE; - else - size = sizeof(struct v11_compute_mqd); - - memset(m, 0, size); + memset(m, 0, mqd_size); m->header = 0xC0310800; m->compute_pipelinestat_enable = 1; @@ -478,7 +464,7 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c index 565858b9044d..b3e122d7876e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -77,19 +77,16 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct v12_compute_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); + struct kfd_node *node = mm->dev; struct kfd_mem_obj *mqd_mem_obj; - /* - * Allocate one PAGE_SIZE memory for MQD as MES writes to areas beyond - * struct MQD size. - */ - if (kfd_gtt_sa_allocate(node, PAGE_SIZE, &mqd_mem_obj)) + if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj)) return NULL; return mqd_mem_obj; @@ -101,11 +98,12 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, { uint64_t addr; struct v12_compute_mqd *m; + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); m = (struct v12_compute_mqd *) mqd_mem_obj->cpu_ptr; addr = mqd_mem_obj->gpu_addr; - memset(m, 0, PAGE_SIZE); + memset(m, 0, mqd_size); m->header = 0xC0310800; m->compute_pipelinestat_enable = 1; @@ -351,6 +349,12 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + /* Allow context switch so we don't cross-process starve with a massive + * command buffer of long-running SDMA commands + * sdmax_rlcx_ib_cntl represent SDMA_QUEUE0_IB_CNTL register + */ + m->sdmax_rlcx_ib_cntl |= SDMA0_QUEUE0_IB_CNTL__SWITCH_INSIDE_IB_MASK; + q->is_active = QUEUE_IS_ACTIVE(*q); } @@ -380,7 +384,7 @@ struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c new file mode 100644 index 000000000000..c90c0d99b1e3 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v12_structs.h" +#include "gc/gc_12_1_0_sh_mask.h" +#include "amdgpu_amdkfd.h" +#include "kfd_device_queue_manager.h" + +static inline struct v12_1_compute_mqd *get_mqd(void *mqd) +{ + return (struct v12_1_compute_mqd *)mqd; +} + +static inline struct v12_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v12_sdma_mqd *)mqd; +} + +static void mqd_symmetrically_map_cu_mask_v12_1(struct mqd_manager *mm, + const uint32_t *cu_mask, uint32_t cu_mask_count, + uint32_t *se_mask, uint32_t inst) +{ + struct amdgpu_cu_info *cu_info = &mm->dev->adev->gfx.cu_info; + struct amdgpu_gfx_config *gfx_info = &mm->dev->adev->gfx.config; + uint32_t cu_per_sh[2][2] = {0}; + uint32_t en_mask = 0x3; + int i, se, sh, cu, cu_inc = 0; + uint32_t cu_active_per_node; + int inc = NUM_XCC(mm->dev->xcc_mask); + int xcc_inst = inst + ffs(mm->dev->xcc_mask) - 1; + + cu_active_per_node = cu_info->number / mm->dev->kfd->num_nodes; + if (cu_mask_count > cu_active_per_node) + cu_mask_count = cu_active_per_node; + + /* + * Count active CUs per SE/SH. + */ + for (se = 0; se < gfx_info->max_shader_engines; se++) + for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) + cu_per_sh[se][sh] = hweight32( + cu_info->bitmap[xcc_inst][se][sh]); + + /* Symmetrically map cu_mask to all SEs & SHs: + * For GFX 12.1.0, the following code only looks at a + * subset of the cu_mask corresponding to the inst parameter. + * If we have n XCCs under one GPU node + * cu_mask[0] bit0 -> XCC0 se_mask[0] bit0 (XCC0,SE0,SH0,CU0) + * cu_mask[0] bit1 -> XCC1 se_mask[0] bit0 (XCC1,SE0,SH0,CU0) + * .. + * cu_mask[0] bitn -> XCCn se_mask[0] bit0 (XCCn,SE0,SH0,CU0) + * cu_mask[0] bit n+1 -> XCC0 se_mask[1] bit0 (XCC0,SE1,SH0,CU0) + * + * For example, if there are 6 XCCs under 1 KFD node, this code + * running for each inst, will look at the bits as: + * inst, inst + 6, inst + 12... + * + * First ensure all CUs are disabled, then enable user specified CUs. + */ + for (i = 0; i < gfx_info->max_shader_engines; i++) + se_mask[i] = 0; + + i = inst; + for (cu = 0; cu < 16; cu++) { + for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) { + for (se = 0; se < gfx_info->max_shader_engines; se++) { + if (cu_per_sh[se][sh] > cu) { + if (cu_mask[i / 32] & (1U << (i % 32))) { + if (cu == 8 && sh == 0) + se_mask[se] |= en_mask << 30; + else + se_mask[se] |= en_mask << (cu_inc + sh * 16); + } + i += inc; + if (i >= cu_mask_count) + return; + } + } + } + cu_inc += 2; + } +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo, uint32_t inst) +{ + struct v12_1_compute_mqd *m; + uint32_t se_mask[2] = {0}; + + if (!minfo || !minfo->cu_mask.ptr) + return; + + mqd_symmetrically_map_cu_mask_v12_1(mm, + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, inst); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + + pr_debug("update cu mask to %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1); +} + +static void set_priority(struct v12_1_compute_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; +} + +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, + struct queue_properties *q) +{ + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); + struct kfd_node *node = mm->dev; + struct kfd_mem_obj *mqd_mem_obj; + + if (q->type == KFD_QUEUE_TYPE_COMPUTE) + mqd_size *= NUM_XCC(node->xcc_mask); + + if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v12_1_compute_mqd *m; + u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size); + + m = (struct v12_1_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, mqd_size); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se8 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x63 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + /* Set cp_hqd_hq_status0.c_queue_debug_en to 1 to have the CP set up the + * DISPATCH_PTR. This is required for the kfd debugger + */ + m->cp_hqd_hq_status0 = 1 << 14; + + if (amdgpu_amdkfd_have_atomics_support(mm->dev->adev)) + m->cp_hqd_hq_status0 |= 1 << 29; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->kfd->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q, NULL); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms, 0); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_1_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + if (q->metadata_queue_size) { + /* On GC 12.1 is 64 DWs which is 4 times size of AQL packet */ + if (q->metadata_queue_size == q->queue_size * 4) { + /* + * User application allocates main queue ring and metadata queue ring + * with a single allocation. metadata queue ring starts after main + * queue ring. + */ + m->cp_hqd_kd_base = + lower_32_bits((q->queue_address + q->queue_size) >> 8); + m->cp_hqd_kd_base_hi = + upper_32_bits((q->queue_address + q->queue_size) >> 8); + + m->cp_hqd_kd_cntl |= CP_HQD_KD_CNTL__KD_FETCHER_ENABLE_MASK; + /* KD_SIZE = 2 for metadata packet = 64 DWs */ + m->cp_hqd_kd_cntl |= 2 << CP_HQD_KD_CNTL__KD_SIZE__SHIFT; + } else { + pr_warn("Invalid metadata ring size, metadata queue will be ignored\n"); + } + } + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 1 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->kfd->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + set_priority(m, q); + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) +{ + return false; +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v12_1_compute_mqd *m; + struct mqd_user_context_save_area_header header; + + m = get_mqd(mqd); + + /* Control stack is written backwards, while workgroup context data + * is written forwards. Both starts from m->cp_hqd_cntl_stack_size. + * Current position is at m->cp_hqd_cntl_stack_offset and + * m->cp_hqd_wg_state_offset, respectively. + */ + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + /* Control stack is not copied to user mode for GFXv12 because + * it's part of the context save area that is already + * accessible to user mode + */ + header.control_stack_size = *ctl_stack_used_size; + header.wave_state_size = *save_area_used_size; + + header.wave_state_offset = m->cp_hqd_wg_state_offset; + header.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header))) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_1_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_sdma_mqd *m; + + m = (struct v12_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, PAGE_SIZE); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q, NULL); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_SDMA_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum + << SDMA0_SDMA_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT) + & SDMA0_SDMA_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + /* Allow context switch so we don't cross-process starve with a massive + * command buffer of long-running SDMA commands + * sdmax_rlcx_ib_cntl represent SDMA_QUEUE0_IB_CNTL register + */ + m->sdmax_rlcx_ib_cntl |= SDMA0_SDMA_QUEUE0_IB_CNTL__SWITCH_INSIDE_IB_MASK; + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, + struct kfd_mem_obj *xcc_mqd_mem_obj, + uint64_t offset) +{ + xcc_mqd_mem_obj->mem = (offset == 0) ? + mqd_mem_obj->mem : NULL; + xcc_mqd_mem_obj->gpu_addr = mqd_mem_obj->gpu_addr + offset; + xcc_mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)mqd_mem_obj->cpu_ptr + + offset); +} + +static void init_mqd_v12_1(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_1_compute_mqd *m; + int xcc = 0; + struct kfd_mem_obj xcc_mqd_mem_obj; + uint64_t xcc_gart_addr = 0; + uint64_t xcc_ctx_save_restore_area_address; + uint64_t offset = mm->mqd_stride(mm, q); + uint32_t local_xcc_start = mm->dev->dqm->current_logical_xcc_start++; + + memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj)); + for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { + get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc); + + init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); + + m->cp_mqd_stride_size = offset; + + /* + * Update the CWSR address for each XCC if CWSR is enabled + * and CWSR area is allocated in thunk + */ + if (mm->dev->kfd->cwsr_enabled && + q->ctx_save_restore_area_address) { + xcc_ctx_save_restore_area_address = + q->ctx_save_restore_area_address + + (xcc * q->ctx_save_restore_area_size); + + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(xcc_ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(xcc_ctx_save_restore_area_address); + } + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->compute_tg_chunk_size = 1; + m->compute_current_logical_xcc_id = + (local_xcc_start + xcc) % + NUM_XCC(mm->dev->xcc_mask); + } else { + /* PM4 Queue */ + m->compute_current_logical_xcc_id = 0; + m->compute_tg_chunk_size = 0; + m->pm4_target_xcc_in_xcp = q->pm4_target_xcc; + } + + if (xcc == 0) { + /* Set the MQD pointer and gart address to XCC0 MQD */ + *mqd = m; + *gart_addr = xcc_gart_addr; + } + } +} + +static void update_mqd_v12_1(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, struct mqd_update_info *minfo) +{ + struct v12_1_compute_mqd *m; + int xcc = 0; + uint64_t size = mm->mqd_stride(mm, q); + + for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { + m = get_mqd(mqd + size * xcc); + update_mqd(mm, m, q, minfo); + + update_cu_mask(mm, m, minfo, xcc); + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->compute_tg_chunk_size = 1; + } else { + /* PM4 Queue */ + m->compute_current_logical_xcc_id = 0; + m->compute_tg_chunk_size = 0; + m->pm4_target_xcc_in_xcp = q->pm4_target_xcc; + } + } +} + +static int destroy_mqd_v12_1(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, unsigned int timeout, + uint32_t pipe_id, uint32_t queue_id) +{ + uint32_t xcc_mask = mm->dev->xcc_mask; + int xcc_id, err, inst = 0; + void *xcc_mqd; + struct v12_1_compute_mqd *m; + uint64_t mqd_offset; + + m = get_mqd(mqd); + mqd_offset = m->cp_mqd_stride_size; + + for_each_inst(xcc_id, xcc_mask) { + xcc_mqd = mqd + mqd_offset * inst; + err = mm->dev->kfd2kgd->hqd_destroy(mm->dev->adev, xcc_mqd, + type, timeout, pipe_id, + queue_id, xcc_id); + if (err) { + pr_debug("Destroy MQD failed for xcc: %d\n", inst); + break; + } + ++inst; + } + + return err; +} + +static int load_mqd_v12_1(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + uint32_t xcc_mask = mm->dev->xcc_mask; + int xcc_id, err, inst = 0; + void *xcc_mqd; + uint64_t mqd_stride_size = mm->mqd_stride(mm, p); + + for_each_inst(xcc_id, xcc_mask) { + xcc_mqd = mqd + mqd_stride_size * inst; + err = mm->dev->kfd2kgd->hqd_load( + mm->dev->adev, xcc_mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, wptr_shift, 0, mms, + xcc_id); + if (err) { + pr_debug("Load MQD failed for xcc: %d\n", inst); + break; + } + ++inst; + } + + return err; +} + +static int get_wave_state_v12_1(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + int xcc, err = 0; + void *xcc_mqd; + void __user *xcc_ctl_stack; + uint64_t mqd_stride_size = mm->mqd_stride(mm, q); + u32 tmp_ctl_stack_used_size = 0, tmp_save_area_used_size = 0; + + for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { + xcc_mqd = mqd + mqd_stride_size * xcc; + xcc_ctl_stack = (void __user *)((uintptr_t)ctl_stack + + q->ctx_save_restore_area_size * xcc); + + err = get_wave_state(mm, xcc_mqd, q, xcc_ctl_stack, + &tmp_ctl_stack_used_size, + &tmp_save_area_used_size); + if (err) + break; + + /* + * Set the ctl_stack_used_size and save_area_used_size to + * ctl_stack_used_size and save_area_used_size of XCC 0 when + * passing the info to user-space. + * For multi XCC, user-space would have to look at the header + * info of each Control stack area to determine the control + * stack size and save area used. + */ + if (xcc == 0) { + *ctl_stack_used_size = tmp_ctl_stack_used_size; + *save_area_used_size = tmp_save_area_used_size; + } + } + + return err; +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_1_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v12_1(enum KFD_MQD_TYPE type, + struct kfd_node *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc_obj(*mqd); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_v12_1; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd_v12_1; + mqd->update_mqd = update_mqd_v12_1; + mqd->destroy_mqd = destroy_mqd_v12_1; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_1_compute_mqd); + mqd->get_wave_state = get_wave_state_v12_1; + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = kfd_hiq_load_mqd_kiq; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_1_compute_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + mqd->check_preemption_failed = check_preemption_failed; + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_1_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = kfd_load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = kfd_destroy_mqd_sdma; + mqd->is_occupied = kfd_is_occupied_sdma; + mqd->mqd_size = sizeof(struct v12_sdma_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 3014925d95ff..f6d9d81003dc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm, struct queue_properties *q) { if (mm->dev->kfd->cwsr_enabled && - q->type == KFD_QUEUE_TYPE_COMPUTE) - return ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE); + q->type == KFD_QUEUE_TYPE_COMPUTE) { + + /* On gfxv9, the MQD resides in the first 4K page, + * followed by the control stack. Align both to + * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary. + */ + + return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE); + } return mm->mqd_size; } @@ -106,13 +113,27 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct v9_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, +static bool mqd_on_vram(struct amdgpu_device *adev) +{ + if (adev->apu_prefer_gtt) + return false; + + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 5, 0): + return true; + default: + return false; + } +} + +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { int retval; + struct kfd_node *node = mm->dev; struct kfd_mem_obj *mqd_mem_obj = NULL; /* For V9 only, due to a HW bug, the control stack of a user mode @@ -132,14 +153,16 @@ static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, * amdgpu memory functions to do so. */ if (node->kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { - mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj); if (!mqd_mem_obj) return NULL; - retval = amdgpu_amdkfd_alloc_gtt_mem(node->adev, - (ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) * + retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev, + (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE)) * NUM_XCC(node->xcc_mask), - &(mqd_mem_obj->gtt_mem), + mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM : + AMDGPU_GEM_DOMAIN_GTT, + &(mqd_mem_obj->mem), &(mqd_mem_obj->gpu_addr), (void *)&(mqd_mem_obj->cpu_ptr), true); @@ -341,11 +364,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, { struct v9_mqd *m; struct kfd_context_save_area_header header; + u32 cntl_stack_size; + u32 cntl_stack_offset; /* Control stack is located one page after MQD. */ - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); m = get_mqd(mqd); + cntl_stack_size = min_t(u32, m->cp_hqd_cntl_stack_size, q->ctl_stack_size); + cntl_stack_offset = min_t(u32, m->cp_hqd_cntl_stack_offset, cntl_stack_size); *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - m->cp_hqd_cntl_stack_offset; @@ -361,26 +388,30 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state))) return -EFAULT; - if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset, - mqd_ctl_stack + m->cp_hqd_cntl_stack_offset, - *ctl_stack_used_size)) + *ctl_stack_used_size = cntl_stack_size - cntl_stack_offset; + + if (copy_to_user(ctl_stack + cntl_stack_offset, mqd_ctl_stack + cntl_stack_offset, + *ctl_stack_used_size)) return -EFAULT; return 0; } -static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size) +static int get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size) { struct v9_mqd *m = get_mqd(mqd); - *ctl_stack_size = m->cp_hqd_cntl_stack_size; + if (check_mul_overflow(m->cp_hqd_cntl_stack_size, NUM_XCC(mm->dev->xcc_mask), ctl_stack_size)) + return -EINVAL; + + return 0; } static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) { struct v9_mqd *m; /* Control stack is located one page after MQD. */ - void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); m = get_mqd(mqd); @@ -388,6 +419,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size); } +static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm, + void *mqd, + void *mqd_dst, + void *ctl_stack_dst) +{ + struct v9_mqd *m; + int xcc; + uint64_t size = get_mqd(mqd)->cp_mqd_stride_size; + + for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) { + m = get_mqd(mqd + size * xcc); + + checkpoint_mqd(mm, m, + (uint8_t *)mqd_dst + sizeof(*m) * xcc, + (uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc); + } +} + static void restore_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *qp, @@ -408,7 +457,7 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd, *gart_addr = addr; /* Control stack is located one page after MQD. */ - ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE); + ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE); memcpy(ctl_stack, ctl_stack_src, ctl_stack_size); m->cp_hqd_pq_doorbell_control = @@ -495,6 +544,10 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + /* Allow context switch so we don't cross-process starve with a massive + * command buffer of long-running SDMA commands + */ + m->sdmax_rlcx_ib_cntl |= SDMA0_GFX_IB_CNTL__SWITCH_INSIDE_IB_MASK; q->is_active = QUEUE_IS_ACTIVE(*q); } @@ -554,7 +607,7 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - if (amdgpu_sriov_vf(mm->dev->adev)) + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) m->cp_hqd_pq_doorbell_control |= 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); @@ -574,7 +627,7 @@ static int hiq_load_mqd_kiq_v9_4_3(struct mqd_manager *mm, void *mqd, struct queue_properties *p, struct mm_struct *mms) { uint32_t xcc_mask = mm->dev->xcc_mask; - int xcc_id, err, inst = 0; + int xcc_id, err = 0, inst = 0; void *xcc_mqd; uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); @@ -598,7 +651,7 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id) { uint32_t xcc_mask = mm->dev->xcc_mask; - int xcc_id, err, inst = 0; + int xcc_id, err = 0, inst = 0; uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); struct v9_mqd *m; u32 doorbell_off; @@ -643,8 +696,8 @@ static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, struct kfd_mem_obj *xcc_mqd_mem_obj, uint64_t offset) { - xcc_mqd_mem_obj->gtt_mem = (offset == 0) ? - mqd_mem_obj->gtt_mem : NULL; + xcc_mqd_mem_obj->mem = (offset == 0) ? + mqd_mem_obj->mem : NULL; xcc_mqd_mem_obj->gpu_addr = mqd_mem_obj->gpu_addr + offset; xcc_mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)mqd_mem_obj->cpu_ptr + offset); @@ -667,7 +720,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc); init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q); - + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = offset; /* @@ -714,6 +769,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, *gart_addr = xcc_gart_addr; } } + + if (mqd_on_vram(mm->dev->adev)) + amdgpu_device_flush_hdp(mm->dev->adev, NULL); } static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, @@ -727,6 +785,9 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); + if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; update_cu_mask(mm, m, minfo, xcc); if (q->format == KFD_QUEUE_FORMAT_AQL) { @@ -747,14 +808,57 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m->pm4_target_xcc_in_xcp = q->pm4_target_xcc; } } + + if (mqd_on_vram(mm->dev->adev)) + amdgpu_device_flush_hdp(mm->dev->adev, NULL); } +static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *qp, + const void *mqd_src, + const void *ctl_stack_src, u32 ctl_stack_size) +{ + struct kfd_mem_obj xcc_mqd_mem_obj; + u32 mqd_ctl_stack_size; + struct v9_mqd *m; + u32 num_xcc; + int xcc; + + uint64_t offset = mm->mqd_stride(mm, qp); + + mm->dev->dqm->current_logical_xcc_start++; + + num_xcc = NUM_XCC(mm->dev->xcc_mask); + mqd_ctl_stack_size = ctl_stack_size / num_xcc; + + memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj)); + + /* Set the MQD pointer and gart address to XCC0 MQD */ + *mqd = mqd_mem_obj->cpu_ptr; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + for (xcc = 0; xcc < num_xcc; xcc++) { + get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc); + restore_mqd(mm, (void **)&m, + &xcc_mqd_mem_obj, + NULL, + qp, + (uint8_t *)mqd_src + xcc * sizeof(*m), + (uint8_t *)ctl_stack_src + xcc * mqd_ctl_stack_size, + mqd_ctl_stack_size); + } + + if (mqd_on_vram(mm->dev->adev)) + amdgpu_device_flush_hdp(mm->dev->adev, NULL); +} static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, enum kfd_preempt_type type, unsigned int timeout, uint32_t pipe_id, uint32_t queue_id) { uint32_t xcc_mask = mm->dev->xcc_mask; - int xcc_id, err, inst = 0; + int xcc_id, err = 0, inst = 0; void *xcc_mqd; struct v9_mqd *m; uint64_t mqd_offset; @@ -784,7 +888,7 @@ static int load_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); uint32_t xcc_mask = mm->dev->xcc_mask; - int xcc_id, err, inst = 0; + int xcc_id, err = 0, inst = 0; void *xcc_mqd; uint64_t mqd_stride_size = mm->mqd_stride(mm, p); @@ -870,7 +974,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; @@ -882,8 +986,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->free_mqd = kfd_free_mqd_cp; mqd->is_occupied = kfd_is_occupied_cp; mqd->get_checkpoint_info = get_checkpoint_info; - mqd->checkpoint_mqd = checkpoint_mqd; - mqd->restore_mqd = restore_mqd; mqd->mqd_size = sizeof(struct v9_mqd); mqd->mqd_stride = mqd_stride_v9; #if defined(CONFIG_DEBUG_FS) @@ -897,12 +999,16 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_v9_4_3; mqd->destroy_mqd = destroy_mqd_v9_4_3; mqd->get_wave_state = get_wave_state_v9_4_3; + mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3; + mqd->restore_mqd = restore_mqd_v9_4_3; } else { mqd->init_mqd = init_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = kfd_destroy_mqd_cp; mqd->get_wave_state = get_wave_state; + mqd->checkpoint_mqd = checkpoint_mqd; + mqd->restore_mqd = restore_mqd; } break; case KFD_MQD_TYPE_HIQ: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index c1fafc502515..431a20323146 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -73,12 +73,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, static void set_priority(struct vi_mqd *m, struct queue_properties *q) { m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; - m->cp_hqd_queue_priority = q->priority; } -static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd, +static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, struct queue_properties *q) { + struct kfd_node *kfd = mm->dev; struct kfd_mem_obj *mqd_mem_obj; if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd), @@ -274,10 +274,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } -static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size) +static int get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size) { /* Control stack is stored in user mode */ *ctl_stack_size = 0; + return 0; } static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst) @@ -445,7 +446,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + mqd = kzalloc_obj(*mqd); if (!mqd) return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 4984b41cd372..b1a6eb349bb3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -31,6 +31,7 @@ #define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0) #define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1) #define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2) +#define OVER_SUBSCRIPTION_XNACK_CONFLICT (1 << 3) static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, unsigned int buffer_size_bytes) @@ -44,7 +45,8 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, - int *over_subscription) + int *over_subscription, + int xnack_conflict) { unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; unsigned int map_queue_size; @@ -73,6 +75,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, *over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT; if (gws_queue_count > 1) *over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT; + if (xnack_conflict && (node->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN)) + *over_subscription |= OVER_SUBSCRIPTION_XNACK_CONFLICT; if (*over_subscription) dev_dbg(dev, "Over subscribed runlist\n"); @@ -96,7 +100,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, unsigned int **rl_buffer, uint64_t *rl_gpu_buffer, unsigned int *rl_buffer_size, - int *is_over_subscription) + int *is_over_subscription, + int xnack_conflict) { struct kfd_node *node = pm->dqm->dev; struct device *dev = node->adev->dev; @@ -105,7 +110,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, if (WARN_ON(pm->allocated)) return -EINVAL; - pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription, + xnack_conflict); mutex_lock(&pm->lock); @@ -142,11 +148,27 @@ static int pm_create_runlist_ib(struct packet_manager *pm, struct queue *q; struct kernel_queue *kq; int is_over_subscription; + int xnack_enabled = -1; + bool xnack_conflict = 0; rl_wptr = retval = processes_mapped = 0; + /* Check if processes set different xnack modes */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + if (xnack_enabled < 0) + /* First process */ + xnack_enabled = qpd->pqm->process->xnack_enabled; + else if (qpd->pqm->process->xnack_enabled != xnack_enabled) { + /* Found a process with a different xnack mode */ + xnack_conflict = 1; + break; + } + } + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, - &alloc_size_bytes, &is_over_subscription); + &alloc_size_bytes, &is_over_subscription, + xnack_conflict); if (retval) return retval; @@ -156,9 +178,13 @@ static int pm_create_runlist_ib(struct packet_manager *pm, dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->active_queue_count); +build_runlist_ib: /* build the run list ib packet */ list_for_each_entry(cur, queues, list) { qpd = cur->qpd; + /* group processes with the same xnack mode together */ + if (qpd->pqm->process->xnack_enabled != xnack_enabled) + continue; /* build map process packet */ if (processes_mapped >= pm->dqm->processes_count) { dev_dbg(dev, "Not enough space left in runlist IB\n"); @@ -215,18 +241,26 @@ static int pm_create_runlist_ib(struct packet_manager *pm, alloc_size_bytes); } } + if (xnack_conflict) { + /* pick up processes with the other xnack mode */ + xnack_enabled = !xnack_enabled; + xnack_conflict = 0; + goto build_runlist_ib; + } dev_dbg(dev, "Finished map process and queues to runlist\n"); if (is_over_subscription) { if (!pm->is_over_subscription) - dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s. Expect reduced ROCm performance.\n", - is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ? - " too many processes." : "", - is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ? - " too many queues." : "", - is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ? - " multiple processes using cooperative launch." : ""); + dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s%s. Expect reduced ROCm performance.\n", + is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ? + " too many processes" : "", + is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ? + " too many queues" : "", + is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ? + " multiple processes using cooperative launch" : "", + is_over_subscription & OVER_SUBSCRIPTION_XNACK_CONFLICT ? + " xnack on/off processes mixed on gfx9" : ""); retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, @@ -396,14 +430,33 @@ out: return retval; } -int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) +/* pm_config_dequeue_wait_counts: Configure dequeue timer Wait Counts + * by writing to CP_IQ_WAIT_TIME2 registers. + * + * @cmd: See emum kfd_config_dequeue_wait_counts_cmd definition + * @value: Depends on the cmd. This parameter is unused for + * KFD_DEQUEUE_WAIT_INIT and KFD_DEQUEUE_WAIT_RESET. For + * KFD_DEQUEUE_WAIT_SET_SCH_WAVE it holds value to be set + * + */ +int pm_config_dequeue_wait_counts(struct packet_manager *pm, + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t value) { struct kfd_node *node = pm->dqm->dev; struct device *dev = node->adev->dev; int retval = 0; uint32_t *buffer, size; - size = pm->pmf->set_grace_period_size; + if (!pm->pmf->config_dequeue_wait_counts || + !pm->pmf->config_dequeue_wait_counts_size) + return 0; + + if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) || + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0))) + return 0; + + size = pm->pmf->config_dequeue_wait_counts_size; mutex_lock(&pm->lock); @@ -419,13 +472,18 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) goto out; } - retval = pm->pmf->set_grace_period(pm, buffer, grace_period); - if (!retval) + retval = pm->pmf->config_dequeue_wait_counts(pm, buffer, + cmd, value); + if (!retval) { retval = kq_submit_packet(pm->priv_queue); - else + + /* If default value is modified, cache that in dqm->wait_times */ + if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT) + update_dqm_wait_times(pm->dqm); + } else { kq_rollback_packet(pm->priv_queue); + } } - out: mutex_unlock(&pm->lock); return retval; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 1f9f5bfeaf86..3d2375817c3e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -43,11 +43,11 @@ static int pm_map_process_v9(struct packet_manager *pm, memset(buffer, 0, sizeof(struct pm4_mes_map_process)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); - if (adev->enforce_isolation[kfd->node_id]) + if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE) packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields14.gds_size = qpd->gds_size & 0x3F; packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0; @@ -102,11 +102,12 @@ static int pm_map_process_aldebaran(struct packet_manager *pm, memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process_aldebaran)); - if (adev->enforce_isolation[knode->node_id]) + if (adev->enforce_isolation[knode->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields14.gds_size = qpd->gds_size & 0x3F; packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0; @@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ? - 1 : min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); + concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) ? + 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum); packet = (struct pm4_mes_runlist *)buffer; @@ -202,6 +203,8 @@ static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_set_resources__hsa_interface_queue_hiq; packet->bitfields2.vmid_mask = res->vmid_mask; packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; + if (pm->dqm->dev->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN) + packet->bitfields2.enb_xnack_retry_disable_check = 1; packet->bitfields7.oac_mask = res->oac_mask; packet->bitfields8.gds_heap_base = res->gds_heap_base; packet->bitfields8.gds_heap_size = res->gds_heap_size; @@ -237,7 +240,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields2.engine_sel = engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.gws_control_queue = q->gws ? 1 : 0; + packet->bitfields2.gws_control_queue = q->properties.is_gws ? 1 : 0; packet->bitfields2.extended_engine_sel = extended_engine_sel__mes_map_queues__legacy_engine_sel; packet->bitfields2.queue_type = @@ -249,10 +252,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_latency_static_queue_vi; break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: if (q->properties.sdma_engine_id < 2 && @@ -297,23 +296,79 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, return 0; } -static int pm_set_grace_period_v9(struct packet_manager *pm, +static inline void pm_build_dequeue_wait_counts_packet_info(struct packet_manager *pm, + uint32_t sch_value, uint32_t que_sleep, uint32_t *reg_offset, + uint32_t *reg_data) +{ + pm->dqm->dev->kfd2kgd->build_dequeue_wait_counts_packet_info( + pm->dqm->dev->adev, + pm->dqm->wait_times, + sch_value, + que_sleep, + reg_offset, + reg_data); +} + +/* pm_config_dequeue_wait_counts_v9: Builds WRITE_DATA packet with + * register/value for configuring dequeue wait counts + * + * @return: -ve for failure and 0 for success and buffer is + * filled in with packet + * + **/ +static int pm_config_dequeue_wait_counts_v9(struct packet_manager *pm, uint32_t *buffer, - uint32_t grace_period) + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t value) { struct pm4_mec_write_data_mmio *packet; uint32_t reg_offset = 0; uint32_t reg_data = 0; - pm->dqm->dev->kfd2kgd->build_grace_period_packet_info( - pm->dqm->dev->adev, - pm->dqm->wait_times, - grace_period, - ®_offset, - ®_data); + switch (cmd) { + case KFD_DEQUEUE_WAIT_INIT: { + uint32_t sch_wave = 0, que_sleep = 1; + + /* For all gfx9 ASICs > gfx941, + * Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default 0x40. + * On a 1GHz machine this is roughly 1 microsecond, which is + * about how long it takes to load data out of memory during + * queue connect + * QUE_SLEEP: Wait Count for Dequeue Retry. + * + * Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU + */ + if (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) || + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0)) + return -EPERM; + + if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev->gmc.is_app_apu && + (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3))) + sch_wave = 1; + + pm_build_dequeue_wait_counts_packet_info(pm, sch_wave, que_sleep, + ®_offset, ®_data); - if (grace_period == USE_DEFAULT_GRACE_PERIOD) - reg_data = pm->dqm->wait_times; + break; + } + case KFD_DEQUEUE_WAIT_RESET: + /* reg_data would be set to dqm->wait_times */ + pm_build_dequeue_wait_counts_packet_info(pm, 0, 0, ®_offset, ®_data); + break; + + case KFD_DEQUEUE_WAIT_SET_SCH_WAVE: + /* The CP cannot handle value 0 and it will result in + * an infinite grace period being set so set to 1 to prevent this. Also + * avoid debugger API breakage as it sets 0 and expects a low value. + */ + if (!value) + value = 1; + pm_build_dequeue_wait_counts_packet_info(pm, value, 0, ®_offset, ®_data); + break; + default: + pr_err("Invalid dequeue wait cmd\n"); + return -EINVAL; + } packet = (struct pm4_mec_write_data_mmio *)buffer; memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio)); @@ -415,7 +470,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = { .set_resources = pm_set_resources_v9, .map_queues = pm_map_queues_v9, .unmap_queues = pm_unmap_queues_v9, - .set_grace_period = pm_set_grace_period_v9, + .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9, .query_status = pm_query_status_v9, .release_mem = NULL, .map_process_size = sizeof(struct pm4_mes_map_process), @@ -423,7 +478,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio), + .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio), .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = 0, }; @@ -434,7 +489,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = { .set_resources = pm_set_resources_v9, .map_queues = pm_map_queues_v9, .unmap_queues = pm_unmap_queues_v9, - .set_grace_period = pm_set_grace_period_v9, + .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9, .query_status = pm_query_status_v9, .release_mem = NULL, .map_process_size = sizeof(struct pm4_mes_map_process_aldebaran), @@ -442,7 +497,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio), + .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio), .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = 0, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c index c1199d06d131..1f0b8ef7c966 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c @@ -42,6 +42,7 @@ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, struct qcm_process_device *qpd) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); struct pm4_mes_map_process *packet; packet = (struct pm4_mes_map_process *)buffer; @@ -52,7 +53,7 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, sizeof(struct pm4_mes_map_process)); packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; - packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields2.pasid = pdd->pasid; packet->bitfields3.page_table_base = qpd->page_table_base; packet->bitfields10.gds_size = qpd->gds_size; packet->bitfields10.num_gws = qpd->num_gws; @@ -165,15 +166,10 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_latency_static_queue_vi; break; - case KFD_QUEUE_TYPE_DIQ: - packet->bitfields2.queue_type = - queue_type__mes_map_queues__debug_interface_queue_vi; - break; case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; - use_static = false; /* no static queues under SDMA */ break; default: WARN(1, "queue type %d", q->properties.type); @@ -303,7 +299,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = { .set_resources = pm_set_resources_vi, .map_queues = pm_map_queues_vi, .unmap_queues = pm_unmap_queues_vi, - .set_grace_period = NULL, + .config_dequeue_wait_counts = NULL, .query_status = pm_query_status_vi, .release_mem = pm_release_mem_vi, .map_process_size = sizeof(struct pm4_mes_map_process), @@ -311,7 +307,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = { .set_resources_size = sizeof(struct pm4_mes_set_resources), .map_queues_size = sizeof(struct pm4_mes_map_queues), .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .set_grace_period_size = 0, + .config_dequeue_wait_counts_size = 0, .query_status_size = sizeof(struct pm4_mes_query_status), .release_mem_size = sizeof(struct pm4_mec_release_mem) }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c deleted file mode 100644 index e3b250918f39..000000000000 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright 2014-2022 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include <linux/types.h> -#include "kfd_priv.h" -#include "amdgpu_ids.h" - -static unsigned int pasid_bits = 16; -static bool pasids_allocated; /* = false */ - -bool kfd_set_pasid_limit(unsigned int new_limit) -{ - if (new_limit < 2) - return false; - - if (new_limit < (1U << pasid_bits)) { - if (pasids_allocated) - /* We've already allocated user PASIDs, too late to - * change the limit - */ - return false; - - while (new_limit < (1U << pasid_bits)) - pasid_bits--; - } - - return true; -} - -unsigned int kfd_get_pasid_limit(void) -{ - return 1U << pasid_bits; -} - -u32 kfd_pasid_alloc(void) -{ - int r = amdgpu_pasid_alloc(pasid_bits); - - if (r > 0) { - pasids_allocated = true; - return r; - } - - return 0; -} - -void kfd_pasid_free(u32 pasid) -{ - amdgpu_pasid_free(pasid); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h index cd8611401a66..e356a207d03c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -63,7 +63,8 @@ struct pm4_mes_set_resources { struct { uint32_t vmid_mask:16; uint32_t unmap_latency:8; - uint32_t reserved1:5; + uint32_t reserved1:4; + uint32_t enb_xnack_retry_disable_check:1; enum mes_set_resources_queue_type_enum queue_type:3; } bitfields2; uint32_t ordinal2; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d8cd913aa772..d5b07789eda4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -91,7 +91,7 @@ /* Macro for allocating structures */ #define kfd_alloc_struct(ptr_to_struct) \ - ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) + ((typeof(ptr_to_struct)) kzalloc_obj(*ptr_to_struct)) #define KFD_MAX_NUM_OF_PROCESSES 512 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 @@ -102,8 +102,8 @@ * The first chunk is the TBA used for the CWSR ISA code. The second * chunk is used as TMA for user-mode trap handler setup in daisy-chain mode. */ -#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) -#define KFD_CWSR_TMA_OFFSET (PAGE_SIZE + 2048) +#define KFD_CWSR_TBA_TMA_SIZE (AMDGPU_GPU_PAGE_SIZE * 2) +#define KFD_CWSR_TMA_OFFSET (AMDGPU_GPU_PAGE_SIZE + 2048) #define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ (KFD_MAX_NUM_OF_PROCESSES * \ @@ -111,7 +111,14 @@ #define KFD_KERNEL_QUEUE_SIZE 2048 -#define KFD_UNMAP_LATENCY_MS (4000) +/* KFD_UNMAP_LATENCY_MS is the timeout CP waiting for SDMA preemption. One XCC + * can be associated to 2 SDMA engines. queue_preemption_timeout_ms is the time + * driver waiting for CP returning the UNMAP_QUEUE fence. Thus the math is + * queue_preemption_timeout_ms = sdma_preemption_time * 2 + cp workload + * The format here makes CP workload 10% of total timeout + */ +#define KFD_UNMAP_LATENCY_MS \ + ((queue_preemption_timeout_ms - queue_preemption_timeout_ms / 10) >> 1) #define KFD_MAX_SDMA_QUEUES 128 @@ -208,7 +215,8 @@ enum cache_policy { ((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) || \ (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) || \ - (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0))) + (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(12, 1, 0))) struct kfd_node; @@ -234,7 +242,6 @@ struct kfd_device_info { uint32_t no_atomic_fw_version; unsigned int num_sdma_queues_per_engine; unsigned int num_reserved_sdma_queues_per_engine; - DECLARE_BITMAP(reserved_sdma_queues_bitmap, KFD_MAX_SDMA_QUEUES); }; unsigned int kfd_get_num_sdma_engines(struct kfd_node *kdev); @@ -245,7 +252,7 @@ struct kfd_mem_obj { uint32_t range_end; uint64_t gpu_addr; uint32_t *cpu_ptr; - void *gtt_mem; + void *mem; }; struct kfd_vmid_info { @@ -289,7 +296,6 @@ struct kfd_node { /* Global GWS resource shared between processes */ void *gws; - bool gws_debug_workaround; /* Clients watching SMI events */ struct list_head smi_clients; @@ -373,6 +379,11 @@ struct kfd_dev { /* bitmap for dynamic doorbell allocation from doorbell object */ unsigned long *doorbell_bitmap; + + /* for dynamic partitioning */ + int kfd_dev_lock; + + atomic_t kfd_processes_count; }; enum kfd_mempool { @@ -384,6 +395,7 @@ enum kfd_mempool { /* Character device interface */ int kfd_chardev_init(void); void kfd_chardev_exit(void); +void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen); /** * enum kfd_unmap_queues_filter - Enum for queue filters. @@ -423,7 +435,6 @@ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, - KFD_QUEUE_TYPE_DIQ, KFD_QUEUE_TYPE_SDMA_XGMI, KFD_QUEUE_TYPE_SDMA_BY_ENG_ID }; @@ -496,7 +507,8 @@ struct queue_properties { enum kfd_queue_format format; unsigned int queue_id; uint64_t queue_address; - uint64_t queue_size; + uint64_t queue_size; + uint64_t metadata_queue_size; uint32_t priority; uint32_t queue_percent; void __user *read_ptr; @@ -687,6 +699,7 @@ struct qcm_process_device { uint32_t num_gws; uint32_t num_oac; uint32_t sh_hidden_private_base; + uint32_t vm_cntx_cntl; /* CWSR memory */ struct kgd_mem *cwsr_mem; @@ -851,6 +864,8 @@ struct kfd_process_device { /* Tracks queue reset status */ bool has_reset_queue; + + u32 pasid; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -910,8 +925,6 @@ struct kfd_process { /* We want to receive a notification when the mm_struct is destroyed */ struct mmu_notifier mmu_notifier; - u32 pasid; - /* * Array of kfd_process_device pointers, * one for each device the process is using. @@ -1007,9 +1020,19 @@ struct kfd_process { /* if gpu page fault sent to KFD */ bool gpu_page_fault; + + /*kfd context id */ + u16 context_id; + + /* The primary kfd_process allocating IDs for its secondary kfd_process, 0 for primary kfd_process */ + struct ida id_table; + }; -#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ +#define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */ +#define KFD_CONTEXT_ID_PRIMARY 0xFFFF +#define KFD_CONTEXT_ID_MIN 0 + extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); extern struct srcu_struct kfd_processes_srcu; @@ -1025,22 +1048,28 @@ extern struct srcu_struct kfd_processes_srcu; typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p, void *data); +typedef int amdkfd_ioctl_validate_t(void *kdata, unsigned int usize); + struct amdkfd_ioctl_desc { unsigned int cmd; int flags; amdkfd_ioctl_t *func; + amdkfd_ioctl_validate_t *validate; unsigned int cmd_drv; const char *name; }; bool kfd_dev_is_large_bar(struct kfd_node *dev); +struct kfd_process *create_process(const struct task_struct *thread, bool primary); int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); void kfd_cleanup_processes(void); struct kfd_process *kfd_create_process(struct task_struct *thread); -struct kfd_process *kfd_get_process(const struct task_struct *task); -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid); +int kfd_create_process_sysfs(struct kfd_process *process); +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid, + struct kfd_process_device **pdd); struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); +struct kfd_process *kfd_lookup_process_by_id(const struct mm_struct *mm, u16 id); int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id); int kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node, @@ -1078,6 +1107,7 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported); int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, struct vm_area_struct *vma); +void kfd_process_notifier_release_internal(struct kfd_process *p); /* KFD process API for creating and translating handles */ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, @@ -1091,8 +1121,6 @@ struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); /* PASIDs */ int kfd_pasid_init(void); void kfd_pasid_exit(void); -bool kfd_set_pasid_limit(unsigned int new_limit); -unsigned int kfd_get_pasid_limit(void); u32 kfd_pasid_alloc(void); void kfd_pasid_free(u32 pasid); @@ -1142,7 +1170,6 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock( uint32_t proximity_domain); struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id); struct kfd_node *kfd_device_by_id(uint32_t gpu_id); -struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev); static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id, uint32_t vmid) { @@ -1154,9 +1181,11 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, struct kfd_dev *dev = adev->kfd.dev; uint32_t i; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && - KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && - KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0)) + /* + * On multi-aid system, attempt per-node matching. Otherwise, + * fall back to the first node. + */ + if (!amdgpu_is_multi_aid(adev)) return dev->nodes[0]; for (i = 0; i < dev->num_nodes; i++) @@ -1166,7 +1195,9 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, return NULL; } int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_node **kdev); +uint32_t kfd_topology_get_num_devices(void); int kfd_numa_node_to_apic_id(int numa_node_id); +uint32_t kfd_gpu_node_num(void); /* Interrupts */ #define KFD_IRQ_FENCE_CLIENTID 0xff @@ -1332,12 +1363,14 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, struct kfd_node *dev); struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, struct kfd_node *dev); +struct mqd_manager *mqd_manager_init_v12_1(enum KFD_MQD_TYPE type, + struct kfd_node *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_node *dev, enum kfd_queue_type type); void kernel_queue_uninit(struct kernel_queue *kq); -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); +int kfd_evict_process_device(struct kfd_process_device *pdd); int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id); /* Process Queue Manager */ @@ -1366,8 +1399,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, unsigned int qid, struct mqd_update_info *minfo); int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, void *gws); -struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, - unsigned int qid); struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, @@ -1394,6 +1425,24 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) +/** + * enum kfd_config_dequeue_wait_counts_cmd - Command for configuring + * dequeue wait counts. + * + * @KFD_DEQUEUE_WAIT_INIT: Set optimized dequeue wait counts for a + * certain ASICs. For these ASICs, this is default value used by RESET + * @KFD_DEQUEUE_WAIT_RESET: Reset dequeue wait counts to the optimized value + * for certain ASICs. For others set it to default hardware reset value + * @KFD_DEQUEUE_WAIT_SET_SCH_WAVE: Set context switch latency wait + * + */ +enum kfd_config_dequeue_wait_counts_cmd { + KFD_DEQUEUE_WAIT_INIT = 1, + KFD_DEQUEUE_WAIT_RESET = 2, + KFD_DEQUEUE_WAIT_SET_SCH_WAVE = 3 +}; + + struct packet_manager { struct device_queue_manager *dqm; struct kernel_queue *priv_queue; @@ -1419,8 +1468,8 @@ struct packet_manager_funcs { int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, enum kfd_unmap_queues_filter mode, uint32_t filter_param, bool reset); - int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer, - uint32_t grace_period); + int (*config_dequeue_wait_counts)(struct packet_manager *pm, uint32_t *buffer, + enum kfd_config_dequeue_wait_counts_cmd cmd, uint32_t value); int (*query_status)(struct packet_manager *pm, uint32_t *buffer, uint64_t fence_address, uint64_t fence_value); int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); @@ -1431,7 +1480,7 @@ struct packet_manager_funcs { int set_resources_size; int map_queues_size; int unmap_queues_size; - int set_grace_period_size; + int config_dequeue_wait_counts_size; int query_status_size; int release_mem_size; }; @@ -1454,7 +1503,9 @@ int pm_send_unmap_queue(struct packet_manager *pm, void pm_release_ib(struct packet_manager *pm); -int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period); +int pm_config_dequeue_wait_counts(struct packet_manager *pm, + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t wait_counts_config); /* Following PM funcs can be shared among VI and AI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); @@ -1467,6 +1518,7 @@ extern const struct kfd_event_interrupt_class event_interrupt_class_v9; extern const struct kfd_event_interrupt_class event_interrupt_class_v9_4_3; extern const struct kfd_event_interrupt_class event_interrupt_class_v10; extern const struct kfd_event_interrupt_class event_interrupt_class_v11; +extern const struct kfd_event_interrupt_class event_interrupt_class_v12_1; extern const struct kfd_device_global_init_class device_global_init_class_cik; @@ -1478,7 +1530,7 @@ int kfd_wait_on_events(struct kfd_process *p, bool all, uint32_t *user_timeout_ms, uint32_t *wait_result); void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, - uint32_t valid_id_bits); + uint32_t valid_id_bits, bool signal_mailbox_updated); void kfd_signal_hw_exception_event(u32 pasid); int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id); @@ -1492,21 +1544,24 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, int kfd_get_num_events(struct kfd_process *p); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va); + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data); void kfd_signal_reset_event(struct kfd_node *dev); void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid); +void kfd_signal_process_terminate_event(struct kfd_process *p); -static inline void kfd_flush_tlb(struct kfd_process_device *pdd, - enum TLB_FLUSH_TYPE type) +static inline void kfd_flush_tlb(struct kfd_process_device *pdd) { struct amdgpu_device *adev = pdd->dev->adev; struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); - amdgpu_vm_flush_compute_tlb(adev, vm, type, pdd->dev->xcc_mask); + amdgpu_vm_flush_compute_tlb(adev, vm, TLB_FLUSH_HEAVYWEIGHT, + pdd->dev->xcc_mask); } static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) @@ -1519,7 +1574,7 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) int kfd_send_exception_to_runtime(struct kfd_process *p, unsigned int queue_id, uint64_t error_reason); -bool kfd_is_locked(void); +bool kfd_is_locked(struct kfd_dev *kfd); /* Compute profile */ void kfd_inc_compute_active(struct kfd_node *dev); @@ -1566,10 +1621,15 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev); int pm_debugfs_hang_hws(struct packet_manager *pm); int dqm_debugfs_hang_hws(struct device_queue_manager *dqm); +void kfd_debugfs_add_process(struct kfd_process *p); +void kfd_debugfs_remove_process(struct kfd_process *p); + #else static inline void kfd_debugfs_init(void) {} static inline void kfd_debugfs_fini(void) {} +static inline void kfd_debugfs_add_process(struct kfd_process *p) {} +static inline void kfd_debugfs_remove_process(struct kfd_process *p) {} #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 083f83c94531..d28ca581cad0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -35,6 +35,7 @@ #include <linux/pm_runtime.h> #include "amdgpu_amdkfd.h" #include "amdgpu.h" +#include "amdgpu_reset.h" struct mm_struct; @@ -67,7 +68,6 @@ static struct workqueue_struct *kfd_restore_wq; static struct kfd_process *find_process(const struct task_struct *thread, bool ref); static void kfd_process_ref_release(struct kref *ref); -static struct kfd_process *create_process(const struct task_struct *thread); static void evict_process_worker(struct work_struct *work); static void restore_process_worker(struct work_struct *work); @@ -153,7 +153,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work) (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) continue; - sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL); + sdma_q = kzalloc_obj(struct temp_sdma_queue_list); if (!sdma_q) { dqm_unlock(dqm); goto cleanup; @@ -282,8 +282,8 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) cu_cnt = 0; proc = pdd->process; if (pdd->qpd.queue_count == 0) { - pr_debug("Gpu-Id: %d has no active queues for process %d\n", - dev->id, proc->pasid); + pr_debug("Gpu-Id: %d has no active queues for process pid %d\n", + dev->id, (int)proc->lead_thread->pid); return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); } @@ -291,7 +291,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) wave_cnt = 0; max_waves_per_cu = 0; - cu_occupancy = kcalloc(AMDGPU_MAX_QUEUES, sizeof(*cu_occupancy), GFP_KERNEL); + cu_occupancy = kzalloc_objs(*cu_occupancy, AMDGPU_MAX_QUEUES); if (!cu_occupancy) return -ENOMEM; @@ -327,12 +327,9 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - if (strcmp(attr->name, "pasid") == 0) { - struct kfd_process *p = container_of(attr, struct kfd_process, - attr_pasid); - - return snprintf(buffer, PAGE_SIZE, "%d\n", p->pasid); - } else if (strncmp(attr->name, "vram_", 5) == 0) { + if (strcmp(attr->name, "pasid") == 0) + return snprintf(buffer, PAGE_SIZE, "%d\n", 0); + else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage)); @@ -588,7 +585,7 @@ static void kfd_procfs_add_sysfs_stats(struct kfd_process *p) ret = kobject_init_and_add(pdd->kobj_stats, &procfs_stats_type, p->kobj, - stats_dir_filename); + "%s", stats_dir_filename); if (ret) { pr_warn("Creating KFD proc/stats_%s folder failed", @@ -635,7 +632,7 @@ static void kfd_procfs_add_sysfs_counters(struct kfd_process *p) return; ret = kobject_init_and_add(kobj_counters, &sysfs_counters_type, - p->kobj, counters_dir_filename); + p->kobj, "%s", counters_dir_filename); if (ret) { pr_warn("Creating KFD proc/%s folder failed", counters_dir_filename); @@ -682,7 +679,7 @@ static void kfd_procfs_add_sysfs_files(struct kfd_process *p) void kfd_procfs_del_queue(struct queue *q) { - if (!q) + if (!q || !q->process->kobj) return; kobject_del(&q->kobj); @@ -692,7 +689,8 @@ void kfd_procfs_del_queue(struct queue *q) int kfd_process_create_wq(void) { if (!kfd_process_wq) - kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); + kfd_process_wq = alloc_workqueue("kfd_process_wq", WQ_UNBOUND, + 0); if (!kfd_restore_wq) kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", WQ_FREEZABLE); @@ -827,6 +825,103 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd) kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr); } +int kfd_create_process_sysfs(struct kfd_process *process) +{ + struct kfd_process *primary_process; + int ret; + + if (process->kobj) { + pr_warn("kobject already exists for the kfd_process\n"); + return -EINVAL; + } + + process->kobj = kfd_alloc_struct(process->kobj); + if (!process->kobj) { + pr_warn("Creating procfs kobject failed"); + return -ENOMEM; + } + + if (process->context_id == KFD_CONTEXT_ID_PRIMARY) + ret = kobject_init_and_add(process->kobj, &procfs_type, + procfs.kobj, "%d", + (int)process->lead_thread->pid); + else { + primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm); + if (!primary_process) + return -ESRCH; + + ret = kobject_init_and_add(process->kobj, &procfs_type, + primary_process->kobj, "context_%u", + process->context_id); + kfd_unref_process(primary_process); + } + + if (ret) { + pr_warn("Creating procfs pid directory failed"); + kobject_put(process->kobj); + process->kobj = NULL; + return ret; + } + + kfd_sysfs_create_file(process->kobj, &process->attr_pasid, + "pasid"); + + process->kobj_queues = kobject_create_and_add("queues", + process->kobj); + if (!process->kobj_queues) + pr_warn("Creating KFD proc/queues folder failed"); + + kfd_procfs_add_sysfs_stats(process); + kfd_procfs_add_sysfs_files(process); + kfd_procfs_add_sysfs_counters(process); + + return 0; +} + +static int kfd_process_alloc_id(struct kfd_process *process) +{ + int ret; + struct kfd_process *primary_process; + + /* already assign 0xFFFF when create */ + if (process->context_id == KFD_CONTEXT_ID_PRIMARY) + return 0; + + primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm); + if (!primary_process) + return -ESRCH; + + /* id range: KFD_CONTEXT_ID_MIN to 0xFFFE */ + ret = ida_alloc_range(&primary_process->id_table, KFD_CONTEXT_ID_MIN, + KFD_CONTEXT_ID_PRIMARY - 1, GFP_KERNEL); + if (ret < 0) + goto out; + + process->context_id = ret; + ret = 0; + +out: + kfd_unref_process(primary_process); + + return ret; +} + +static void kfd_process_free_id(struct kfd_process *process) +{ + struct kfd_process *primary_process; + + if (process->context_id != KFD_CONTEXT_ID_PRIMARY) + return; + + primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm); + if (!primary_process) + return; + + ida_free(&primary_process->id_table, process->context_id); + + kfd_unref_process(primary_process); +} + struct kfd_process *kfd_create_process(struct task_struct *thread) { struct kfd_process *process; @@ -835,11 +930,13 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) if (!(thread->mm && mmget_not_zero(thread->mm))) return ERR_PTR(-EINVAL); - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) { - mmput(thread->mm); - return ERR_PTR(-EINVAL); - } + /* If the process just called exec(3), it is possible that the + * cleanup of the kfd_process (following the release of the mm + * of the old process image) is still in the cleanup work queue. + * Make sure to drain any job before trying to recreate any + * resource for this process. + */ + flush_workqueue(kfd_process_wq); /* * take kfd processes mutex before starting of process creation @@ -848,7 +945,13 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) */ mutex_lock(&kfd_processes_mutex); - if (kfd_is_locked()) { + if (kfd_gpu_node_num() <= 0) { + pr_warn("no gpu node! Cannot create KFD process"); + process = ERR_PTR(-EINVAL); + goto out; + } + + if (kfd_is_locked(NULL)) { pr_debug("KFD is locked! Cannot create process"); process = ERR_PTR(-EINVAL); goto out; @@ -861,46 +964,18 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) if (process) { pr_debug("Process already found\n"); } else { - /* If the process just called exec(3), it is possible that the - * cleanup of the kfd_process (following the release of the mm - * of the old process image) is still in the cleanup work queue. - * Make sure to drain any job before trying to recreate any - * resource for this process. - */ - flush_workqueue(kfd_process_wq); - - process = create_process(thread); + process = create_process(thread, true); if (IS_ERR(process)) goto out; if (!procfs.kobj) goto out; - process->kobj = kfd_alloc_struct(process->kobj); - if (!process->kobj) { - pr_warn("Creating procfs kobject failed"); - goto out; - } - ret = kobject_init_and_add(process->kobj, &procfs_type, - procfs.kobj, "%d", - (int)process->lead_thread->pid); - if (ret) { - pr_warn("Creating procfs pid directory failed"); - kobject_put(process->kobj); - goto out; - } - - kfd_sysfs_create_file(process->kobj, &process->attr_pasid, - "pasid"); - - process->kobj_queues = kobject_create_and_add("queues", - process->kobj); - if (!process->kobj_queues) - pr_warn("Creating KFD proc/queues folder failed"); + ret = kfd_create_process_sysfs(process); + if (ret) + pr_warn("Failed to create sysfs entry for the kfd_process"); - kfd_procfs_add_sysfs_stats(process); - kfd_procfs_add_sysfs_files(process); - kfd_procfs_add_sysfs_counters(process); + kfd_debugfs_add_process(process); init_waitqueue_head(&process->wait_irq_drain); } @@ -911,31 +986,13 @@ out: return process; } -struct kfd_process *kfd_get_process(const struct task_struct *thread) -{ - struct kfd_process *process; - - if (!thread->mm) - return ERR_PTR(-EINVAL); - - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) - return ERR_PTR(-EINVAL); - - process = find_process(thread, false); - if (!process) - return ERR_PTR(-EINVAL); - - return process; -} - static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) { struct kfd_process *process; hash_for_each_possible_rcu(kfd_processes_table, process, kfd_processes, (uintptr_t)mm) - if (process->mm == mm) + if (process->mm == mm && process->context_id == KFD_CONTEXT_ID_PRIMARY) return process; return NULL; @@ -1056,17 +1113,15 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; - pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n", - pdd->dev->id, p->pasid); + kfd_smi_event_process(pdd, false); + pr_debug("Releasing pdd (topology id %d, for pid %d)\n", + pdd->dev->id, p->lead_thread->pid); kfd_process_device_destroy_cwsr_dgpu(pdd); kfd_process_device_destroy_ib_mem(pdd); - if (pdd->drm_file) { - amdgpu_amdkfd_gpuvm_release_process_vm( - pdd->dev->adev, pdd->drm_priv); + if (pdd->drm_file) fput(pdd->drm_file); - } if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base) free_pages((unsigned long)pdd->qpd.cwsr_kaddr, @@ -1078,18 +1133,19 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) if (pdd->dev->kfd->shared_resources.enable_mes && pdd->proc_ctx_cpu_ptr) - amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, + amdgpu_amdkfd_free_kernel_mem(pdd->dev->adev, &pdd->proc_ctx_bo); /* * before destroying pdd, make sure to report availability * for auto suspend */ if (pdd->runtime_inuse) { - pm_runtime_mark_last_busy(adev_to_drm(pdd->dev->adev)->dev); pm_runtime_put_autosuspend(adev_to_drm(pdd->dev->adev)->dev); pdd->runtime_inuse = false; } + atomic_dec(&pdd->dev->kfd->kfd_processes_count); + kfree(pdd); p->pdds[i] = NULL; } @@ -1140,6 +1196,17 @@ static void kfd_process_remove_sysfs(struct kfd_process *p) p->kobj = NULL; } +/* + * If any GPU is ongoing reset, wait for reset complete. + */ +static void kfd_process_wait_gpu_reset_complete(struct kfd_process *p) +{ + int i; + + for (i = 0; i < p->n_pdds; i++) + flush_workqueue(p->pdds[i]->dev->adev->reset_domain->wq); +} + /* No process locking is needed in this function, because the process * is not findable any more. We must assume that no other thread is * using it any more, otherwise we couldn't safely free the process @@ -1151,8 +1218,10 @@ static void kfd_process_wq_release(struct work_struct *work) release_work); struct dma_fence *ef; - kfd_process_dequeue_from_all_devices(p); - pqm_uninit(&p->pqm); + /* + * If GPU in reset, user queues may still running, wait for reset complete. + */ + kfd_process_wait_gpu_reset_complete(p); /* Signal the eviction fence after user mode queues are * destroyed. This allows any BOs to be freed without @@ -1163,7 +1232,12 @@ static void kfd_process_wq_release(struct work_struct *work) if (ef) dma_fence_signal(ef); - kfd_process_remove_sysfs(p); + if (p->context_id != KFD_CONTEXT_ID_PRIMARY) + kfd_process_free_id(p); + else + ida_destroy(&p->id_table); + + kfd_debugfs_remove_process(p); kfd_process_kunmap_signal_bo(p); kfd_process_free_outstanding_kfd_bos(p); @@ -1174,11 +1248,15 @@ static void kfd_process_wq_release(struct work_struct *work) kfd_event_free_process(p); - kfd_pasid_free(p->pasid); mutex_destroy(&p->mutex); put_task_struct(p->lead_thread); + /* the last step is removing process entries under /sys + * to indicate the process has been terminated. + */ + kfd_process_remove_sysfs(p); + kfree(p); } @@ -1203,13 +1281,41 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn) kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); } -static void kfd_process_notifier_release_internal(struct kfd_process *p) +static void kfd_process_table_remove(struct kfd_process *p) +{ + mutex_lock(&kfd_processes_mutex); + /* + * Do early return if table is empty. + * + * This could potentially happen if this function is called concurrently + * by mmu_notifier and by kfd_cleanup_pocesses. + * + */ + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return; + } + hash_del_rcu(&p->kfd_processes); + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); +} + +void kfd_process_notifier_release_internal(struct kfd_process *p) { int i; + kfd_process_table_remove(p); cancel_delayed_work_sync(&p->eviction_work); cancel_delayed_work_sync(&p->restore_work); + /* + * Dequeue and destroy user queues, it is not safe for GPU to access + * system memory after mmu release notifier callback returns because + * exit_mmap free process memory afterwards. + */ + kfd_process_dequeue_from_all_devices(p); + pqm_uninit(&p->pqm); + for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; @@ -1240,7 +1346,8 @@ static void kfd_process_notifier_release_internal(struct kfd_process *p) srcu_read_unlock(&kfd_processes_srcu, idx); } - mmu_notifier_put(&p->mmu_notifier); + if (p->context_id == KFD_CONTEXT_ID_PRIMARY) + mmu_notifier_put(&p->mmu_notifier); } static void kfd_process_notifier_release(struct mmu_notifier *mn, @@ -1256,22 +1363,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, if (WARN_ON(p->mm != mm)) return; - mutex_lock(&kfd_processes_mutex); - /* - * Do early return if table is empty. - * - * This could potentially happen if this function is called concurrently - * by mmu_notifier and by kfd_cleanup_pocesses. - * - */ - if (hash_empty(kfd_processes_table)) { - mutex_unlock(&kfd_processes_mutex); - return; - } - hash_del_rcu(&p->kfd_processes); - mutex_unlock(&kfd_processes_mutex); - synchronize_srcu(&kfd_processes_srcu); - kfd_process_notifier_release_internal(p); } @@ -1472,7 +1563,8 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported) * management and memory-manager-related preemptions or * even deadlocks. */ - if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1) && + KFD_GC_VERSION(dev) < IP_VERSION(12, 1, 0)) return false; if (dev->kfd->noretry) @@ -1496,13 +1588,13 @@ void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd, * On return the kfd_process is fully operational and will be freed when the * mm is released */ -static struct kfd_process *create_process(const struct task_struct *thread) +struct kfd_process *create_process(const struct task_struct *thread, bool primary) { struct kfd_process *process; struct mmu_notifier *mn; int err = -ENOMEM; - process = kzalloc(sizeof(*process), GFP_KERNEL); + process = kzalloc_obj(*process); if (!process) goto err_alloc_process; @@ -1512,6 +1604,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) process->lead_thread = thread->group_leader; process->n_pdds = 0; process->queues_paused = false; + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); process->last_restore_timestamp = get_jiffies_64(); @@ -1525,12 +1618,6 @@ static struct kfd_process *create_process(const struct task_struct *thread) atomic_set(&process->debugged_process_count, 0); sema_init(&process->runtime_enable_sema, 0); - process->pasid = kfd_pasid_alloc(); - if (process->pasid == 0) { - err = -ENOSPC; - goto err_alloc_pasid; - } - err = pqm_init(&process->pqm, process); if (err != 0) goto err_process_pqm_init; @@ -1561,12 +1648,22 @@ static struct kfd_process *create_process(const struct task_struct *thread) * After this point, mmu_notifier_put will trigger the cleanup by * dropping the last process reference in the free_notifier. */ - mn = mmu_notifier_get(&kfd_process_mmu_notifier_ops, process->mm); - if (IS_ERR(mn)) { - err = PTR_ERR(mn); - goto err_register_notifier; + if (primary) { + process->context_id = KFD_CONTEXT_ID_PRIMARY; + mn = mmu_notifier_get(&kfd_process_mmu_notifier_ops, process->mm); + if (IS_ERR(mn)) { + err = PTR_ERR(mn); + goto err_register_notifier; + } + BUG_ON(mn != &process->mmu_notifier); + ida_init(&process->id_table); + } + + err = kfd_process_alloc_id(process); + if (err) { + pr_err("Creating kfd process: failed to alloc an id\n"); + goto err_alloc_id; } - BUG_ON(mn != &process->mmu_notifier); kfd_unref_process(process); get_task_struct(process->lead_thread); @@ -1575,6 +1672,8 @@ static struct kfd_process *create_process(const struct task_struct *thread) return process; +err_alloc_id: + kfd_process_free_id(process); err_register_notifier: hash_del_rcu(&process->kfd_processes); svm_range_list_fini(process); @@ -1584,8 +1683,6 @@ err_init_svm_range_list: err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: - kfd_pasid_free(process->pasid); -err_alloc_pasid: kfd_event_free_process(process); err_event_init: mutex_destroy(&process->mutex); @@ -1613,7 +1710,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE)) return NULL; - pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); + pdd = kzalloc_obj(*pdd); if (!pdd) return NULL; @@ -1643,6 +1740,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, /* Init idr used for memory handle translation */ idr_init(&pdd->alloc_idr); + atomic_inc(&dev->kfd->kfd_processes_count); + return pdd; } @@ -1670,9 +1769,6 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, struct kfd_node *dev; int ret; - if (!drm_file) - return -EINVAL; - if (pdd->drm_priv) return -EBUSY; @@ -1704,15 +1800,21 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, if (ret) goto err_init_cwsr; - ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, avm, p->pasid); - if (ret) - goto err_set_pasid; + if (unlikely(!avm->pasid)) { + dev_warn(pdd->dev->adev->dev, "WARN: vm %p has no pasid associated", + avm); + ret = -EINVAL; + goto err_get_pasid; + } + pdd->pasid = avm->pasid; pdd->drm_file = drm_file; + kfd_smi_event_process(pdd, true); + return 0; -err_set_pasid: +err_get_pasid: kfd_process_device_destroy_cwsr_dgpu(pdd); err_init_cwsr: kfd_process_device_destroy_ib_mem(pdd); @@ -1798,25 +1900,50 @@ void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, idr_remove(&pdd->alloc_idr, handle); } -/* This increments the process->ref counter. */ -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid) +static struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 pasid) { - struct kfd_process *p, *ret_p = NULL; + struct kfd_process_device *ret_p = NULL; + struct kfd_process *p; unsigned int temp; - - int idx = srcu_read_lock(&kfd_processes_srcu); + int i; hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (p->pasid == pasid) { - kref_get(&p->ref); - ret_p = p; - break; + for (i = 0; i < p->n_pdds; i++) { + if (p->pdds[i]->pasid == pasid) { + ret_p = p->pdds[i]; + break; + } } + if (ret_p) + break; + } + return ret_p; +} + +/* This increments the process->ref counter. */ +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid, + struct kfd_process_device **pdd) +{ + struct kfd_process_device *ret_p; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + ret_p = kfd_lookup_process_device_by_pasid(pasid); + if (ret_p) { + if (pdd) + *pdd = ret_p; + kref_get(&ret_p->process->ref); + + srcu_read_unlock(&kfd_processes_srcu, idx); + return ret_p->process; } srcu_read_unlock(&kfd_processes_srcu, idx); - return ret_p; + if (pdd) + *pdd = NULL; + + return NULL; } /* This increments the process->ref counter. */ @@ -1835,6 +1962,27 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) return p; } +/* This increments the process->ref counter. */ +struct kfd_process *kfd_lookup_process_by_id(const struct mm_struct *mm, u16 id) +{ + struct kfd_process *p, *ret_p = NULL; + unsigned int temp; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (p->mm == mm && p->context_id == id) { + kref_get(&p->ref); + ret_p = p; + break; + } + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return ret_p; +} + /* kfd_process_evict_queues - Evict all user queues of a process * * Eviction is reference-counted per process-device. This means multiple @@ -1942,18 +2090,18 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node, return -EINVAL; } -static int signal_eviction_fence(struct kfd_process *p) +static bool signal_eviction_fence(struct kfd_process *p) { struct dma_fence *ef; - int ret; + bool ret; rcu_read_lock(); ef = dma_fence_get_rcu_safe(&p->ef); rcu_read_unlock(); if (!ef) - return -EINVAL; + return true; - ret = dma_fence_signal(ef); + ret = dma_fence_check_and_signal(ef); dma_fence_put(ef); return ret; @@ -1972,7 +2120,7 @@ static void evict_process_worker(struct work_struct *work) */ p = container_of(dwork, struct kfd_process, eviction_work); - pr_debug("Started evicting pasid 0x%x\n", p->pasid); + pr_debug("Started evicting process pid %d\n", p->lead_thread->pid); ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM); if (!ret) { /* If another thread already signaled the eviction fence, @@ -1984,9 +2132,9 @@ static void evict_process_worker(struct work_struct *work) msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); - pr_debug("Finished evicting pasid 0x%x\n", p->pasid); + pr_debug("Finished evicting process pid %d\n", p->lead_thread->pid); } else - pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); + pr_err("Failed to evict queues of process pid %d\n", p->lead_thread->pid); } static int restore_process_helper(struct kfd_process *p) @@ -2003,9 +2151,11 @@ static int restore_process_helper(struct kfd_process *p) ret = kfd_process_restore_queues(p); if (!ret) - pr_debug("Finished restoring pasid 0x%x\n", p->pasid); + pr_debug("Finished restoring process pid %d\n", + p->lead_thread->pid); else - pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid); + pr_err("Failed to restore queues of process pid %d\n", + p->lead_thread->pid); return ret; } @@ -2022,7 +2172,7 @@ static void restore_process_worker(struct work_struct *work) * lifetime of this thread, kfd_process p will be valid */ p = container_of(dwork, struct kfd_process, restore_work); - pr_debug("Started restoring pasid 0x%x\n", p->pasid); + pr_debug("Started restoring process pasid %d\n", (int)p->lead_thread->pid); /* Setting last_restore_timestamp before successful restoration. * Otherwise this would have to be set by KGD (restore_process_bos) @@ -2038,8 +2188,8 @@ static void restore_process_worker(struct work_struct *work) ret = restore_process_helper(p); if (ret) { - pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", - p->pasid, PROCESS_BACK_OFF_TIME_MS); + pr_debug("Failed to restore BOs of process pid %d, retry after %d ms\n", + p->lead_thread->pid, PROCESS_BACK_OFF_TIME_MS); if (mod_delayed_work(kfd_restore_wq, &p->restore_work, msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); @@ -2055,7 +2205,7 @@ void kfd_suspend_all_processes(void) WARN(debug_evictions, "Evicting all processes"); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND)) - pr_err("Failed to suspend process 0x%x\n", p->pasid); + pr_err("Failed to suspend process pid %d\n", p->lead_thread->pid); signal_eviction_fence(p); } srcu_read_unlock(&kfd_processes_srcu, idx); @@ -2069,8 +2219,8 @@ int kfd_resume_all_processes(void) hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (restore_process_helper(p)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); + pr_err("Restore process pid %d failed during resume\n", + p->lead_thread->pid); ret = -EFAULT; } } @@ -2125,14 +2275,15 @@ int kfd_process_drain_interrupts(struct kfd_process_device *pdd) memset(irq_drain_fence, 0, sizeof(irq_drain_fence)); irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) | KFD_IRQ_FENCE_CLIENTID; - irq_drain_fence[3] = pdd->process->pasid; + irq_drain_fence[3] = pdd->pasid; /* * For GFX 9.4.3/9.5.0, send the NodeId also in IH cookie DW[3] */ if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) || KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4) || - KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0)) { + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(12, 1, 0)) { node_id = ffs(pdd->dev->interrupt_bitmap) - 1; irq_drain_fence[3] |= node_id << 16; } @@ -2156,7 +2307,7 @@ void kfd_process_close_interrupt_drain(unsigned int pasid) { struct kfd_process *p; - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; @@ -2277,8 +2428,8 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - seq_printf(m, "Process %d PASID 0x%x:\n", - p->lead_thread->tgid, p->pasid); + seq_printf(m, "Process %d PASID %d:\n", + p->lead_thread->tgid, p->lead_thread->pid); mutex_lock(&p->mutex); r = pqm_debugfs_mqds(m, &p->pqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index bd36a75309e1..44e39ce222b7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -69,8 +69,8 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, pr_debug("The new slot id %lu\n", found); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_info("Cannot open more queues for process with pasid 0x%x\n", - pqm->process->pasid); + pr_info("Cannot open more queues for process with pid %d\n", + pqm->process->lead_thread->pid); return -ENOMEM; } @@ -94,7 +94,8 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) if (dev->kfd->shared_resources.enable_mes && !!pdd->proc_ctx_gpu_addr && down_read_trylock(&dev->adev->reset_domain->sem)) { amdgpu_mes_flush_shader_debugger(dev->adev, - pdd->proc_ctx_gpu_addr); + pdd->proc_ctx_gpu_addr, + ffs(pdd->dev->xcc_mask) - 1); up_read(&dev->adev->reset_domain->sem); } pdd->already_dequeued = true; @@ -209,8 +210,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, } if (dev->kfd->shared_resources.enable_mes) { - amdgpu_amdkfd_free_gtt_mem(dev->adev, &pqn->q->gang_ctx_bo); - amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); + amdgpu_amdkfd_free_kernel_mem(dev->adev, &pqn->q->gang_ctx_bo); + amdgpu_amdkfd_free_kernel_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart); } } @@ -264,8 +265,9 @@ static int init_user_queue(struct process_queue_manager *pqm, (*q)->process = pqm->process; if (dev->kfd->shared_resources.enable_mes) { - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, + retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, AMDGPU_MES_GANG_CTX_SIZE, + AMDGPU_GEM_DOMAIN_GTT, &(*q)->gang_ctx_bo, &(*q)->gang_ctx_gpu_addr, &(*q)->gang_ctx_cpu_ptr, @@ -279,20 +281,17 @@ static int init_user_queue(struct process_queue_manager *pqm, /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) */ - if (((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) - >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { - if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { - pr_err("Queue memory allocated to wrong device\n"); - retval = -EINVAL; - goto free_gang_ctx_bo; - } + if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) { + pr_err("Queue memory allocated to wrong device\n"); + retval = -EINVAL; + goto free_gang_ctx_bo; + } - retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, - &(*q)->wptr_bo_gart); - if (retval) { - pr_err("Failed to map wptr bo to GART\n"); - goto free_gang_ctx_bo; - } + retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo, + &(*q)->wptr_bo_gart); + if (retval) { + pr_err("Failed to map wptr bo to GART\n"); + goto free_gang_ctx_bo; } } @@ -300,7 +299,7 @@ static int init_user_queue(struct process_queue_manager *pqm, return 0; free_gang_ctx_bo: - amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); + amdgpu_amdkfd_free_kernel_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; @@ -348,7 +347,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, * If we are just about to create DIQ, the is_debug flag is not set yet * Hence we also check the type as well */ - if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ)) + if (pdd->qpd.is_debug) max_queues = dev->kfd->device_info.max_no_of_hqd/2; if (pdd->qpd.queue_count >= max_queues) @@ -363,11 +362,28 @@ int pqm_create_queue(struct process_queue_manager *pqm, if (retval != 0) return retval; + /* Register process if this is the first queue */ if (list_empty(&pdd->qpd.queues_list) && list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); - pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); + /* Allocate proc_ctx_bo only if MES is enabled and this is the first queue */ + if (!pdd->proc_ctx_cpu_ptr && dev->kfd->shared_resources.enable_mes) { + retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, + AMDGPU_MES_PROC_CTX_SIZE, + AMDGPU_GEM_DOMAIN_GTT, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (retval) { + dev_err(dev->adev->dev, "failed to allocate process context bo\n"); + return retval; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + + pqn = kzalloc_obj(*pqn); if (!pqn) { retval = -ENOMEM; goto err_allocate_pqn; @@ -413,30 +429,21 @@ int pqm_create_queue(struct process_queue_manager *pqm, restore_mqd, restore_ctl_stack); print_queue(q); break; - case KFD_QUEUE_TYPE_DIQ: - kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); - if (!kq) { - retval = -ENOMEM; - goto err_create_queue; - } - kq->queue->properties.queue_id = *qid; - pqn->kq = kq; - pqn->q = NULL; - retval = kfd_process_drain_interrupts(pdd); - if (retval) - break; - - retval = dev->dqm->ops.create_kernel_queue(dev->dqm, - kq, &pdd->qpd); - break; default: WARN(1, "Invalid queue type %d", type); retval = -EINVAL; } if (retval != 0) { - pr_err("Pasid 0x%x DQM create queue type %d failed. ret %d\n", - pqm->process->pasid, type, retval); + if ((type == KFD_QUEUE_TYPE_SDMA || + type == KFD_QUEUE_TYPE_SDMA_XGMI || + type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) && + retval == -ENOMEM) + pr_warn("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); + else + pr_err("process pid %d DQM create queue type %d failed. ret %d\n", + pqm->process->lead_thread->pid, type, retval); goto err_create_queue; } @@ -530,9 +537,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n", - pqm->process->pasid, + pdd->pasid, pqn->q->properties.queue_id, retval); - if (retval != -ETIME) + if (retval != -ETIME && retval != -EIO) goto err_destroy_queue; } kfd_procfs_del_queue(pqn->q); @@ -583,9 +590,11 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, return err; if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo, - p->queue_size)) { + p->queue_size + + pqn->q->properties.metadata_queue_size)) { pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n", p->queue_address, p->queue_size); + amdgpu_bo_unreserve(vm->root.bo); return -EFAULT; } @@ -652,19 +661,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, return 0; } -struct kernel_queue *pqm_get_kernel_queue( - struct process_queue_manager *pqm, - unsigned int qid) -{ - struct process_queue_node *pqn; - - pqn = get_queue_by_qid(pqm, qid); - if (pqn && pqn->kq) - return pqn->kq; - - return NULL; -} - struct queue *pqm_get_user_queue(struct process_queue_manager *pqm, unsigned int qid) { @@ -907,7 +903,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd, q_data = (struct kfd_criu_queue_priv_data *)q_private_data; - /* data stored in this order: priv_data, mqd, ctl_stack */ + /* + * data stored in this order: + * priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]... + */ q_data->mqd_size = mqd_size; q_data->ctl_stack_size = ctl_stack_size; @@ -956,7 +955,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p, } static void set_queue_properties_from_criu(struct queue_properties *qp, - struct kfd_criu_queue_priv_data *q_data) + struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc) { qp->is_interop = false; qp->queue_percent = q_data->q_percent; @@ -969,7 +968,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp, qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size; qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address; qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size; - qp->ctl_stack_size = q_data->ctl_stack_size; + if (q_data->type == KFD_QUEUE_TYPE_COMPUTE) + qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc; + else + qp->ctl_stack_size = q_data->ctl_stack_size; + qp->type = q_data->type; qp->format = q_data->format; } @@ -990,7 +993,7 @@ int kfd_criu_restore_queue(struct kfd_process *p, if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size) return -EINVAL; - q_data = kmalloc(sizeof(*q_data), GFP_KERNEL); + q_data = kmalloc_obj(*q_data); if (!q_data) return -ENOMEM; @@ -1029,12 +1032,15 @@ int kfd_criu_restore_queue(struct kfd_process *p, goto exit; } - /* data stored in this order: mqd, ctl_stack */ + /* + * data stored in this order: + * mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]... + */ mqd = q_extra_data; ctl_stack = mqd + q_data->mqd_size; memset(&qp, 0, sizeof(qp)); - set_queue_properties_from_criu(&qp, q_data); + set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask)); print_queue_properties(&qp); @@ -1065,6 +1071,7 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, uint32_t *ctl_stack_size) { struct process_queue_node *pqn; + int ret; pqn = get_queue_by_qid(pqm, qid); if (!pqn) { @@ -1077,9 +1084,14 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm, return -EOPNOTSUPP; } - pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm, + ret = pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm, pqn->q, mqd_size, ctl_stack_size); + if (ret) { + pr_debug("amdkfd: Overflow while computing stack size for queue %d\n", qid); + return ret; + } + return 0; } @@ -1114,32 +1126,13 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) break; default: seq_printf(m, - " Bad user queue type %d on device %x\n", + " Queue node with bad user queue type %d on device %x\n", q->properties.type, q->device->id); continue; } mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type]; size = mqd_mgr->mqd_stride(mqd_mgr, &q->properties); - } else if (pqn->kq) { - q = pqn->kq->queue; - mqd_mgr = pqn->kq->mqd_mgr; - switch (q->properties.type) { - case KFD_QUEUE_TYPE_DIQ: - seq_printf(m, " DIQ on device %x\n", - pqn->kq->dev->id); - break; - default: - seq_printf(m, - " Bad kernel queue type %d on device %x\n", - q->properties.type, - pqn->kq->dev->id); - continue; - } - } else { - seq_printf(m, - " Weird: Queue node with neither kernel nor user queue\n"); - continue; } for (xcc = 0; xcc < num_xccs; xcc++) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 24396a2c77bd..28354a4e5dd5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -70,7 +70,7 @@ int init_queue(struct queue **q, const struct queue_properties *properties) { struct queue *tmp_q; - tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL); + tmp_q = kzalloc_obj(*tmp_q); if (!tmp_q) return -ENOMEM; @@ -233,6 +233,7 @@ void kfd_queue_buffer_put(struct amdgpu_bo **bo) int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { struct kfd_topology_device *topo_dev; + u64 expected_queue_size; struct amdgpu_vm *vm; u32 total_cwsr_size; int err; @@ -241,6 +242,18 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope if (!topo_dev) return -EINVAL; + /* AQL queues on GFX7 and GFX8 appear twice their actual size */ + if (properties->type == KFD_QUEUE_TYPE_COMPUTE && + properties->format == KFD_QUEUE_FORMAT_AQL && + topo_dev->node_props.gfx_target_version >= 70000 && + topo_dev->node_props.gfx_target_version < 90000) + /* metadata_queue_size not supported on GFX7/GFX8 */ + expected_queue_size = + PAGE_ALIGN(properties->queue_size / 2); + else + expected_queue_size = + PAGE_ALIGN(properties->queue_size + properties->metadata_queue_size); + vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); if (err) @@ -255,7 +268,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, - &properties->ring_bo, properties->queue_size); + &properties->ring_bo, expected_queue_size); if (err) goto out_err_unreserve; @@ -265,8 +278,8 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope /* EOP buffer is not required for all ASICs */ if (properties->eop_ring_buffer_address) { - if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { - pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n", + if (properties->eop_ring_buffer_size < topo_dev->node_props.eop_buffer_size) { + pr_debug("queue eop bo size 0x%x is less than node eop buf size 0x%x\n", properties->eop_ring_buffer_size, topo_dev->node_props.eop_buffer_size); err = -EINVAL; @@ -274,7 +287,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope } err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address, &properties->eop_buf_bo, - properties->eop_ring_buffer_size); + ALIGN(properties->eop_ring_buffer_size, PAGE_SIZE)); if (err) goto out_err_unreserve; } @@ -287,16 +300,16 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; } - if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { - pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n", properties->ctx_save_restore_area_size, topo_dev->node_props.cwsr_size); err = -EINVAL; goto out_err_unreserve; } - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, @@ -342,8 +355,8 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope topo_dev = kfd_topology_device_by_id(pdd->dev->id); if (!topo_dev) return -EINVAL; - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); @@ -382,34 +395,82 @@ int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, return 0; } -#define SGPR_SIZE_PER_CU 0x4000 -#define LDS_SIZE_PER_CU 0x10000 -#define HWREG_SIZE_PER_CU 0x1000 #define DEBUGGER_BYTES_ALIGN 64 #define DEBUGGER_BYTES_PER_WAVE 32 +static u32 kfd_get_sgpr_size_per_cu(u32 gfxv) +{ + u32 sgpr_size = 0x4000; + + if (gfxv == 120500 || + gfxv == 120501) + sgpr_size = 0x8000; + + return sgpr_size; +} + static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) { u32 vgpr_size = 0x40000; - if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ + if (gfxv == 90402 || /* GFX_VERSION_AQUA_VANJARAM */ gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */ gfxv == 90008 || /* GFX_VERSION_ARCTURUS */ gfxv == 90500) vgpr_size = 0x80000; else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */ gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ + gfxv == 110501 || /* GFX_VERSION_GFX1151 */ gfxv == 120000 || /* GFX_VERSION_GFX1200 */ gfxv == 120001) /* GFX_VERSION_GFX1201 */ vgpr_size = 0x60000; + else if (gfxv == 120500 || /* GFX_VERSION_GFX1250 */ + gfxv == 120501) /* GFX_VERSION_GFX1251 */ + vgpr_size = 0x80000; return vgpr_size; } +static u32 kfd_get_hwreg_size_per_cu(u32 gfxv) +{ + u32 hwreg_size = 0x1000; + + if (gfxv == 120500 || gfxv == 120501) + hwreg_size = 0x8000; + + return hwreg_size; +} + +static u32 kfd_get_lds_size_per_cu(u32 gfxv, struct kfd_node_properties *props) +{ + u32 lds_size = 0x10000; + + if (gfxv == 90500 || gfxv == 120500 || gfxv == 120501) + lds_size = props->lds_size_in_kb << 10; + + return lds_size; +} + +static u32 get_num_waves(struct kfd_node_properties *props, u32 gfxv, u32 cu_num) +{ + u32 wave_num = 0; + + if (gfxv < 100100) + wave_num = min(cu_num * 40, + props->array_count / props->simd_arrays_per_engine * 512); + else if (gfxv < 120500) + wave_num = cu_num * 32; + else if (gfxv <= 120501) + wave_num = cu_num * 64; + + WARN_ON(wave_num == 0); + + return wave_num; +} + #define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \ - (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\ - (((gfxv) == 90500) ? (props->lds_size_in_kb << 10) : LDS_SIZE_PER_CU) +\ - HWREG_SIZE_PER_CU) + (kfd_get_vgpr_size_per_cu(gfxv) + kfd_get_sgpr_size_per_cu(gfxv) +\ + kfd_get_lds_size_per_cu(gfxv, props) + kfd_get_hwreg_size_per_cu(gfxv)) #define CNTL_STACK_BYTES_PER_WAVE(gfxv) \ ((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/ @@ -429,14 +490,13 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) return; cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); - wave_num = (gfxv < 100100) ? /* GFX_VERSION_NAVI10 */ - min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512) - : cu_num * 32; + wave_num = get_num_waves(props, gfxv, cu_num); - wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE); + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), + AMDGPU_GPU_PAGE_SIZE); ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, - PAGE_SIZE); + AMDGPU_GPU_PAGE_SIZE); if ((gfxv / 10000 * 10000) == 100000) { /* HW design limits control stack size to 0x7000. @@ -448,11 +508,11 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) props->ctl_stack_size = ctl_stack_size; props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); - props->cwsr_size = ctl_stack_size + wg_data_size; + props->cwsr_size = ALIGN(ctl_stack_size + wg_data_size, PAGE_SIZE); if (gfxv == 80002) /* GFX_VERSION_TONGA */ props->eop_buffer_size = 0x8000; - else if ((gfxv / 100 * 100) == 90400) /* GFX_VERSION_AQUA_VANJARAM */ + else if (gfxv == 90402) /* GFX_VERSION_AQUA_VANJARAM */ props->eop_buffer_size = 4096; else if (gfxv >= 80000) props->eop_buffer_size = 4096; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 9b8169761ec5..15975c23a88e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -163,10 +163,9 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client, unsigned int event) { - uint64_t all = KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS); uint64_t events = READ_ONCE(client->events); - if (pid && client->pid != pid && !(client->suser && (events & all))) + if (pid && client->pid != pid && !client->suser) return false; return events & KFD_SMI_EVENT_MASK_FROM_INDEX(event); @@ -254,9 +253,9 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid); if (task_info) { /* Report VM faults from user applications, not retry from kernel */ - if (task_info->pid) + if (task_info->task.pid) kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( - task_info->pid, task_info->task_name)); + task_info->task.pid, task_info->task.comm)); amdgpu_vm_put_task_info(task_info); } } @@ -313,7 +312,7 @@ void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE, KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid, - node->id, 0)); + node->id, '0')); } void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) @@ -345,12 +344,33 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, pid, address, last - address + 1, node->id, trigger)); } +void kfd_smi_event_process(struct kfd_process_device *pdd, bool start) +{ + struct amdgpu_task_info *task_info; + struct amdgpu_vm *avm; + + if (!pdd->drm_priv) + return; + + avm = drm_priv_to_vm(pdd->drm_priv); + task_info = amdgpu_vm_get_task_info_vm(avm); + + if (task_info) { + kfd_smi_event_add(0, pdd->dev, + start ? KFD_SMI_EVENT_PROCESS_START : + KFD_SMI_EVENT_PROCESS_END, + KFD_EVENT_FMT_PROCESS(task_info->task.pid, + task_info->task.comm)); + amdgpu_vm_put_task_info(task_info); + } +} + int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) { struct kfd_smi_client *client; int ret; - client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL); + client = kzalloc_obj(struct kfd_smi_client); if (!client) return -ENOMEM; INIT_LIST_HEAD(&client->list); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 503bff13d815..bb4d72b57387 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -53,4 +53,5 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm); void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, unsigned long address, unsigned long last, uint32_t trigger); +void kfd_smi_event_process(struct kfd_process_device *pdd, bool start); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index bd3e20d981e0..35ec67d9739b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -33,6 +33,7 @@ #include "amdgpu_hmm.h" #include "amdgpu.h" #include "amdgpu_xgmi.h" +#include "amdgpu_reset.h" #include "kfd_priv.h" #include "kfd_svm.h" #include "kfd_migrate.h" @@ -167,7 +168,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, int i, r; if (!addr) { - addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL); + addr = kvzalloc_objs(*addr, prange->npages); if (!addr) return -ENOMEM; prange->dma_addr[gpuidx] = addr; @@ -328,7 +329,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, struct svm_range *prange; struct kfd_process *p; - prange = kzalloc(sizeof(*prange), GFP_KERNEL); + prange = kzalloc_obj(*prange); if (!prange) return NULL; @@ -538,7 +539,7 @@ static struct svm_range_bo *svm_range_bo_new(void) { struct svm_range_bo *svm_bo; - svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); + svm_bo = kzalloc_obj(*svm_bo); if (!svm_bo) return NULL; @@ -563,7 +564,8 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, int r; p = container_of(prange->svms, struct kfd_process, svms); - pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, + pr_debug("process pid: %d svms 0x%p [0x%lx 0x%lx]\n", + p->lead_thread->pid, prange->svms, prange->start, prange->last); if (svm_range_validate_svm_bo(node, prange)) @@ -584,7 +586,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, svm_bo->eviction_fence = amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), mm, - svm_bo); + svm_bo, p->context_id); mmput(mm); INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); svm_bo->evicting = 0; @@ -626,9 +628,8 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, } } - r = dma_resv_reserve_fences(bo->tbo.base.resv, 1); + r = dma_resv_reserve_fences(bo->tbo.base.resv, TTM_NUM_MOVE_FENCES); if (r) { - pr_debug("failed %d to reserve bo\n", r); amdgpu_bo_unreserve(bo); goto reserve_bo_failed; } @@ -1143,40 +1144,57 @@ static int svm_range_split_tail(struct svm_range *prange, uint64_t new_last, struct list_head *insert_list, struct list_head *remap_list) { + unsigned long last_align_down = ALIGN_DOWN(prange->last, 512); + unsigned long start_align = ALIGN(prange->start, 512); + bool huge_page_mapping = last_align_down > start_align; struct svm_range *tail = NULL; - int r = svm_range_split(prange, prange->start, new_last, &tail); + int r; - if (!r) { - list_add(&tail->list, insert_list); - if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity)) - list_add(&tail->update_list, remap_list); - } - return r; + r = svm_range_split(prange, prange->start, new_last, &tail); + + if (r) + return r; + + list_add(&tail->list, insert_list); + + if (huge_page_mapping && tail->start > start_align && + tail->start < last_align_down && (!IS_ALIGNED(tail->start, 512))) + list_add(&tail->update_list, remap_list); + + return 0; } static int svm_range_split_head(struct svm_range *prange, uint64_t new_start, struct list_head *insert_list, struct list_head *remap_list) { + unsigned long last_align_down = ALIGN_DOWN(prange->last, 512); + unsigned long start_align = ALIGN(prange->start, 512); + bool huge_page_mapping = last_align_down > start_align; struct svm_range *head = NULL; - int r = svm_range_split(prange, new_start, prange->last, &head); + int r; - if (!r) { - list_add(&head->list, insert_list); - if (!IS_ALIGNED(new_start, 1UL << prange->granularity)) - list_add(&head->update_list, remap_list); - } - return r; + r = svm_range_split(prange, new_start, prange->last, &head); + + if (r) + return r; + + list_add(&head->list, insert_list); + + if (huge_page_mapping && head->last + 1 > start_align && + head->last + 1 < last_align_down && (!IS_ALIGNED(head->last, 512))) + list_add(&head->update_list, remap_list); + + return 0; } static void -svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, - struct svm_range *pchild, enum svm_work_list_ops op) +svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op) { pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", pchild, pchild->start, pchild->last, prange, op); - pchild->work_item.mm = mm; + pchild->work_item.mm = NULL; pchild->work_item.op = op; list_add_tail(&pchild->child_list, &prange->child_list); } @@ -1189,7 +1207,7 @@ svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) } static uint64_t -svm_range_get_pte_flags(struct kfd_node *node, +svm_range_get_pte_flags(struct kfd_node *node, struct amdgpu_vm *vm, struct svm_range *prange, int domain) { struct kfd_node *bo_node; @@ -1200,7 +1218,8 @@ svm_range_get_pte_flags(struct kfd_node *node, bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; - unsigned int mtype_local; + unsigned int mtype_local, mtype_remote; + bool is_aid_a1, is_local; if (domain == SVM_RANGE_VRAM_DOMAIN) bo_node = prange->svm_bo->node; @@ -1244,8 +1263,7 @@ svm_range_get_pte_flags(struct kfd_node *node, case IP_VERSION(9, 4, 4): case IP_VERSION(9, 5, 0): if (ext_coherent) - mtype_local = (gc_ip_version < IP_VERSION(9, 5, 0) && !node->adev->rev_id) ? - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_CC; + mtype_local = AMDGPU_VM_MTYPE_CC; else mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; @@ -1278,7 +1296,7 @@ svm_range_get_pte_flags(struct kfd_node *node, mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; /* system memory accessed by the dGPU */ } else { - if (gc_ip_version < IP_VERSION(9, 5, 0)) + if (gc_ip_version < IP_VERSION(9, 5, 0) || ext_coherent) mapping_flags |= AMDGPU_VM_MTYPE_UC; else mapping_flags |= AMDGPU_VM_MTYPE_NC; @@ -1286,12 +1304,26 @@ svm_range_get_pte_flags(struct kfd_node *node, break; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): - if (domain == SVM_RANGE_VRAM_DOMAIN) { - if (bo_node != node) - mapping_flags |= AMDGPU_VM_MTYPE_NC; + mapping_flags |= AMDGPU_VM_MTYPE_NC; + break; + case IP_VERSION(12, 1, 0): + is_aid_a1 = (node->adev->rev_id & 0x10); + is_local = (domain == SVM_RANGE_VRAM_DOMAIN) && + (bo_node->adev == node->adev); + + mtype_local = amdgpu_mtype_local == 0 ? AMDGPU_VM_MTYPE_RW : + amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : + is_aid_a1 ? AMDGPU_VM_MTYPE_RW : AMDGPU_VM_MTYPE_NC; + mtype_remote = is_aid_a1 ? AMDGPU_VM_MTYPE_NC : AMDGPU_VM_MTYPE_UC; + snoop = true; + + if (is_local) /* local HBM */ { + mapping_flags |= mtype_local; + } else if (ext_coherent) { + mapping_flags |= AMDGPU_VM_MTYPE_UC; } else { - mapping_flags |= coherent ? - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; + /* system memory or remote VRAM */ + mapping_flags |= mtype_remote; } break; default: @@ -1299,10 +1331,6 @@ svm_range_get_pte_flags(struct kfd_node *node, AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; } - mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; - - if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) - mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; @@ -1312,7 +1340,15 @@ svm_range_get_pte_flags(struct kfd_node *node, if (gc_ip_version >= IP_VERSION(12, 0, 0)) pte_flags |= AMDGPU_PTE_IS_PTE; - pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); + amdgpu_gmc_get_vm_pte(node->adev, vm, NULL, mapping_flags, &pte_flags); + pte_flags |= AMDGPU_PTE_READABLE; + if (!(flags & KFD_IOCTL_SVM_FLAG_GPU_RO)) + pte_flags |= AMDGPU_PTE_WRITEABLE; + + if ((gc_ip_version == IP_VERSION(12, 1, 0)) && + node->adev->have_atomics_support) + pte_flags |= AMDGPU_PTE_BUS_ATOMICS; + return pte_flags; } @@ -1321,12 +1357,23 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, uint64_t start, uint64_t last, struct dma_fence **fence) { - uint64_t init_pte_value = 0; + uint64_t init_pte_value = adev->gmc.init_pte_flags; + uint64_t gpu_start, gpu_end; + + /* Convert CPU page range to GPU page range */ + gpu_start = start * AMDGPU_GPU_PAGES_IN_CPU_PAGE; + gpu_end = (last + 1) * AMDGPU_GPU_PAGES_IN_CPU_PAGE - 1; + + pr_debug("CPU[0x%llx 0x%llx] -> GPU[0x%llx 0x%llx]\n", start, last, + gpu_start, gpu_end); - pr_debug("[0x%llx 0x%llx]\n", start, last); + if (!amdgpu_vm_ready(vm)) { + pr_debug("VM not ready, canceling unmap\n"); + return -EINVAL; + } - return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start, - last, init_pte_value, 0, 0, NULL, NULL, + return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, gpu_start, + gpu_end, init_pte_value, 0, 0, NULL, NULL, fence); } @@ -1334,7 +1381,6 @@ static int svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, unsigned long last, uint32_t trigger) { - DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); struct kfd_process_device *pdd; struct dma_fence *fence = NULL; struct kfd_process *p; @@ -1352,11 +1398,9 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, prange->mapped_to_gpu = false; } - bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, - MAX_GPU_INSTANCE); p = container_of(prange->svms, struct kfd_process, svms); - for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { + for_each_or_bit(gpuidx, prange->bitmap_access, prange->bitmap_aip, MAX_GPU_INSTANCE) { pr_debug("unmap from gpu idx 0x%x\n", gpuidx); pdd = kfd_process_device_from_gpuidx(p, gpuidx); if (!pdd) { @@ -1380,7 +1424,7 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, if (r) break; } - kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT); + kfd_flush_tlb(pdd); } return r; @@ -1405,7 +1449,15 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, last_start, last_start + npages - 1, readonly); + if (!amdgpu_vm_ready(vm)) { + pr_debug("VM not ready, canceling map\n"); + return -EINVAL; + } + for (i = offset; i < offset + npages; i++) { + uint64_t gpu_start; + uint64_t gpu_end; + last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; @@ -1419,21 +1471,26 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", last_start, prange->start + i, last_domain ? "GPU" : "CPU"); - pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain); + pte_flags = svm_range_get_pte_flags(pdd->dev, vm, prange, last_domain); if (readonly) pte_flags &= ~AMDGPU_PTE_WRITEABLE; - pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", - prange->svms, last_start, prange->start + i, - (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, - pte_flags); /* For dGPU mode, we use same vm_manager to allocate VRAM for * different memory partition based on fpfn/lpfn, we should use * same vm_manager.vram_base_offset regardless memory partition. */ + gpu_start = last_start * AMDGPU_GPU_PAGES_IN_CPU_PAGE; + gpu_end = (prange->start + i + 1) * AMDGPU_GPU_PAGES_IN_CPU_PAGE - 1; + + pr_debug("svms 0x%p map CPU[0x%lx 0x%llx] GPU[0x%llx 0x%llx] vram %d PTE 0x%llx\n", + prange->svms, last_start, prange->start + i, + gpu_start, gpu_end, + (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, + pte_flags); + r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true, - NULL, last_start, prange->start + i, + NULL, gpu_start, gpu_end, pte_flags, (last_start - prange->start) << PAGE_SHIFT, bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, @@ -1514,7 +1571,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, } } - kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(pdd); } return r; @@ -1631,7 +1688,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, int32_t idx; int r = 0; - ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL); + ctx = kzalloc_obj(struct svm_validate_context); if (!ctx) return -ENOMEM; ctx->process = container_of(prange->svms, struct kfd_process, svms); @@ -1706,7 +1763,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range = NULL; + struct amdgpu_hmm_range *range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1721,10 +1778,36 @@ static int svm_range_validate_and_map(struct mm_struct *mm, next = min(vma->vm_end, end); npages = (next - addr) >> PAGE_SHIFT; + /* HMM requires at least READ permissions. If provided with PROT_NONE, + * unmap the memory. If it's not already mapped, this is a no-op + * If PROT_WRITE is provided without READ, warn first then unmap + */ + if (!(vma->vm_flags & VM_READ)) { + unsigned long e, s; + + svm_range_lock(prange); + if (vma->vm_flags & VM_WRITE) + pr_debug("VM_WRITE without VM_READ is not supported"); + s = max(start, prange->start); + e = min(end, prange->last); + if (e >= s) + r = svm_range_unmap_from_gpus(prange, s, e, + KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU); + svm_range_unlock(prange); + /* If unmap returns non-zero, we'll bail on the next for loop + * iteration, so just leave r and continue + */ + addr = next; + continue; + } + WRITE_ONCE(p->svms.faulting_task, current); - r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, - readonly, owner, NULL, - &hmm_range); + range = amdgpu_hmm_range_alloc(NULL); + if (likely(range)) + r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, + readonly, owner, range); + else + r = -ENOMEM; WRITE_ONCE(p->svms.faulting_task, NULL); if (r) pr_debug("failed %d to get svm range pages\n", r); @@ -1735,7 +1818,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, if (!r) { offset = (addr >> PAGE_SHIFT) - prange->start; r = svm_range_dma_map(prange, ctx->bitmap, offset, npages, - hmm_range->hmm_pfns); + range->hmm_range.hmm_pfns); if (r) pr_debug("failed %d to dma map range\n", r); } @@ -1743,14 +1826,17 @@ static int svm_range_validate_and_map(struct mm_struct *mm, svm_range_lock(prange); /* Free backing memory of hmm_range if it was initialized - * Overrride return value to TRY AGAIN only if prior returns + * Override return value to TRY AGAIN only if prior returns * were successful */ - if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { + if (range && !amdgpu_hmm_range_valid(range) && !r) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; } + /* Free the hmm range */ + amdgpu_hmm_range_free(range); + if (!r && !list_empty(&prange->child_list)) { pr_debug("range split by unmap in parallel, validate again\n"); r = -EAGAIN; @@ -2310,6 +2396,9 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) pr_debug("drain retry fault gpu %d svms %p\n", i, svms); + if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem)) + continue; + amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, pdd->dev->adev->irq.retry_cam_enabled ? &pdd->dev->adev->irq.ih : @@ -2319,6 +2408,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, &pdd->dev->adev->irq.ih_soft); + up_read(&pdd->dev->adev->reset_domain->sem); pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); } @@ -2400,15 +2490,17 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, prange->work_item.op != SVM_OP_UNMAP_RANGE) prange->work_item.op = op; } else { - prange->work_item.op = op; - - /* Pairs with mmput in deferred_list_work */ - mmget(mm); - prange->work_item.mm = mm; - list_add_tail(&prange->deferred_list, - &prange->svms->deferred_range_list); - pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", - prange, prange->start, prange->last, op); + /* Pairs with mmput in deferred_list_work. + * If process is exiting and mm is gone, don't update mmu notifier. + */ + if (mmget_not_zero(mm)) { + prange->work_item.mm = mm; + prange->work_item.op = op; + list_add_tail(&prange->deferred_list, + &prange->svms->deferred_range_list); + pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", + prange, prange->start, prange->last, op); + } } spin_unlock(&svms->deferred_list_lock); } @@ -2422,8 +2514,7 @@ void schedule_deferred_list_work(struct svm_range_list *svms) } static void -svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, - struct svm_range *prange, unsigned long start, +svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start, unsigned long last) { struct svm_range *head; @@ -2444,12 +2535,12 @@ svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, svm_range_split(tail, last + 1, tail->last, &head); if (head != prange && tail != prange) { - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); - svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE); } else if (tail != prange) { - svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE); } else if (head != prange) { - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); } else if (parent != prange) { prange->work_item.op = SVM_OP_UNMAP_RANGE; } @@ -2501,7 +2592,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, adev = pdd->dev->adev; /* Check and drain ih1 ring if cam not available */ - if (adev->irq.ih1.ring_size) { + if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) { ih = &adev->irq.ih1; checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); if (ih->rptr != checkpoint_wptr) { @@ -2526,14 +2617,14 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, l = min(last, pchild->last); if (l >= s) svm_range_unmap_from_gpus(pchild, s, l, trigger); - svm_range_unmap_split(mm, prange, pchild, start, last); + svm_range_unmap_split(prange, pchild, start, last); mutex_unlock(&pchild->lock); } s = max(start, prange->start); l = min(last, prange->last); if (l >= s) svm_range_unmap_from_gpus(prange, s, l, trigger); - svm_range_unmap_split(mm, prange, prange, start, last); + svm_range_unmap_split(prange, prange, start, last); if (unmap_parent) svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); @@ -2576,8 +2667,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, if (range->event == MMU_NOTIFY_RELEASE) return true; - if (!mmget_not_zero(mni->mm)) - return true; start = mni->interval_tree.start; last = mni->interval_tree.last; @@ -2604,7 +2693,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, } svm_range_unlock(prange); - mmput(mni->mm); return true; } @@ -2691,7 +2779,7 @@ svm_range_best_restore_location(struct svm_range *prange, return -1; } - if (node->adev->flags & AMD_IS_APU) + if (node->adev->apu_prefer_gtt) return 0; if (prange->preferred_loc == gpuid || @@ -2979,7 +3067,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, return -EFAULT; } - p = kfd_lookup_process_by_pasid(pasid); + p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) { pr_debug("kfd process not founded pasid 0x%x\n", pasid); return 0; @@ -3008,19 +3096,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } - /* check if this page fault time stamp is before svms->checkpoint_ts */ - if (svms->checkpoint_ts[gpuidx] != 0) { - if (amdgpu_ih_ts_after(ts, svms->checkpoint_ts[gpuidx])) { - pr_debug("draining retry fault, drop fault 0x%llx\n", addr); - r = 0; - goto out; - } else - /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts - * to zero to avoid following ts wrap around give wrong comparing - */ - svms->checkpoint_ts[gpuidx] = 0; - } - if (!p->xnack_enabled) { pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); r = -EFAULT; @@ -3040,6 +3115,23 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, mmap_read_lock(mm); retry_write_locked: mutex_lock(&svms->lock); + + /* check if this page fault time stamp is before svms->checkpoint_ts */ + if (svms->checkpoint_ts[gpuidx] != 0) { + if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) { + pr_debug("draining retry fault, drop fault 0x%llx\n", addr); + if (write_locked) + mmap_write_downgrade(mm); + r = -EAGAIN; + goto out_unlock_svms; + } else { + /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts + * to zero to avoid following ts wrap around give wrong comparing + */ + svms->checkpoint_ts[gpuidx] = 0; + } + } + prange = svm_range_from_addr(svms, addr, NULL); if (!prange) { pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", @@ -3165,7 +3257,8 @@ out_unlock_svms: mutex_unlock(&svms->lock); mmap_read_unlock(mm); - svm_range_count_fault(node, p, gpuidx); + if (r != -EAGAIN) + svm_range_count_fault(node, p, gpuidx); mmput(mm); out: @@ -3242,7 +3335,8 @@ void svm_range_list_fini(struct kfd_process *p) struct svm_range *prange; struct svm_range *next; - pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); + pr_debug("process pid %d svms 0x%p\n", p->lead_thread->pid, + &p->svms); cancel_delayed_work_sync(&p->svms.restore_work); @@ -3265,7 +3359,8 @@ void svm_range_list_fini(struct kfd_process *p) mutex_destroy(&p->svms.lock); - pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); + pr_debug("process pid %d svms 0x%p done\n", + p->lead_thread->pid, &p->svms); } int svm_range_list_init(struct kfd_process *p) @@ -3438,7 +3533,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) goto out; } - if (bo_node->adev->flags & AMD_IS_APU) { + if (bo_node->adev->apu_prefer_gtt) { best_loc = 0; goto out; } @@ -3628,8 +3723,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, bool flush_tlb; int r, ret = 0; - pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", - p->pasid, &p->svms, start, start + size - 1, size); + pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", + p->lead_thread->pid, &p->svms, start, start + size - 1, size); r = svm_range_check_attr(p, nattr, attrs); if (r) @@ -3667,6 +3762,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); /* TODO: unmap ranges from GPU that lost access */ } + update_mapping |= !p->xnack_enabled && !list_empty(&remap_list); + list_for_each_entry_safe(prange, next, &remove_list, update_list) { pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, @@ -3737,8 +3834,8 @@ out_unlock_range: out: mutex_unlock(&process_info->lock); - pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, - &p->svms, start, start + size - 1, r); + pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] done, r=%d\n", + p->lead_thread->pid, &p->svms, start, start + size - 1, r); return ret ? ret : r; } @@ -4076,8 +4173,8 @@ exit: return ret; } -int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size) +void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size) { uint64_t total_size, accessibility_size, common_attr_size; int nattr_common = 4, nattr_accessibility = 1; @@ -4089,8 +4186,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, *svm_priv_data_size = 0; svms = &p->svms; - if (!svms) - return -EINVAL; mutex_lock(&svms->lock); list_for_each_entry(prange, &svms->list, list) { @@ -4132,7 +4227,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges, *svm_priv_data_size); - return 0; } int kfd_criu_checkpoint_svm(struct kfd_process *p, @@ -4149,8 +4243,6 @@ int kfd_criu_checkpoint_svm(struct kfd_process *p, struct mm_struct *mm; svms = &p->svms; - if (!svms) - return -EINVAL; mm = get_task_mm(p->lead_thread); if (!mm) { @@ -4248,7 +4340,7 @@ svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, r = svm_range_get_attr(p, mm, start, size, nattrs, attrs); break; default: - r = EINVAL; + r = -EINVAL; break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index bddd24f04669..a63dfc95b602 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -31,7 +31,6 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/sched/mm.h> -#include <linux/hmm.h> #include "amdgpu.h" #include "kfd_priv.h" @@ -184,8 +183,8 @@ void schedule_deferred_list_work(struct svm_range_list *svms); void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages); void svm_range_dma_unmap(struct svm_range *prange); -int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size); +void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size); int kfd_criu_checkpoint_svm(struct kfd_process *p, uint8_t __user *user_priv_data, uint64_t *priv_offset); @@ -202,7 +201,7 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s * is initialized to not 0 when page migration register device memory. */ #define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\ - ((adev)->flags & AMD_IS_APU)) + ((adev)->apu_prefer_gtt)) void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); @@ -237,13 +236,12 @@ static inline int svm_range_schedule_evict_svm_bo( return -EINVAL; } -static inline int svm_range_get_info(struct kfd_process *p, - uint32_t *num_svm_ranges, - uint64_t *svm_priv_data_size) +static inline void svm_range_get_info(struct kfd_process *p, + uint32_t *num_svm_ranges, + uint64_t *svm_priv_data_size) { *num_svm_ranges = 0; *svm_priv_data_size = 0; - return 0; } static inline int kfd_criu_checkpoint_svm(struct kfd_process *p, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index ceb9fb475ef1..29dee26261ab 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -108,24 +108,6 @@ struct kfd_node *kfd_device_by_id(uint32_t gpu_id) return top_dev->gpu; } -struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev) -{ - struct kfd_topology_device *top_dev; - struct kfd_node *device = NULL; - - down_read(&topology_lock); - - list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu && top_dev->gpu->adev->pdev == pdev) { - device = top_dev->gpu; - break; - } - - up_read(&topology_lock); - - return device; -} - /* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { @@ -509,6 +491,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.num_sdma_queues_per_engine); sysfs_show_32bit_prop(buffer, offs, "num_cp_queues", dev->node_props.num_cp_queues); + sysfs_show_32bit_prop(buffer, offs, "cwsr_size", + dev->node_props.cwsr_size); + sysfs_show_32bit_prop(buffer, offs, "ctl_stack_size", + dev->node_props.ctl_stack_size); if (dev->gpu) { log_max_watch_addr = @@ -528,6 +514,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0) && + (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)) + dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); @@ -537,11 +527,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->gpu->kfd->mec_fw_version); sysfs_show_32bit_prop(buffer, offs, "capability", dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "capability2", + dev->node_props.capability2); sysfs_show_64bit_prop(buffer, offs, "debug_prop", dev->node_props.debug_prop); sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", dev->gpu->kfd->sdma_fw_version); sysfs_show_64bit_prop(buffer, offs, "unique_id", + dev->gpu->xcp && + (dev->gpu->xcp->xcp_mgr->mode != + AMDGPU_SPX_PARTITION_MODE) ? + dev->gpu->xcp->unique_id : dev->gpu->adev->unique_id); sysfs_show_32bit_prop(buffer, offs, "num_xcc", NUM_XCC(dev->gpu->xcc_mask)); @@ -715,7 +711,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, i = 0; list_for_each_entry(mem, &dev->mem_props, list) { - mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + mem->kobj = kzalloc_obj(struct kobject); if (!mem->kobj) return -ENOMEM; ret = kobject_init_and_add(mem->kobj, &mem_type, @@ -736,7 +732,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, i = 0; list_for_each_entry(cache, &dev->cache_props, list) { - cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + cache->kobj = kzalloc_obj(struct kobject); if (!cache->kobj) return -ENOMEM; ret = kobject_init_and_add(cache->kobj, &cache_type, @@ -757,7 +753,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, i = 0; list_for_each_entry(iolink, &dev->io_link_props, list) { - iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + iolink->kobj = kzalloc_obj(struct kobject); if (!iolink->kobj) return -ENOMEM; ret = kobject_init_and_add(iolink->kobj, &iolink_type, @@ -778,7 +774,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, i = 0; list_for_each_entry(p2plink, &dev->p2p_link_props, list) { - p2plink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + p2plink->kobj = kzalloc_obj(struct kobject); if (!p2plink->kobj) return -ENOMEM; ret = kobject_init_and_add(p2plink->kobj, &iolink_type, @@ -935,17 +931,12 @@ static void kfd_debug_print_topology(void) dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); if (dev) { - if (dev->node_props.cpu_cores_count && - dev->node_props.simd_count) { - pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); - } else if (dev->node_props.cpu_cores_count) + if (dev->node_props.cpu_cores_count) pr_info("Topology: Add CPU node\n"); - else if (dev->node_props.simd_count) - pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", - dev->node_props.device_id, - dev->node_props.vendor_id); + else + pr_info("Topology: Add GPU node [0x%0x:0x%0x]\n", + dev->node_props.vendor_id, + dev->node_props.device_id); } up_read(&topology_lock); } @@ -968,24 +959,23 @@ static void kfd_update_system_properties(void) up_read(&topology_lock); } -static void find_system_memory(const struct dmi_header *dm, - void *private) +static void find_system_memory(const struct dmi_header *dm, void *private) { + struct dmi_mem_device *memdev = container_of(dm, struct dmi_mem_device, header); struct kfd_mem_properties *mem; - u16 mem_width, mem_clock; struct kfd_topology_device *kdev = (struct kfd_topology_device *)private; - const u8 *dmi_data = (const u8 *)(dm + 1); - - if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { - mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); - mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); - list_for_each_entry(mem, &kdev->mem_props, list) { - if (mem_width != 0xFFFF && mem_width != 0) - mem->width = mem_width; - if (mem_clock != 0) - mem->mem_clk_max = mem_clock; - } + + if (memdev->header.type != DMI_ENTRY_MEM_DEVICE) + return; + if (memdev->header.length < sizeof(struct dmi_mem_device)) + return; + + list_for_each_entry(mem, &kdev->mem_props, list) { + if (memdev->total_width != 0xFFFF && memdev->total_width != 0) + mem->width = memdev->total_width; + if (memdev->speed != 0) + mem->mem_clk_max = memdev->speed; } } @@ -1284,34 +1274,41 @@ static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, { struct kfd_node *gpu = outbound_link->gpu; struct amdgpu_device *adev = gpu->adev; - int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + unsigned int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + unsigned int num_xgmi_sdma_engines = kfd_get_num_xgmi_sdma_engines(gpu); + unsigned int num_sdma_engines = kfd_get_num_sdma_engines(gpu); + uint32_t sdma_eng_id_mask = (1 << num_sdma_engines) - 1; + uint32_t xgmi_sdma_eng_id_mask = + ((1 << num_xgmi_sdma_engines) - 1) << num_sdma_engines; + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && adev->aid_mask && num_xgmi_nodes && gpu->kfd->num_nodes == 1 && - kfd_get_num_xgmi_sdma_engines(gpu) >= 14 && - (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8); + num_xgmi_sdma_engines >= 6 && (!(adev->flags & AMD_IS_APU) && + num_xgmi_nodes == 8); if (support_rec_eng) { int src_socket_id = adev->gmc.xgmi.physical_node_id; int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; + unsigned int reshift = num_xgmi_sdma_engines == 6 ? 1 : 0; outbound_link->rec_sdma_eng_id_mask = - 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id]; + 1 << (rec_sdma_eng_map[src_socket_id][dst_socket_id] >> reshift); inbound_link->rec_sdma_eng_id_mask = - 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id]; - } else { - int num_sdma_eng = kfd_get_num_sdma_engines(gpu); - int i, eng_offset = 0; + 1 << (rec_sdma_eng_map[dst_socket_id][src_socket_id] >> reshift); - if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && - kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) { - eng_offset = num_sdma_eng; - num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu); - } + /* If recommended engine is out of range, need to reset the mask */ + if (outbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + outbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; + if (inbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask) + inbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask; - for (i = 0; i < num_sdma_eng; i++) { - outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); - inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); - } + } else { + uint32_t engine_mask = (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && + num_xgmi_sdma_engines && to_dev->gpu) ? xgmi_sdma_eng_id_mask : + sdma_eng_id_mask; + + outbound_link->rec_sdma_eng_id_mask = engine_mask; + inbound_link->rec_sdma_eng_id_mask = engine_mask; } } @@ -1384,7 +1381,7 @@ static int kfd_build_p2p_node_entry(struct kfd_topology_device *dev, { int ret; - p2plink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + p2plink->kobj = kzalloc_obj(struct kobject); if (!p2plink->kobj) return -ENOMEM; @@ -1593,7 +1590,8 @@ static int kfd_dev_create_p2p_links(void) break; if (!dev->gpu || !dev->gpu->adev || (dev->gpu->kfd->hive_id && - dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id)) + dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id && + amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev))) goto next; /* check if node(s) is/are peer accessible in one direction or bi-direction */ @@ -1683,17 +1681,32 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, int cache_type, unsigned int cu_processor_id, struct kfd_node *knode) { - unsigned int cu_sibling_map_mask; + unsigned int cu_sibling_map_mask = 0; int first_active_cu; int i, j, k, xcc, start, end; int num_xcc = NUM_XCC(knode->xcc_mask); struct kfd_cache_properties *pcache = NULL; enum amdgpu_memory_partition mode; struct amdgpu_device *adev = knode->adev; + bool found = false; start = ffs(knode->xcc_mask) - 1; end = start + num_xcc; - cu_sibling_map_mask = cu_info->bitmap[start][0][0]; + + /* To find the bitmap in the first active cu in the first + * xcc, it is based on the assumption that evrey xcc must + * have at least one active cu. + */ + for (i = 0; i < gfx_info->max_shader_engines && !found; i++) { + for (j = 0; j < gfx_info->max_sh_per_se && !found; j++) { + if (cu_info->bitmap[start][i % 4][j % 4]) { + cu_sibling_map_mask = + cu_info->bitmap[start][i % 4][j % 4]; + found = true; + } + } + } + cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -2000,18 +2013,23 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; - dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; + if (!amdgpu_sriov_vf(dev->gpu->adev)) + dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; + } else { dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT; - if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0)) - dev->node_props.capability |= - HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; - if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) { + dev->node_props.capability |= + HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + dev->node_props.capability2 |= + HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED; + } } kfd_topology_set_dbg_firmware_support(dev); @@ -2279,6 +2297,17 @@ int kfd_topology_remove_device(struct kfd_node *gpu) return res; } +uint32_t kfd_topology_get_num_devices(void) +{ + uint32_t num_devices; + + down_read(&topology_lock); + num_devices = sys_props.num_devices; + up_read(&topology_lock); + + return num_devices; +} + /* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD * topology. If GPU device is found @idx, then valid kfd_dev pointer is * returned through @kdev @@ -2339,6 +2368,28 @@ int kfd_numa_node_to_apic_id(int numa_node_id) return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); } +/* kfd_gpu_node_num - Return kfd gpu node number at system */ +uint32_t kfd_gpu_node_num(void) +{ + struct kfd_node *dev; + u8 gpu_num = 0; + u8 id = 0; + + while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { + if (!dev || kfd_devcgroup_check_permission(dev)) { + /* Skip non GPU devices and devices to which the + * current process have no access to + */ + id++; + continue; + } + id++; + gpu_num++; + } + + return gpu_num; +} + #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) @@ -2392,3 +2443,26 @@ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) } #endif + +void kfd_update_svm_support_properties(struct amdgpu_device *adev) +{ + struct kfd_topology_device *dev; + int ret; + + down_write(&topology_lock); + list_for_each_entry(dev, &topology_device_list, list) { + if (!dev->gpu || dev->gpu->adev != adev) + continue; + + if (KFD_IS_SVM_API_SUPPORTED(adev)) { + dev->node_props.capability |= HSA_CAP_SVMAPI_SUPPORTED; + ret = kfd_topology_update_sysfs(); + if (!ret) + sys_props.generation_count++; + else + dev_err(adev->dev, "Failed to update SVM support properties. ret=%d\n", ret); + } else + dev->node_props.capability &= ~HSA_CAP_SVMAPI_SUPPORTED; + } + up_write(&topology_lock); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 155b5c410af1..ad63ba67b577 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -24,6 +24,7 @@ #ifndef __KFD_TOPOLOGY_H__ #define __KFD_TOPOLOGY_H__ +#include <linux/dmi.h> #include <linux/types.h> #include <linux/list.h> #include <linux/kfd_sysfs.h> @@ -50,6 +51,7 @@ struct kfd_node_properties { uint32_t cpu_core_id_base; uint32_t simd_id_base; uint32_t capability; + uint32_t capability2; uint64_t debug_prop; uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; @@ -179,8 +181,30 @@ struct kfd_system_properties { struct attribute attr_props; }; +struct dmi_mem_device { + struct dmi_header header; + u16 physical_handle; + u16 error_handle; + u16 total_width; + u16 data_width; + u16 size; + u8 form_factor; + u8 device_set; + u8 device_locator; + u8 bank_locator; + u8 memory_type; + u16 type_detail; + u16 speed; +} __packed; + struct kfd_topology_device *kfd_create_topology_device( struct list_head *device_list); void kfd_release_topology_device_list(struct list_head *device_list); +#if IS_ENABLED(CONFIG_HSA_AMD) +void kfd_update_svm_support_properties(struct amdgpu_device *adev); +#else +static inline void kfd_update_svm_support_properties(struct amdgpu_device *adev) {} +#endif + #endif /* __KFD_TOPOLOGY_H__ */ |
