summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Kconfig4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Makefile4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c25
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h1212
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm940
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c256
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c32
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_debug.c83
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_debug.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c76
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c357
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c509
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h22
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c75
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c43
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c43
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c43
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c98
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c45
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c77
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c136
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c26
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c151
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c103
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c405
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c157
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c72
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c131
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c27
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c28
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c22
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c724
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c164
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c92
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c107
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pasid.c70
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h128
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c465
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c159
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_queue.c110
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c32
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c322
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.h14
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c218
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h24
55 files changed, 5799 insertions, 2087 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index d3c3d3ab7225..a5d7467c2f34 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -5,7 +5,7 @@
config HSA_AMD
bool "HSA kernel driver for AMD GPU devices"
- depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64)
+ depends on DRM_AMDGPU && (X86_64 || ARM64 || PPC64 || (RISCV && 64BIT) || (LOONGARCH && 64BIT))
select HMM_MIRROR
select MMU_NOTIFIER
select DRM_AMDGPU_USERPTR
@@ -27,7 +27,7 @@ config HSA_AMD_SVM
config HSA_AMD_P2P
bool "HSA kernel driver support for peer-to-peer for AMD GPU devices"
- depends on HSA_AMD && PCI_P2PDMA && DMABUF_MOVE_NOTIFY
+ depends on HSA_AMD && PCI_P2PDMA
help
Enable peer-to-peer (P2P) communication between AMD GPUs over
the PCIe bus. This can improve performance of multi-GPU compute
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 0d3d8972240d..85fc67d521e5 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -27,7 +27,6 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_device.o \
$(AMDKFD_PATH)/kfd_chardev.o \
$(AMDKFD_PATH)/kfd_topology.o \
- $(AMDKFD_PATH)/kfd_pasid.o \
$(AMDKFD_PATH)/kfd_doorbell.o \
$(AMDKFD_PATH)/kfd_flat_memory.o \
$(AMDKFD_PATH)/kfd_process.o \
@@ -39,6 +38,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v10.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v11.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v12.o \
+ $(AMDKFD_PATH)/kfd_mqd_manager_v12_1.o \
$(AMDKFD_PATH)/kfd_kernel_queue.o \
$(AMDKFD_PATH)/kfd_packet_manager.o \
$(AMDKFD_PATH)/kfd_packet_manager_vi.o \
@@ -51,12 +51,14 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v12.o \
+ $(AMDKFD_PATH)/kfd_device_queue_manager_v12_1.o \
$(AMDKFD_PATH)/kfd_interrupt.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v10.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
+ $(AMDKFD_PATH)/kfd_int_process_v12_1.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o \
$(AMDKFD_PATH)/kfd_debug.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 795382b55e0a..b799c70f5742 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -91,36 +91,41 @@ static void cik_event_interrupt_wq(struct kfd_node *dev,
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
uint32_t context_id = ihre->data & 0xfffffff;
- unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
u32 pasid = (ihre->ring_id & 0xffff0000) >> 16;
if (pasid == 0)
return;
if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
- kfd_signal_event_interrupt(pasid, context_id, 28);
+ kfd_signal_event_interrupt(pasid, context_id, 28, true);
else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
- kfd_signal_event_interrupt(pasid, context_id, 28);
+ kfd_signal_event_interrupt(pasid, context_id, 28, true);
else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
- kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
+ kfd_signal_event_interrupt(pasid, context_id & 0xff, 8, true);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
kfd_signal_hw_exception_event(pasid);
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+ struct kfd_process_device *pdd = NULL;
struct kfd_vm_fault_info info;
+ struct kfd_process *p;
kfd_smi_event_update_vmfault(dev, pasid);
- kfd_dqm_evict_pasid(dev->dqm, pasid);
+ p = kfd_lookup_process_by_pasid(pasid, &pdd);
+ if (!pdd)
+ return;
+
+ kfd_evict_process_device(pdd);
memset(&info, 0, sizeof(info));
amdgpu_amdkfd_gpuvm_get_vm_fault_info(dev->adev, &info);
- if (!info.page_addr && !info.status)
+ if (!info.page_addr && !info.status) {
+ kfd_unref_process(p);
return;
+ }
- if (info.vmid == vmid)
- kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
- else
- kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
+ kfd_signal_vm_fault_event(pdd, &info, NULL);
+ kfd_unref_process(p);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 651660958e5b..54fa76f374c9 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = {
};
static const uint32_t cwsr_trap_gfx12_hex[] = {
- 0xbfa00001, 0xbfa0024b,
- 0xb0804009, 0xb8f8f804,
+ 0xbfa00001, 0xbfa00239,
+ 0xb0804009, 0xb8eef81a,
+ 0xbf880000, 0xb980081a,
+ 0x00000000, 0xb8f8f804,
+ 0x9177ff77, 0x0c000000,
+ 0x846e9a6e, 0x8c776e77,
0x9178ff78, 0x00008c00,
0xb8fbf811, 0x8b6eff78,
0x00004000, 0xbfa10008,
0x8b6eff7b, 0x00000080,
0xbfa20018, 0x8b6ea07b,
- 0xbfa20042, 0xbf830010,
+ 0xbfa2004a, 0xbf830010,
0xb8fbf811, 0xbfa0fffb,
0x8b6eff7b, 0x00000bd0,
0xbfa20010, 0xb8eef812,
@@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0xf0000000, 0xbfa20005,
0x8b6fff6f, 0x00000200,
0xbfa20002, 0x8b6ea07b,
- 0xbfa2002c, 0xbefa4d82,
+ 0xbfa20034, 0xbefa4d82,
0xbf8a0000, 0x84fa887a,
0xbf0d8f7b, 0xbfa10002,
0x8c7bff7b, 0xffff0000,
- 0xf4601bbd, 0xf8000010,
- 0xbf8a0000, 0x846e976e,
- 0x9177ff77, 0x00800000,
- 0x8c776e77, 0xf4603bbd,
- 0xf8000000, 0xbf8a0000,
- 0xf4603ebd, 0xf8000008,
- 0xbf8a0000, 0x8bee6e6e,
- 0xbfa10001, 0xbe80486e,
- 0x8b6eff6d, 0xf0000000,
- 0xbfa20009, 0xb8eef811,
- 0x8b6eff6e, 0x00000080,
- 0xbfa20007, 0x8c78ff78,
- 0x00004000, 0x80ec886c,
- 0x82ed806d, 0xbfa00002,
- 0x806c846c, 0x826d806d,
- 0x8b6dff6d, 0x0000ffff,
- 0x8bfe7e7e, 0x8bea6a6a,
- 0x85788978, 0xb9783244,
+ 0x8b6eff77, 0x0c000000,
+ 0x916dff6d, 0x0c000000,
+ 0x8c6d6e6d, 0xf4601bbd,
+ 0xf8000010, 0xbf8a0000,
+ 0x846e976e, 0x9177ff77,
+ 0x00800000, 0x8c776e77,
+ 0xf4603bbd, 0xf8000000,
+ 0xbf8a0000, 0xf4603ebd,
+ 0xf8000008, 0xbf8a0000,
+ 0x8bee6e6e, 0xbfa10001,
+ 0xbe80486e, 0x8b6eff6d,
+ 0xf0000000, 0xbfa20009,
+ 0xb8eef811, 0x8b6eff6e,
+ 0x00000080, 0xbfa20007,
+ 0x8c78ff78, 0x00004000,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa00002, 0x806c846c,
+ 0x826d806d, 0x8b6dff6d,
+ 0x0000ffff, 0x8bfe7e7e,
+ 0x8bea6a6a, 0x85788978,
+ 0x936eff77, 0x0002001a,
+ 0xb96ef81a, 0xb9783244,
0xbe804a6c, 0xb8faf802,
0xbf0d987a, 0xbfa10001,
0xbfb00000, 0x8b6dff6d,
@@ -3703,66 +3711,57 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0x807a817a, 0xbf0d997b,
0xbfa20002, 0x847a897a,
0xbfa00001, 0x847a8a7a,
- 0xb8fb1e06, 0x847b8a7b,
- 0x807a7b7a, 0x8b7bff7f,
- 0x0000ffff, 0x807aff7a,
- 0x00000200, 0x807a7e7a,
- 0x827b807b, 0xd7610000,
- 0x00010870, 0xd7610000,
- 0x00010a71, 0xd7610000,
- 0x00010c72, 0xd7610000,
- 0x00010e73, 0xd7610000,
- 0x00011074, 0xd7610000,
- 0x00011275, 0xd7610000,
- 0x00011476, 0xd7610000,
- 0x00011677, 0xd7610000,
- 0x00011a79, 0xd7610000,
- 0x00011c7e, 0xd7610000,
- 0x00011e7f, 0xbefe00ff,
- 0x00003fff, 0xbeff0080,
- 0xee0a407a, 0x000c0000,
- 0x00004000, 0xd760007a,
- 0x00011d00, 0xd760007b,
- 0x00011f00, 0xbefe007a,
- 0xbeff007b, 0xbef4007e,
- 0x8b75ff7f, 0x0000ffff,
- 0x8c75ff75, 0x00040000,
- 0xbef60080, 0xbef700ff,
- 0x10807fac, 0xbef1007d,
- 0xbef00080, 0xb8f30742,
- 0x84739973, 0xbefe00c1,
- 0x857d9973, 0x8b7d817d,
- 0xbf06817d, 0xbfa20002,
- 0xbeff0080, 0xbfa00002,
- 0xbeff00c1, 0xbfa0000c,
- 0xbef600ff, 0x01000000,
- 0xc4068070, 0x008ce801,
- 0x00008000, 0xc4068070,
- 0x008ce802, 0x00010000,
- 0xc4068070, 0x008ce803,
- 0x00018000, 0xbfa0000b,
- 0xbef600ff, 0x01000000,
- 0xc4068070, 0x008ce801,
- 0x00010000, 0xc4068070,
- 0x008ce802, 0x00020000,
- 0xc4068070, 0x008ce803,
- 0x00030000, 0xb8f03b05,
- 0x80708170, 0xbf0d9973,
- 0xbfa20002, 0x84708970,
- 0xbfa00001, 0x84708a70,
- 0xb8fa1e06, 0x847a8a7a,
- 0x80707a70, 0x8070ff70,
- 0x00000200, 0xbef600ff,
- 0x01000000, 0x7e000280,
+ 0x8b7bff7f, 0x0000ffff,
+ 0x807aff7a, 0x00000240,
+ 0x807a7e7a, 0x827b807b,
+ 0xd7610000, 0x00010870,
+ 0xd7610000, 0x00010a71,
+ 0xd7610000, 0x00010c72,
+ 0xd7610000, 0x00010e73,
+ 0xd7610000, 0x00011074,
+ 0xd7610000, 0x00011275,
+ 0xd7610000, 0x00011476,
+ 0xd7610000, 0x00011677,
+ 0xd7610000, 0x00011a79,
+ 0xd7610000, 0x00011c7e,
+ 0xd7610000, 0x00011e7f,
+ 0xbefe00ff, 0x00003fff,
+ 0xbeff0080, 0xee0a407a,
+ 0x000c0000, 0x00000000,
+ 0xd760007a, 0x00011d00,
+ 0xd760007b, 0x00011f00,
+ 0xbefe007a, 0xbeff007b,
+ 0xbef4007e, 0x8b75ff7f,
+ 0x0000ffff, 0xbef1007d,
+ 0xb8f30742, 0x84739973,
+ 0xbefe00c1, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00002, 0xbeff00c1,
+ 0xbfa0000a, 0xee0a4074,
+ 0x008c0000, 0x00008000,
+ 0xee0a4074, 0x010c0000,
+ 0x00010000, 0xee0a4074,
+ 0x018c0000, 0x00018000,
+ 0xbfa00009, 0xee0a4074,
+ 0x008c0000, 0x00010000,
+ 0xee0a4074, 0x010c0000,
+ 0x00020000, 0xee0a4074,
+ 0x018c0000, 0x00030000,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0x8070ff70,
+ 0x00000200, 0x7e000280,
0x7e020280, 0x7e040280,
- 0xbefd0080, 0xbe804ec2,
- 0xbf94fffe, 0xb8faf804,
- 0x8b7a847a, 0x91788478,
- 0x8c787a78, 0xd7610002,
+ 0xbefd0080, 0xd7610002,
0x0000fa71, 0x807d817d,
+ 0xbe804ec2, 0xbf94fffe,
+ 0xb8faf804, 0x8b7a847a,
+ 0x91788478, 0x8c787a78,
0xd7610002, 0x0000fa6c,
- 0x807d817d, 0x917aff6d,
- 0x80000000, 0xd7610002,
+ 0x807d817d, 0x8b7aff6d,
+ 0x0000ffff, 0xd7610002,
0x0000fa7a, 0x807d817d,
0xd7610002, 0x0000fa6e,
0x807d817d, 0xd7610002,
@@ -3770,32 +3769,31 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0xd7610002, 0x0000fa78,
0x807d817d, 0xb8faf811,
0xd7610002, 0x0000fa7a,
- 0x807d817d, 0xd7610002,
- 0x0000fa7b, 0x807d817d,
- 0xb8f1f801, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f814, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f815, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f812, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8f1f813, 0xd7610002,
- 0x0000fa71, 0x807d817d,
- 0xb8faf802, 0xd7610002,
- 0x0000fa7a, 0x807d817d,
- 0xbefa50c1, 0xbfc70000,
+ 0x807d817d, 0xbefa0080,
0xd7610002, 0x0000fa7a,
- 0x807d817d, 0xbefe00ff,
- 0x0000ffff, 0xbeff0080,
- 0xc4068070, 0x008ce802,
- 0x00000000, 0xbefe00c1,
- 0xb8f03b05, 0x80708170,
- 0xbf0d9973, 0xbfa20002,
- 0x84708970, 0xbfa00001,
- 0x84708a70, 0xb8fa1e06,
- 0x847a8a7a, 0x80707a70,
- 0xbef600ff, 0x01000000,
+ 0x807d817d, 0xb8f1f801,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f814,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f815,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f812,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8f1f813,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8faf802,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xbefa50c1,
+ 0xbfc70000, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xbefe00ff, 0x0000ffff,
+ 0xbeff0080, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x010c0000, 0x00000000,
+ 0xbefe00c1, 0xb8f03b05,
+ 0x80708170, 0xbf0d9973,
+ 0xbfa20002, 0x84708970,
+ 0xbfa00001, 0x84708a70,
0xbef90080, 0xbefd0080,
0xbf800000, 0xbe804100,
0xbe824102, 0xbe844104,
@@ -3826,12 +3824,13 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0x0000f20e, 0x80798179,
0xd7610002, 0x0000f20f,
0x80798179, 0xbf06a079,
- 0xbfa10007, 0xc4068070,
- 0x008ce802, 0x00000000,
+ 0xbfa10009, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x010c0000, 0x00000000,
0x8070ff70, 0x00000080,
0xbef90080, 0x7e040280,
0x807d907d, 0xbf0aff7d,
- 0x00000060, 0xbfa2ffbb,
+ 0x00000060, 0xbfa2ffb9,
0xbe804100, 0xbe824102,
0xbe844104, 0xbe864106,
0xbe884108, 0xbe8a410a,
@@ -3853,47 +3852,47 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0xd7610002, 0x0000f20a,
0x80798179, 0xd7610002,
0x0000f20b, 0x80798179,
- 0xc4068070, 0x008ce802,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x010c0000,
0x00000000, 0xbefe00c1,
0x857d9973, 0x8b7d817d,
0xbf06817d, 0xbfa20002,
0xbeff0080, 0xbfa00001,
0xbeff00c1, 0xb8fb4306,
- 0x8b7bc17b, 0xbfa10044,
+ 0x8b7bc17b, 0xbfa10042,
0x8b7aff6d, 0x80000000,
- 0xbfa10041, 0x847b897b,
- 0xbef6007b, 0xb8f03b05,
- 0x80708170, 0xbf0d9973,
- 0xbfa20002, 0x84708970,
- 0xbfa00001, 0x84708a70,
- 0xb8fa1e06, 0x847a8a7a,
- 0x80707a70, 0x8070ff70,
+ 0xbfa1003f, 0x847b897b,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0x8070ff70,
0x00000200, 0x8070ff70,
- 0x00000080, 0xbef600ff,
- 0x01000000, 0xd71f0000,
+ 0x00000080, 0xd71f0000,
0x000100c1, 0xd7200000,
0x000200c1, 0x16000084,
0x857d9973, 0x8b7d817d,
0xbf06817d, 0xbefd0080,
- 0xbfa20013, 0xbe8300ff,
+ 0xbfa20015, 0xbe8300ff,
0x00000080, 0xbf800000,
0xbf800000, 0xbf800000,
0xd8d80000, 0x01000000,
- 0xbf8a0000, 0xc4068070,
- 0x008ce801, 0x00000000,
+ 0xbf8a0000, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x008c0000, 0x00000000,
0x807d037d, 0x80700370,
0xd5250000, 0x0001ff00,
0x00000080, 0xbf0a7b7d,
- 0xbfa2fff3, 0xbfa00012,
+ 0xbfa2fff1, 0xbfa00014,
0xbe8300ff, 0x00000100,
0xbf800000, 0xbf800000,
0xbf800000, 0xd8d80000,
0x01000000, 0xbf8a0000,
- 0xc4068070, 0x008ce801,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x008c0000,
0x00000000, 0x807d037d,
0x80700370, 0xd5250000,
0x0001ff00, 0x00000100,
- 0xbf0a7b7d, 0xbfa2fff3,
+ 0xbf0a7b7d, 0xbfa2fff1,
0xbefe00c1, 0x857d9973,
0x8b7d817d, 0xbf06817d,
0xbfa20004, 0xbef000ff,
@@ -3903,78 +3902,72 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0xb8fb3b05, 0x807b817b,
0x847b827b, 0x857d9973,
0x8b7d817d, 0xbf06817d,
- 0xbfa2001b, 0xbef600ff,
- 0x01000000, 0xbefd0084,
- 0xbf0a7b7d, 0xbfa10040,
+ 0xbfa2001b, 0xbefd0084,
+ 0xbf0a7b7d, 0xbfa10032,
0x7e008700, 0x7e028701,
0x7e048702, 0x7e068703,
- 0xc4068070, 0x008ce800,
- 0x00000000, 0xc4068070,
- 0x008ce801, 0x00008000,
- 0xc4068070, 0x008ce802,
- 0x00010000, 0xc4068070,
- 0x008ce803, 0x00018000,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x000c0000,
+ 0x00000000, 0xee0a4076,
+ 0x008c0000, 0x00008000,
+ 0xee0a4076, 0x010c0000,
+ 0x00010000, 0xee0a4076,
+ 0x018c0000, 0x00018000,
0x807d847d, 0x8070ff70,
0x00000200, 0xbf0a7b7d,
- 0xbfa2ffeb, 0xbfa0002a,
- 0xbef600ff, 0x01000000,
+ 0xbfa2ffe9, 0xbfa0001a,
0xbefd0084, 0xbf0a7b7d,
- 0xbfa10015, 0x7e008700,
+ 0xbfa10017, 0x7e008700,
0x7e028701, 0x7e048702,
- 0x7e068703, 0xc4068070,
- 0x008ce800, 0x00000000,
- 0xc4068070, 0x008ce801,
- 0x00010000, 0xc4068070,
- 0x008ce802, 0x00020000,
- 0xc4068070, 0x008ce803,
+ 0x7e068703, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x000c0000, 0x00000000,
+ 0xee0a4076, 0x008c0000,
+ 0x00010000, 0xee0a4076,
+ 0x010c0000, 0x00020000,
+ 0xee0a4076, 0x018c0000,
0x00030000, 0x807d847d,
0x8070ff70, 0x00000400,
- 0xbf0a7b7d, 0xbfa2ffeb,
- 0xb8fb1e06, 0x8b7bc17b,
- 0xbfa1000d, 0x847b837b,
- 0x807b7d7b, 0xbefe00c1,
- 0xbeff0080, 0x7e008700,
- 0xc4068070, 0x008ce800,
- 0x00000000, 0x807d817d,
- 0x8070ff70, 0x00000080,
- 0xbf0a7b7d, 0xbfa2fff7,
- 0xbfa0016e, 0xbef4007e,
+ 0xbf0a7b7d, 0xbfa2ffe9,
+ 0xbfa0014c, 0xbef4007e,
0x8b75ff7f, 0x0000ffff,
- 0x8c75ff75, 0x00040000,
- 0xbef60080, 0xbef700ff,
- 0x10807fac, 0xbef1007f,
- 0xb8f20742, 0x84729972,
- 0x8b6eff7f, 0x04000000,
- 0xbfa1003b, 0xbefe00c1,
- 0x857d9972, 0x8b7d817d,
- 0xbf06817d, 0xbfa20002,
- 0xbeff0080, 0xbfa00001,
- 0xbeff00c1, 0xb8ef4306,
- 0x8b6fc16f, 0xbfa10030,
- 0x846f896f, 0xbef6006f,
+ 0xbef1007f, 0xb8f20742,
+ 0x84729972, 0x8b6eff7f,
+ 0x04000000, 0xbfa10044,
+ 0xbefe00c1, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8ef4306, 0x8b6fc16f,
+ 0xbfa10039, 0x846f896f,
0xb8f83b05, 0x80788178,
0xbf0d9972, 0xbfa20002,
0x84788978, 0xbfa00001,
- 0x84788a78, 0xb8ee1e06,
- 0x846e8a6e, 0x80786e78,
- 0x8078ff78, 0x00000200,
+ 0x84788a78, 0x8078ff78,
+ 0x00000200, 0x8078ff78,
+ 0x00000080, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbefd0080, 0xd71f0001,
+ 0x000100c1, 0xd7200001,
+ 0x000202c1, 0x30020282,
+ 0xbfa20012, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xbf8a0000, 0xd8340000,
+ 0x00000001, 0xd5250001,
+ 0x0001ff01, 0x00000080,
+ 0x807dff7d, 0x00000080,
0x8078ff78, 0x00000080,
- 0xbef600ff, 0x01000000,
- 0x857d9972, 0x8b7d817d,
- 0xbf06817d, 0xbefd0080,
- 0xbfa2000d, 0xc4050078,
- 0x0080e800, 0x00000000,
- 0xbf8a0000, 0xdac00000,
- 0x00000000, 0x807dff7d,
- 0x00000080, 0x8078ff78,
- 0x00000080, 0xbf0a6f7d,
- 0xbfa2fff4, 0xbfa0000c,
- 0xc4050078, 0x0080e800,
- 0x00000000, 0xbf8a0000,
- 0xdac00000, 0x00000000,
+ 0xbf0a6f7d, 0xbfa2ffef,
+ 0xbfa00011, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xbf8a0000, 0xd8340000,
+ 0x00000001, 0xd5250001,
+ 0x0001ff01, 0x00000100,
0x807dff7d, 0x00000100,
0x8078ff78, 0x00000100,
- 0xbf0a6f7d, 0xbfa2fff4,
+ 0xbf0a6f7d, 0xbfa2ffef,
0xbef80080, 0xbefe00c1,
0x857d9972, 0x8b7d817d,
0xbf06817d, 0xbfa20002,
@@ -3983,120 +3976,102 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0x806f816f, 0x846f826f,
0x857d9972, 0x8b7d817d,
0xbf06817d, 0xbfa2002c,
- 0xbef600ff, 0x01000000,
0xbeee0078, 0x8078ff78,
0x00000200, 0xbefd0084,
- 0xbf0a6f7d, 0xbfa10061,
- 0xc4050078, 0x008ce800,
- 0x00000000, 0xc4050078,
- 0x008ce801, 0x00008000,
- 0xc4050078, 0x008ce802,
- 0x00010000, 0xc4050078,
- 0x008ce803, 0x00018000,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xee0a0076,
+ 0x000c0001, 0x00008000,
+ 0xee0a0076, 0x000c0002,
+ 0x00010000, 0xee0a0076,
+ 0x000c0003, 0x00018000,
0xbf8a0000, 0x7e008500,
0x7e028501, 0x7e048502,
0x7e068503, 0x807d847d,
0x8078ff78, 0x00000200,
- 0xbf0a6f7d, 0xbfa2ffea,
- 0xc405006e, 0x008ce800,
- 0x00000000, 0xc405006e,
- 0x008ce801, 0x00008000,
- 0xc405006e, 0x008ce802,
- 0x00010000, 0xc405006e,
- 0x008ce803, 0x00018000,
- 0xbf8a0000, 0xbfa0003d,
- 0xbef600ff, 0x01000000,
+ 0xbf0a6f7d, 0xbfa2ffe8,
+ 0x80766e74, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xee0a0076,
+ 0x000c0001, 0x00008000,
+ 0xee0a0076, 0x000c0002,
+ 0x00010000, 0xee0a0076,
+ 0x000c0003, 0x00018000,
+ 0xbf8a0000, 0xbfa0002d,
0xbeee0078, 0x8078ff78,
0x00000400, 0xbefd0084,
- 0xbf0a6f7d, 0xbfa10016,
- 0xc4050078, 0x008ce800,
- 0x00000000, 0xc4050078,
- 0x008ce801, 0x00010000,
- 0xc4050078, 0x008ce802,
- 0x00020000, 0xc4050078,
- 0x008ce803, 0x00030000,
+ 0xbf0a6f7d, 0xbfa10018,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xee0a0076,
+ 0x000c0001, 0x00010000,
+ 0xee0a0076, 0x000c0002,
+ 0x00020000, 0xee0a0076,
+ 0x000c0003, 0x00030000,
0xbf8a0000, 0x7e008500,
0x7e028501, 0x7e048502,
0x7e068503, 0x807d847d,
0x8078ff78, 0x00000400,
- 0xbf0a6f7d, 0xbfa2ffea,
- 0xb8ef1e06, 0x8b6fc16f,
- 0xbfa1000f, 0x846f836f,
- 0x806f7d6f, 0xbefe00c1,
- 0xbeff0080, 0xc4050078,
- 0x008ce800, 0x00000000,
- 0xbf8a0000, 0x7e008500,
- 0x807d817d, 0x8078ff78,
- 0x00000080, 0xbf0a6f7d,
- 0xbfa2fff6, 0xbeff00c1,
- 0xc405006e, 0x008ce800,
- 0x00000000, 0xc405006e,
- 0x008ce801, 0x00010000,
- 0xc405006e, 0x008ce802,
- 0x00020000, 0xc405006e,
- 0x008ce803, 0x00030000,
+ 0xbf0a6f7d, 0xbfa2ffe8,
+ 0x80766e74, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xee0a0076,
+ 0x000c0001, 0x00010000,
+ 0xee0a0076, 0x000c0002,
+ 0x00020000, 0xee0a0076,
+ 0x000c0003, 0x00030000,
0xbf8a0000, 0xb8f83b05,
0x80788178, 0xbf0d9972,
0xbfa20002, 0x84788978,
0xbfa00001, 0x84788a78,
- 0xb8ee1e06, 0x846e8a6e,
- 0x80786e78, 0x8078ff78,
- 0x00000200, 0x80f8ff78,
- 0x00000050, 0xbef600ff,
- 0x01000000, 0xbefd00ff,
- 0x0000006c, 0x80f89078,
- 0xf462403a, 0xf0000000,
+ 0x8078ff78, 0x00000200,
+ 0x80f8ff78, 0x00000060,
+ 0x80767874, 0x82778075,
+ 0xbefd00ff, 0x0000006c,
+ 0xf460403b, 0xf8000000,
0xbf8a0000, 0x80fd847d,
0xbf800000, 0xbe804300,
- 0xbe824302, 0x80f8a078,
- 0xf462603a, 0xf0000000,
- 0xbf8a0000, 0x80fd887d,
- 0xbf800000, 0xbe804300,
- 0xbe824302, 0xbe844304,
- 0xbe864306, 0x80f8c078,
- 0xf462803a, 0xf0000000,
+ 0xbe824302, 0x80f6a076,
+ 0x82f78077, 0xf460603b,
+ 0xf8000000, 0xbf8a0000,
+ 0x80fd887d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0xbe844304, 0xbe864306,
+ 0x80f6c076, 0x82f78077,
+ 0xf460803b, 0xf8000000,
0xbf8a0000, 0x80fd907d,
0xbf800000, 0xbe804300,
0xbe824302, 0xbe844304,
0xbe864306, 0xbe884308,
0xbe8a430a, 0xbe8c430c,
0xbe8e430e, 0xbf06807d,
- 0xbfa1fff0, 0xb980f801,
+ 0xbfa1ffef, 0xb980f801,
0x00000000, 0xb8f83b05,
0x80788178, 0xbf0d9972,
0xbfa20002, 0x84788978,
0xbfa00001, 0x84788a78,
- 0xb8ee1e06, 0x846e8a6e,
- 0x80786e78, 0x8078ff78,
- 0x00000200, 0xbef600ff,
- 0x01000000, 0xbeff0071,
- 0xf4621bfa, 0xf0000000,
- 0x80788478, 0xf4621b3a,
- 0xf0000000, 0x80788478,
- 0xf4621b7a, 0xf0000000,
- 0x80788478, 0xf4621c3a,
- 0xf0000000, 0x80788478,
- 0xf4621c7a, 0xf0000000,
- 0x80788478, 0xf4621eba,
- 0xf0000000, 0x80788478,
- 0xf4621efa, 0xf0000000,
- 0x80788478, 0xf4621e7a,
- 0xf0000000, 0x80788478,
- 0xf4621cfa, 0xf0000000,
- 0x80788478, 0xf4621bba,
- 0xf0000000, 0x80788478,
- 0xbf8a0000, 0xb96ef814,
- 0xf4621bba, 0xf0000000,
- 0x80788478, 0xbf8a0000,
- 0xb96ef815, 0xf4621bba,
- 0xf0000000, 0x80788478,
- 0xbf8a0000, 0xb96ef812,
- 0xf4621bba, 0xf0000000,
- 0x80788478, 0xbf8a0000,
+ 0x8078ff78, 0x00000200,
+ 0x80767874, 0x82778075,
+ 0xbeff0071, 0xf4601bfb,
+ 0xf8000000, 0xf4601b3b,
+ 0xf8000004, 0xf4601b7b,
+ 0xf8000008, 0xf4601c3b,
+ 0xf800000c, 0xf4601c7b,
+ 0xf8000010, 0xf4601ebb,
+ 0xf8000014, 0xf4601efb,
+ 0xf8000018, 0xf4601e7b,
+ 0xf800001c, 0xf4601cfb,
+ 0xf8000020, 0xf4601bbb,
+ 0xf8000024, 0xbf8a0000,
+ 0xb96ef814, 0xf4601bbb,
+ 0xf8000028, 0xbf8a0000,
+ 0xb96ef815, 0xf4601bbb,
+ 0xf800002c, 0xbf8a0000,
+ 0xb96ef812, 0xf4601bbb,
+ 0xf8000030, 0xbf8a0000,
0xb96ef813, 0x8b6eff7f,
- 0x04000000, 0xbfa1000d,
- 0x80788478, 0xf4621bba,
- 0xf0000000, 0x80788478,
+ 0x04000000, 0xbfa1000b,
+ 0xf4601bbb, 0xf8000038,
0xbf8a0000, 0xbf0d806e,
0xbfa10006, 0x856e906e,
0x8b6e6e6e, 0xbfa10003,
@@ -4109,17 +4084,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0xb8ee3b05, 0x806e816e,
0xbf0d9972, 0xbfa20002,
0x846e896e, 0xbfa00001,
- 0x846e8a6e, 0xb8ef1e06,
- 0x846f8a6f, 0x806e6f6e,
- 0x806eff6e, 0x00000200,
- 0x806e746e, 0x826f8075,
- 0x8b6fff6f, 0x0000ffff,
- 0xf4605c37, 0xf8000050,
- 0xf4605d37, 0xf8000060,
- 0xf4601e77, 0xf8000074,
- 0xbf8a0000, 0x8b6dff6d,
- 0x0000ffff, 0x8bfe7e7e,
- 0x8bea6a6a, 0xb97af804,
+ 0x846e8a6e, 0x806eff6e,
+ 0x00000240, 0x806e746e,
+ 0x826f8075, 0xf4605c37,
+ 0xf8000010, 0xf4605d37,
+ 0xf8000020, 0xf4601e77,
+ 0xf8000034, 0xbf8a0000,
+ 0x8b6dff6d, 0x0000ffff,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0x936eff77, 0x0002001a,
+ 0xb96ef81a, 0xb97af804,
0xbe804ec2, 0xbf94fffe,
0xbe804a6c, 0xbe804ec2,
0xbf94fffe, 0xbfb10000,
@@ -4611,3 +4585,687 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0xbf8a0000, 0xbe801f6c,
0xbf9b0000, 0x00000000,
};
+
+static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
+ 0xbfa00001, 0xbfa003be,
+ 0xb0804009, 0xb8f8f804,
+ 0x9178ff78, 0x00008c00,
+ 0xb8fbf811, 0x8b6eff78,
+ 0x00004000, 0xbfa10008,
+ 0x8b6eff7b, 0x00000080,
+ 0xbfa20018, 0x8b6ea07b,
+ 0xbfa200da, 0xbf830010,
+ 0xb8fbf811, 0xbfa0fffb,
+ 0x8b6eff7b, 0x00000bd0,
+ 0xbfa20010, 0xb8eef812,
+ 0x8b6f8f7b, 0xbfa10002,
+ 0x8c6eff6e, 0x00000080,
+ 0xb8eff813, 0x8b6e6e6f,
+ 0xbfa20008, 0x8b6eff6d,
+ 0xf0000000, 0xbfa20005,
+ 0x8b6fff6f, 0x00000200,
+ 0xbfa20002, 0x8b6ea07b,
+ 0xbfa200c4, 0x9177ff77,
+ 0x007fc000, 0xb8fa04a1,
+ 0x847a967a, 0x8c777a77,
+ 0xb8fa0421, 0x847a957a,
+ 0x8c777a77, 0xb8fa3021,
+ 0x847a8e7a, 0x8c777a77,
+ 0xb980f821, 0x00000000,
+ 0xbefa4d82, 0xbf8a0000,
+ 0x84fa887a, 0xbf0d987b,
+ 0xbfa10002, 0x8c7bff7b,
+ 0xfe000000, 0xf4601bbd,
+ 0xf8000010, 0xbf8a0000,
+ 0x846e976e, 0x9177ff77,
+ 0x00800000, 0x8c776e77,
+ 0xf4603bbd, 0xf8000000,
+ 0xbf8a0000, 0xf4603ebd,
+ 0xf8000008, 0xbf8a0000,
+ 0x8bee6e6e, 0xbfa10001,
+ 0xbe80486e, 0x8b6eff6d,
+ 0xf0000000, 0xbfa20009,
+ 0xb8eef811, 0x8b6eff6e,
+ 0x00000080, 0xbfa20007,
+ 0x8c78ff78, 0x00004000,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa00002, 0x806c846c,
+ 0x826d806d, 0x8b6dff6d,
+ 0x01ffffff, 0xb8fbf811,
+ 0xbf0d847b, 0xbfa20081,
+ 0xf4003eb6, 0xf8000000,
+ 0xbfc70000, 0xf4003bb6,
+ 0xf8000008, 0x8b76ff7a,
+ 0x80000000, 0xbfa20027,
+ 0x9376ff7a, 0x00060019,
+ 0x81f9a376, 0xbf0b8179,
+ 0xbfa2006e, 0x81f9ac76,
+ 0xbf0b8179, 0xbfa20068,
+ 0x81f9b776, 0xbf0b8179,
+ 0xbfa20065, 0x8b76ff7a,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000fe, 0xbfa20063,
+ 0xbf06ff76, 0x000000ff,
+ 0xbfa2005d, 0xbf06ff76,
+ 0x000000fa, 0xbfa2005a,
+ 0x81f9ff76, 0x000000e9,
+ 0xbf0b8179, 0xbfa20056,
+ 0x8b76ff7b, 0xffff0000,
+ 0xbf06ff76, 0xbf860000,
+ 0xbfa1005a, 0x9376ff7b,
+ 0x0002000e, 0x8b79ff7b,
+ 0x00003f00, 0x85798679,
+ 0x8c767976, 0xb9763b01,
+ 0xbfa00052, 0x8b76ff7a,
+ 0xfc000000, 0xbf06ff76,
+ 0xd4000000, 0xbfa20019,
+ 0xbf06ff76, 0xc8000000,
+ 0xbfa2002d, 0x8b76ff7a,
+ 0xff000000, 0xbf06ff76,
+ 0xcf000000, 0xbfa2003f,
+ 0x8b79ff7a, 0xffff0000,
+ 0xbf06ff79, 0xcc330000,
+ 0xbfa2003d, 0xbf06ff79,
+ 0xcc880000, 0xbfa2003a,
+ 0xbf06ff79, 0xcc350000,
+ 0xbfa2003a, 0xbf06ff79,
+ 0xcc3a0000, 0xbfa20037,
+ 0xbf06ff76, 0xcc000000,
+ 0xbfa10034, 0x8b76ff7b,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000ff, 0xbfa20029,
+ 0xbf06ff76, 0x000000fa,
+ 0xbfa20026, 0x81f6ff76,
+ 0x000000e9, 0xbf0b8176,
+ 0xbfa20022, 0x8b76ff7b,
+ 0x0003fe00, 0xbf06ff76,
+ 0x0001fe00, 0xbfa2001d,
+ 0x8b76ff7b, 0x07fc0000,
+ 0xbf06ff76, 0x03fc0000,
+ 0xbfa20018, 0xbfa00014,
+ 0x9376ff7a, 0x00040016,
+ 0x81f68176, 0xbf0b8176,
+ 0xbfa20012, 0x9376ff7a,
+ 0x00050011, 0x81f68176,
+ 0xbf0b8176, 0xbfa2000d,
+ 0x8b76ff7a, 0x000001ff,
+ 0xbf06ff76, 0x000000ff,
+ 0xbfa20008, 0x8b76ff7b,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000ff, 0xbfa20003,
+ 0xbfc70000, 0xbefb006e,
+ 0xbfa0ffa7, 0xbfc70000,
+ 0xbefb006f, 0xbfa0ffa4,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa0fff7, 0xbfc70000,
+ 0x857a9677, 0xb97a04a1,
+ 0x857a9577, 0xb97a0421,
+ 0x857a8e77, 0xb97a3021,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0x85788978, 0xb9783244,
+ 0xbe804a6c, 0xb8faf802,
+ 0xbf0d987a, 0xbfa10001,
+ 0xbfb00000, 0x8b6dff6d,
+ 0x01ffffff, 0xbefa0080,
+ 0xb97a0151, 0x9177ff77,
+ 0x007fc000, 0xb8fa04a1,
+ 0x847a967a, 0x8c777a77,
+ 0xb8fa0421, 0x847a957a,
+ 0x8c777a77, 0xb8fa3021,
+ 0x847a8e7a, 0x8c777a77,
+ 0xb980f821, 0x00000000,
+ 0xbf0d847b, 0xbfa20081,
+ 0xf4003eb6, 0xf8000000,
+ 0xbfc70000, 0xf4003bb6,
+ 0xf8000008, 0x8b76ff7a,
+ 0x80000000, 0xbfa20027,
+ 0x9376ff7a, 0x00060019,
+ 0x81f9a376, 0xbf0b8179,
+ 0xbfa2006e, 0x81f9ac76,
+ 0xbf0b8179, 0xbfa20068,
+ 0x81f9b776, 0xbf0b8179,
+ 0xbfa20065, 0x8b76ff7a,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000fe, 0xbfa20063,
+ 0xbf06ff76, 0x000000ff,
+ 0xbfa2005d, 0xbf06ff76,
+ 0x000000fa, 0xbfa2005a,
+ 0x81f9ff76, 0x000000e9,
+ 0xbf0b8179, 0xbfa20056,
+ 0x8b76ff7b, 0xffff0000,
+ 0xbf06ff76, 0xbf860000,
+ 0xbfa1005a, 0x9376ff7b,
+ 0x0002000e, 0x8b79ff7b,
+ 0x00003f00, 0x85798679,
+ 0x8c767976, 0xb9763b01,
+ 0xbfa00052, 0x8b76ff7a,
+ 0xfc000000, 0xbf06ff76,
+ 0xd4000000, 0xbfa20019,
+ 0xbf06ff76, 0xc8000000,
+ 0xbfa2002d, 0x8b76ff7a,
+ 0xff000000, 0xbf06ff76,
+ 0xcf000000, 0xbfa2003f,
+ 0x8b79ff7a, 0xffff0000,
+ 0xbf06ff79, 0xcc330000,
+ 0xbfa2003d, 0xbf06ff79,
+ 0xcc880000, 0xbfa2003a,
+ 0xbf06ff79, 0xcc350000,
+ 0xbfa2003a, 0xbf06ff79,
+ 0xcc3a0000, 0xbfa20037,
+ 0xbf06ff76, 0xcc000000,
+ 0xbfa10034, 0x8b76ff7b,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000ff, 0xbfa20029,
+ 0xbf06ff76, 0x000000fa,
+ 0xbfa20026, 0x81f6ff76,
+ 0x000000e9, 0xbf0b8176,
+ 0xbfa20022, 0x8b76ff7b,
+ 0x0003fe00, 0xbf06ff76,
+ 0x0001fe00, 0xbfa2001d,
+ 0x8b76ff7b, 0x07fc0000,
+ 0xbf06ff76, 0x03fc0000,
+ 0xbfa20018, 0xbfa00014,
+ 0x9376ff7a, 0x00040016,
+ 0x81f68176, 0xbf0b8176,
+ 0xbfa20012, 0x9376ff7a,
+ 0x00050011, 0x81f68176,
+ 0xbf0b8176, 0xbfa2000d,
+ 0x8b76ff7a, 0x000001ff,
+ 0xbf06ff76, 0x000000ff,
+ 0xbfa20008, 0x8b76ff7b,
+ 0x000001ff, 0xbf06ff76,
+ 0x000000ff, 0xbfa20003,
+ 0xbfc70000, 0xbefb006e,
+ 0xbfa0ffa7, 0xbfc70000,
+ 0xbefb006f, 0xbfa0ffa4,
+ 0x80ec886c, 0x82ed806d,
+ 0xbfa0fff7, 0xbfc70000,
+ 0xbeee007e, 0xbeef007f,
+ 0xbefe0180, 0xbefe4d84,
+ 0xbf8a0000, 0x8b7aff7f,
+ 0x04000000, 0x847a857a,
+ 0x8c6d7a6d, 0xb8eff822,
+ 0xb980f822, 0x00000000,
+ 0xb8fa2b01, 0x847a997a,
+ 0x8c6d7a6d, 0xbefa0080,
+ 0xb97a2b01, 0xbefa007e,
+ 0x8b7bff7f, 0x01ffffff,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xee0a407a, 0x000c0000,
+ 0x00000000, 0x7e000280,
+ 0xbefe007a, 0xbeff007b,
+ 0xb8fb0742, 0x847b997b,
+ 0xb8fa3b05, 0x807a817a,
+ 0xbf0d997b, 0xbfa20002,
+ 0x847a897a, 0xbfa00001,
+ 0x847a8a7a, 0x8b7bff7f,
+ 0x01ffffff, 0x807aff7a,
+ 0x000001c0, 0x807a7e7a,
+ 0x827b807b, 0xd7610000,
+ 0x00010870, 0xd7610000,
+ 0x00010a71, 0xd7610000,
+ 0x00010c72, 0xd7610000,
+ 0x00010e73, 0xd7610000,
+ 0x00011074, 0xd7610000,
+ 0x00011275, 0xd7610000,
+ 0x00011476, 0xd7610000,
+ 0x00011677, 0xd7610000,
+ 0x00011a79, 0xd7610000,
+ 0x00011c7e, 0xd7610000,
+ 0x00011e7f, 0xbefe00ff,
+ 0x00003fff, 0xbeff0080,
+ 0xee0a407a, 0x000c0000,
+ 0x00000000, 0xd760007a,
+ 0x00011d00, 0xd760007b,
+ 0x00011f00, 0xbefe007a,
+ 0xbeff007b, 0xbef4007e,
+ 0x8b75ff7f, 0x01ffffff,
+ 0xbef1007d, 0xb8f30742,
+ 0x84739973, 0xbefe00c1,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20002,
+ 0xbeff0080, 0xbfa00002,
+ 0xbeff00c1, 0xbfa0000a,
+ 0xee0a4074, 0x008c0000,
+ 0x00008000, 0xee0a4074,
+ 0x010c0000, 0x00010000,
+ 0xee0a4074, 0x018c0000,
+ 0x00018000, 0xbfa00009,
+ 0xee0a4074, 0x008c0000,
+ 0x00010000, 0xee0a4074,
+ 0x010c0000, 0x00020000,
+ 0xee0a4074, 0x018c0000,
+ 0x00030000, 0xb8f03b05,
+ 0x80708170, 0xbf0d9973,
+ 0xbfa20002, 0x84708970,
+ 0xbfa00001, 0x84708a70,
+ 0x8070ff70, 0x00000200,
+ 0x7e000280, 0x7e020280,
+ 0x7e040280, 0xbefd0080,
+ 0xd7610002, 0x0000fa71,
+ 0x807d817d, 0xb8faf802,
+ 0xbf0c8b7a, 0xbfa20003,
+ 0xbe804fc2, 0xbf94fffe,
+ 0xbfa10001, 0xbe804ec4,
+ 0xbf94fffc, 0xbefa4c88,
+ 0xbfc70000, 0xbf0c807a,
+ 0xbfa20006, 0x9371ff7a,
+ 0x00070004, 0x937aff7a,
+ 0x00070010, 0xbf06717a,
+ 0xbfa2fff6, 0xb8faf804,
+ 0x8b7aff7a, 0x0001000c,
+ 0x9178ff78, 0x0001000c,
+ 0x8c787a78, 0xd7610002,
+ 0x0000fa6c, 0x807d817d,
+ 0x8b7aff6d, 0x01ffffff,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa6e, 0x807d817d,
+ 0xbefa0080, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xd7610002, 0x0000fa78,
+ 0x807d817d, 0xb8faf811,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xd7610002,
+ 0x0000fa6f, 0x807d817d,
+ 0xb8f1f801, 0x937aff6d,
+ 0x00060019, 0x847a8c7a,
+ 0x8c717a71, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xb8f1f814, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xb8f1f815, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xb8f1f812, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xb8f1f813, 0xd7610002,
+ 0x0000fa71, 0x807d817d,
+ 0xb8faf802, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xbefa50c1, 0xbfc70000,
+ 0xd7610002, 0x0000fa7a,
+ 0x807d817d, 0xbefa4c88,
+ 0xbfc70000, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xb8faf81a, 0xd7610002,
+ 0x0000fa7a, 0x807d817d,
+ 0xbefe00c1, 0xbeff0080,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x010c0000,
+ 0x00000000, 0xbefe00c1,
+ 0x7e040280, 0xbefa5081,
+ 0xbfc70000, 0xd7610002,
+ 0x0001007a, 0xbefa5082,
+ 0xbfc70000, 0xd7610002,
+ 0x0001027a, 0xbefa5083,
+ 0xbfc70000, 0xd7610002,
+ 0x0001047a, 0xbefa5084,
+ 0xbfc70000, 0xd7610002,
+ 0x0001067a, 0xbefa5085,
+ 0xbfc70000, 0xd7610002,
+ 0x0001087a, 0xbefa5086,
+ 0xbfc70000, 0xd7610002,
+ 0x00010a7a, 0xbefa5087,
+ 0xbfc70000, 0xd7610002,
+ 0x00010c7a, 0xbefa5088,
+ 0xbfc70000, 0xd7610002,
+ 0x00010e7a, 0xbefa5089,
+ 0xbfc70000, 0xd7610002,
+ 0x0001107a, 0xbefa508a,
+ 0xbfc70000, 0xd7610002,
+ 0x0001127a, 0xbefa508b,
+ 0xbfc70000, 0xd7610002,
+ 0x0001147a, 0xbefa508c,
+ 0xbfc70000, 0xd7610002,
+ 0x0001167a, 0xbefa508d,
+ 0xbfc70000, 0xd7610002,
+ 0x0001187a, 0xbefa508e,
+ 0xbfc70000, 0xd7610002,
+ 0x00011a7a, 0xbefa508f,
+ 0xbfc70000, 0xd7610002,
+ 0x00011c7a, 0xbefa5090,
+ 0xbfc70000, 0xd7610002,
+ 0x00011e7a, 0xee0a4076,
+ 0x010c0000, 0x00008000,
+ 0xb8f03b05, 0x80708170,
+ 0xbf0d9973, 0xbfa20002,
+ 0x84708970, 0xbfa00001,
+ 0x84708a70, 0xbef90080,
+ 0xbefd0080, 0xbf800000,
+ 0xbe804100, 0xbe824102,
+ 0xbe844104, 0xbe864106,
+ 0xbe884108, 0xbe8a410a,
+ 0xbe8c410c, 0xbe8e410e,
+ 0xd7610002, 0x0000f200,
+ 0x80798179, 0xd7610002,
+ 0x0000f201, 0x80798179,
+ 0xd7610002, 0x0000f202,
+ 0x80798179, 0xd7610002,
+ 0x0000f203, 0x80798179,
+ 0xd7610002, 0x0000f204,
+ 0x80798179, 0xd7610002,
+ 0x0000f205, 0x80798179,
+ 0xd7610002, 0x0000f206,
+ 0x80798179, 0xd7610002,
+ 0x0000f207, 0x80798179,
+ 0xd7610002, 0x0000f208,
+ 0x80798179, 0xd7610002,
+ 0x0000f209, 0x80798179,
+ 0xd7610002, 0x0000f20a,
+ 0x80798179, 0xd7610002,
+ 0x0000f20b, 0x80798179,
+ 0xd7610002, 0x0000f20c,
+ 0x80798179, 0xd7610002,
+ 0x0000f20d, 0x80798179,
+ 0xd7610002, 0x0000f20e,
+ 0x80798179, 0xd7610002,
+ 0x0000f20f, 0x80798179,
+ 0xbf06a079, 0xbfa10009,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x010c0000,
+ 0x00000000, 0x8070ff70,
+ 0x00000080, 0xbef90080,
+ 0x7e040280, 0x807d907d,
+ 0xbf0aff7d, 0x00000060,
+ 0xbfa2ffb9, 0xbe804100,
+ 0xbe824102, 0xbe844104,
+ 0xbe864106, 0xbe884108,
+ 0xbe8a410a, 0xd7610002,
+ 0x0000f200, 0x80798179,
+ 0xd7610002, 0x0000f201,
+ 0x80798179, 0xd7610002,
+ 0x0000f202, 0x80798179,
+ 0xd7610002, 0x0000f203,
+ 0x80798179, 0xd7610002,
+ 0x0000f204, 0x80798179,
+ 0xd7610002, 0x0000f205,
+ 0x80798179, 0xd7610002,
+ 0x0000f206, 0x80798179,
+ 0xd7610002, 0x0000f207,
+ 0x80798179, 0xd7610002,
+ 0x0000f208, 0x80798179,
+ 0xd7610002, 0x0000f209,
+ 0x80798179, 0xd7610002,
+ 0x0000f20a, 0x80798179,
+ 0xd7610002, 0x0000f20b,
+ 0x80798179, 0xbefe00ff,
+ 0x0000ffff, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x010c0000, 0x00000000,
+ 0xbefe00c1, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8fb4306, 0x8b7bc17b,
+ 0xbfa10042, 0x8b7aff6d,
+ 0x80000000, 0xbfa1003f,
+ 0x847b8a7b, 0xb8f03b05,
+ 0x80708170, 0xbf0d9973,
+ 0xbfa20002, 0x84708970,
+ 0xbfa00001, 0x84708a70,
+ 0x8070ff70, 0x00000200,
+ 0x8070ff70, 0x00000200,
+ 0xd71f0000, 0x000100c1,
+ 0xd7200000, 0x000200c1,
+ 0x16000084, 0x857d9973,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbefd0080, 0xbfa20015,
+ 0xbe8300ff, 0x00000080,
+ 0xbf800000, 0xbf800000,
+ 0xbf800000, 0xd8d80000,
+ 0x01000000, 0xbf8a0000,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x008c0000,
+ 0x00000000, 0x807d037d,
+ 0x80700370, 0xd5250000,
+ 0x0001ff00, 0x00000080,
+ 0xbf0a7b7d, 0xbfa2fff1,
+ 0xbfa00014, 0xbe8300ff,
+ 0x00000100, 0xbf800000,
+ 0xbf800000, 0xbf800000,
+ 0xd8d80000, 0x01000000,
+ 0xbf8a0000, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x008c0000, 0x00000000,
+ 0x807d037d, 0x80700370,
+ 0xd5250000, 0x0001ff00,
+ 0x00000100, 0xbf0a7b7d,
+ 0xbfa2fff1, 0xbefe00c1,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20004,
+ 0xbef000ff, 0x00000200,
+ 0xbeff0080, 0xbfa00003,
+ 0xbef000ff, 0x00000400,
+ 0xbeff00c1, 0xb8fb3b05,
+ 0x807b817b, 0x847b827b,
+ 0x857d9973, 0x8b7d817d,
+ 0xbf06817d, 0xbfa2001b,
+ 0xbefd0084, 0xbf0a7b7d,
+ 0xbfa10032, 0x7e008700,
+ 0x7e028701, 0x7e048702,
+ 0x7e068703, 0x80767074,
+ 0x82778075, 0xee0a4076,
+ 0x000c0000, 0x00000000,
+ 0xee0a4076, 0x008c0000,
+ 0x00008000, 0xee0a4076,
+ 0x010c0000, 0x00010000,
+ 0xee0a4076, 0x018c0000,
+ 0x00018000, 0x807d847d,
+ 0x8070ff70, 0x00000200,
+ 0xbf0a7b7d, 0xbfa2ffe9,
+ 0xbfa0001a, 0xbefd0084,
+ 0xbf0a7b7d, 0xbfa10017,
+ 0x7e008700, 0x7e028701,
+ 0x7e048702, 0x7e068703,
+ 0x80767074, 0x82778075,
+ 0xee0a4076, 0x000c0000,
+ 0x00000000, 0xee0a4076,
+ 0x008c0000, 0x00010000,
+ 0xee0a4076, 0x010c0000,
+ 0x00020000, 0xee0a4076,
+ 0x018c0000, 0x00030000,
+ 0x807d847d, 0x8070ff70,
+ 0x00000400, 0xbf0a7b7d,
+ 0xbfa2ffe9, 0xbfa00184,
+ 0xbef4007e, 0x8b75ff7f,
+ 0x01ffffff, 0xbef1007f,
+ 0xb8f20742, 0x84729972,
+ 0x8b6eff7f, 0x04000000,
+ 0xbfa10044, 0xbefe00c1,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbfa20002,
+ 0xbeff0080, 0xbfa00001,
+ 0xbeff00c1, 0xb8ef4306,
+ 0x8b6fc16f, 0xbfa10039,
+ 0x846f8a6f, 0xb8f83b05,
+ 0x80788178, 0xbf0d9972,
+ 0xbfa20002, 0x84788978,
+ 0xbfa00001, 0x84788a78,
+ 0x8078ff78, 0x00000200,
+ 0x8078ff78, 0x00000200,
+ 0x857d9972, 0x8b7d817d,
+ 0xbf06817d, 0xbefd0080,
+ 0xd71f0001, 0x000100c1,
+ 0xd7200001, 0x000202c1,
+ 0x30020282, 0xbfa20012,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xbf8a0000,
+ 0xd8340000, 0x00000001,
+ 0xd5250001, 0x0001ff01,
+ 0x00000080, 0x807dff7d,
+ 0x00000080, 0x8078ff78,
+ 0x00000080, 0xbf0a6f7d,
+ 0xbfa2ffef, 0xbfa00011,
+ 0x80767874, 0x82778075,
+ 0xee0a0076, 0x000c0000,
+ 0x00000000, 0xbf8a0000,
+ 0xd8340000, 0x00000001,
+ 0xd5250001, 0x0001ff01,
+ 0x00000100, 0x807dff7d,
+ 0x00000100, 0x8078ff78,
+ 0x00000100, 0xbf0a6f7d,
+ 0xbfa2ffef, 0xbef80080,
+ 0xbefe00c1, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa20002, 0xbeff0080,
+ 0xbfa00001, 0xbeff00c1,
+ 0xb8ef3b05, 0x806f816f,
+ 0x846f826f, 0x857d9972,
+ 0x8b7d817d, 0xbf06817d,
+ 0xbfa2002c, 0xbeee0078,
+ 0x8078ff78, 0x00000200,
+ 0xbefd0084, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00008000, 0xee0a0076,
+ 0x000c0002, 0x00010000,
+ 0xee0a0076, 0x000c0003,
+ 0x00018000, 0xbf8a0000,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807d847d, 0x8078ff78,
+ 0x00000200, 0xbf0a6f7d,
+ 0xbfa2ffe8, 0x80766e74,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00008000, 0xee0a0076,
+ 0x000c0002, 0x00010000,
+ 0xee0a0076, 0x000c0003,
+ 0x00018000, 0xbf8a0000,
+ 0xbfa0002d, 0xbeee0078,
+ 0x8078ff78, 0x00000400,
+ 0xbefd0084, 0xbf0a6f7d,
+ 0xbfa10018, 0x80767874,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00010000, 0xee0a0076,
+ 0x000c0002, 0x00020000,
+ 0xee0a0076, 0x000c0003,
+ 0x00030000, 0xbf8a0000,
+ 0x7e008500, 0x7e028501,
+ 0x7e048502, 0x7e068503,
+ 0x807d847d, 0x8078ff78,
+ 0x00000400, 0xbf0a6f7d,
+ 0xbfa2ffe8, 0x80766e74,
+ 0x82778075, 0xee0a0076,
+ 0x000c0000, 0x00000000,
+ 0xee0a0076, 0x000c0001,
+ 0x00010000, 0xee0a0076,
+ 0x000c0002, 0x00020000,
+ 0xee0a0076, 0x000c0003,
+ 0x00030000, 0xbf8a0000,
+ 0xb8f83b05, 0x80788178,
+ 0xbf0d9972, 0xbfa20002,
+ 0x84788978, 0xbfa00001,
+ 0x84788a78, 0x8078ff78,
+ 0x00000200, 0x80f8ff78,
+ 0x00000060, 0x80767874,
+ 0x82778075, 0xbefd00ff,
+ 0x0000006c, 0xf460403b,
+ 0xf8000000, 0xbf8a0000,
+ 0x80fd847d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0x80f6a076, 0x82f78077,
+ 0xf460603b, 0xf8000000,
+ 0xbf8a0000, 0x80fd887d,
+ 0xbf800000, 0xbe804300,
+ 0xbe824302, 0xbe844304,
+ 0xbe864306, 0x80f6c076,
+ 0x82f78077, 0xf460803b,
+ 0xf8000000, 0xbf8a0000,
+ 0x80fd907d, 0xbf800000,
+ 0xbe804300, 0xbe824302,
+ 0xbe844304, 0xbe864306,
+ 0xbe884308, 0xbe8a430a,
+ 0xbe8c430c, 0xbe8e430e,
+ 0xbf06807d, 0xbfa1ffef,
+ 0xb980f801, 0x00000000,
+ 0xb8f83b05, 0x80788178,
+ 0xbf0d9972, 0xbfa20002,
+ 0x84788978, 0xbfa00001,
+ 0x84788a78, 0x8078ff78,
+ 0x00000200, 0x80767874,
+ 0x82778075, 0xbeff0071,
+ 0xf4601bfb, 0xf8000000,
+ 0xf4601b3b, 0xf8000004,
+ 0xf4601b7b, 0xf8000008,
+ 0xf4601c3b, 0xf800000c,
+ 0xf4601c7b, 0xf8000010,
+ 0xf4601ebb, 0xf8000014,
+ 0xf4601efb, 0xf8000018,
+ 0xf4601e7b, 0xf800001c,
+ 0xf4601cfb, 0xf8000020,
+ 0xf4601bbb, 0xf8000024,
+ 0xbf8a0000, 0xb96ef814,
+ 0xf4601bbb, 0xf8000028,
+ 0xbf8a0000, 0xb96ef815,
+ 0xf4601bbb, 0xf800002c,
+ 0xbf8a0000, 0xb96ef812,
+ 0xf4601bbb, 0xf8000030,
+ 0xbf8a0000, 0xb96ef813,
+ 0x8b6eff7f, 0x04000000,
+ 0xbfa10022, 0xf4601bbb,
+ 0xf8000038, 0xbf8a0000,
+ 0xbf0d806e, 0xbfa1001d,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804ec1,
+ 0x816ec16e, 0xbfa0fffb,
+ 0xbef800ff, 0x00000080,
+ 0xbefd0081, 0xf4601bbb,
+ 0xf0000000, 0xbfc70000,
+ 0x80788478, 0x937eff6e,
+ 0x00070004, 0x847e907e,
+ 0x8c7d7e7d, 0xbe80517d,
+ 0x917dff7d, 0x007f0000,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804e7d,
+ 0x816ec16e, 0xbfa0fffb,
+ 0x807d817d, 0xbf08907d,
+ 0xbfa1ffec, 0xf4601bbb,
+ 0xf800003c, 0xbfc70000,
+ 0xbf0d806e, 0xbfa1000c,
+ 0xbf0d9a7f, 0xbfa10002,
+ 0xbf068180, 0xbe804fc4,
+ 0xbf94fffc, 0xbfa10006,
+ 0x856e906e, 0x8b6e6e6e,
+ 0xbfa10003, 0xbe804ec3,
+ 0x816ec16e, 0xbfa0fffb,
+ 0xf4601bbb, 0xf8000040,
+ 0xbfc70000, 0xb96ef81a,
+ 0xbefd006f, 0xbefe0070,
+ 0xbeff0071, 0xb979f822,
+ 0xb97b2011, 0x857b867b,
+ 0xb97b0191, 0x857b827b,
+ 0xb97bba11, 0xb973f801,
+ 0xb8ee3b05, 0x806e816e,
+ 0xbf0d9972, 0xbfa20002,
+ 0x846e896e, 0xbfa00001,
+ 0x846e8a6e, 0x806eff6e,
+ 0x000001c0, 0x806e746e,
+ 0x826f8075, 0xf4605c37,
+ 0xf8000010, 0xf4605d37,
+ 0xf8000020, 0xf4601e77,
+ 0xf8000034, 0xbf8a0000,
+ 0x856e9677, 0xb96e04a1,
+ 0x856e9577, 0xb96e0421,
+ 0x856e8e77, 0xb96e3021,
+ 0x8b6dff6d, 0x01ffffff,
+ 0x8bfe7e7e, 0x8bea6a6a,
+ 0xb97af804, 0xb8eef802,
+ 0xbf0c8b6e, 0xbfa20003,
+ 0xbe804fc2, 0xbf94fffe,
+ 0xbfa10001, 0xbe804ec4,
+ 0xbf94fffc, 0x857a897a,
+ 0xb97a0244, 0xbe804a6c,
+ 0xb8eef802, 0xbf0c8b6e,
+ 0xbfa20003, 0xbe804fc2,
+ 0xbf94fffe, 0xbfa10001,
+ 0xbe804ec4, 0xbf94fffc,
+ 0xbfb10000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
index 7b9d36e5fa43..d38ff404277b 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
@@ -28,10 +28,30 @@
*/
#define CHIP_GFX12 37
+#define CHIP_GC_12_0_3 38
+
+#define HAVE_XNACK (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define HAVE_57BIT_ADDRESS (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0)
+#define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define RELAXED_SCHEDULING_IN_TRAP (ASIC_FAMILY == CHIP_GFX12)
+#define HAVE_INSTRUCTION_FIXUP (ASIC_FAMILY == CHIP_GC_12_0_3)
#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
+#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
+#define WAVE32_ONLY (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define SAVE_TTMPS_IN_SGPR_BLOCK (ASIC_FAMILY >= CHIP_GC_12_0_3)
+
+#if HAVE_XNACK && !WAVE32_ONLY
+# error
+#endif
-var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4
+#define ADDRESS_HI32_NUM_BITS ((HAVE_57BIT_ADDRESS ? 57 : 48) - 32)
+#define ADDRESS_HI32_MASK ((1 << ADDRESS_HI32_NUM_BITS) - 1)
+
+var SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK = 0x4 | (NUM_NAMED_BARRIERS ? 0x8 : 0) | (HAVE_CLUSTER_BARRIER ? 0x10000 : 0)
var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9
var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00
var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000
@@ -40,6 +60,7 @@ var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15
var SQ_WAVE_STATUS_WAVE64_SHIFT = 29
var SQ_WAVE_STATUS_WAVE64_SIZE = 1
var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24
+var SQ_WAVE_STATUS_IN_WG_SHIFT = 11
var SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000
@@ -47,11 +68,15 @@ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
+
+#if ASIC_FAMILY < CHIP_GC_12_0_3
var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9
+#else
+var SQ_WAVE_LDS_ALLOC_GRANULARITY = 10
+#endif
var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF
+var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT = 4
var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10
var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5
var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20
@@ -77,11 +102,46 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
+
+var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0
+var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2
+
var BARRIER_STATE_SIGNAL_OFFSET = 16
+var BARRIER_STATE_SIGNAL_SIZE = 7
+var BARRIER_STATE_MEMBER_OFFSET = 4
+var BARRIER_STATE_MEMBER_SIZE = 7
var BARRIER_STATE_VALID_OFFSET = 0
+#if RELAXED_SCHEDULING_IN_TRAP
+var TTMP11_SCHED_MODE_SHIFT = 26
+var TTMP11_SCHED_MODE_SIZE = 2
+var TTMP11_SCHED_MODE_MASK = 0xC000000
+#endif
+
+var NAMED_BARRIERS_SR_OFFSET_FROM_HWREG = 0x80
+var S_BARRIER_INIT_MEMBERCNT_MASK = 0x7F0000
+var S_BARRIER_INIT_MEMBERCNT_SHIFT = 0x10
+
+var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT = 18
+var SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE = 1
+var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT = 16
+var SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE = 1
+var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT = 0
+var SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE = 7
+
+#if HAVE_BANKED_VGPRS
+var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 12
+var SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6
+#endif
+
var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23
var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000
+var TTMP11_FIRST_REPLAY_SHIFT = 22
+var TTMP11_FIRST_REPLAY_MASK = 0x400000
+var TTMP11_REPLAY_W64H_SHIFT = 21
+var TTMP11_REPLAY_W64H_MASK = 0x200000
+var TTMP11_FXPTR_SHIFT = 14
+var TTMP11_FXPTR_MASK = 0x1FC000
// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
@@ -93,6 +153,11 @@ var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_SAVE_PC_HI_FIRST_WAVE_MASK = 0x80000000
var S_SAVE_PC_HI_FIRST_WAVE_SHIFT = 31
+#if HAVE_BANKED_VGPRS
+var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT = 25
+var S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE = 6
+#endif
+
var s_sgpr_save_num = 108
var s_save_spi_init_lo = exec_lo
@@ -103,12 +168,12 @@ var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
var s_save_state_priv = ttmp12
var s_save_excp_flag_priv = ttmp15
-var s_save_xnack_mask = s_save_excp_flag_priv
+var s_save_xnack_mask = s_save_exec_hi
var s_wave_size = ttmp7
-var s_save_buf_rsrc0 = ttmp8
-var s_save_buf_rsrc1 = ttmp9
-var s_save_buf_rsrc2 = ttmp10
-var s_save_buf_rsrc3 = ttmp11
+var s_save_base_addr_lo = ttmp8
+var s_save_base_addr_hi = ttmp9
+var s_save_addr_lo = ttmp10
+var s_save_addr_hi = ttmp11
var s_save_mem_offset = ttmp4
var s_save_alloc_size = s_save_excp_flag_priv
var s_save_tmp = ttmp14
@@ -116,9 +181,6 @@ var s_save_m0 = ttmp5
var s_save_ttmps_lo = s_save_tmp
var s_save_ttmps_hi = s_save_excp_flag_priv
-var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
-var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
-
var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
var S_WAVE_SIZE = 25
@@ -139,15 +201,21 @@ var s_restore_exec_hi = ttmp5
var s_restore_state_priv = ttmp14
var s_restore_excp_flag_priv = ttmp15
var s_restore_xnack_mask = ttmp13
-var s_restore_buf_rsrc0 = ttmp8
-var s_restore_buf_rsrc1 = ttmp9
-var s_restore_buf_rsrc2 = ttmp10
-var s_restore_buf_rsrc3 = ttmp11
+var s_restore_base_addr_lo = ttmp8
+var s_restore_base_addr_hi = ttmp9
+var s_restore_addr_lo = ttmp10
+var s_restore_addr_hi = ttmp11
var s_restore_size = ttmp6
var s_restore_ttmps_lo = s_restore_tmp
var s_restore_ttmps_hi = s_restore_alloc_size
var s_restore_spi_init_hi_save = s_restore_exec_hi
+#if SAVE_TTMPS_IN_SGPR_BLOCK
+var TTMP_SR_OFFSET_FROM_HWREG = -0x40
+#else
+var TTMP_SR_OFFSET_FROM_HWREG = 0x40
+#endif
+
shader main
asic(DEFAULT)
type(CS)
@@ -159,8 +227,23 @@ L_JUMP_TO_RESTORE:
s_branch L_RESTORE
L_SKIP_RESTORE:
+#if RELAXED_SCHEDULING_IN_TRAP
+ // Assume most relaxed scheduling mode is set. Save and revert to normal mode.
+ s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)
+ s_wait_alu 0
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, \
+ SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
+#endif
+
s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC
+#if RELAXED_SCHEDULING_IN_TRAP
+ // Save SCHED_MODE[1:0] into ttmp11[27:26].
+ s_andn2_b32 ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
+ s_lshl_b32 ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
+ s_or_b32 ttmp11, ttmp11, ttmp2
+#endif
+
// Clear SPI_PRIO: do not save with elevated priority.
// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
@@ -226,6 +309,10 @@ L_CHECK_TRAP_ID:
s_cbranch_scc1 L_SAVE
L_FETCH_2ND_TRAP:
+#if HAVE_XNACK
+ save_and_clear_xnack_state_priv(ttmp14)
+#endif
+
// Read second-level TBA/TMA from first-level TMA and jump if available.
// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
// ttmp12 holds SQ_WAVE_STATUS
@@ -233,10 +320,17 @@ L_FETCH_2ND_TRAP:
s_wait_idle
s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
- s_bitcmp1_b32 ttmp15, 0xF
+ s_bitcmp1_b32 ttmp15, (ADDRESS_HI32_NUM_BITS - 1)
s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA
- s_or_b32 ttmp15, ttmp15, 0xFFFF0000
+ s_or_b32 ttmp15, ttmp15, ~ADDRESS_HI32_MASK
L_NO_SIGN_EXTEND_TMA:
+#if RELAXED_SCHEDULING_IN_TRAP
+ // Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
+ // The second-level trap will restore from ttmp1 for backwards compatibility.
+ s_and_b32 ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
+ s_andn2_b32 ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
+ s_or_b32 ttmp1, ttmp1, ttmp2
+#endif
s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag
s_wait_idle
@@ -277,15 +371,30 @@ L_TRAP_CASE:
s_addc_u32 ttmp1, ttmp1, 0x0
L_EXIT_TRAP:
- s_and_b32 ttmp1, ttmp1, 0xFFFF
+ s_and_b32 ttmp1, ttmp1, ADDRESS_HI32_MASK
+
+#if HAVE_INSTRUCTION_FIXUP
+ s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
+ fixup_instruction()
+#endif
+
+#if HAVE_XNACK
+ restore_xnack_state_priv(s_save_tmp)
+#endif
// Restore SQ_WAVE_STATUS.
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
- // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
+ // STATE_PRIV.*BARRIER_COMPLETE may have changed since we read it.
// Only restore fields which the trap handler changes.
s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
+
+#if RELAXED_SCHEDULING_IN_TRAP
+ // Assume relaxed scheduling mode after this point.
+ restore_sched_mode(ttmp2)
+#endif
+
s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
@@ -299,11 +408,18 @@ L_SAVE:
s_cbranch_scc0 L_HAVE_VGPRS
s_endpgm
L_HAVE_VGPRS:
-
- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, ADDRESS_HI32_MASK
s_mov_b32 s_save_tmp, 0
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit
+#if HAVE_XNACK
+ save_and_clear_xnack_state_priv(s_save_tmp)
+#endif
+
+#if HAVE_INSTRUCTION_FIXUP
+ fixup_instruction()
+#endif
+
/* inform SPI the readiness and wait for SPI's go signal */
s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
s_mov_b32 s_save_exec_hi, exec_hi
@@ -317,11 +433,25 @@ L_HAVE_VGPRS:
s_lshl_b32 s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+#if HAVE_XNACK
+ s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_WAVE_XNACK_MASK)
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_MASK), 0
+#endif
+
+#if HAVE_BANKED_VGPRS
+ // Save and clear shader's DST/SRC0/SRC1 VGPR bank selection so we can use v[0-255].
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE)
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_mov_b32 s_save_tmp, 0
+ s_setreg_b32 hwreg(HW_REG_WAVE_MODE, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SIZE), s_save_tmp
+#endif
+
// Trap temporaries must be saved via VGPR but all VGPRs are in use.
// There is no ttmp space to hold the resource constant for VGPR save.
// Save v0 by itself since it requires only two SGPRs.
s_mov_b32 s_save_ttmps_lo, exec_lo
- s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF
+ s_and_b32 s_save_ttmps_hi, exec_hi, ADDRESS_HI32_MASK
s_mov_b32 exec_lo, 0xFFFFFFFF
s_mov_b32 exec_hi, 0xFFFFFFFF
global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
@@ -330,13 +460,13 @@ L_HAVE_VGPRS:
s_mov_b32 exec_hi, s_save_ttmps_hi
// Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
- // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
+ // ttmp SR memory offset:
+ // - gfx12: size(VGPR)+size(SGPR)+0x40
+ // - gfx12.5: size(VGPR)+size(SGPR)-0x40
get_wave_size2(s_save_ttmps_hi)
get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
- get_svgpr_size_bytes(s_save_ttmps_hi)
- s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
- s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
- s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
+ s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
+ s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0
@@ -354,23 +484,16 @@ L_HAVE_VGPRS:
s_mov_b32 exec_lo, 0x3FFF
s_mov_b32 exec_hi, 0x0
- global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] offset:0x40 scope:SCOPE_SYS
+ global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] scope:SCOPE_SYS
v_readlane_b32 ttmp14, v0, 0xE
v_readlane_b32 ttmp15, v0, 0xF
s_mov_b32 exec_lo, ttmp14
s_mov_b32 exec_hi, ttmp15
- /* setup Resource Contants */
- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
-
+ s_mov_b32 s_save_base_addr_lo, s_save_spi_init_lo
+ s_and_b32 s_save_base_addr_hi, s_save_spi_init_hi, ADDRESS_HI32_MASK
s_mov_b32 s_save_m0, m0
- /* global mem offset */
- s_mov_b32 s_save_mem_offset, 0x0
get_wave_size2(s_wave_size)
/* save first 4 VGPRs, needed for SGPR save */
@@ -385,65 +508,72 @@ L_ENABLE_SAVE_4VGPR_EXEC_HI:
s_mov_b32 exec_hi, 0xFFFFFFFF
s_branch L_SAVE_4VGPR_WAVE64
L_SAVE_4VGPR_WAVE32:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
-
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
+ global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128
+ global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:128*3
s_branch L_SAVE_HWREG
L_SAVE_4VGPR_WAVE64:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
-
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
+ global_store_addtid_b32 v1, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256
+ global_store_addtid_b32 v2, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_store_addtid_b32 v3, [s_save_base_addr_lo, s_save_base_addr_hi] scope:SCOPE_SYS offset:256*3
/* save HW registers */
L_SAVE_HWREG:
- // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource
v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource
v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store
s_mov_b32 m0, 0x0 //Next lane of v2 to write to
+ write_hwreg_to_v2(s_save_m0)
+
// Ensure no further changes to barrier or LDS state.
- // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
- s_barrier_signal -2
- s_barrier_wait -2
+ // STATE_PRIV.*BARRIER_COMPLETE may change up to this point.
+ wait_trap_barriers(s_save_tmp, s_save_m0, 1)
- // Re-read final state of BARRIER_COMPLETE field for save.
+ // Re-read final state of *BARRIER_COMPLETE fields for save.
s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
- s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
+ s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp
- write_hwreg_to_v2(s_save_m0)
write_hwreg_to_v2(s_save_pc_lo)
- s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
+ s_and_b32 s_save_tmp, s_save_pc_hi, ADDRESS_HI32_MASK
write_hwreg_to_v2(s_save_tmp)
write_hwreg_to_v2(s_save_exec_lo)
+#if WAVE32_ONLY
+ s_mov_b32 s_save_tmp, 0
+ write_hwreg_to_v2(s_save_tmp)
+#else
write_hwreg_to_v2(s_save_exec_hi)
+#endif
write_hwreg_to_v2(s_save_state_priv)
s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV)
write_hwreg_to_v2(s_save_tmp)
+#if HAVE_XNACK
write_hwreg_to_v2(s_save_xnack_mask)
+#else
+ s_mov_b32 s_save_tmp, 0
+ write_hwreg_to_v2(s_save_tmp)
+#endif
s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE)
+
+#if HAVE_BANKED_VGPRS
+ s_bfe_u32 s_save_tmp, s_save_pc_hi, (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SHIFT | (S_SAVE_PC_HI_DST_SRC0_SRC1_VGPR_MSB_SIZE << 0x10))
+ s_lshl_b32 s_save_tmp, s_save_tmp, SQ_WAVE_MODE_DST_SRC0_SRC1_VGPR_MSB_SHIFT
+ s_or_b32 s_save_m0, s_save_m0, s_save_tmp
+#endif
+
write_hwreg_to_v2(s_save_m0)
s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO)
@@ -465,22 +595,49 @@ L_SAVE_HWREG:
s_wait_kmcnt (0)
write_hwreg_to_v2(s_save_tmp)
+#if HAVE_CLUSTER_BARRIER
+ s_sendmsg_rtn_b32 s_save_tmp, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
+ s_wait_kmcnt 0
+ write_hwreg_to_v2(s_save_tmp)
+#endif
+
+#if ASIC_FAMILY >= CHIP_GC_12_0_3
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCHED_MODE)
+ write_hwreg_to_v2(s_save_tmp)
+#endif
+
+#if ! SAVE_TTMPS_IN_SGPR_BLOCK
// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
s_mov_b32 exec_lo, 0xFFFF
+#else
+ // All 128 bytes are available for HWREGs.
+ s_mov_b32 exec_lo, 0xFFFFFFFF
+#endif
s_mov_b32 exec_hi, 0x0
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
s_mov_b32 exec_lo, 0xFFFFFFFF
+#if NUM_NAMED_BARRIERS
+ v_mov_b32 v2, 0
+
+ for var bar_idx = 0; bar_idx < NUM_NAMED_BARRIERS; bar_idx ++
+ s_get_barrier_state s_save_tmp, (bar_idx + 1)
+ s_wait_kmcnt 0
+ v_writelane_b32 v2, s_save_tmp, bar_idx
+ end
+
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
+#endif
+
/* save SGPRs */
// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
- // SGPR SR memory offset : size(VGPR)+size(SVGPR)
+ // SGPR SR memory offset : size(VGPR)
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into
@@ -502,7 +659,9 @@ L_SAVE_SGPR_LOOP:
s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled?
s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80
s_mov_b32 ttmp13, 0x0
v_mov_b32 v2, 0x0
@@ -521,7 +680,14 @@ L_SAVE_SGPR_SKIP_TCP_STORE:
s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
write_12sgpr_to_v2(s0)
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+#if SAVE_TTMPS_IN_SGPR_BLOCK
+ // Last 16 dwords of the SGPR block already contain the TTMPS. Make
+ // sure to not override them.
+ s_mov_b32 exec_lo, 0xFFFF
+#endif
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
/* save LDS */
@@ -547,18 +713,13 @@ L_SAVE_LDS_NORMAL:
// first wave do LDS save;
s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
- // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
//
get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
- get_svgpr_size_bytes(s_save_tmp)
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
//load 0~63*4(byte address) to vgpr v0
v_mbcnt_lo_u32_b32 v0, -1, 0
v_mbcnt_hi_u32_b32 v0, -1, v0
@@ -578,7 +739,9 @@ L_SAVE_LDS_W32:
L_SAVE_LDS_LOOP_W32:
ds_read_b32 v1, v0
s_wait_idle
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
@@ -596,7 +759,9 @@ L_SAVE_LDS_W64:
L_SAVE_LDS_LOOP_W64:
ds_read_b32 v1, v0
s_wait_idle
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
@@ -629,8 +794,6 @@ L_SAVE_VGPR_NORMAL:
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_SAVE_VGPR_WAVE64
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR Allocated in 4-GPR granularity
// VGPR store using dw burst
@@ -644,10 +807,12 @@ L_SAVE_VGPR_W32_LOOP:
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:128*3
s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
@@ -657,12 +822,10 @@ L_SAVE_VGPR_W32_LOOP:
s_branch L_SAVE_VGPR_END
L_SAVE_VGPR_WAVE64:
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR store using dw burst
s_mov_b32 m0, 0x4 //VGPR initial index value =4
s_cmp_lt_u32 m0, s_save_alloc_size
- s_cbranch_scc0 L_SAVE_SHARED_VGPR
+ s_cbranch_scc0 L_SAVE_VGPR_END
L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v0, v0 //v0 = v[0+m0]
@@ -670,45 +833,24 @@ L_SAVE_VGPR_W64_LOOP:
v_movrels_b32 v2, v2 //v2 = v[2+m0]
v_movrels_b32 v3, v3 //v3 = v[3+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256
- buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_save_addr_lo, s_save_base_addr_lo, s_save_mem_offset
+ s_addc_u32 s_save_addr_hi, s_save_base_addr_hi, 0x0
+ global_store_addtid_b32 v0, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS
+ global_store_addtid_b32 v1, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256
+ global_store_addtid_b32 v2, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_store_addtid_b32 v3, [s_save_addr_lo, s_save_addr_hi] scope:SCOPE_SYS offset:256*3
s_add_u32 m0, m0, 4 //next vgpr index
s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
-L_SAVE_SHARED_VGPR:
- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
- s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
- //save shared_vgpr will start from the index of m0
- s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
- s_mov_b32 exec_lo, 0xFFFFFFFF
- s_mov_b32 exec_hi, 0x00000000
-
-L_SAVE_SHARED_VGPR_WAVE64_LOOP:
- v_movrels_b32 v0, v0 //v0 = v[0+m0]
- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS
- s_add_u32 m0, m0, 1 //next vgpr index
- s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
- s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
-
L_SAVE_VGPR_END:
s_branch L_END_PGM
L_RESTORE:
- /* Setup Resource Contants */
- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+ s_mov_b32 s_restore_base_addr_lo, s_restore_spi_init_lo
+ s_and_b32 s_restore_base_addr_hi, s_restore_spi_init_hi, ADDRESS_HI32_MASK
// Save s_restore_spi_init_hi for later use.
s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi
@@ -735,28 +877,31 @@ L_RESTORE_LDS_NORMAL:
s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
- // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
//
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
s_and_b32 m0, m0, 1
s_cmp_eq_u32 m0, 1
s_mov_b32 m0, 0x0
+
+ v_mbcnt_lo_u32_b32 v1, -1, 0
+ v_mbcnt_hi_u32_b32 v1, -1, v1
+ v_lshlrev_b32 v1, 2, v1 // 0, 4, 8, ... 124 (W32) or 252 (W64)
+
s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
L_RESTORE_LDS_LOOP_W32:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
s_wait_idle
- ds_store_addtid_b32 v0
+ ds_store_b32 v1, v0
+ v_add_nc_u32 v1, v1, 128
s_add_u32 m0, m0, 128 // 128 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
@@ -764,9 +909,12 @@ L_RESTORE_LDS_LOOP_W32:
s_branch L_RESTORE_VGPR
L_RESTORE_LDS_LOOP_W64:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
s_wait_idle
- ds_store_addtid_b32 v0
+ ds_store_b32 v1, v0
+ v_add_nc_u32 v1, v1, 256
s_add_u32 m0, m0, 256 // 256 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
@@ -795,20 +943,18 @@ L_RESTORE_VGPR_NORMAL:
s_cmp_eq_u32 m0, 1
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR load using dw burst
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
s_mov_b32 m0, 4 //VGPR initial index value = 4
- s_cmp_lt_u32 m0, s_restore_alloc_size
- s_cbranch_scc0 L_RESTORE_SGPR
L_RESTORE_VGPR_WAVE32_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
s_wait_idle
v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1
@@ -820,29 +966,31 @@ L_RESTORE_VGPR_WAVE32_LOOP:
s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
/* VGPR restore on v0 */
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:128*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:128*3
s_wait_idle
s_branch L_RESTORE_SGPR
L_RESTORE_VGPR_WAVE64:
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
-
// VGPR load using dw burst
s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
s_mov_b32 m0, 4 //VGPR initial index value = 4
s_cmp_lt_u32 m0, s_restore_alloc_size
- s_cbranch_scc0 L_RESTORE_SHARED_VGPR
+ s_cbranch_scc0 L_RESTORE_V0
L_RESTORE_VGPR_WAVE64_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
s_wait_idle
v_movreld_b32 v0, v0 //v[0+m0] = v0
v_movreld_b32 v1, v1
@@ -853,50 +1001,29 @@ L_RESTORE_VGPR_WAVE64_LOOP:
s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
-L_RESTORE_SHARED_VGPR:
- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
- s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
- //restore shared_vgpr will start from the index of m0
- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
- s_mov_b32 exec_lo, 0xFFFFFFFF
- s_mov_b32 exec_hi, 0x00000000
-L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset scope:SCOPE_SYS
- s_wait_idle
- v_movreld_b32 v0, v0 //v[0+m0] = v0
- s_add_u32 m0, m0, 1 //next vgpr index
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
- s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
-
- s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
-
/* VGPR restore on v0 */
L_RESTORE_V0:
- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS
- buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256
- buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*2
- buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save scope:SCOPE_SYS offset:256*3
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset_save
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
+ global_load_addtid_b32 v0, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS
+ global_load_addtid_b32 v1, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256
+ global_load_addtid_b32 v2, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*2
+ global_load_addtid_b32 v3, [s_restore_addr_lo, s_restore_addr_hi] scope:SCOPE_SYS offset:256*3
s_wait_idle
/* restore SGPRs */
//will be 2+8+16*6
- // SGPR SR memory offset : size(VGPR)+size(SVGPR)
+ // SGPR SR memory offset : size(VGPR)
L_RESTORE_SGPR:
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved
-
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 24*4 // s[104:107]
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
s_mov_b32 m0, s_sgpr_save_num
- read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b128 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104]
@@ -905,7 +1032,9 @@ L_RESTORE_SGPR:
s_movreld_b64 s0, s0 //s[0+m0] = s0
s_movreld_b64 s2, s2
- read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 8*4 // s[96:103]
+ s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0
+ s_load_b256 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96]
@@ -917,7 +1046,9 @@ L_RESTORE_SGPR:
s_movreld_b64 s6, s6
L_RESTORE_SGPR_LOOP:
- read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_sub_co_u32 s_restore_addr_lo, s_restore_addr_lo, 16*4 // s[0,16,32,48,64,80]
+ s_sub_co_ci_u32 s_restore_addr_hi, s_restore_addr_hi, 0
+ s_load_b512 s0, [s_restore_addr_lo, s_restore_addr_hi], 0x0 scope:SCOPE_SYS
s_wait_idle
s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
@@ -941,76 +1072,123 @@ L_RESTORE_SGPR:
/* restore HW registers */
L_RESTORE_HWREG:
- // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
- get_svgpr_size_bytes(s_restore_tmp)
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
-
- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_add_u32 s_restore_addr_lo, s_restore_base_addr_lo, s_restore_mem_offset
+ s_addc_u32 s_restore_addr_hi, s_restore_base_addr_hi, 0x0
// Restore s_restore_spi_init_hi before the saved value gets clobbered.
s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save
- read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_state_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_excp_flag_priv, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
- read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_m0, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS
+ s_load_b32 s_restore_pc_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x4
+ s_load_b32 s_restore_pc_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x8
+ s_load_b32 s_restore_exec_lo, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0xC
+ s_load_b32 s_restore_exec_hi, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x10
+ s_load_b32 s_restore_state_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x14
+ s_load_b32 s_restore_excp_flag_priv, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x18
+ s_load_b32 s_restore_xnack_mask, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x1C
+ s_load_b32 s_restore_mode, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x20
+ s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x24
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_LO), s_restore_flat_scratch
- read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_flat_scratch, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x28
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_SCRATCH_BASE_HI), s_restore_flat_scratch
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x2C
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x30
s_wait_idle
s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
- // Only the first wave needs to restore the workgroup barrier.
+ // Only the first wave needs to restore group barriers.
s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
+ s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE
// Skip over WAVE_STATUS, since there is no state to restore from it
- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4
- read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x38
s_wait_idle
+ // Skip group barriers if wave is not part of a group.
s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
+ s_cbranch_scc0 L_SKIP_GROUP_BARRIER_RESTORE
- // extract the saved signal count from s_restore_tmp
- s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
+ // Restore workgroup barrier signal count.
+ restore_barrier_signal_count(-1)
- // We need to call s_barrier_signal repeatedly to restore the signal
- // count of the work group barrier. The member count is already
- // initialized with the number of waves in the work group.
-L_BARRIER_RESTORE_LOOP:
- s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
- s_cbranch_scc0 L_SKIP_BARRIER_RESTORE
- s_barrier_signal -1
- s_add_i32 s_restore_tmp, s_restore_tmp, -1
- s_branch L_BARRIER_RESTORE_LOOP
+#if NUM_NAMED_BARRIERS
+ s_mov_b32 s_restore_mem_offset, NAMED_BARRIERS_SR_OFFSET_FROM_HWREG
+ s_mov_b32 m0, 1
+
+L_RESTORE_NAMED_BARRIER_LOOP:
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], s_restore_mem_offset scope:SCOPE_SYS
+ s_wait_kmcnt 0
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 0x4
-L_SKIP_BARRIER_RESTORE:
+ // Restore named barrier member count.
+ s_bfe_u32 exec_lo, s_restore_tmp, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 16))
+ s_lshl_b32 exec_lo, exec_lo, S_BARRIER_INIT_MEMBERCNT_SHIFT
+ s_or_b32 m0, m0, exec_lo
+ s_barrier_init m0
+ s_andn2_b32 m0, m0, S_BARRIER_INIT_MEMBERCNT_MASK
+
+ // Restore named barrier signal count.
+ restore_barrier_signal_count(m0)
+
+ s_add_u32 m0, m0, 1
+ s_cmp_gt_u32 m0, NUM_NAMED_BARRIERS
+ s_cbranch_scc0 L_RESTORE_NAMED_BARRIER_LOOP
+#endif
+
+L_SKIP_GROUP_BARRIER_RESTORE:
+#if HAVE_CLUSTER_BARRIER
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x3C
+ s_wait_kmcnt 0
+
+ // Skip cluster barrier restore if wave is not part of a cluster.
+ s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET
+ s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE
+
+ // Only the first wave in the group signals the trap cluster barrier.
+ s_bitcmp1_b32 s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT
+ s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
+
+ // Clear SCC: s_barrier_signal_isfirst -4 writes SCC=>1 but not SCC=>0.
+ s_cmp_eq_u32 0, 1
+ s_barrier_signal_isfirst -4
+L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_wait -4
+
+ // Only the first wave in the cluster restores the barrier.
+ s_cbranch_scc0 L_SKIP_CLUSTER_BARRIER_RESTORE
+
+ // Restore cluster barrier signal count.
+ restore_barrier_signal_count(-3)
+L_SKIP_CLUSTER_BARRIER_RESTORE:
+#endif
+
+#if ASIC_FAMILY >= CHIP_GC_12_0_3
+ s_load_b32 s_restore_tmp, [s_restore_addr_lo, s_restore_addr_hi], null scope:SCOPE_SYS offset:0x40
+ s_wait_kmcnt 0
+ s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_restore_tmp
+#endif
s_mov_b32 m0, s_restore_m0
s_mov_b32 exec_lo, s_restore_exec_lo
s_mov_b32 exec_hi, s_restore_exec_hi
+#if HAVE_XNACK
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_MASK), s_restore_xnack_mask
+#endif
+
// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
// Only restore the other fields to avoid clobbering them.
s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 0, SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_1_SIZE), s_restore_excp_flag_priv
@@ -1022,37 +1200,50 @@ L_SKIP_BARRIER_RESTORE:
s_setreg_b32 hwreg(HW_REG_WAVE_MODE), s_restore_mode
// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
- // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
+ // ttmp SR memory offset :
+ // - gfx12: size(VGPR)+size(SGPR)+0x40
+ // - gfx12.5: size(VGPR)+size(SGPR)-0x40
get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
- get_svgpr_size_bytes(s_restore_ttmps_hi)
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
- s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
- s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
- s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
- s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 scope:SCOPE_SYS
- s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 scope:SCOPE_SYS
- s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 scope:SCOPE_SYS
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, (get_sgpr_size_bytes() + TTMP_SR_OFFSET_FROM_HWREG)
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_base_addr_lo
+ s_addc_u32 s_restore_ttmps_hi, s_restore_base_addr_hi, 0x0
+ s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x10 scope:SCOPE_SYS
+ s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x20 scope:SCOPE_SYS
+ s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x34 scope:SCOPE_SYS
s_wait_idle
- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+#if HAVE_XNACK
+ restore_xnack_state_priv(s_restore_tmp)
+#endif
+
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, ADDRESS_HI32_MASK //Do it here in order not to affect STATUS
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
+#if RELAXED_SCHEDULING_IN_TRAP
+ // Assume relaxed scheduling mode after this point.
+ restore_sched_mode(s_restore_tmp)
+#endif
+
s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu
- // Make barrier and LDS state visible to all waves in the group.
- // STATE_PRIV.BARRIER_COMPLETE may change after this point.
- s_barrier_signal -2
- s_barrier_wait -2
+ // Make barrier and LDS state visible to all waves in the group/cluster.
+ // STATE_PRIV.*BARRIER_COMPLETE may change after this point.
+ wait_trap_barriers(s_restore_tmp, 0, 0)
+
+#if HAVE_CLUSTER_BARRIER
+ // SCC is changed by wait_trap_barriers, restore it separately.
+ s_lshr_b32 s_restore_state_priv, s_restore_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, 1), s_restore_state_priv
+#endif
s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
L_END_PGM:
- // Make sure that no wave of the workgroup can exit the trap handler
- // before the workgroup barrier state is saved.
- s_barrier_signal -2
- s_barrier_wait -2
+ // Make sure that no wave of the group/cluster can exit the trap handler
+ // before the group/cluster barrier state is saved.
+ wait_trap_barriers(s_restore_tmp, 0, 0)
+
s_endpgm_saved
end
@@ -1079,26 +1270,6 @@ function write_12sgpr_to_v2(s)
end
end
-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
- s_buffer_load_dword s, s_rsrc, s_mem_offset scope:SCOPE_SYS
- s_add_u32 s_mem_offset, s_mem_offset, 4
-end
-
-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*16
- s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
-end
-
-function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*8
- s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
-end
-
-function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
- s_sub_u32 s_mem_offset, s_mem_offset, 4*4
- s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset scope:SCOPE_SYS
-end
-
function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_WAVE_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
@@ -1111,20 +1282,275 @@ L_ENABLE_SHIFT_W64:
L_SHIFT_DONE:
end
-function get_svgpr_size_bytes(s_svgpr_size_byte)
- s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_WAVE_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
- s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
-end
-
function get_sgpr_size_bytes
return 512
end
function get_hwreg_size_bytes
+#if ASIC_FAMILY >= CHIP_GC_12_0_3
+ return 512
+#else
return 128
+#endif
end
function get_wave_size2(s_reg)
s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE)
s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE
end
+
+#if HAVE_XNACK
+function save_and_clear_xnack_state_priv(s_tmp)
+ // Preserve and clear XNACK state before issuing further translations.
+ // Save XNACK_STATE_PRIV.{FIRST_REPLAY, REPLAY_W64H, FXPTR} into ttmp11[22:14].
+ s_andn2_b32 ttmp11, ttmp11, (TTMP11_FIRST_REPLAY_MASK | TTMP11_REPLAY_W64H_MASK | TTMP11_FXPTR_MASK)
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_FIRST_REPLAY_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_REPLAY_W64H_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_getreg_b32 s_tmp, hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE)
+ s_lshl_b32 s_tmp, s_tmp, TTMP11_FXPTR_SHIFT
+ s_or_b32 ttmp11, ttmp11, s_tmp
+
+ s_setreg_imm32_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV), 0
+end
+
+function restore_xnack_state_priv(s_tmp)
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_FIRST_REPLAY_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FIRST_REPLAY_SIZE), s_tmp
+
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_REPLAY_W64H_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_REPLAY_W64H_SIZE), s_tmp
+
+ s_lshr_b32 s_tmp, ttmp11, TTMP11_FXPTR_SHIFT
+ s_setreg_b32 hwreg(HW_REG_WAVE_XNACK_STATE_PRIV, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SHIFT, SQ_WAVE_XNACK_STATE_PRIV_FXPTR_SIZE), s_tmp
+end
+#endif
+
+function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa)
+#if HAVE_CLUSTER_BARRIER
+ // If not in a WG then wave cannot use s_barrier_signal_isfirst.
+ s_getreg_b32 s_tmp1, hwreg(HW_REG_WAVE_STATUS)
+ s_bitcmp0_b32 s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT
+ s_cbranch_scc1 L_TRAP_CLUSTER_BARRIER_SIGNAL
+
+ s_barrier_signal_isfirst -2
+ s_barrier_wait -2
+
+ // Only the first wave in the group signals the trap cluster barrier.
+ s_cbranch_scc0 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL
+
+L_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_signal -4
+
+L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
+ s_barrier_wait -4
+
+#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND
+if serialize_wa
+ // Trap cluster barrier may complete with a user cluster barrier in-flight.
+ // This is indicated if user cluster member count and signal count are equal.
+L_WAIT_USER_CLUSTER_BARRIER_COMPLETE:
+ s_sendmsg_rtn_b32 s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
+ s_wait_kmcnt 0
+ s_bitcmp0_b32 s_tmp1, BARRIER_STATE_VALID_OFFSET
+ s_cbranch_scc1 L_NOT_IN_CLUSTER
+
+ s_bfe_u32 s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10))
+ s_bfe_u32 s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10))
+ s_cmp_eq_u32 s_tmp1, s_tmp2
+ s_cbranch_scc1 L_WAIT_USER_CLUSTER_BARRIER_COMPLETE
+end
+L_NOT_IN_CLUSTER:
+#endif
+
+#else
+ s_barrier_signal -2
+ s_barrier_wait -2
+#endif
+end
+
+#if RELAXED_SCHEDULING_IN_TRAP
+function restore_sched_mode(s_tmp)
+ s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
+ s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
+end
+#endif
+
+function restore_barrier_signal_count(barrier_id)
+ // extract the saved signal count from s_restore_tmp
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET
+
+ // We need to call s_barrier_signal repeatedly to restore the signal count
+ // of the group/cluster barrier. The member count is already initialized.
+L_BARRIER_RESTORE_LOOP:
+ s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp
+ s_cbranch_scc0 L_BARRIER_RESTORE_DONE
+ s_barrier_signal barrier_id
+ s_add_i32 s_restore_tmp, s_restore_tmp, -1
+ s_branch L_BARRIER_RESTORE_LOOP
+
+L_BARRIER_RESTORE_DONE:
+end
+
+#if HAVE_INSTRUCTION_FIXUP
+function fixup_instruction
+ // PC read may fault if memory violation has been asserted.
+ // In this case no further progress is expected so fixup is not needed.
+ s_bitcmp1_b32 s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT
+ s_cbranch_scc1 L_FIXUP_DONE
+
+ // ttmp[0:1]: {7b'0} PC[56:0]
+ // ttmp2, 3, 10, 13, 14, 15: free
+ s_load_b64 [ttmp14, ttmp15], [ttmp0, ttmp1], 0 scope:SCOPE_CU // Load the 2 instruction DW we are returning to
+ s_wait_kmcnt 0
+ s_load_b64 [ttmp2, ttmp3], [ttmp0, ttmp1], 8 scope:SCOPE_CU // Load the next 2 instruction DW, just in case
+ s_and_b32 ttmp10, ttmp14, 0x80000000 // Check bit 31 in the first DWORD
+ // SCC set if ttmp10 is != 0, i.e. if bit 31 == 1
+ s_cbranch_scc1 L_FIXUP_NOT_VOP12C // If bit 31 is 1, we are not VOP1, VOP2, or VOP3C
+ // Fall through here means bit 31 == 0, meaning we are VOP1, VOP2, or VOPC
+ // Size of instruction depends on Opcode or SRC0_9
+ // Check for VOP2 opcode
+ s_bfe_u32 ttmp10, ttmp14, (25 | (6 << 0x10)) // Check bits 30:25 for VOP2 Opcode
+ // VOP2 V_FMAMK_F64 of V_FMAAK_F64 has implied 64-bit literature, 3 DW
+ s_sub_co_i32 ttmp13, ttmp10, 0x23 // V_FMAMK_F64 is 0x23, V_FMAAK_F64 is 0x24
+ s_cmp_le_u32 ttmp13, 0x1 // 0==0x23, 1==0x24
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst
+ // VOP2 V_FMAMK_F32, V_FMAAK_F32, V_FMAMK_F16, V_FMAAK_F16, 2 DW
+ s_sub_co_i32 ttmp13, ttmp10, 0x2c // V_FMAMK_F32 is 0x2c, V_FMAAK_F32 is 0x2d
+ s_cmp_le_u32 ttmp13, 0x1 // 0==0x2c, 1==0x2d
+ s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst
+ s_sub_co_i32 ttmp13, ttmp10, 0x37 // V_FMAMK_F16 is 0x37, V_FMAAK_F16 is 0x38
+ s_cmp_le_u32 ttmp13, 0x1 // 0==0x37, 1==0x38
+ s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst
+ // Check SRC0_9 for VOP1, VOP2, and VOPC
+ s_and_b32 ttmp10, ttmp14, 0x1ff // Check bits 8:0 for SRC0_9
+ // Literal constant 64 is 3 DWORDs
+ s_cmp_eq_u32 ttmp10, 0xfe // 0xfe == 254 == Literal constant64
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ // Literal constant 32, DPP16, DPP8, and DPP8FI are 2 DWORDs
+ s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst
+ s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16
+ s_cbranch_scc1 L_FIXUP_TWO_DWORD // 2 DWORD inst
+ s_sub_co_i32 ttmp13, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea
+ s_cmp_le_u32 ttmp13, 0x1 // 0==0xe9, 1==0xea
+ s_cbranch_scc1 L_FIXUP_TWO_DWORD // If either, this is 2 DWORD inst
+ // Instruction is 1 DWORD otherwise
+
+L_FIXUP_ONE_DWORD:
+ // Check if TTMP15 contains the value for S_SET_VGPR_MSB instruction
+ s_and_b32 ttmp10, ttmp15, 0xffff0000 // Check encoding in upper 16 bits
+ s_cmp_eq_u32 ttmp10, 0xbf860000 // Check if SOPP (9b'10_1111111) and S_SET_VGPR_MSB (7b'0000110)
+ s_cbranch_scc0 L_FIXUP_DONE // No problem, no fixup needed
+ // VALU op followed by a S_SET_VGPR_MSB. Need to pull SIMM[15:8] to fix up MODE.*_VGPR_MSB
+ s_bfe_u32 ttmp10, ttmp15, (14 | (2 << 0x10)) // Shift SIMM[15:14] over to 1:0, Dst
+ s_and_b32 ttmp13, ttmp15, 0x3f00 // Mask to get SIMM[13:8] only
+ s_lshr_b32 ttmp13, ttmp13, 6 // Shift SIMM[13:8] into 7:2, Src2, Src1, Src0
+ s_or_b32 ttmp10, ttmp10, ttmp13 // Src2, Src1, Src0, Dst --> format in MODE register
+ s_setreg_b32 hwreg(HW_REG_WAVE_MODE, 12, 8), ttmp10 // Write value into MODE[19:12]
+ s_branch L_FIXUP_DONE
+
+L_FIXUP_NOT_VOP12C:
+ // ttmp[0:1]: {7b'0} PC[56:0]
+ // ttmp2: PC+2 value (not waitcnt'ed yet)
+ // ttmp3: PC+3 value (not waitcnt'ed yet)
+ // ttmp10, ttmp13: free
+ // ttmp14: PC+O value
+ // ttmp15: PC+1 value
+ // Not VOP1, VOP2, or VOPC.
+ // Check if we are VOP3 or VOP3SD
+ s_and_b32 ttmp10, ttmp14, 0xfc000000 // Bits 31:26
+ s_cmp_eq_u32 ttmp10, 0xd4000000 // If 31:26 = 0x35, this is VOP3 or VOP3SD
+ s_cbranch_scc1 L_FIXUP_CHECK_VOP3 // If VOP3 or VOP3SD, need to check SRC2_9, SRC1_9, SRC0_9
+ // Not VOP1, VOP2, VOPC, VOP3, or VOP3SD.
+ // Check for VOPD
+ s_cmp_eq_u32 ttmp10, 0xc8000000 // If 31:26 = 0x32, this is VOPD
+ s_cbranch_scc1 L_FIXUP_CHECK_VOPD // If VOPD, need to check OpX, OpY, SRCX0 and SRCY0
+ // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD.
+ // Check if we are VOPD3
+ s_and_b32 ttmp10, ttmp14, 0xff000000 // Bits 31:24
+ s_cmp_eq_u32 ttmp10, 0xcf000000 // If 31:24 = 0xcf, this is VOPD3
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // If VOPD3, 3 DWORD inst
+ // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD, or VOPD3.
+ // Check if we are in the middle of VOP3PX.
+ s_and_b32 ttmp13, ttmp14, 0xffff0000 // Bits 31:16
+ s_cmp_eq_u32 ttmp13, 0xcc330000 // If 31:16 = 0xcc33, this is 8 bytes past VOP3PX
+ s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE
+ s_cmp_eq_u32 ttmp13, 0xcc880000 // If 31:16 = 0xcc88, this is 8 bytes past VOP3PX
+ s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE
+ // Might be in VOP3P, but we must ensure we are not VOP3PX2
+ s_cmp_eq_u32 ttmp13, 0xcc350000 // If 31:16 = 0xcc35, this is VOP3PX2
+ s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed
+ s_cmp_eq_u32 ttmp13, 0xcc3a0000 // If 31:16 = 0xcc3a, this is VOP3PX2
+ s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed
+ // Check if we are VOP3P
+ s_cmp_eq_u32 ttmp10, 0xcc000000 // If 31:24 = 0xcc, this is VOP3P
+ s_cbranch_scc0 L_FIXUP_DONE // Not in VOP3P, so instruction is not VOP1, VOP2,
+ // VOPC, VOP3, VOP3SD, VOP3P, VOPD, or VOPD3
+ // No fixup needed.
+ // Fall-through if we are in VOP3P to check SRC2_9, SRC1_9, and SRC0_9
+L_FIXUP_CHECK_VOP3:
+ // Start with Src0, which is in bits 8:0 of second instruction DW, ttmp15
+ s_and_b32 ttmp10, ttmp15, 0x1ff // Mask out unused bits
+ // Src0_9 == Literal constant 32, DPP16, DPP8, and DPP8FI means 3 DWORDs
+ s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ s_cmp_eq_u32 ttmp10, 0xfa // 0xfa == 250 = DPP16
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ s_sub_co_i32 ttmp10, ttmp10, 0xe9 // DPP8 is 0xe9, DPP8FI is 0xea
+ s_cmp_le_u32 ttmp10, 0x1 // 0==0xe9, 1==0xea
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst
+ s_and_b32 ttmp10, ttmp15, 0x3fe00 // Next is Src1, which is in 17:9
+ s_cmp_eq_u32 ttmp10, 0x1fe00 // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ s_and_b32 ttmp10, ttmp15, 0x7fc0000 // Next is Src2, which is in 26:18
+ s_cmp_eq_u32 ttmp10, 0x3fc0000 // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ s_branch L_FIXUP_TWO_DWORD // No special encodings, VOP3* is 2 Dword
+
+L_FIXUP_CHECK_VOPD:
+ // OpX being V_DUAL_FMA*K_F32 means 3 DWORDs
+ s_bfe_u32 ttmp10, ttmp14, (22 | (4 << 0x10)) // OPX is bits 25:22
+ s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2
+ s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst
+ // OpY being V_DUAL_FMA*K_F32 means 3 DWORDs
+ s_bfe_u32 ttmp10, ttmp14, (17 | (5 << 0x10)) // OPX is bits 21:17
+ s_sub_co_i32 ttmp10, ttmp10, 0x1 // V_DUAL_FMAAK_F32 is 0x1, V_DUAL_FMAMK_F32 is 0x2
+ s_cmp_le_u32 ttmp10, 0x1 // 0==0x1, 1==0x2
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // If either, this is 3 DWORD inst
+ // SRCX0 == Literal constant 32 means 3 DWORDs
+ s_and_b32 ttmp10, ttmp14, 0x1ff // SRCX0 is in bits 8:0 of 1st DWORD
+ s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ // SRCY0 == Literal constant 32 means 3 DWORDs
+ s_and_b32 ttmp10, ttmp15, 0x1ff // SRCY0 is in bits 8:0 of 2nd DWORD
+ s_cmp_eq_u32 ttmp10, 0xff // 0xff == 255 = Literal constant32
+ s_cbranch_scc1 L_FIXUP_THREE_DWORD // 3 DWORD inst
+ // If otherwise, no special encodings. Default VOPD is 2 Dword
+ // Fall-thru if true, because this is a 2 DWORD inst
+L_FIXUP_TWO_DWORD:
+ s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3
+ s_mov_b32 ttmp15, ttmp2 // Move possible S_SET_VGPR_MSB into ttmp15
+ s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB
+
+L_FIXUP_THREE_DWORD:
+ s_wait_kmcnt 0 // Wait for PC+2 and PC+3 to arrive in ttmp2 and ttmp3
+ s_mov_b32 ttmp15, ttmp3 // Move possible S_SET_VGPR_MSB into ttmp15
+ s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB
+
+L_FIXUP_VOP3PX_MIDDLE:
+ s_sub_co_u32 ttmp0, ttmp0, 8 // Rewind PC 8 bytes to beginning of instruction
+ s_sub_co_ci_u32 ttmp1, ttmp1, 0
+ s_branch L_FIXUP_TWO_DWORD // 2 DWORD inst (2nd half of a 4 DWORD inst)
+
+L_FIXUP_DONE:
+ s_wait_kmcnt 0 // Ensure load of ttmp2 and ttmp3 is done
+end
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 065d87841459..03b266b26738 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -22,10 +22,10 @@
*/
#include <linux/device.h>
-#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/overflow.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -67,6 +67,21 @@ static const struct class kfd_class = {
.name = kfd_dev_name,
};
+/*
+ * Cache the address space of the chardev on first open so that the reset
+ * path can drop all userspace mappings of doorbell and MMIO ranges via
+ * unmap_mapping_range().
+ */
+static struct address_space *kfd_dev_mapping;
+
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
+{
+ struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
+
+ if (mapping)
+ unmap_mapping_range(mapping, holebegin, holelen, 1);
+}
+
static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
{
struct kfd_process_device *pdd;
@@ -133,6 +148,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
if (iminor(inode) != 0)
return -ENODEV;
+ /*
+ * /dev/kfd is a single chardev so all opens share one inode. Cache
+ * its address_space on the first open for use by the reset path.
+ */
+ if (!READ_ONCE(kfd_dev_mapping))
+ cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
+
is_32bit_user_mode = in_compat_syscall();
if (is_32bit_user_mode) {
@@ -155,8 +177,8 @@ static int kfd_open(struct inode *inode, struct file *filep)
/* filep now owns the reference returned by kfd_create_process */
filep->private_data = process;
- dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
- process->pasid, process->is_32bit_user_mode);
+ dev_dbg(kfd_device, "process pid %d opened kfd node, compat mode (32 bit) - %d\n",
+ process->lead_thread->pid, process->is_32bit_user_mode);
return 0;
}
@@ -165,8 +187,13 @@ static int kfd_release(struct inode *inode, struct file *filep)
{
struct kfd_process *process = filep->private_data;
- if (process)
- kfd_unref_process(process);
+ if (!process)
+ return 0;
+
+ if (process->context_id != KFD_CONTEXT_ID_PRIMARY)
+ kfd_process_notifier_release_internal(process);
+
+ kfd_unref_process(process);
return 0;
}
@@ -212,6 +239,16 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
return -EINVAL;
}
+ if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) {
+ args->ring_size = KFD_MIN_QUEUE_RING_SIZE;
+ pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE");
+ }
+
+ if ((args->metadata_ring_size != 0) && !is_power_of_2(args->metadata_ring_size)) {
+ pr_err("Metadata ring size must be a power of 2 or 0\n");
+ return -EINVAL;
+ }
+
if (!access_ok((const void __user *) args->read_pointer_address,
sizeof(uint32_t))) {
pr_err("Can't access read pointer\n");
@@ -246,6 +283,9 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
+ if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+ q_properties->metadata_queue_size = args->metadata_ring_size;
+
q_properties->read_ptr = (void __user *)args->read_pointer_address;
q_properties->write_ptr = (void __user *)args->write_pointer_address;
q_properties->eop_ring_buffer_address = args->eop_buffer_address;
@@ -361,8 +401,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
goto err_acquire_queue_buf;
}
- pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
- p->pasid,
+ pr_debug("Creating queue for process pid %d on gpu 0x%x\n",
+ p->lead_thread->pid,
dev->id);
err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id,
@@ -415,9 +455,9 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
int retval;
struct kfd_ioctl_destroy_queue_args *args = data;
- pr_debug("Destroying queue id %d for pasid 0x%x\n",
+ pr_debug("Destroying queue id %d for process pid %d\n",
args->queue_id,
- p->pasid);
+ p->lead_thread->pid);
mutex_lock(&p->mutex);
@@ -461,6 +501,11 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
return -EINVAL;
}
+ if (args->ring_size < KFD_MIN_QUEUE_RING_SIZE) {
+ args->ring_size = KFD_MIN_QUEUE_RING_SIZE;
+ pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE");
+ }
+
properties.queue_address = args->ring_base_address;
properties.queue_size = args->ring_size;
properties.queue_percent = args->queue_percentage & 0xFF;
@@ -468,8 +513,8 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
properties.pm4_target_xcc = (args->queue_percentage >> 8) & 0xFF;
properties.priority = args->queue_priority;
- pr_debug("Updating queue id %d for pasid 0x%x\n",
- args->queue_id, p->pasid);
+ pr_debug("Updating queue id %d for process pid %d\n",
+ args->queue_id, p->lead_thread->pid);
mutex_lock(&p->mutex);
@@ -512,15 +557,10 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
cu_mask_size = sizeof(uint32_t) * (max_num_cus/32);
}
- minfo.cu_mask.ptr = kzalloc(cu_mask_size, GFP_KERNEL);
- if (!minfo.cu_mask.ptr)
- return -ENOMEM;
-
- retval = copy_from_user(minfo.cu_mask.ptr, cu_mask_ptr, cu_mask_size);
- if (retval) {
+ minfo.cu_mask.ptr = memdup_user(cu_mask_ptr, cu_mask_size);
+ if (IS_ERR(minfo.cu_mask.ptr)) {
pr_debug("Could not copy CU mask from userspace");
- retval = -EFAULT;
- goto out;
+ return PTR_ERR(minfo.cu_mask.ptr);
}
mutex_lock(&p->mutex);
@@ -529,7 +569,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
mutex_unlock(&p->mutex);
-out:
kfree(minfo.cu_mask.ptr);
return retval;
}
@@ -596,7 +635,8 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
default_policy,
alternate_policy,
(void __user *)args->alternate_aperture_base,
- args->alternate_aperture_size))
+ args->alternate_aperture_size,
+ args->misc_process_flag))
err = -EINVAL;
out:
@@ -695,7 +735,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
struct kfd_process_device_apertures *pAperture;
int i;
- dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
+ dev_dbg(kfd_device, "get apertures for process pid %d", p->lead_thread->pid);
args->num_of_nodes = 0;
@@ -747,7 +787,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
int ret;
int i;
- dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
+ dev_dbg(kfd_device, "get apertures for process pid %d",
+ p->lead_thread->pid);
if (args->num_of_nodes == 0) {
/* Return number of nodes, so that user space can alloacate
@@ -758,12 +799,15 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
goto out_unlock;
}
+ if (args->num_of_nodes > kfd_topology_get_num_devices())
+ return -EINVAL;
+
/* Fill in process-aperture information for all available
* nodes, but not more than args->num_of_nodes as that is
* the amount of memory allocated by user
*/
- pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures),
- GFP_KERNEL);
+ pa = kzalloc_objs(struct kfd_process_device_apertures,
+ args->num_of_nodes);
if (!pa)
return -ENOMEM;
@@ -1052,6 +1096,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
if (args->size == 0)
return -EINVAL;
+ if (p->context_id != KFD_CONTEXT_ID_PRIMARY && (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) {
+ pr_debug("USERPTR is not supported on non-primary kfd_process\n");
+
+ return -EOPNOTSUPP;
+ }
+
#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
/* Flush pending deferred work to avoid racing with deferred actions
* from previous memory map changes (e.g. munmap).
@@ -1059,7 +1109,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
svm_range_list_lock_and_flush_work(&p->svms, current->mm);
mutex_lock(&p->svms.lock);
mmap_write_unlock(current->mm);
- if (interval_tree_iter_first(&p->svms.objects,
+
+ /* Skip a special case that allocates VRAM without VA,
+ * VA will be invalid of 0.
+ */
+ if (!(!args->va_addr && (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)) &&
+ interval_tree_iter_first(&p->svms.objects,
args->va_addr >> PAGE_SHIFT,
(args->va_addr + args->size - 1) >> PAGE_SHIFT)) {
pr_err("Address: 0x%llx already allocated by SVM\n",
@@ -1327,7 +1382,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
if (WARN_ON_ONCE(!peer_pdd))
continue;
- kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
+ kfd_flush_tlb(peer_pdd);
}
kfree(devices_arr);
@@ -1422,7 +1477,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
if (WARN_ON_ONCE(!peer_pdd))
continue;
if (flush_tlb)
- kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
+ kfd_flush_tlb(peer_pdd);
/* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
@@ -1663,6 +1718,16 @@ static int kfd_ioctl_smi_events(struct file *filep,
return kfd_smi_event_open(pdd->dev, &args->anon_fd);
}
+static int kfd_ioctl_svm_validate(void *kdata, unsigned int usize)
+{
+ struct kfd_ioctl_svm_args *args = kdata;
+ size_t expected = struct_size(args, attrs, args->nattr);
+
+ if (expected == SIZE_MAX || usize < expected)
+ return -EINVAL;
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
static int kfd_ioctl_set_xnack_mode(struct file *filep,
@@ -1703,6 +1768,12 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
struct kfd_ioctl_svm_args *args = data;
int r = 0;
+ if (p->context_id != KFD_CONTEXT_ID_PRIMARY) {
+ pr_debug("SVM ioctl not supported on non-primary kfd process\n");
+
+ return -EOPNOTSUPP;
+ }
+
pr_debug("start 0x%llx size 0x%llx op 0x%x nattr 0x%x\n",
args->start_addr, args->size, args->op, args->nattr);
@@ -2027,9 +2098,7 @@ static int criu_get_process_object_info(struct kfd_process *p,
num_events = kfd_get_num_events(p);
- ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
- if (ret)
- return ret;
+ svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
*num_objects = num_queues + num_events + num_svm_ranges;
@@ -2191,7 +2260,7 @@ static int criu_restore_devices(struct kfd_process *p,
if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
return -EINVAL;
- device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
+ device_buckets = kmalloc_objs(*device_buckets, args->num_devices);
if (!device_buckets)
return -ENOMEM;
@@ -2434,7 +2503,7 @@ static int criu_restore_bos(struct kfd_process *p,
/* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
- bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
+ bo_buckets = kvmalloc_objs(*bo_buckets, args->num_bos);
if (!bo_buckets)
return -ENOMEM;
@@ -2452,7 +2521,7 @@ static int criu_restore_bos(struct kfd_process *p,
goto exit;
}
- bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
+ bo_privs = kvmalloc_objs(*bo_privs, args->num_bos);
if (!bo_privs) {
ret = -ENOMEM;
goto exit;
@@ -2557,8 +2626,8 @@ static int criu_restore(struct file *filep,
pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
- if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
- !args->num_devices || !args->num_bos)
+ if ((args->num_bos > 0 && !args->bos) || !args->devices || !args->priv_data ||
+ !args->priv_data_size || !args->num_devices)
return -EINVAL;
mutex_lock(&p->mutex);
@@ -2771,8 +2840,12 @@ static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
* SET_SHADER_DEBUGGER clears any stale process context data
* saved in MES.
*/
- if (pdd->dev->kfd->shared_resources.enable_mes)
- kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
+ if (pdd->dev->kfd->shared_resources.enable_mes) {
+ ret = kfd_dbg_set_mes_debug_mode(
+ pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
+ if (ret)
+ return ret;
+ }
}
p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
@@ -2818,7 +2891,7 @@ retry:
static int runtime_disable(struct kfd_process *p)
{
- int i = 0, ret;
+ int i = 0, ret = 0;
bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
@@ -2855,6 +2928,7 @@ static int runtime_disable(struct kfd_process *p)
/* disable ttmp setup */
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
+ int last_err = 0;
if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
pdd->spi_dbg_override =
@@ -2864,14 +2938,17 @@ static int runtime_disable(struct kfd_process *p)
pdd->dev->vm_info.last_vmid_kfd);
if (!pdd->dev->kfd->shared_resources.enable_mes)
- debug_refresh_runlist(pdd->dev->dqm);
+ last_err = debug_refresh_runlist(pdd->dev->dqm);
else
- kfd_dbg_set_mes_debug_mode(pdd,
+ last_err = kfd_dbg_set_mes_debug_mode(pdd,
!kfd_dbg_has_cwsr_workaround(pdd->dev));
+
+ if (last_err)
+ ret = last_err;
}
}
- return 0;
+ return ret;
}
static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
@@ -2902,6 +2979,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
struct kfd_process_device *pdd = NULL;
int r = 0;
+ if (p->context_id != KFD_CONTEXT_ID_PRIMARY) {
+ pr_debug("Set debug trap ioctl can not be invoked on non-primary kfd process\n");
+
+ return -EOPNOTSUPP;
+ }
+
if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
pr_err("Debugging does not support sched_policy %i", sched_policy);
return -EINVAL;
@@ -2946,6 +3029,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
goto out;
}
+ if (target->context_id != KFD_CONTEXT_ID_PRIMARY) {
+ pr_debug("Set debug trap ioctl not supported on non-primary kfd process\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
/* Check if target is still PTRACED. */
rcu_read_lock();
if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
@@ -3109,9 +3198,55 @@ out:
return r;
}
+/* userspace programs need to invoke this ioctl explicitly on a FD to
+ * create a secondary kfd_process which replacing its primary kfd_process
+ */
+static int kfd_ioctl_create_process(struct file *filep, struct kfd_process *p, void *data)
+{
+ struct kfd_process *process;
+ int ret;
+
+ if (!filep->private_data || !p)
+ return -EINVAL;
+
+ /* Each FD owns only one kfd_process */
+ if (p->context_id != KFD_CONTEXT_ID_PRIMARY)
+ return -EINVAL;
+
+ mutex_lock(&kfd_processes_mutex);
+ if (p != filep->private_data) {
+ mutex_unlock(&kfd_processes_mutex);
+ return -EINVAL;
+ }
+
+ process = create_process(current, false);
+ if (IS_ERR(process)) {
+ mutex_unlock(&kfd_processes_mutex);
+ return PTR_ERR(process);
+ }
+
+ filep->private_data = process;
+ mutex_unlock(&kfd_processes_mutex);
+
+ ret = kfd_create_process_sysfs(process);
+ if (ret)
+ pr_warn("Failed to create sysfs entry for the kfd_process");
+
+ /* Each open() increases kref of the primary kfd_process,
+ * so we need to reduce it here when we create a new secondary process replacing it
+ */
+ kfd_unref_process(p);
+
+ return 0;
+}
+
#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
- .cmd_drv = 0, .name = #ioctl}
+ .validate = NULL, .cmd_drv = 0, .name = #ioctl}
+
+#define AMDKFD_IOCTL_DEF_V(ioctl, _func, _validate, _flags) \
+ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
+ .validate = _validate, .cmd_drv = 0, .name = #ioctl}
/** Ioctl table */
static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
@@ -3208,7 +3343,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
kfd_ioctl_smi_events, 0),
- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SVM, kfd_ioctl_svm, 0),
+ AMDKFD_IOCTL_DEF_V(AMDKFD_IOC_SVM, kfd_ioctl_svm,
+ kfd_ioctl_svm_validate, 0),
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
kfd_ioctl_set_xnack_mode, 0),
@@ -3227,6 +3363,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
kfd_ioctl_set_debug_trap, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_PROCESS,
+ kfd_ioctl_create_process, 0),
};
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
@@ -3243,8 +3382,10 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
int retcode = -EINVAL;
bool ptrace_attached = false;
- if (nr >= AMDKFD_CORE_IOCTL_COUNT)
+ if (nr >= AMDKFD_CORE_IOCTL_COUNT) {
+ retcode = -ENOTTY;
goto err_i1;
+ }
if ((nr >= AMDKFD_COMMAND_START) && (nr < AMDKFD_COMMAND_END)) {
u32 amdkfd_size;
@@ -3257,8 +3398,10 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
asize = amdkfd_size;
cmd = ioctl->cmd;
- } else
+ } else {
+ retcode = -ENOTTY;
goto err_i1;
+ }
dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, arg);
@@ -3326,6 +3469,12 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
memset(kdata, 0, usize);
}
+ if (ioctl->validate) {
+ retcode = ioctl->validate(kdata, usize);
+ if (retcode)
+ goto err_i1;
+ }
+
retcode = func(filep, process, kdata);
if (cmd & IOC_OUT)
@@ -3365,12 +3514,12 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process,
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- pr_debug("pasid 0x%x mapping mmio page\n"
+ pr_debug("process pid %d mapping mmio page\n"
" target user address == 0x%08llX\n"
" physical address == 0x%08llX\n"
" vm_flags == 0x%04lX\n"
" size == 0x%04lX\n",
- process->pasid, (unsigned long long) vma->vm_start,
+ process->lead_thread->pid, (unsigned long long) vma->vm_start,
address, vma->vm_flags, PAGE_SIZE);
return io_remap_pfn_range(vma,
@@ -3381,16 +3530,19 @@ static int kfd_mmio_mmap(struct kfd_node *dev, struct kfd_process *process,
}
-static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
+static int kfd_mmap(struct file *filep, struct vm_area_struct *vma)
{
struct kfd_process *process;
struct kfd_node *dev = NULL;
unsigned long mmap_offset;
unsigned int gpu_id;
- process = kfd_get_process(current);
- if (IS_ERR(process))
- return PTR_ERR(process);
+ process = filep->private_data;
+ if (!process)
+ return -ESRCH;
+
+ if (process->lead_thread != current->group_leader)
+ return -EBADF;
mmap_offset = vma->vm_pgoff << PAGE_SHIFT;
gpu_id = KFD_MMAP_GET_GPU_ID(mmap_offset);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 693469c18c60..a1087c13f241 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1704,6 +1704,8 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc
case IP_VERSION(11, 5, 0):
case IP_VERSION(11, 5, 1):
case IP_VERSION(11, 5, 2):
+ case IP_VERSION(11, 5, 3):
+ case IP_VERSION(11, 5, 4):
/* Cacheline size not available in IP discovery for gc11.
* kfd_fill_gpu_cache_info_from_gfx_config to hard code it
*/
@@ -1711,6 +1713,7 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc
fallthrough;
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
+ case IP_VERSION(12, 1, 0):
num_of_cache_types =
kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,
cache_line_size_missing,
@@ -2132,9 +2135,6 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);
int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :
KFD_CRAT_INTRA_SOCKET_WEIGHT;
- uint32_t bandwidth = ext_cpu ? amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
- kdev->adev, NULL, true) : mem_bw;
-
/*
* with host gpu xgmi link, host can access gpu memory whether
* or not pcie bar type is large, so always create bidirectional
@@ -2143,8 +2143,16 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
sub_type_hdr->weight_xgmi = weight;
- sub_type_hdr->minimum_bandwidth_mbs = bandwidth;
- sub_type_hdr->maximum_bandwidth_mbs = bandwidth;
+ if (ext_cpu) {
+ amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,
+ AMDGPU_XGMI_BW_MODE_PER_LINK,
+ AMDGPU_XGMI_BW_UNIT_MBYTES,
+ &sub_type_hdr->minimum_bandwidth_mbs,
+ &sub_type_hdr->maximum_bandwidth_mbs);
+ } else {
+ sub_type_hdr->minimum_bandwidth_mbs = mem_bw;
+ sub_type_hdr->maximum_bandwidth_mbs = mem_bw;
+ }
} else {
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
sub_type_hdr->minimum_bandwidth_mbs =
@@ -2197,12 +2205,12 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
if (use_ta_info) {
sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *
- amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, peer_kdev->adev);
- sub_type_hdr->maximum_bandwidth_mbs =
- amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev,
- peer_kdev->adev, false);
- sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ?
- amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, NULL, true) : 0;
+ amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);
+ amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,
+ AMDGPU_XGMI_BW_MODE_PER_PEER,
+ AMDGPU_XGMI_BW_UNIT_MBYTES,
+ &sub_type_hdr->minimum_bandwidth_mbs,
+ &sub_type_hdr->maximum_bandwidth_mbs);
} else {
bool is_single_hop = kdev->kfd == peer_kdev->kfd;
int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :
@@ -2351,7 +2359,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
if (kdev->kfd->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
- if (!peer_dev->gpu)
+ if (!peer_dev || !peer_dev->gpu)
continue;
if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
continue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index a8abc3091801..0f7aa51b629e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -204,11 +204,12 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
size_t exception_data_size)
{
struct kfd_process *p;
+ struct kfd_process_device *pdd = NULL;
bool signaled_to_debugger_or_runtime = false;
- p = kfd_lookup_process_by_pasid(pasid);
+ p = kfd_lookup_process_by_pasid(pasid, &pdd);
- if (!p)
+ if (!pdd)
return false;
if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
@@ -238,9 +239,8 @@ bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
mutex_unlock(&p->mutex);
} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
- kfd_dqm_evict_pasid(dev->dqm, p->pasid);
- kfd_signal_vm_fault_event(dev, p->pasid, NULL,
- exception_data);
+ kfd_evict_process_device(pdd);
+ kfd_signal_vm_fault_event(pdd, NULL, exception_data);
signaled_to_debugger_or_runtime = true;
}
@@ -276,8 +276,8 @@ int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
data = (struct kfd_hsa_memory_exception_data *)
pdd->vm_fault_exc_data;
- kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
- kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+ kfd_evict_process_device(pdd);
+ kfd_signal_vm_fault_event(pdd, NULL, data);
error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
}
@@ -357,12 +357,13 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
return 0;
if (!pdd->proc_ctx_cpu_ptr) {
- r = amdgpu_amdkfd_alloc_gtt_mem(adev,
- AMDGPU_MES_PROC_CTX_SIZE,
- &pdd->proc_ctx_bo,
- &pdd->proc_ctx_gpu_addr,
- &pdd->proc_ctx_cpu_ptr,
- false);
+ r = amdgpu_amdkfd_alloc_kernel_mem(adev,
+ AMDGPU_MES_PROC_CTX_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &pdd->proc_ctx_bo,
+ &pdd->proc_ctx_gpu_addr,
+ &pdd->proc_ctx_cpu_ptr,
+ false);
if (r) {
dev_err(adev->dev,
"failed to allocate process context bo\n");
@@ -371,8 +372,10 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
}
- return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
- pdd->watch_points, flags, sq_trap_en);
+ return amdgpu_mes_set_shader_debugger(pdd->dev->adev,
+ pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+ pdd->watch_points, flags, sq_trap_en,
+ ffs(pdd->dev->xcc_mask) - 1);
}
#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
@@ -401,27 +404,25 @@ static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_i
return -ENOMEM;
}
-static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id)
{
spin_lock(&pdd->dev->watch_points_lock);
/* process owns device watch point so safe to clear */
- if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
- pdd->alloc_watch_ids &= ~(0x1 << watch_id);
- pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
+ if (pdd->alloc_watch_ids & BIT(watch_id)) {
+ pdd->alloc_watch_ids &= ~BIT(watch_id);
+ pdd->dev->alloc_watch_ids &= ~BIT(watch_id);
}
spin_unlock(&pdd->dev->watch_points_lock);
}
-static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
+static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id)
{
bool owns_watch_id = false;
spin_lock(&pdd->dev->watch_points_lock);
- owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
- ((pdd->alloc_watch_ids >> watch_id) & 0x1);
-
+ owns_watch_id = pdd->alloc_watch_ids & BIT(watch_id);
spin_unlock(&pdd->dev->watch_points_lock);
return owns_watch_id;
@@ -432,6 +433,9 @@ int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
{
int r;
+ if (watch_id >= MAX_WATCH_ADDRESSES)
+ return -EINVAL;
+
if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
return -EINVAL;
@@ -469,6 +473,9 @@ int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
if (r)
return r;
+ if (*watch_id >= MAX_WATCH_ADDRESSES)
+ return -EINVAL;
+
if (!pdd->dev->kfd->shared_resources.enable_mes) {
r = debug_lock_and_unmap(pdd->dev->dqm);
if (r) {
@@ -516,9 +523,15 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
int i, r = 0, rewind_count = 0;
for (i = 0; i < target->n_pdds; i++) {
+ uint32_t caps;
+ uint32_t caps2;
struct kfd_topology_device *topo_dev =
- kfd_topology_device_by_id(target->pdds[i]->dev->id);
- uint32_t caps = topo_dev->node_props.capability;
+ kfd_topology_device_by_id(target->pdds[i]->dev->id);
+ if (!topo_dev)
+ return -EINVAL;
+
+ caps = topo_dev->node_props.capability;
+ caps2 = topo_dev->node_props.capability2;
if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
@@ -531,6 +544,12 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
*flags = prev_flags;
return -EACCES;
}
+
+ if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) &&
+ (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) {
+ *flags = prev_flags;
+ return -EACCES;
+ }
}
target->dbg_flags = *flags;
@@ -565,9 +584,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
continue;
if (!pdd->dev->kfd->shared_resources.enable_mes)
- debug_refresh_runlist(pdd->dev->dqm);
+ (void)debug_refresh_runlist(pdd->dev->dqm);
else
- kfd_dbg_set_mes_debug_mode(pdd, true);
+ (void)kfd_dbg_set_mes_debug_mode(pdd, true);
}
}
@@ -627,9 +646,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
if (!pdd->dev->kfd->shared_resources.enable_mes)
- debug_refresh_runlist(pdd->dev->dqm);
+ (void)debug_refresh_runlist(pdd->dev->dqm);
else
- kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
+ (void)kfd_dbg_set_mes_debug_mode(pdd,
+ !kfd_dbg_has_cwsr_workaround(pdd->dev));
}
kfd_dbg_set_workaround(target, false);
@@ -1071,6 +1091,10 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
for (i = 0; i < tmp_num_devices; i++) {
struct kfd_process_device *pdd = target->pdds[i];
struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+ if (!topo_dev) {
+ r = -EINVAL;
+ break;
+ }
device_info.gpu_id = pdd->dev->id;
device_info.exception_status = pdd->exception_status;
@@ -1098,6 +1122,7 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
device_info.capability = topo_dev->node_props.capability;
device_info.debug_prop = topo_dev->node_props.debug_prop;
+ device_info.capability2 = topo_dev->node_props.capability2;
if (exception_clear_mask)
pdd->exception_status &= ~exception_clear_mask;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 27aa1a5b120f..fbb751821c69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -120,8 +120,7 @@ static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev)
&& dev->kfd->mec2_fw_version < 0x1b6) ||
(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
&& dev->kfd->mec2_fw_version < 0x30) ||
- (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
- KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
+ kfd_dbg_has_cwsr_workaround(dev))
return false;
/* Assume debugging and cooperative launch supported otherwise. */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 4a5a0a4e00f2..7d4e07452cdb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -27,6 +27,16 @@
#include "kfd_priv.h"
static struct dentry *debugfs_root;
+static struct dentry *debugfs_proc;
+static struct list_head procs;
+
+struct debugfs_proc_entry {
+ struct list_head list;
+ struct dentry *proc_dentry;
+ pid_t pid;
+};
+
+#define MAX_DEBUGFS_FILENAME_LEN 32
static int kfd_debugfs_open(struct inode *inode, struct file *file)
{
@@ -92,6 +102,8 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = {
void kfd_debugfs_init(void)
{
debugfs_root = debugfs_create_dir("kfd", NULL);
+ debugfs_proc = debugfs_create_dir("proc", debugfs_root);
+ INIT_LIST_HEAD(&procs);
debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
kfd_debugfs_mqds_by_process, &kfd_debugfs_fops);
@@ -107,5 +119,69 @@ void kfd_debugfs_init(void)
void kfd_debugfs_fini(void)
{
+ debugfs_remove_recursive(debugfs_proc);
debugfs_remove_recursive(debugfs_root);
}
+
+static ssize_t kfd_debugfs_pasid_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct kfd_process_device *pdd = file_inode(file)->i_private;
+ char tmp[32];
+ int len;
+
+ len = snprintf(tmp, sizeof(tmp), "%u\n", pdd->pasid);
+
+ return simple_read_from_buffer(buf, count, ppos, tmp, len);
+}
+
+static const struct file_operations kfd_debugfs_pasid_fops = {
+ .owner = THIS_MODULE,
+ .read = kfd_debugfs_pasid_read,
+};
+
+void kfd_debugfs_add_process(struct kfd_process *p)
+{
+ int i;
+ char name[MAX_DEBUGFS_FILENAME_LEN];
+ struct debugfs_proc_entry *entry;
+
+ entry = kzalloc_obj(*entry);
+ if (!entry)
+ return;
+
+ list_add(&entry->list, &procs);
+ entry->pid = p->lead_thread->pid;
+ snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "%d",
+ (int)entry->pid);
+ entry->proc_dentry = debugfs_create_dir(name, debugfs_proc);
+
+ /* Create debugfs files for each GPU:
+ * - proc/<pid>/pasid_<gpuid>
+ */
+ for (i = 0; i < p->n_pdds; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+
+ snprintf(name, MAX_DEBUGFS_FILENAME_LEN, "pasid_%u",
+ pdd->dev->id);
+ debugfs_create_file((const char *)name, S_IFREG | 0444,
+ entry->proc_dentry, pdd,
+ &kfd_debugfs_pasid_fops);
+ }
+}
+
+void kfd_debugfs_remove_process(struct kfd_process *p)
+{
+ struct debugfs_proc_entry *entry, *next;
+
+ mutex_lock(&kfd_processes_mutex);
+ list_for_each_entry_safe(entry, next, &procs, list) {
+ if (entry->pid != p->lead_thread->pid)
+ continue;
+
+ debugfs_remove_recursive(entry->proc_dentry);
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ mutex_unlock(&kfd_processes_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index a29374c86405..b7f8f7ff8198 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -57,6 +57,7 @@ extern const struct kfd2kgd_calls gfx_v10_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v11_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v12_kfd2kgd;
+extern const struct kfd2kgd_calls gfx_v12_1_kfd2kgd;
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size);
@@ -94,6 +95,8 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd)
case IP_VERSION(5, 2, 2):/* NAVY_FLOUNDER */
case IP_VERSION(5, 2, 4):/* DIMGREY_CAVEFISH */
case IP_VERSION(5, 2, 5):/* BEIGE_GOBY */
+ kfd->device_info.num_sdma_queues_per_engine = 8;
+ break;
case IP_VERSION(6, 0, 0):
case IP_VERSION(6, 0, 1):
case IP_VERSION(6, 0, 2):
@@ -101,9 +104,14 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd)
case IP_VERSION(6, 1, 0):
case IP_VERSION(6, 1, 1):
case IP_VERSION(6, 1, 2):
+ case IP_VERSION(6, 1, 3):
+ case IP_VERSION(6, 1, 4):
case IP_VERSION(7, 0, 0):
case IP_VERSION(7, 0, 1):
+ case IP_VERSION(7, 1, 0):
kfd->device_info.num_sdma_queues_per_engine = 8;
+ /* Reserve 1 for paging and 1 for gfx */
+ kfd->device_info.num_reserved_sdma_queues_per_engine = 2;
break;
default:
dev_warn(kfd_device,
@@ -111,29 +119,6 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd)
sdma_version);
kfd->device_info.num_sdma_queues_per_engine = 8;
}
-
- bitmap_zero(kfd->device_info.reserved_sdma_queues_bitmap, KFD_MAX_SDMA_QUEUES);
-
- switch (sdma_version) {
- case IP_VERSION(6, 0, 0):
- case IP_VERSION(6, 0, 1):
- case IP_VERSION(6, 0, 2):
- case IP_VERSION(6, 0, 3):
- case IP_VERSION(6, 1, 0):
- case IP_VERSION(6, 1, 1):
- case IP_VERSION(6, 1, 2):
- case IP_VERSION(7, 0, 0):
- case IP_VERSION(7, 0, 1):
- /* Reserve 1 for paging and 1 for gfx */
- kfd->device_info.num_reserved_sdma_queues_per_engine = 2;
- /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */
- bitmap_set(kfd->device_info.reserved_sdma_queues_bitmap, 0,
- kfd->adev->sdma.num_instances *
- kfd->device_info.num_reserved_sdma_queues_per_engine);
- break;
- default:
- break;
- }
}
static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
@@ -180,6 +165,8 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
case IP_VERSION(11, 5, 0):
case IP_VERSION(11, 5, 1):
case IP_VERSION(11, 5, 2):
+ case IP_VERSION(11, 5, 3):
+ case IP_VERSION(11, 5, 4):
kfd->device_info.event_interrupt_class = &event_interrupt_class_v11;
break;
case IP_VERSION(12, 0, 0):
@@ -187,6 +174,10 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
/* GFX12_TODO: Change to v12 version. */
kfd->device_info.event_interrupt_class = &event_interrupt_class_v11;
break;
+ case IP_VERSION(12, 1, 0):
+ kfd->device_info.event_interrupt_class =
+ &event_interrupt_class_v12_1;
+ break;
default:
dev_warn(kfd_device, "v9 event interrupt handler is set due to "
"mismatch of gc ip block(GC_HWIP:0x%x).\n", gc_version);
@@ -349,11 +340,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
f2g = &aldebaran_kfd2kgd;
break;
case IP_VERSION(9, 4, 3):
- gfx_target_version = adev->rev_id >= 1 ? 90402
- : adev->flags & AMD_IS_APU ? 90400
- : 90401;
- f2g = &gc_9_4_3_kfd2kgd;
- break;
case IP_VERSION(9, 4, 4):
gfx_target_version = 90402;
f2g = &gc_9_4_3_kfd2kgd;
@@ -454,6 +440,14 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
gfx_target_version = 110502;
f2g = &gfx_v11_kfd2kgd;
break;
+ case IP_VERSION(11, 5, 3):
+ gfx_target_version = 110503;
+ f2g = &gfx_v11_kfd2kgd;
+ break;
+ case IP_VERSION(11, 5, 4):
+ gfx_target_version = 110504;
+ f2g = &gfx_v11_kfd2kgd;
+ break;
case IP_VERSION(12, 0, 0):
gfx_target_version = 120000;
f2g = &gfx_v12_kfd2kgd;
@@ -462,6 +456,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
gfx_target_version = 120001;
f2g = &gfx_v12_kfd2kgd;
break;
+ case IP_VERSION(12, 1, 0):
+ gfx_target_version = 120500;
+ f2g = &gfx_v12_1_kfd2kgd;
+ break;
default:
break;
}
@@ -480,7 +478,7 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
return NULL;
}
- kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+ kfd = kzalloc_obj(*kfd);
if (!kfd)
return NULL;
@@ -493,6 +491,7 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
mutex_init(&kfd->doorbell_mutex);
ida_init(&kfd->doorbell_ida);
+ atomic_set(&kfd->kfd_processes_count, 0);
return kfd;
}
@@ -546,11 +545,16 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE);
kfd->cwsr_isa = cwsr_trap_gfx11_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex);
- } else {
+ } else if (KFD_GC_VERSION(kfd) < IP_VERSION(12, 1, 0)) {
BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_hex)
> KFD_CWSR_TMA_OFFSET);
kfd->cwsr_isa = cwsr_trap_gfx12_hex;
kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_hex);
+ } else {
+ BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_1_0_hex)
+ > KFD_CWSR_TMA_OFFSET);
+ kfd->cwsr_isa = cwsr_trap_gfx12_1_0_hex;
+ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_1_0_hex);
}
kfd->cwsr_enabled = true;
@@ -583,9 +587,13 @@ static int kfd_gws_init(struct kfd_node *node)
&& kfd->mec2_fw_version >= 0x6b) ||
(KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0)
&& KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0)
- && mes_rev >= 68))))
+ && mes_rev >= 68) ||
+ (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0))))) {
+ if (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0))
+ node->adev->gds.gws_size = 64;
ret = amdgpu_amdkfd_alloc_gws(node->adev,
node->adev->gds.gws_size, &node->gws);
+ }
return ret;
}
@@ -675,6 +683,7 @@ static void kfd_setup_interrupt_bitmap(struct kfd_node *node,
struct amdgpu_device *adev = node->adev;
uint32_t xcc_mask = node->xcc_mask;
uint32_t xcc, mapped_xcc;
+ uint32_t bitmap;
/*
* Interrupt bitmap is setup for processing interrupts from
* different XCDs and AIDs.
@@ -696,9 +705,22 @@ static void kfd_setup_interrupt_bitmap(struct kfd_node *node,
* - AND VMID reported in the interrupt lies within the
* VMID range of the node.
*/
- for_each_inst(xcc, xcc_mask) {
- mapped_xcc = GET_INST(GC, xcc);
- node->interrupt_bitmap |= (mapped_xcc % 2 ? 5 : 3) << (4 * (mapped_xcc / 2));
+ switch (KFD_GC_VERSION(node)) {
+ case IP_VERSION(12, 1, 0):
+ for_each_inst(xcc, xcc_mask) {
+ mapped_xcc = GET_INST(GC, xcc);
+ bitmap = 0x2 | (0x4 << (mapped_xcc % 4));
+ if (mapped_xcc/4)
+ bitmap = bitmap << 8;
+ node->interrupt_bitmap |= bitmap;
+ }
+ break;
+ default:
+ for_each_inst(xcc, xcc_mask) {
+ mapped_xcc = GET_INST(GC, xcc);
+ node->interrupt_bitmap |= (mapped_xcc % 2 ? 5 : 3) << (4 * (mapped_xcc / 2));
+ }
+ break;
}
dev_info(kfd_device, "Node: %d, interrupt_bitmap: %x\n", kfd_node_idx,
node->interrupt_bitmap);
@@ -761,7 +783,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
* If the VMID range changes for multi-partition capable GPUs, then
* this code MUST be revisited.
*/
- if (kfd->adev->xcp_mgr) {
+ if (kfd->adev->xcp_mgr && (KFD_GC_VERSION(kfd) != IP_VERSION(12, 1, 0))) {
partition_mode = amdgpu_xcp_query_partition_mode(kfd->adev->xcp_mgr,
AMDGPU_XCP_FL_LOCKED);
if (partition_mode == AMDGPU_CPX_PARTITION_MODE &&
@@ -798,12 +820,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
/* add another 512KB for all other allocations on gart (HPD, fences) */
size += 512 * 1024;
- if (amdgpu_amdkfd_alloc_gtt_mem(
- kfd->adev, size, &kfd->gtt_mem,
+ if (amdgpu_amdkfd_alloc_kernel_mem(
+ kfd->adev, size, AMDGPU_GEM_DOMAIN_GTT,
+ &kfd->gtt_mem,
&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
false)) {
dev_err(kfd_device, "Could not allocate %d bytes\n", size);
- goto alloc_gtt_mem_failure;
+ goto alloc_kernel_mem_failure;
}
dev_info(kfd_device, "Allocated %d bytes on gart\n", size);
@@ -841,7 +864,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
/* Allocate the KFD nodes */
for (i = 0, xcp_idx = 0; i < kfd->num_nodes; i++) {
- node = kzalloc(sizeof(struct kfd_node), GFP_KERNEL);
+ node = kzalloc_obj(struct kfd_node);
if (!node)
goto node_alloc_error;
@@ -868,7 +891,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
}
if (partition_mode == AMDGPU_CPX_PARTITION_MODE &&
- kfd->num_nodes != 1) {
+ kfd->num_nodes != 1 &&
+ (KFD_GC_VERSION(kfd) != IP_VERSION(12, 1, 0))) {
/* For multi-partition capable GPUs and CPX mode, first
* XCD gets VMID range 4-9 and second XCD gets VMID
* range 10-15.
@@ -889,6 +913,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
node->compute_vmid_bitmap =
gpu_resources->compute_vmid_bitmap;
}
+
node->max_proc_per_quantum = max_proc_per_quantum;
atomic_set(&node->sram_ecc_flag, 0);
@@ -927,8 +952,8 @@ node_alloc_error:
kfd_doorbell_error:
kfd_gtt_sa_fini(kfd);
kfd_gtt_sa_init_error:
- amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
-alloc_gtt_mem_failure:
+ amdgpu_amdkfd_free_kernel_mem(kfd->adev, &kfd->gtt_mem);
+alloc_kernel_mem_failure:
dev_err(kfd_device,
"device %x:%x NOT added due to errors\n",
kfd->adev->pdev->vendor, kfd->adev->pdev->device);
@@ -945,10 +970,13 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfd_doorbell_fini(kfd);
ida_destroy(&kfd->doorbell_ida);
kfd_gtt_sa_fini(kfd);
- amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
+ amdgpu_amdkfd_free_kernel_mem(kfd->adev, &kfd->gtt_mem);
}
kfree(kfd);
+
+ /* after remove a kfd device unlock kfd driver */
+ kgd2kfd_unlock_kfd(NULL);
}
int kgd2kfd_pre_reset(struct kfd_dev *kfd,
@@ -965,7 +993,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd,
kfd_smi_event_update_gpu_reset(node, false, reset_context);
}
- kgd2kfd_suspend(kfd, false);
+ kgd2kfd_suspend(kfd, true);
for (i = 0; i < kfd->num_nodes; i++)
kfd_signal_reset_event(kfd->nodes[i]);
@@ -1007,13 +1035,33 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
return 0;
}
-bool kfd_is_locked(void)
+bool kfd_is_locked(struct kfd_dev *kfd)
{
+ uint8_t id = 0;
+ struct kfd_node *dev;
+
lockdep_assert_held(&kfd_processes_mutex);
- return (kfd_locked > 0);
+
+ /* check reset/suspend lock */
+ if (kfd_locked > 0)
+ return true;
+
+ if (kfd)
+ return kfd->kfd_dev_lock > 0;
+
+ /* check lock on all cgroup accessible devices */
+ while (kfd_topology_enum_kfd_devices(id++, &dev) == 0) {
+ if (!dev || kfd_devcgroup_check_permission(dev))
+ continue;
+
+ if (dev->kfd->kfd_dev_lock > 0)
+ return true;
+ }
+
+ return false;
}
-void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc)
{
struct kfd_node *node;
int i;
@@ -1021,14 +1069,8 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
if (!kfd->init_complete)
return;
- /* for runtime suspend, skip locking kfd */
- if (!run_pm) {
- mutex_lock(&kfd_processes_mutex);
- /* For first KFD device suspend all the KFD processes */
- if (++kfd_locked == 1)
- kfd_suspend_all_processes();
- mutex_unlock(&kfd_processes_mutex);
- }
+ if (suspend_proc)
+ kgd2kfd_suspend_process(kfd);
for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i];
@@ -1036,9 +1078,9 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
}
}
-int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
+int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc)
{
- int ret, i;
+ int ret = 0, i;
if (!kfd->init_complete)
return 0;
@@ -1049,14 +1091,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
return ret;
}
- /* for runtime resume, skip unlocking kfd */
- if (!run_pm) {
- mutex_lock(&kfd_processes_mutex);
- if (--kfd_locked == 0)
- ret = kfd_resume_all_processes();
- WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
- mutex_unlock(&kfd_processes_mutex);
- }
+ if (resume_proc)
+ ret = kgd2kfd_resume_process(kfd);
+
+ return ret;
+}
+
+void kgd2kfd_suspend_process(struct kfd_dev *kfd)
+{
+ if (!kfd->init_complete)
+ return;
+
+ mutex_lock(&kfd_processes_mutex);
+ /* For first KFD device suspend all the KFD processes */
+ if (++kfd_locked == 1)
+ kfd_suspend_all_processes();
+ mutex_unlock(&kfd_processes_mutex);
+}
+
+int kgd2kfd_resume_process(struct kfd_dev *kfd)
+{
+ int ret = 0;
+
+ if (!kfd->init_complete)
+ return 0;
+
+ mutex_lock(&kfd_processes_mutex);
+ if (--kfd_locked == 0)
+ ret = kfd_resume_all_processes();
+ WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
+ mutex_unlock(&kfd_processes_mutex);
return ret;
}
@@ -1091,7 +1155,15 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
}
for (i = 0; i < kfd->num_nodes; i++) {
- node = kfd->nodes[i];
+ /* Race if another thread in b/w
+ * kfd_cleanup_nodes and kfree(kfd),
+ * when kfd->nodes[i] = NULL
+ */
+ if (kfd->nodes[i])
+ node = kfd->nodes[i];
+ else
+ return;
+
spin_lock_irqsave(&node->interrupt_lock, flags);
if (node->interrupts_active
@@ -1151,12 +1223,13 @@ int kgd2kfd_resume_mm(struct mm_struct *mm)
* prepare for safe eviction of KFD BOs that belong to the specified
* process.
*
- * @mm: mm_struct that identifies the specified KFD process
+ * @mm: mm_struct that identifies a group of KFD processes
+ * @context_id: an id that identifies a specific KFD context in the above kfd process group
* @fence: eviction fence attached to KFD process BOs
*
*/
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
- struct dma_fence *fence)
+ u16 context_id, struct dma_fence *fence)
{
struct kfd_process *p;
unsigned long active_time;
@@ -1168,7 +1241,7 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
if (dma_fence_is_signaled(fence))
return 0;
- p = kfd_lookup_process_by_mm(mm);
+ p = kfd_lookup_process_by_id(mm, context_id);
if (!p)
return -ENODEV;
@@ -1255,7 +1328,7 @@ int kfd_gtt_sa_allocate(struct kfd_node *node, unsigned int size,
if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
return -ENOMEM;
- *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ *mem_obj = kzalloc_obj(struct kfd_mem_obj);
if (!(*mem_obj))
return -ENOMEM;
@@ -1436,24 +1509,66 @@ unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
kfd_get_num_sdma_engines(node);
}
-int kgd2kfd_check_and_lock_kfd(void)
+int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
{
+ struct kfd_process *p;
+ int r = 0, temp, idx;
+
mutex_lock(&kfd_processes_mutex);
- if (!hash_empty(kfd_processes_table) || kfd_is_locked()) {
- mutex_unlock(&kfd_processes_mutex);
- return -EBUSY;
+
+ /* kfd_processes_count is per kfd_dev, return -EBUSY without
+ * further check
+ */
+ if (!!atomic_read(&kfd->kfd_processes_count)) {
+ pr_debug("process_wq_release not finished\n");
+ r = -EBUSY;
+ goto out;
}
- ++kfd_locked;
+ if (hash_empty(kfd_processes_table) && !kfd_is_locked(kfd))
+ goto out;
+
+ /* fail under system reset/resume or kfd device is partition switching. */
+ if (kfd_is_locked(kfd)) {
+ r = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * ensure all running processes are cgroup excluded from device before mode switch.
+ * i.e. no pdd was created on the process socket.
+ */
+ idx = srcu_read_lock(&kfd_processes_srcu);
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ int i;
+
+ for (i = 0; i < p->n_pdds; i++) {
+ if (p->pdds[i]->dev->kfd != kfd)
+ continue;
+
+ r = -EBUSY;
+ goto proc_check_unlock;
+ }
+ }
+
+proc_check_unlock:
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+out:
+ if (!r)
+ ++kfd->kfd_dev_lock;
mutex_unlock(&kfd_processes_mutex);
- return 0;
+ return r;
}
-void kgd2kfd_unlock_kfd(void)
+/* unlock a kfd dev or kfd driver */
+void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
{
mutex_lock(&kfd_processes_mutex);
- --kfd_locked;
+ if (kfd)
+ --kfd->kfd_dev_lock;
+ else
+ --kfd_locked;
mutex_unlock(&kfd_processes_mutex);
}
@@ -1479,6 +1594,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
return ret;
}
+int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd)
+{
+ struct kfd_node *node;
+ int i, r;
+
+ if (!kfd->init_complete)
+ return 0;
+
+ for (i = 0; i < kfd->num_nodes; i++) {
+ node = kfd->nodes[i];
+ r = node->dqm->ops.unhalt(node->dqm);
+ if (r) {
+ dev_err(kfd_device, "Error in starting scheduler\n");
+ return r;
+ }
+ }
+ return 0;
+}
+
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;
@@ -1496,6 +1630,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
return node->dqm->ops.halt(node->dqm);
}
+int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd)
+{
+ struct kfd_node *node;
+ int i, r;
+
+ if (!kfd->init_complete)
+ return 0;
+
+ for (i = 0; i < kfd->num_nodes; i++) {
+ node = kfd->nodes[i];
+ r = node->dqm->ops.halt(node->dqm);
+ if (r)
+ return r;
+ }
+ return 0;
+}
+
bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;
@@ -1556,15 +1707,20 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
{
struct kfd_process *p;
u32 cam_index;
+ u32 src_data_idx;
+
+ src_data_idx = (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0)) ?
+ 3 : 2;
if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) {
- p = kfd_lookup_process_by_pasid(entry->pasid);
+ p = kfd_lookup_process_by_pasid(entry->pasid, NULL);
if (!p)
return true;
if (p->gpu_page_fault && !p->debug_trap_enabled) {
if (retry_fault && adev->irq.retry_cam_enabled) {
- cam_index = entry->src_data[2] & 0x3ff;
+ cam_index = entry->src_data[src_data_idx] & 0x3ff;
+
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
}
@@ -1581,6 +1737,42 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
return false;
}
+/** kgd2kfd_teardown_processes - gracefully tear down existing
+ * kfd processes that use adev
+ *
+ * @adev: amdgpu_device where kfd processes run on and will be
+ * teardown
+ *
+ */
+void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
+{
+ struct hlist_node *p_temp;
+ struct kfd_process *p;
+ struct kfd_node *dev;
+ unsigned int temp;
+
+ mutex_lock(&kfd_processes_mutex);
+
+ if (hash_empty(kfd_processes_table)) {
+ mutex_unlock(&kfd_processes_mutex);
+ return;
+ }
+
+ hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
+ for (int i = 0; i < p->n_pdds; i++) {
+ dev = p->pdds[i]->dev;
+ if (dev->adev == adev)
+ kfd_signal_process_terminate_event(p);
+ }
+ }
+
+ mutex_unlock(&kfd_processes_mutex);
+
+ /* wait all kfd processes use adev terminate */
+ while (!!atomic_read(&adev->kfd.dev->kfd_processes_count))
+ cond_resched();
+}
+
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS
@@ -1593,6 +1785,11 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev)
return -EINVAL;
}
+ if (dev->kfd->shared_resources.enable_mes) {
+ dev_err(dev->adev->dev, "Inducing MES hang is not supported\n");
+ return -EINVAL;
+ }
+
return dqm_debugfs_hang_hws(dev->dqm);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 34c2c42c0f95..e0a31e11f0ff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,12 +36,15 @@
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_reset.h"
+#include "amdgpu_sdma.h"
#include "mes_v11_api_def.h"
#include "kfd_debug.h"
/* Size of the per-pipe EOP queue */
#define CIK_HPD_EOP_BYTES_LOG2 11
#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2)
+/* See unmap_queues_cpsch() */
+#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
u32 pasid, unsigned int vmid);
@@ -66,7 +69,8 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id);
-static void kfd_process_hw_exception(struct work_struct *work);
+
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
@@ -133,9 +137,10 @@ static void init_sdma_bitmaps(struct device_queue_manager *dqm)
bitmap_set(dqm->xgmi_sdma_bitmap, 0, get_num_xgmi_sdma_queues(dqm));
/* Mask out the reserved queues */
- bitmap_andnot(dqm->sdma_bitmap, dqm->sdma_bitmap,
- dqm->dev->kfd->device_info.reserved_sdma_queues_bitmap,
- KFD_MAX_SDMA_QUEUES);
+ bitmap_clear(dqm->sdma_bitmap, 0, kfd_get_num_sdma_engines(dqm->dev) *
+ dqm->dev->kfd->device_info.num_reserved_sdma_queues_per_engine);
+ bitmap_clear(dqm->xgmi_sdma_bitmap, 0, kfd_get_num_xgmi_sdma_engines(dqm->dev) *
+ dqm->dev->kfd->device_info.num_reserved_sdma_queues_per_engine);
}
void program_sh_mem_settings(struct device_queue_manager *dqm,
@@ -170,7 +175,7 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
/*
* Issue a GPU reset if HWS is unresponsive
*/
- schedule_work(&dqm->hw_exception_work);
+ amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
}
static int convert_to_mes_queue_type(int queue_type)
@@ -207,23 +212,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;
- if (!pdd->proc_ctx_cpu_ptr) {
- r = amdgpu_amdkfd_alloc_gtt_mem(adev,
- AMDGPU_MES_PROC_CTX_SIZE,
- &pdd->proc_ctx_bo,
- &pdd->proc_ctx_gpu_addr,
- &pdd->proc_ctx_cpu_ptr,
- false);
- if (r) {
- dev_err(adev->dev,
- "failed to allocate process context bo\n");
- return r;
- }
- memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
- }
-
memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
- queue_input.process_id = qpd->pqm->process->pasid;
+ queue_input.process_id = pdd->pasid;
queue_input.page_table_base_addr = qpd->page_table_base;
queue_input.process_va_start = 0;
queue_input.process_va_end = adev->vm_manager.max_pfn - 1;
@@ -265,6 +255,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
queue_input.queue_type = (uint32_t)queue_type;
queue_input.exclusively_scheduled = q->properties.is_gws;
+ queue_input.sh_mem_config_data = qpd->sh_mem_config;
+ queue_input.vm_cntx_cntl = qpd->vm_cntx_cntl;
+ queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
@@ -295,6 +288,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
+ queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
@@ -410,8 +404,7 @@ static void increment_queue_count(struct device_queue_manager *dqm,
struct queue *q)
{
dqm->active_queue_count++;
- if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
- q->properties.type == KFD_QUEUE_TYPE_DIQ)
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
dqm->active_cp_queue_count++;
if (q->properties.is_gws) {
@@ -425,8 +418,7 @@ static void decrement_queue_count(struct device_queue_manager *dqm,
struct queue *q)
{
dqm->active_queue_count--;
- if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
- q->properties.type == KFD_QUEUE_TYPE_DIQ)
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
dqm->active_cp_queue_count--;
if (q->properties.is_gws) {
@@ -483,6 +475,9 @@ static int allocate_doorbell(struct qcm_process_device *qpd,
} else {
/* For CP queues on SOC15 */
if (restore_id) {
+ if (*restore_id >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
+ return -EINVAL;
+
/* make sure that ID is free */
if (__test_and_set_bit(*restore_id, qpd->doorbell_bitmap))
return -EINVAL;
@@ -542,6 +537,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
+ struct kfd_process_device *pdd = qpd_to_pdd(qpd);
struct device *dev = dqm->dev->adev->dev;
int allocated_vmid = -1, i;
@@ -560,9 +556,9 @@ static int allocate_vmid(struct device_queue_manager *dqm,
pr_debug("vmid allocated: %d\n", allocated_vmid);
- dqm->vmid_pasid[allocated_vmid] = q->process->pasid;
+ dqm->vmid_pasid[allocated_vmid] = pdd->pasid;
- set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid);
+ set_pasid_vmid_mapping(dqm, pdd->pasid, allocated_vmid);
qpd->vmid = allocated_vmid;
q->properties.vmid = allocated_vmid;
@@ -579,7 +575,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
qpd->vmid,
qpd->page_table_base);
/* invalidate the VM context after pasid and vmid mapping is set up */
- kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
+ kfd_flush_tlb(qpd_to_pdd(qpd));
if (dqm->dev->kfd2kgd->set_scratch_backing_va)
dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->adev,
@@ -617,7 +613,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
if (flush_texture_cache_nocpsch(q->device, qpd))
dev_err(dev, "Failed to flush TC\n");
- kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
+ kfd_flush_tlb(qpd_to_pdd(qpd));
/* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -683,7 +679,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
/* Temporarily release dqm lock to avoid a circular lock dependency */
dqm_unlock(dqm);
- q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+ q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr, &q->properties);
dqm_lock(dqm);
if (!q->mqd_mem_obj) {
@@ -814,6 +810,11 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process
return -EOPNOTSUPP;
}
+ /* taking the VMID for that process on the safe way using PDD */
+ pdd = kfd_get_process_device_data(dev, p);
+ if (!pdd)
+ return -EFAULT;
+
/* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
* ATC_VMID15_PASID_MAPPING
* to check which VMID the current process is mapped to.
@@ -823,23 +824,19 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process
status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info
(dev->adev, vmid, &queried_pasid);
- if (status && queried_pasid == p->pasid) {
- pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n",
- vmid, p->pasid);
+ if (status && queried_pasid == pdd->pasid) {
+ pr_debug("Killing wave fronts of vmid %d and process pid %d\n",
+ vmid, p->lead_thread->pid);
break;
}
}
if (vmid > last_vmid_to_scan) {
- dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", p->pasid);
+ dev_err(dev->adev->dev, "Didn't find vmid for process pid %d\n",
+ p->lead_thread->pid);
return -EFAULT;
}
- /* taking the VMID for that process on the safe way using PDD */
- pdd = kfd_get_process_device_data(dev, p);
- if (!pdd)
- return -EFAULT;
-
reg_gfx_index.bits.sh_broadcast_writes = 1;
reg_gfx_index.bits.se_broadcast_writes = 1;
reg_gfx_index.bits.instance_broadcast_writes = 1;
@@ -865,8 +862,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
int retval;
struct mqd_manager *mqd_mgr;
- mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
- q->properties.type)];
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
deallocate_hqd(dqm, q);
@@ -1075,8 +1071,8 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
if (q->properties.is_suspended)
return 0;
- pr_debug("Suspending PASID %u queue [%i]\n",
- pdd->process->pasid,
+ pr_debug("Suspending process pid %d queue [%i]\n",
+ pdd->process->lead_thread->pid,
q->properties.queue_id);
is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW);
@@ -1123,8 +1119,8 @@ static int resume_single_queue(struct device_queue_manager *dqm,
pdd = qpd_to_pdd(qpd);
- pr_debug("Restoring from suspend PASID %u queue [%i]\n",
- pdd->process->pasid,
+ pr_debug("Restoring from suspend process pid %d queue [%i]\n",
+ pdd->process->lead_thread->pid,
q->properties.queue_id);
q->properties.is_suspended = false;
@@ -1157,8 +1153,8 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
goto out;
pdd = qpd_to_pdd(qpd);
- pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
- pdd->process->pasid);
+ pr_debug_ratelimited("Evicting process pid %d queues\n",
+ pdd->process->lead_thread->pid);
pdd->last_evict_timestamp = get_jiffies_64();
/* Mark all queues as evicted. Deactivate all active queues on
@@ -1215,8 +1211,11 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
if (!pdd->drm_priv)
goto out;
- pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
- pdd->process->pasid);
+ pr_debug_ratelimited("Evicting process pid %d queues\n",
+ pdd->process->lead_thread->pid);
+
+ if (dqm->dev->kfd->shared_resources.enable_mes)
+ pdd->last_evict_timestamp = get_jiffies_64();
/* Mark all queues as evicted. Deactivate all active queues on
* the qpd.
@@ -1230,23 +1229,23 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
decrement_queue_count(dqm, qpd, q);
if (dqm->dev->kfd->shared_resources.enable_mes) {
- int err;
-
- err = remove_queue_mes(dqm, q, qpd);
- if (err) {
+ retval = remove_queue_mes(dqm, q, qpd);
+ if (retval) {
dev_err(dev, "Failed to evict queue %d\n",
q->properties.queue_id);
- retval = err;
+ goto out;
}
}
}
- pdd->last_evict_timestamp = get_jiffies_64();
- if (!dqm->dev->kfd->shared_resources.enable_mes)
+
+ if (!dqm->dev->kfd->shared_resources.enable_mes) {
+ pdd->last_evict_timestamp = get_jiffies_64();
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
+ }
out:
dqm_unlock(dqm);
@@ -1276,8 +1275,8 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
goto out;
}
- pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
- pdd->process->pasid);
+ pr_debug_ratelimited("Restoring process pid %d queues\n",
+ pdd->process->lead_thread->pid);
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
@@ -1288,7 +1287,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
dqm->dev->adev,
qpd->vmid,
qpd->page_table_base);
- kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
+ kfd_flush_tlb(pdd);
}
/* Take a safe reference to the mm_struct, which may otherwise
@@ -1360,8 +1359,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (!pdd->drm_priv)
goto vm_not_acquired;
- pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
- pdd->process->pasid);
+ pr_debug_ratelimited("Restoring process pid %d queues\n",
+ pdd->process->lead_thread->pid);
/* Update PD Base in QPD */
qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
@@ -1405,7 +1404,7 @@ static int register_process(struct device_queue_manager *dqm,
uint64_t pd_base;
int retval;
- n = kzalloc(sizeof(*n), GFP_KERNEL);
+ n = kzalloc_obj(*n);
if (!n)
return -ENOMEM;
@@ -1439,13 +1438,12 @@ static int register_process(struct device_queue_manager *dqm,
static int unregister_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
- int retval;
+ int retval = 0;
struct device_process_node *cur, *next;
pr_debug("qpd->queues_list is %s\n",
list_empty(&qpd->queues_list) ? "empty" : "not empty");
- retval = 0;
dqm_lock(dqm);
list_for_each_entry_safe(cur, next, &dqm->queues, list) {
@@ -1475,7 +1473,7 @@ set_pasid_vmid_mapping(struct device_queue_manager *dqm, u32 pasid,
unsigned int vmid)
{
uint32_t xcc_mask = dqm->dev->xcc_mask;
- int xcc_id, ret;
+ int xcc_id, ret = 0;
for_each_inst(xcc_id, xcc_mask) {
ret = dqm->dev->kfd2kgd->set_pasid_vmid_mapping(
@@ -1585,12 +1583,16 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
int bit;
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
- if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
- dev_err(dev, "No more SDMA queue to allocate\n");
+ if (bitmap_empty(dqm->sdma_bitmap, get_num_sdma_queues(dqm))) {
+ dev_warn(dev, "No more SDMA queue to allocate (%d total queues)\n",
+ get_num_sdma_queues(dqm));
return -ENOMEM;
}
if (restore_sdma_id) {
+ if (*restore_sdma_id >= get_num_sdma_queues(dqm))
+ return -EINVAL;
+
/* Re-use existing sdma_id */
if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
dev_err(dev, "SDMA queue already in use\n");
@@ -1611,11 +1613,15 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
- if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
- dev_err(dev, "No more XGMI SDMA queue to allocate\n");
+ if (bitmap_empty(dqm->xgmi_sdma_bitmap, get_num_xgmi_sdma_queues(dqm))) {
+ dev_warn(dev, "No more XGMI SDMA queue to allocate (%d total queues)\n",
+ get_num_xgmi_sdma_queues(dqm));
return -ENOMEM;
}
if (restore_sdma_id) {
+ if (*restore_sdma_id >= get_num_xgmi_sdma_queues(dqm))
+ return -EINVAL;
+
/* Re-use existing sdma_id */
if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
dev_err(dev, "SDMA queue already in use\n");
@@ -1671,8 +1677,8 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
}
if (!free_bit_found) {
- dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
- q->properties.sdma_engine_id);
+ dev_warn(dev, "No more SDMA queue to allocate for target ID %i (%d total queues)\n",
+ q->properties.sdma_engine_id, num_queues);
return -ENOMEM;
}
}
@@ -1755,15 +1761,11 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
dqm->active_cp_queue_count = 0;
dqm->gws_queue_count = 0;
dqm->active_runlist = false;
- INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
dqm->trap_debug_vmid = 0;
init_sdma_bitmaps(dqm);
- if (dqm->dev->kfd2kgd->get_iq_wait_times)
- dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
- &dqm->wait_times,
- ffs(dqm->dev->xcc_mask) - 1);
+ update_dqm_wait_times(dqm);
return 0;
}
@@ -1829,8 +1831,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
struct device *dev = dqm->dev->adev->dev;
int retval, num_hw_queue_slots;
- retval = 0;
-
dqm_lock(dqm);
if (!dqm->dev->kfd->shared_resources.enable_mes) {
@@ -1859,25 +1859,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
/* clear hang status when driver try to start the hw scheduler */
dqm->sched_running = true;
- if (!dqm->dev->kfd->shared_resources.enable_mes)
+ if (!dqm->dev->kfd->shared_resources.enable_mes) {
+ if (pm_config_dequeue_wait_counts(&dqm->packet_mgr,
+ KFD_DEQUEUE_WAIT_INIT, 0 /* unused */))
+ dev_err(dev, "Setting optimized dequeue wait failed. Using default values\n");
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
-
- /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */
- if (amdgpu_emu_mode == 0 && dqm->dev->adev->gmc.is_app_apu &&
- (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3))) {
- uint32_t reg_offset = 0;
- uint32_t grace_period = 1;
-
- retval = pm_update_grace_period(&dqm->packet_mgr,
- grace_period);
- if (retval)
- dev_err(dev, "Setting grace timeout failed\n");
- else if (dqm->dev->kfd2kgd->build_grace_period_packet_info)
- /* Update dqm->wait_times maintained in software */
- dqm->dev->kfd2kgd->build_grace_period_packet_info(
- dqm->dev->adev, dqm->wait_times,
- grace_period, &reg_offset,
- &dqm->wait_times);
}
/* setup per-queue reset detection buffer */
@@ -1909,6 +1895,8 @@ fail_packet_manager_init:
static int stop_cpsch(struct device_queue_manager *dqm)
{
+ int ret = 0;
+
dqm_lock(dqm);
if (!dqm->sched_running) {
dqm_unlock(dqm);
@@ -1916,9 +1904,10 @@ static int stop_cpsch(struct device_queue_manager *dqm)
}
if (!dqm->dev->kfd->shared_resources.enable_mes)
- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
+ ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+ 0, USE_DEFAULT_GRACE_PERIOD, false);
else
- remove_all_kfd_queues_mes(dqm);
+ ret = remove_all_kfd_queues_mes(dqm);
dqm->sched_running = false;
@@ -1932,7 +1921,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
dqm->detect_hang_info = NULL;
dqm_unlock(dqm);
- return 0;
+ return ret;
}
static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
@@ -2022,7 +2011,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
- q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+ q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr, &q->properties);
if (!q->mqd_mem_obj) {
retval = -ENOMEM;
goto out_deallocate_doorbell;
@@ -2103,7 +2092,8 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
while (*fence_addr != fence_value) {
/* Fatal err detected, this response won't come */
- if (amdgpu_amdkfd_is_fed(dqm->dev->adev))
+ if (amdgpu_amdkfd_is_fed(dqm->dev->adev) ||
+ amdgpu_in_reset(dqm->dev->adev))
return -EIO;
if (time_after(jiffies, end_jiffies)) {
@@ -2152,8 +2142,8 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
{
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
- dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x is reset\n",
- q->properties.queue_id, q->process->pasid);
+ dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid %d is reset\n",
+ q->properties.queue_id, pdd->process->lead_thread->pid);
pdd->has_reset_queue = true;
if (q->properties.is_active) {
@@ -2222,8 +2212,7 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
return NULL;
}
-/* only for compute queue */
-static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+static int reset_hung_queues(struct device_queue_manager *dqm)
{
int r = 0, reset_count = 0, i;
@@ -2276,7 +2265,121 @@ reset_fail:
return r;
}
-/* dqm->lock mutex has to be locked before calling this function */
+static bool sdma_has_hang(struct device_queue_manager *dqm)
+{
+ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+ int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+ int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+ int i, j;
+
+ for (i = engine_start; i < engine_end; i++) {
+ for (j = 0; j < num_queues_per_eng; j++) {
+ if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j))
+ continue;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm,
+ uint32_t doorbell_off)
+{
+ struct device_process_node *cur;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) &&
+ q->properties.doorbell_off == doorbell_off) {
+ set_queue_as_reset(dqm, q, qpd);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
+{
+ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+ int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+ int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+ int r = 0, i, j;
+
+ if (dqm->is_hws_hang)
+ return -EIO;
+
+ /* Scan for hung HW queues and reset engine. */
+ dqm->detect_hang_count = 0;
+ for (i = engine_start; i < engine_end; i++) {
+ for (j = 0; j < num_queues_per_eng; j++) {
+ uint32_t doorbell_off =
+ dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j);
+
+ if (!doorbell_off)
+ continue;
+
+ /* Reset engine and check. */
+ if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
+ dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
+ !set_sdma_queue_as_reset(dqm, doorbell_off)) {
+ r = -ENOTRECOVERABLE;
+ goto reset_fail;
+ }
+
+ /* Should only expect one queue active per engine */
+ dqm->detect_hang_count++;
+ break;
+ }
+ }
+
+ /* Signal process reset */
+ if (dqm->detect_hang_count)
+ kfd_signal_reset_event(dqm->dev);
+ else
+ r = -ENOTRECOVERABLE;
+
+reset_fail:
+ dqm->detect_hang_count = 0;
+
+ return r;
+}
+
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma)
+{
+ struct amdgpu_device *adev = dqm->dev->adev;
+
+ while (halt_if_hws_hang)
+ schedule();
+
+ if (adev->debug_disable_gpu_ring_reset) {
+ dev_info_once(adev->dev,
+ "%s queue hung, but ring reset disabled",
+ is_sdma ? "sdma" : "compute");
+
+ return -EPERM;
+ }
+ if (!amdgpu_gpu_recovery)
+ return -ENOTRECOVERABLE;
+
+ return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm);
+}
+
+/* dqm->lock mutex has to be locked before calling this function
+ *
+ * @grace_period: If USE_DEFAULT_GRACE_PERIOD then default wait time
+ * for context switch latency. Lower values are used by debugger
+ * since context switching are triggered at high frequency.
+ * This is configured by setting CP_IQ_WAIT_TIME2.SCH_WAVE
+ *
+ */
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param,
@@ -2295,7 +2398,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
return -EIO;
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
- retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
+ retval = pm_config_dequeue_wait_counts(&dqm->packet_mgr,
+ KFD_DEQUEUE_WAIT_SET_SCH_WAVE, grace_period);
if (retval)
goto out;
}
@@ -2326,30 +2430,32 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
* check those fields
*/
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
- if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
- while (halt_if_hws_hang)
- schedule();
- if (reset_queues_on_hws_hang(dqm)) {
- dqm->is_hws_hang = true;
- kfd_hws_hang(dqm);
- retval = -ETIME;
- goto out;
- }
- }
+ if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) &&
+ reset_queues_on_hws_hang(dqm, false))
+ goto reset_fail;
+
+ /* Check for SDMA hang and attempt SDMA reset */
+ if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true))
+ goto reset_fail;
/* We need to reset the grace period value for this device */
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
- if (pm_update_grace_period(&dqm->packet_mgr,
- USE_DEFAULT_GRACE_PERIOD))
+ if (pm_config_dequeue_wait_counts(&dqm->packet_mgr,
+ KFD_DEQUEUE_WAIT_RESET, 0 /* unused */))
dev_err(dev, "Failed to reset grace period\n");
}
pm_release_ib(&dqm->packet_mgr);
dqm->active_runlist = false;
-
out:
up_read(&dqm->dev->adev->reset_domain->sem);
return retval;
+
+reset_fail:
+ dqm->is_hws_hang = true;
+ kfd_hws_hang(dqm);
+ up_read(&dqm->dev->adev->reset_domain->sem);
+ return -ETIME;
}
/* only for compute queue */
@@ -2506,20 +2612,13 @@ failed_try_destroy_debugged_queue:
return retval;
}
-/*
- * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to
- * stay in user mode.
- */
-#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL
-/* APE1 limit is inclusive and 64K aligned. */
-#define APE1_LIMIT_ALIGNMENT 0xFFFF
-
static bool set_cache_memory_policy(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size)
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
bool retval = true;
@@ -2528,41 +2627,17 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
dqm_lock(dqm);
- if (alternate_aperture_size == 0) {
- /* base > limit disables APE1 */
- qpd->sh_mem_ape1_base = 1;
- qpd->sh_mem_ape1_limit = 0;
- } else {
- /*
- * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]},
- * SH_MEM_APE1_BASE[31:0], 0x0000 }
- * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]},
- * SH_MEM_APE1_LIMIT[31:0], 0xFFFF }
- * Verify that the base and size parameters can be
- * represented in this format and convert them.
- * Additionally restrict APE1 to user-mode addresses.
- */
-
- uint64_t base = (uintptr_t)alternate_aperture_base;
- uint64_t limit = base + alternate_aperture_size - 1;
-
- if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 ||
- (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) {
- retval = false;
- goto out;
- }
-
- qpd->sh_mem_ape1_base = base >> 16;
- qpd->sh_mem_ape1_limit = limit >> 16;
- }
-
retval = dqm->asic_ops.set_cache_memory_policy(
dqm,
qpd,
default_policy,
alternate_policy,
alternate_aperture_base,
- alternate_aperture_size);
+ alternate_aperture_size,
+ misc_process_properties);
+
+ if (retval)
+ goto out;
if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0))
program_sh_mem_settings(dqm, qpd);
@@ -2654,7 +2729,7 @@ static int get_wave_state(struct device_queue_manager *dqm,
ctl_stack, ctl_stack_used_size, save_area_used_size);
}
-static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
+static int get_queue_checkpoint_info(struct device_queue_manager *dqm,
const struct queue *q,
u32 *mqd_size,
u32 *ctl_stack_size)
@@ -2662,16 +2737,19 @@ static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
struct mqd_manager *mqd_mgr;
enum KFD_MQD_TYPE mqd_type =
get_mqd_type_from_queue_type(q->properties.type);
+ int ret = 0;
dqm_lock(dqm);
mqd_mgr = dqm->mqd_mgrs[mqd_type];
- *mqd_size = mqd_mgr->mqd_size;
+ *mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask);
*ctl_stack_size = 0;
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info)
- mqd_mgr->get_checkpoint_info(mqd_mgr, q->mqd, ctl_stack_size);
+ ret = mqd_mgr->get_checkpoint_info(mqd_mgr, q->mqd, ctl_stack_size);
dqm_unlock(dqm);
+
+ return ret;
}
static int checkpoint_mqd(struct device_queue_manager *dqm,
@@ -2707,7 +2785,7 @@ dqm_unlock:
static int process_termination_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
- int retval;
+ int retval = 0;
struct queue *q;
struct device *dev = dqm->dev->adev->dev;
struct kernel_queue *kq, *kq_next;
@@ -2717,8 +2795,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
bool found = false;
- retval = 0;
-
dqm_lock(dqm);
/* Clean all kernel queues */
@@ -2835,20 +2911,29 @@ static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
(dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size *
NUM_XCC(dqm->dev->xcc_mask));
- retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, size,
- &(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
+ retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev, size,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &(mem_obj->mem), &(mem_obj->gpu_addr),
(void *)&(mem_obj->cpu_ptr), false);
return retval;
}
+static void deallocate_hiq_sdma_mqd(struct kfd_node *dev,
+ struct kfd_mem_obj *mqd)
+{
+ WARN(!mqd, "No hiq sdma mqd trunk to free");
+
+ amdgpu_amdkfd_free_kernel_mem(dev->adev, &mqd->mem);
+}
+
struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
{
struct device_queue_manager *dqm;
pr_debug("Loading device queue manager\n");
- dqm = kzalloc(sizeof(*dqm), GFP_KERNEL);
+ dqm = kzalloc_obj(*dqm);
if (!dqm)
return NULL;
@@ -2937,7 +3022,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
break;
default:
- if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0))
+ if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 1, 0))
+ device_queue_manager_init_v12_1(&dqm->asic_ops);
+ else if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0))
device_queue_manager_init_v12(&dqm->asic_ops);
else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0))
device_queue_manager_init_v11(&dqm->asic_ops);
@@ -2965,19 +3052,14 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
return dqm;
}
+ if (!dev->kfd->shared_resources.enable_mes)
+ deallocate_hiq_sdma_mqd(dev, &dqm->hiq_sdma_mqd);
+
out_free:
kfree(dqm);
return NULL;
}
-static void deallocate_hiq_sdma_mqd(struct kfd_node *dev,
- struct kfd_mem_obj *mqd)
-{
- WARN(!mqd, "No hiq sdma mqd trunk to free");
-
- amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem);
-}
-
void device_queue_manager_uninit(struct device_queue_manager *dqm)
{
dqm->ops.stop(dqm);
@@ -2989,20 +3071,19 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id)
{
- struct kfd_process_device *pdd;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process_device *pdd = NULL;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, &pdd);
struct device_queue_manager *dqm = knode->dqm;
struct device *dev = dqm->dev->adev->dev;
struct qcm_process_device *qpd;
struct queue *q = NULL;
int ret = 0;
- if (!p)
+ if (!pdd)
return -EINVAL;
dqm_lock(dqm);
- pdd = kfd_get_process_device_data(dqm->dev, p);
if (pdd) {
qpd = &pdd->qpd;
@@ -3035,74 +3116,21 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel
out:
dqm_unlock(dqm);
+ kfd_unref_process(p);
return ret;
}
-static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
+int kfd_evict_process_device(struct kfd_process_device *pdd)
{
- struct device *dev = dqm->dev->adev->dev;
- int ret = 0;
-
- /* Check if process is already evicted */
- dqm_lock(dqm);
- if (qpd->evicted) {
- /* Increment the evicted count to make sure the
- * process stays evicted before its terminated.
- */
- qpd->evicted++;
- dqm_unlock(dqm);
- goto out;
- }
- dqm_unlock(dqm);
-
- ret = suspend_all_queues_mes(dqm);
- if (ret) {
- dev_err(dev, "Suspending all queues failed");
- goto out;
- }
-
- ret = dqm->ops.evict_process_queues(dqm, qpd);
- if (ret) {
- dev_err(dev, "Evicting process queues failed");
- goto out;
- }
+ struct device_queue_manager *dqm;
+ struct kfd_process *p;
- ret = resume_all_queues_mes(dqm);
- if (ret)
- dev_err(dev, "Resuming all queues failed");
+ p = pdd->process;
+ dqm = pdd->dev->dqm;
-out:
- return ret;
-}
-
-int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
-{
- struct kfd_process_device *pdd;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
- int ret = 0;
-
- if (!p)
- return -EINVAL;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
- pdd = kfd_get_process_device_data(dqm->dev, p);
- if (pdd) {
- if (dqm->dev->kfd->shared_resources.enable_mes)
- ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
- else
- ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
- }
-
- kfd_unref_process(p);
-
- return ret;
-}
-static void kfd_process_hw_exception(struct work_struct *work)
-{
- struct device_queue_manager *dqm = container_of(work,
- struct device_queue_manager, hw_exception_work);
- amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
+ return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
}
int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
@@ -3441,7 +3469,6 @@ int suspend_queues(struct kfd_process *p,
else
per_device_suspended++;
} else if (err != -EBUSY) {
- r = err;
queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
break;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 09ab36f8e8c6..3272328da11f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -37,7 +37,6 @@
#define KFD_MES_PROCESS_QUANTUM 100000
#define KFD_MES_GANG_QUANTUM 10000
-#define USE_DEFAULT_GRACE_PERIOD 0xffffffff
struct device_process_node {
struct qcm_process_device *qpd;
@@ -174,7 +173,8 @@ struct device_queue_manager_ops {
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size);
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
int (*process_termination)(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
@@ -192,7 +192,7 @@ struct device_queue_manager_ops {
int (*reset_queues)(struct device_queue_manager *dqm,
uint16_t pasid);
- void (*get_queue_checkpoint_info)(struct device_queue_manager *dqm,
+ int (*get_queue_checkpoint_info)(struct device_queue_manager *dqm,
const struct queue *q, u32 *mqd_size,
u32 *ctl_stack_size);
@@ -210,7 +210,8 @@ struct device_queue_manager_asic_ops {
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size);
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
void (*init_sdma_vm)(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd);
@@ -269,7 +270,6 @@ struct device_queue_manager {
/* hw exception */
bool is_hws_hang;
bool is_resetting;
- struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
bool sched_halt;
@@ -299,6 +299,8 @@ void device_queue_manager_init_v11(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v12(
struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_v12_1(
+ struct device_queue_manager_asic_ops *asic_ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
unsigned int get_cp_queues_num(struct device_queue_manager *dqm);
@@ -359,4 +361,14 @@ static inline int read_sdma_queue_counter(uint64_t __user *q_rptr, uint64_t *val
/* SDMA activity counter is stored at queue's RPTR + 0x8 location. */
return get_user(*val, q_rptr + 1);
}
+
+static inline void update_dqm_wait_times(struct device_queue_manager *dqm)
+{
+ if (dqm->dev->kfd2kgd->get_iq_wait_times)
+ dqm->dev->kfd2kgd->get_iq_wait_times(dqm->dev->adev,
+ &dqm->wait_times,
+ ffs(dqm->dev->xcc_mask) - 1);
+}
+
+
#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index d4d95c7f2e5d..0508ef5a41d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -27,12 +27,21 @@
#include "oss/oss_2_4_sh_mask.h"
#include "gca/gfx_7_2_sh_mask.h"
+/*
+ * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to
+ * stay in user mode.
+ */
+#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL
+/* APE1 limit is inclusive and 64K aligned. */
+#define APE1_LIMIT_ALIGNMENT 0xFFFF
+
static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size);
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
static int update_qpd_cik(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm(struct device_queue_manager *dqm,
@@ -80,10 +89,41 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size)
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
uint32_t default_mtype;
uint32_t ape1_mtype;
+ unsigned int temp;
+ bool retval = true;
+
+ if (alternate_aperture_size == 0) {
+ /* base > limit disables APE1 */
+ qpd->sh_mem_ape1_base = 1;
+ qpd->sh_mem_ape1_limit = 0;
+ } else {
+ /*
+ * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]},
+ * SH_MEM_APE1_BASE[31:0], 0x0000 }
+ * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]},
+ * SH_MEM_APE1_LIMIT[31:0], 0xFFFF }
+ * Verify that the base and size parameters can be
+ * represented in this format and convert them.
+ * Additionally restrict APE1 to user-mode addresses.
+ */
+
+ uint64_t base = (uintptr_t)alternate_aperture_base;
+ uint64_t limit = base + alternate_aperture_size - 1;
+
+ if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 ||
+ (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) {
+ retval = false;
+ goto out;
+ }
+
+ qpd->sh_mem_ape1_base = base >> 16;
+ qpd->sh_mem_ape1_limit = limit >> 16;
+ }
default_mtype = (default_policy == cache_policy_coherent) ?
MTYPE_NONCACHED :
@@ -97,37 +137,22 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
| ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED)
| DEFAULT_MTYPE(default_mtype)
| APE1_MTYPE(ape1_mtype);
-
- return true;
-}
-
-static int update_qpd_cik(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
-{
- struct kfd_process_device *pdd;
- unsigned int temp;
-
- pdd = qpd_to_pdd(qpd);
-
- /* check if sh_mem_config register already configured */
- if (qpd->sh_mem_config == 0) {
- qpd->sh_mem_config =
- ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) |
- DEFAULT_MTYPE(MTYPE_NONCACHED) |
- APE1_MTYPE(MTYPE_NONCACHED);
- qpd->sh_mem_ape1_limit = 0;
- qpd->sh_mem_ape1_base = 0;
- }
-
/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
* aperture addresses.
*/
- temp = get_sh_mem_bases_nybble_64(pdd);
+ temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd));
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases);
+out:
+ return retval;
+}
+
+static int update_qpd_cik(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
index 245a90dfc2f6..ba6e3d747ccd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
@@ -31,10 +31,18 @@ static int update_qpd_v10(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
void device_queue_manager_init_v10(
struct device_queue_manager_asic_ops *asic_ops)
{
+ asic_ops->set_cache_memory_policy = set_cache_memory_policy_v10;
asic_ops->update_qpd = update_qpd_v10;
asic_ops->init_sdma_vm = init_sdma_vm_v10;
asic_ops->mqd_manager_init = mqd_manager_init_v10;
@@ -49,27 +57,28 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
private_base;
}
-static int update_qpd_v10(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
+static bool set_cache_memory_policy_v10(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
- struct kfd_process_device *pdd;
-
- pdd = qpd_to_pdd(qpd);
-
- /* check if sh_mem_config register already configured */
- if (qpd->sh_mem_config == 0) {
- qpd->sh_mem_config =
- (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
- (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
- qpd->sh_mem_ape1_limit = 0;
- qpd->sh_mem_ape1_base = 0;
- }
-
- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+ qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
+ (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd));
pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+ return true;
+}
+static int update_qpd_v10(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c
index 2e129da7acb4..8b447d04558f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v11.c
@@ -30,10 +30,18 @@ static int update_qpd_v11(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v11(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
void device_queue_manager_init_v11(
struct device_queue_manager_asic_ops *asic_ops)
{
+ asic_ops->set_cache_memory_policy = set_cache_memory_policy_v11;
asic_ops->update_qpd = update_qpd_v11;
asic_ops->init_sdma_vm = init_sdma_vm_v11;
asic_ops->mqd_manager_init = mqd_manager_init_v11;
@@ -48,28 +56,29 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
private_base;
}
-static int update_qpd_v11(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
+static bool set_cache_memory_policy_v11(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
- struct kfd_process_device *pdd;
-
- pdd = qpd_to_pdd(qpd);
-
- /* check if sh_mem_config register already configured */
- if (qpd->sh_mem_config == 0) {
- qpd->sh_mem_config =
- (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
- (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
-
- qpd->sh_mem_ape1_limit = 0;
- qpd->sh_mem_ape1_base = 0;
- }
+ qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
+ (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd));
pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+ return true;
+}
+static int update_qpd_v11(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c
index 4f3295b29dfb..3550da3a46f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c
@@ -30,10 +30,18 @@ static int update_qpd_v12(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
void device_queue_manager_init_v12(
struct device_queue_manager_asic_ops *asic_ops)
{
+ asic_ops->set_cache_memory_policy = set_cache_memory_policy_v12;
asic_ops->update_qpd = update_qpd_v12;
asic_ops->init_sdma_vm = init_sdma_vm_v12;
asic_ops->mqd_manager_init = mqd_manager_init_v12;
@@ -48,28 +56,29 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
private_base;
}
-static int update_qpd_v12(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
+static bool set_cache_memory_policy_v12(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
- struct kfd_process_device *pdd;
-
- pdd = qpd_to_pdd(qpd);
-
- /* check if sh_mem_config register already configured */
- if (qpd->sh_mem_config == 0) {
- qpd->sh_mem_config =
- (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
- (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
-
- qpd->sh_mem_ape1_limit = 0;
- qpd->sh_mem_ape1_base = 0;
- }
+ qpd->sh_mem_config = (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
+ (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
- qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd));
pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+ return true;
+}
+static int update_qpd_v12(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c
new file mode 100644
index 000000000000..9e70a5f8a50b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12_1.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "gc/gc_12_1_0_sh_mask.h"
+#include "soc_v1_0_enum.h"
+
+static int update_qpd_v12_1(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+static void init_sdma_vm_v12_1(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
+
+void device_queue_manager_init_v12_1(
+ struct device_queue_manager_asic_ops *asic_ops)
+{
+ asic_ops->update_qpd = update_qpd_v12_1;
+ asic_ops->init_sdma_vm = init_sdma_vm_v12_1;
+ asic_ops->mqd_manager_init = mqd_manager_init_v12_1;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+{
+ uint32_t shared_base = pdd->lds_base >> 48;
+ uint32_t private_base = pdd->scratch_base >> 58;
+
+ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+ (private_base << SH_MEM_BASES__PRIVATE_BASE__SHIFT);
+}
+
+static int update_qpd_v12_1(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct kfd_process_device *pdd;
+ struct amdgpu_device *adev = dqm->dev->adev;
+ struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
+
+ pdd = qpd_to_pdd(qpd);
+ qpd->vm_cntx_cntl = hub->vm_cntx_cntl;
+
+ /* check if sh_mem_config register already configured */
+ if (qpd->sh_mem_config == 0) {
+ qpd->sh_mem_config =
+ (SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
+ (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
+
+ qpd->sh_mem_config |=
+ (1 << SH_MEM_CONFIG__F8_MODE__SHIFT);
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ }
+
+ if (KFD_SUPPORT_XNACK_PER_PROCESS(dqm->dev)) {
+ if (!pdd->process->xnack_enabled) {
+ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+ qpd->vm_cntx_cntl &=
+ ~(1 << GCVM_CONTEXT0_CNTL__RETRY_PERMISSION_OR_INVALID_PAGE_FAULT__SHIFT);
+ } else {
+ qpd->sh_mem_config &= ~(1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT);
+ qpd->vm_cntx_cntl |=
+ (1 << GCVM_CONTEXT0_CNTL__RETRY_PERMISSION_OR_INVALID_PAGE_FAULT__SHIFT);
+ }
+ }
+
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+
+ pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+
+ return 0;
+}
+
+static void init_sdma_vm_v12_1(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ /* Not needed on SDMAv4 onwards any more */
+ q->properties.sdma_vm_addr = 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index 67137e674f1d..9fcc8c6e57b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -30,10 +30,18 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
void device_queue_manager_init_v9(
struct device_queue_manager_asic_ops *asic_ops)
{
+ asic_ops->set_cache_memory_policy = set_cache_memory_policy_v9;
asic_ops->update_qpd = update_qpd_v9;
asic_ops->init_sdma_vm = init_sdma_vm_v9;
asic_ops->mqd_manager_init = mqd_manager_init_v9;
@@ -48,10 +56,42 @@ static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
private_base;
}
+static bool set_cache_memory_policy_v9(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
+{
+ qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+
+ if (dqm->dev->kfd->noretry)
+ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+
+ if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) ||
+ KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4))
+ qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT);
+
+ if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0)) {
+ if (misc_process_properties & KFD_PROC_FLAG_MFMA_HIGH_PRECISION)
+ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRECISION_MODE__SHIFT;
+ }
+
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(qpd_to_pdd(qpd));
+
+ pr_debug("sh_mem_bases 0x%X sh_mem_config 0x%X\n", qpd->sh_mem_bases,
+ qpd->sh_mem_config);
+ return true;
+}
+
static int update_qpd_v9(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
- struct kfd_process_device *pdd;
+ struct kfd_process_device *pdd = qpd_to_pdd(qpd);
pdd = qpd_to_pdd(qpd);
@@ -64,8 +104,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) ||
- KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4) ||
- KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 5, 0))
+ KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4))
qpd->sh_mem_config |=
(1 << SH_MEM_CONFIG__F8_MODE__SHIFT);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index b291ee0fab94..dad83356e976 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -27,12 +27,21 @@
#include "gca/gfx_8_0_sh_mask.h"
#include "oss/oss_3_0_sh_mask.h"
+/*
+ * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to
+ * stay in user mode.
+ */
+#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL
+/* APE1 limit is inclusive and 64K aligned. */
+#define APE1_LIMIT_ALIGNMENT 0xFFFF
+
static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size);
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties);
static int update_qpd_vi(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm(struct device_queue_manager *dqm,
@@ -81,10 +90,41 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
enum cache_policy default_policy,
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
- uint64_t alternate_aperture_size)
+ uint64_t alternate_aperture_size,
+ u32 misc_process_properties)
{
uint32_t default_mtype;
uint32_t ape1_mtype;
+ unsigned int temp;
+ bool retval = true;
+
+ if (alternate_aperture_size == 0) {
+ /* base > limit disables APE1 */
+ qpd->sh_mem_ape1_base = 1;
+ qpd->sh_mem_ape1_limit = 0;
+ } else {
+ /*
+ * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]},
+ * SH_MEM_APE1_BASE[31:0], 0x0000 }
+ * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]},
+ * SH_MEM_APE1_LIMIT[31:0], 0xFFFF }
+ * Verify that the base and size parameters can be
+ * represented in this format and convert them.
+ * Additionally restrict APE1 to user-mode addresses.
+ */
+
+ uint64_t base = (uintptr_t)alternate_aperture_base;
+ uint64_t limit = base + alternate_aperture_size - 1;
+
+ if (limit <= base || (base & APE1_FIXED_BITS_MASK) != 0 ||
+ (limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) {
+ retval = false;
+ goto out;
+ }
+
+ qpd->sh_mem_ape1_base = base >> 16;
+ qpd->sh_mem_ape1_limit = limit >> 16;
+ }
default_mtype = (default_policy == cache_policy_coherent) ?
MTYPE_UC :
@@ -100,40 +140,21 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
- return true;
-}
-
-static int update_qpd_vi(struct device_queue_manager *dqm,
- struct qcm_process_device *qpd)
-{
- struct kfd_process_device *pdd;
- unsigned int temp;
-
- pdd = qpd_to_pdd(qpd);
-
- /* check if sh_mem_config register already configured */
- if (qpd->sh_mem_config == 0) {
- qpd->sh_mem_config =
- SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
- SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
- MTYPE_UC <<
- SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
- MTYPE_UC <<
- SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
-
- qpd->sh_mem_ape1_limit = 0;
- qpd->sh_mem_ape1_base = 0;
- }
-
/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
* aperture addresses.
*/
- temp = get_sh_mem_bases_nybble_64(pdd);
+ temp = get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd));
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n",
temp, qpd->sh_mem_bases);
+out:
+ return retval;
+}
+static int update_qpd_vi(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index d075f24e5f9f..44150a71ffd5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -67,7 +67,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
void *backing_store;
struct kfd_signal_page *page;
- page = kzalloc(sizeof(*page), GFP_KERNEL);
+ page = kzalloc_obj(*page);
if (!page)
return NULL;
@@ -142,6 +142,7 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
* @p: Pointer to struct kfd_process
* @id: ID to look up
* @bits: Number of valid bits in @id
+ * @signal_mailbox_updated: flag indicates if FW updates signal mailbox entry
*
* Finds the first signaled event with a matching partial ID. If no
* matching signaled event is found, returns NULL. In that case the
@@ -155,7 +156,8 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
* driver.
*/
static struct kfd_event *lookup_signaled_event_by_partial_id(
- struct kfd_process *p, uint32_t id, uint32_t bits)
+ struct kfd_process *p, uint32_t id, uint32_t bits,
+ bool signal_mailbox_updated)
{
struct kfd_event *ev;
@@ -166,7 +168,8 @@ static struct kfd_event *lookup_signaled_event_by_partial_id(
* and we only need a single lookup.
*/
if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
- if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+ if (signal_mailbox_updated &&
+ page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
return NULL;
return idr_find(&p->event_idr, id);
@@ -331,7 +334,13 @@ static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
if (p->signal_page)
return -EBUSY;
- page = kzalloc(sizeof(*page), GFP_KERNEL);
+ if (size < KFD_SIGNAL_EVENT_LIMIT * 8) {
+ pr_err("Event page size %llu is too small, need at least %lu bytes\n",
+ size, (unsigned long)(KFD_SIGNAL_EVENT_LIMIT * 8));
+ return -EINVAL;
+ }
+
+ page = kzalloc_obj(*page);
if (!page)
return -ENOMEM;
@@ -399,7 +408,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index)
{
int ret = 0;
- struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ struct kfd_event *ev = kzalloc_obj(*ev);
if (!ev)
return -ENOMEM;
@@ -452,11 +461,11 @@ int kfd_criu_restore_event(struct file *devkfd,
struct kfd_event *ev = NULL;
int ret = 0;
- ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
+ ev_priv = kmalloc_obj(*ev_priv);
if (!ev_priv)
return -ENOMEM;
- ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ ev = kzalloc_obj(*ev);
if (!ev) {
ret = -ENOMEM;
goto exit;
@@ -718,7 +727,7 @@ static void set_event_from_interrupt(struct kfd_process *p,
}
void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
- uint32_t valid_id_bits)
+ uint32_t valid_id_bits, bool signal_mailbox_updated)
{
struct kfd_event *ev = NULL;
@@ -727,7 +736,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -736,7 +745,8 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
if (valid_id_bits)
ev = lookup_signaled_event_by_partial_id(p, partial_id,
- valid_id_bits);
+ valid_id_bits,
+ signal_mailbox_updated);
if (ev) {
set_event_from_interrupt(p, ev);
} else if (p->signal_page) {
@@ -748,16 +758,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
uint64_t *slots = page_slots(p->signal_page);
uint32_t id;
- /*
- * If id is valid but slot is not signaled, GPU may signal the same event twice
- * before driver have chance to process the first interrupt, then signal slot is
- * auto-reset after set_event wakeup the user space, just drop the second event as
- * the application only need wakeup once.
- */
- if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
- partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
- goto out_unlock;
-
if (valid_id_bits)
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
partial_id, valid_id_bits);
@@ -786,7 +786,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
}
}
-out_unlock:
rcu_read_unlock();
kfd_unref_process(p);
}
@@ -796,8 +795,7 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
struct kfd_event_waiter *event_waiters;
uint32_t i;
- event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter),
- GFP_KERNEL);
+ event_waiters = kzalloc_objs(struct kfd_event_waiter, num_events);
if (!event_waiters)
return NULL;
@@ -1139,8 +1137,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (type == KFD_EVENT_TYPE_MEMORY) {
dev_warn(kfd_device,
- "Sending SIGSEGV to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGSEGV to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGSEGV, p->lead_thread, 0);
}
@@ -1148,13 +1146,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (send_signal) {
if (send_sigterm) {
dev_warn(kfd_device,
- "Sending SIGTERM to process %d (pasid 0x%x)",
- p->lead_thread->pid, p->pasid);
+ "Sending SIGTERM to process pid %d",
+ p->lead_thread->pid);
send_sig(SIGTERM, p->lead_thread, 0);
} else {
dev_err(kfd_device,
- "Process %d (pasid 0x%x) got unhandled exception",
- p->lead_thread->pid, p->pasid);
+ "Process pid %d got unhandled exception",
+ p->lead_thread->pid);
}
}
@@ -1168,7 +1166,7 @@ void kfd_signal_hw_exception_event(u32 pasid)
* to process context, kfd_process could attempt to exit while we are
* running so the lookup function increments the process ref count.
*/
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return; /* Presumably process exited. */
@@ -1177,22 +1175,39 @@ void kfd_signal_hw_exception_event(u32 pasid)
kfd_unref_process(p);
}
-void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_hsa_memory_exception_data exception_data;
+ int i;
+
+ memset(&exception_data, 0, sizeof(exception_data));
+ exception_data.va = gpu_va;
+ exception_data.failure.NotPresent = 1;
+
+ // Send VM seg fault to all kfd process device
+ for (i = 0; i < p->n_pdds; i++) {
+ pdd = p->pdds[i];
+ exception_data.gpu_id = pdd->user_gpu_id;
+ kfd_evict_process_device(pdd);
+ kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
+ }
+}
+
+void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data)
{
struct kfd_event *ev;
uint32_t id;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = pdd->process;
struct kfd_hsa_memory_exception_data memory_exception_data;
int user_gpu_id;
- if (!p)
- return; /* Presumably process exited. */
-
- user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+ user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
- WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
+ pdd->dev->id);
return;
}
@@ -1229,7 +1244,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
}
rcu_read_unlock();
- kfd_unref_process(p);
}
void kfd_signal_reset_event(struct kfd_node *dev)
@@ -1264,7 +1278,8 @@ void kfd_signal_reset_event(struct kfd_node *dev)
}
if (unlikely(!pdd)) {
- WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
+ WARN_ONCE(1, "Could not get device data from process pid:%d\n",
+ p->lead_thread->pid);
continue;
}
@@ -1273,12 +1288,19 @@ void kfd_signal_reset_event(struct kfd_node *dev)
if (dev->dqm->detect_hang_count) {
struct amdgpu_task_info *ti;
+ struct amdgpu_fpriv *drv_priv;
- ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+ if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) {
+ WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n",
+ dev->id, p->lead_thread->pid);
+ continue;
+ }
+
+ ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
if (ti) {
dev_err(dev->adev->dev,
"Queues reset on process %s tid %d thread %s pid %d\n",
- ti->process_name, ti->tgid, ti->task_name, ti->pid);
+ ti->process_name, ti->tgid, ti->task.comm, ti->task.pid);
amdgpu_vm_put_task_info(ti);
}
}
@@ -1311,7 +1333,7 @@ void kfd_signal_reset_event(struct kfd_node *dev)
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
{
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_event *ev;
@@ -1326,6 +1348,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) {
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
+ kfd_unref_process(p);
return;
}
@@ -1366,3 +1389,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
kfd_unref_process(p);
}
+
+/* signal KFD_EVENT_TYPE_SIGNAL events from process p
+ * send signal SIGBUS to correspondent user space process
+ */
+void kfd_signal_process_terminate_event(struct kfd_process *p)
+{
+ struct kfd_event *ev;
+ u32 id;
+
+ rcu_read_lock();
+
+ /* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */
+ id = 1;
+ idr_for_each_entry_continue(&p->event_idr, ev, id)
+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
+ spin_lock(&ev->lock);
+ set_event(ev);
+ spin_unlock(&ev->lock);
+ }
+
+ /* Send SIGBUS to p->lead_thread */
+ dev_notice(kfd_device,
+ "Sending SIGBUS to process %d",
+ p->lead_thread->pid);
+
+ send_sig(SIGBUS, p->lead_thread, 0);
+
+ rcu_read_unlock();
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index 52ccfd397c2b..1dc21c13833b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -85,6 +85,7 @@ struct kfd_event {
#define KFD_EVENT_TYPE_MEMORY 8
extern void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
- uint32_t valid_id_bits);
+ uint32_t valid_id_bits,
+ bool signal_mailbox_updated);
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index dbcb60eb54b2..04c5e26f01ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -23,7 +23,6 @@
*/
#include <linux/device.h>
-#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/sched.h>
@@ -360,6 +359,25 @@ static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev);
}
+static void kfd_init_apertures_v12(struct kfd_process_device *pdd, uint8_t id)
+{
+ pdd->lds_base = pdd->dev->adev->gmc.shared_aperture_start;
+ pdd->lds_limit = pdd->dev->adev->gmc.shared_aperture_end;
+
+ pdd->gpuvm_base = AMDGPU_VA_RESERVED_BOTTOM;
+ pdd->gpuvm_limit =
+ pdd->dev->kfd->shared_resources.gpuvm_size - 1;
+
+ pdd->scratch_base = pdd->dev->adev->gmc.private_aperture_start;
+ pdd->scratch_limit = pdd->dev->adev->gmc.private_aperture_end;
+
+ /*
+ * Place TBA/TMA on opposite side of VM hole to prevent
+ * stray faults from triggering SVM on these pages.
+ */
+ pdd->qpd.cwsr_base = AMDGPU_VA_RESERVED_TRAP_START(pdd->dev->adev);
+}
+
int kfd_init_apertures(struct kfd_process *process)
{
uint8_t id = 0;
@@ -407,9 +425,11 @@ int kfd_init_apertures(struct kfd_process *process)
kfd_init_apertures_vi(pdd, id);
break;
default:
- if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 0, 1))
+ if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 1, 0)) {
+ kfd_init_apertures_v12(pdd, id);
+ } else if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 0, 1)) {
kfd_init_apertures_v9(pdd, id);
- else {
+ } else {
WARN(1, "Unexpected ASIC family %u",
dev->adev->asic_type);
return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index 37b69fe0ede3..19406ab92c5b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -168,14 +168,14 @@ static bool event_interrupt_isr_v10(struct kfd_node *dev,
client_id != SOC15_IH_CLIENTID_SE3SH)
return false;
- pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
- client_id, source_id, vmid, pasid);
- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
- data[0], data[1], data[2], data[3],
- data[4], data[5], data[6], data[7]);
+ dev_dbg(dev->adev->dev,
+ "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, vmid, pasid);
+ dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6],
+ data[7]);
- /* If there is no valid PASID, it's likely a bug */
- if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+ if (pasid == 0)
return 0;
/* Interrupt types we care about: various signals and faults.
@@ -211,43 +211,72 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
client_id == SOC15_IH_CLIENTID_SE2SH ||
client_id == SOC15_IH_CLIENTID_SE3SH) {
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
- kfd_signal_event_interrupt(pasid, context_id0, 32);
+ kfd_signal_event_interrupt(pasid, context_id0, 32, true);
else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
encoding = REG_GET_FIELD(context_id1,
SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
switch (encoding) {
case SQ_INTERRUPT_WORD_ENCODING_AUTO:
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n",
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1,
- SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
- THREAD_TRACE),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
- WLT),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
- THREAD_TRACE_BUF0_FULL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
- THREAD_TRACE_BUF1_FULL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
- THREAD_TRACE_UTC_ERROR));
+ REG_GET_FIELD(
+ context_id1,
+ SQ_INTERRUPT_WORD_AUTO_CTXID1,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ WLT),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_BUF0_FULL),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_BUF1_FULL),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_UTC_ERROR));
break;
case SQ_INTERRUPT_WORD_ENCODING_INST:
- pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
- SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- SA_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- SIMD_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
- WGP_ID));
+ dev_dbg_ratelimited(
+ dev->adev->dev,
+ "sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
+ REG_GET_FIELD(
+ context_id1,
+ SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ DATA),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SA_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ PRIV),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SIMD_ID),
+ REG_GET_FIELD(
+ context_id1,
+ SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ WGP_ID));
if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) {
if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),
@@ -259,27 +288,43 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
case SQ_INTERRUPT_WORD_ENCODING_ERROR:
sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0,
ERR_TYPE);
- pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n",
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
- SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- SA_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
- SIMD_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
- WGP_ID),
+ dev_warn_ratelimited(
+ dev->adev->dev,
+ "sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n",
+ REG_GET_FIELD(
+ context_id1,
+ SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ DATA),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SA_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ PRIV),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SIMD_ID),
+ REG_GET_FIELD(
+ context_id1,
+ SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ WGP_ID),
sq_intr_err_type);
break;
default:
break;
}
- kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23);
+ kfd_signal_event_interrupt(pasid, context_id0 & 0x7fffff, 23, true);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
@@ -299,7 +344,7 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
client_id == SOC15_IH_CLIENTID_SDMA6 ||
client_id == SOC15_IH_CLIENTID_SDMA7) {
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
- kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true);
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_VMC1 ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index b3f988b275a8..12d81abed748 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -148,44 +148,69 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \
KFD_CTXID0_DOORBELL_ID_MASK)
-static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_BUF_FULL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ REG_TIMESTAMP),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ CMD_TIMESTAMP),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ HOST_CMD_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ HOST_REG_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ IMMED_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_UTC_ERROR));
}
-static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SH_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ SIMD_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ WGP_ID));
}
-static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_warn_ratelimited(
+ dev_warn_ratelimited(
+ dev->adev->dev,
"sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ DETAIL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ TYPE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ SH_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ PRIV),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1,
+ SIMD_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1,
+ WGP_ID));
}
static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
@@ -194,7 +219,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
uint32_t reset = 0;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return;
@@ -255,14 +280,14 @@ static bool event_interrupt_isr_v11(struct kfd_node *dev,
(context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG))
return false;
- pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
- client_id, source_id, vmid, pasid);
- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
- data[0], data[1], data[2], data[3],
- data[4], data[5], data[6], data[7]);
+ dev_dbg(dev->adev->dev,
+ "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, vmid, pasid);
+ dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6],
+ data[7]);
- /* If there is no valid PASID, it's likely a bug */
- if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+ if (pasid == 0)
return false;
/* Interrupt types we care about: various signals and faults.
@@ -328,7 +353,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
- kfd_signal_event_interrupt(pasid, context_id0, 32);
+ kfd_signal_event_interrupt(pasid, context_id0, 32, true);
else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
@@ -341,7 +366,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
- kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true);
else if (source_id == SOC21_INTSRC_SDMA_ECC) {
event_interrupt_poison_consumption_v11(dev, pasid, source_id);
return;
@@ -353,10 +378,10 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
switch (sq_int_enc) {
case SQ_INTERRUPT_WORD_ENCODING_AUTO:
- print_sq_intr_info_auto(context_id0, context_id1);
+ print_sq_intr_info_auto(dev, context_id0, context_id1);
break;
case SQ_INTERRUPT_WORD_ENCODING_INST:
- print_sq_intr_info_inst(context_id0, context_id1);
+ print_sq_intr_info_inst(dev, context_id0, context_id1);
sq_int_priv = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid,
@@ -366,7 +391,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
return;
break;
case SQ_INTERRUPT_WORD_ENCODING_ERROR:
- print_sq_intr_info_error(context_id0, context_id1);
+ print_sq_intr_info_error(dev, context_id0, context_id1);
sq_int_errtype = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE);
if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
@@ -379,7 +404,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
default:
break;
}
- kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24, true);
}
} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c
new file mode 100644
index 000000000000..0da7e1db55c9
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_priv.h"
+#include "kfd_events.h"
+#include "soc15_int.h"
+#include "kfd_device_queue_manager.h"
+#include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
+#include "kfd_smi_events.h"
+#include "kfd_debug.h"
+#include "amdgpu_ras_mgr.h"
+
+/*
+ * GFX12.1 SQ Interrupts
+ *
+ * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
+ * packet to the Interrupt Handler:
+ * Auto - Generated by the SQG (various cmd overflows, timestamps etc)
+ * Wave - Generated by S_SENDMSG through a shader program
+ * Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
+ *
+ * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
+ * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
+ *
+ * - context_id1[7:6]
+ * Encoding type (0 = Auto, 1 = Wave, 2 = Error)
+ *
+ * - context_id0[26]
+ * PRIV bit indicates that Wave S_SEND or error occurred within trap
+ *
+ * - context_id0[24:0]
+ * 25-bit data with the following layout per encoding type:
+ * Auto - only context_id0[8:0] is used, which reports various interrupts
+ * generated by SQG. The rest is 0.
+ * Wave - user data sent from m0 via S_SENDMSG (context_id0[23:0])
+ * Error - Error Type (context_id0[24:21]), Error Details (context_id0[20:0])
+ *
+ * The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave
+ * S_SENDMSG and Errors. These are 0 for Auto.
+ */
+
+enum SQ_INTERRUPT_WORD_ENCODING {
+ SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
+ SQ_INTERRUPT_WORD_ENCODING_INST,
+ SQ_INTERRUPT_WORD_ENCODING_ERROR,
+};
+
+enum SQ_INTERRUPT_ERROR_TYPE {
+ SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
+ SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
+ SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
+ SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
+};
+
+/* SQ_INTERRUPT_WORD_AUTO_CTXID */
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL__SHIFT 2
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL__SHIFT 3
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 8
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF0_FULL_MASK 0x00000004
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF1_FULL_MASK 0x00000008
+#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000100
+#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x000000c0
+
+/* SQ_INTERRUPT_WORD_WAVE_CTXID */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID__SHIFT 25
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 26
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 27
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID__SHIFT 0
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 2
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x00ffffff /* [23:0] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SA_ID_MASK 0x02000000 /* [25] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x04000000 /* [26] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */
+#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */
+
+/* SQ_INTERRUPT_WORD_ERROR_CTXID */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL__SHIFT 0
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL__SHIFT 19
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE__SHIFT 21
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID__SHIFT 25
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV__SHIFT 26
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID__SHIFT 27
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID__SHIFT 0
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID__SHIFT 2
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING__SHIFT 6
+
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL_MASK 0x0007ffff /* [18:0] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__MEM_VIOL_MASK 0x00180000 /* [20:19] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE_MASK 0x01e00000 /* [24:21] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SA_ID_MASK 0x02000000 /* [25] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV_MASK 0x04000000 /* [26] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */
+#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */
+
+/*
+ * The debugger will send user data(m0) with PRIV=1 to indicate it requires
+ * notification from the KFD with the following queue id (DOORBELL_ID) and
+ * trap code (TRAP_CODE).
+ */
+#define KFD_CTXID0_TRAP_CODE_SHIFT 10
+#define KFD_CTXID0_TRAP_CODE_MASK 0xfffc00
+#define KFD_CTXID0_CP_BAD_OP_ECODE_MASK 0x3ffffff
+#define KFD_CTXID0_DOORBELL_ID_MASK 0x0003ff
+
+#define KFD_CTXID0_TRAP_CODE(ctxid0) (((ctxid0) & \
+ KFD_CTXID0_TRAP_CODE_MASK) >> \
+ KFD_CTXID0_TRAP_CODE_SHIFT)
+#define KFD_CTXID0_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \
+ KFD_CTXID0_CP_BAD_OP_ECODE_MASK) >> \
+ KFD_CTXID0_TRAP_CODE_SHIFT)
+#define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \
+ KFD_CTXID0_DOORBELL_ID_MASK)
+
+static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1)
+{
+ dev_dbg_ratelimited(
+ dev->adev->dev,
+ "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf0_full %d, ttrace_buf1_full %d ttrace_utc_err %d\n",
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF0_FULL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF1_FULL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR));
+}
+
+static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1)
+{
+ dev_dbg_ratelimited(
+ dev->adev->dev,
+ "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SA_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID));
+}
+
+static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0, uint32_t context_id1)
+{
+ dev_warn_ratelimited(
+ dev->adev->dev,
+ "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SA_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID));
+}
+
+static void event_interrupt_poison_consumption_v12_1(struct kfd_node *node,
+ uint16_t pasid, uint16_t source_id)
+{
+ enum amdgpu_ras_block block = 0;
+ int ret = -EINVAL;
+ uint32_t reset = 0;
+ u64 event_id = RAS_EVENT_INVALID_ID;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
+
+ if (!p)
+ return;
+
+ /* all queues of a process will be unmapped in one time */
+ if (atomic_read(&p->poison)) {
+ kfd_unref_process(p);
+ return;
+ }
+
+ atomic_set(&p->poison, 1);
+ kfd_unref_process(p);
+
+ switch (source_id) {
+ case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+ if (node->dqm->ops.reset_queues)
+ ret = node->dqm->ops.reset_queues(node->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ break;
+ case SOC21_INTSRC_SDMA_ECC:
+ default:
+ block = AMDGPU_RAS_BLOCK__GFX;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ break;
+ }
+
+ kfd_signal_poison_consumed_event(node, pasid);
+
+ /*
+ * resetting queue passes, do page retirement without gpu reset
+ * resetting queue fails, fallback to gpu reset solution
+ */
+ if (amdgpu_uniras_enabled(node->adev))
+ event_id = amdgpu_ras_mgr_gen_ras_event_seqno(node->adev,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION);
+
+ RAS_EVENT_LOG(node->adev, event_id,
+ "poison is consumed by source %d, kick off gpu reset flow\n", source_id);
+
+ amdgpu_amdkfd_ras_pasid_poison_consumption_handler(node->adev,
+ block, pasid, NULL, NULL, reset);
+}
+
+static bool event_interrupt_isr_v12_1(struct kfd_node *node,
+ const uint32_t *ih_ring_entry,
+ uint32_t *patched_ihre,
+ bool *patched_flag)
+{
+ uint16_t source_id, client_id, pasid, vmid, node_id;
+ const uint32_t *data = ih_ring_entry;
+ uint32_t context_id0;
+
+ node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+
+ if (!kfd_irq_is_from_node(node, node_id, vmid)) {
+ dev_dbg_ratelimited(node->adev->dev,
+ "Interrupt not for Node, node_id: %d, vmid: %d\n", node_id, vmid);
+ return false;
+ }
+
+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+ /* Only handle interrupts from KFD VMIDs */
+ if (!KFD_IRQ_IS_FENCE(client_id, source_id) &&
+ (vmid < node->vm_info.first_vmid_kfd ||
+ vmid > node->vm_info.last_vmid_kfd))
+ return false;
+
+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+ context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+
+ if ((source_id == SOC15_INTSRC_CP_END_OF_PIPE) &&
+ (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG))
+ return false;
+
+ dev_dbg(node->adev->dev, "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, vmid, pasid);
+ dev_dbg(node->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3],
+ data[4], data[5], data[6], data[7]);
+
+ /* If there is no valid PASID, it's likely a bug */
+ if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+ return false;
+
+ /* Interrupt types we care about: various signals and faults.
+ * They will be forwarded to a work queue (see below).
+ */
+ return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+ source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+ source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+ source_id == SOC21_INTSRC_SDMA_TRAP ||
+ KFD_IRQ_IS_FENCE(client_id, source_id) ||
+ ((client_id == SOC21_IH_CLIENTID_VMC ||
+ client_id == SOC21_IH_CLIENTID_UTCL2) &&
+ !amdgpu_no_queue_eviction_on_vm_fault);
+}
+
+static void event_interrupt_wq_v12_1(struct kfd_node *node,
+ const uint32_t *ih_ring_entry)
+{
+ uint16_t source_id, client_id, ring_id, pasid, vmid;
+ uint32_t context_id0, context_id1;
+ uint8_t sq_int_enc, sq_int_priv, sq_int_errtype;
+ struct kfd_vm_fault_info info = {0};
+ struct kfd_hsa_memory_exception_data exception_data;
+
+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+ ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+ context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+ context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
+
+ /* VMC, UTCL2 */
+ if (client_id == SOC21_IH_CLIENTID_VMC ||
+ client_id == SOC21_IH_CLIENTID_UTCL2) {
+ info.vmid = vmid;
+ info.mc_id = client_id;
+ info.page_addr = ih_ring_entry[4] |
+ (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+ info.prot_valid = ring_id & 0x08;
+ info.prot_read = ring_id & 0x10;
+ info.prot_write = ring_id & 0x20;
+
+ memset(&exception_data, 0, sizeof(exception_data));
+ exception_data.gpu_id = node->id;
+ exception_data.va = (info.page_addr) << PAGE_SHIFT;
+ exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
+ exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
+ exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
+ exception_data.failure.imprecise = 0;
+
+ kfd_set_dbg_ev_from_interrupt(node, pasid, -1,
+ KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
+ &exception_data, sizeof(exception_data));
+ kfd_smi_event_update_vmfault(node, pasid);
+
+ /* GRBM, SDMA, SE, PMM */
+ } else if (client_id == SOC21_IH_CLIENTID_GRBM_CP ||
+ client_id == SOC21_IH_CLIENTID_GFX) {
+
+ /* CP */
+ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+ kfd_signal_event_interrupt(pasid, context_id0, 32, false);
+ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+ KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
+ u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
+
+ kfd_set_dbg_ev_from_interrupt(node, pasid, doorbell_id,
+ KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
+ NULL, 0);
+ kfd_dqm_suspend_bad_queue_mes(node, pasid, doorbell_id);
+ }
+
+ /* SDMA */
+ else if (source_id == SOC21_INTSRC_SDMA_TRAP)
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true);
+ else if (source_id == SOC21_INTSRC_SDMA_ECC) {
+ event_interrupt_poison_consumption_v12_1(node, pasid, source_id);
+ return;
+ }
+
+ /* SQ */
+ else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
+ sq_int_enc = REG_GET_FIELD(context_id1,
+ SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
+ switch (sq_int_enc) {
+ case SQ_INTERRUPT_WORD_ENCODING_AUTO:
+ print_sq_intr_info_auto(node, context_id0, context_id1);
+ break;
+ case SQ_INTERRUPT_WORD_ENCODING_INST:
+ print_sq_intr_info_inst(node, context_id0, context_id1);
+ sq_int_priv = REG_GET_FIELD(context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
+ if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(node, pasid,
+ KFD_CTXID0_DOORBELL_ID(context_id0),
+ KFD_CTXID0_TRAP_CODE(context_id0),
+ NULL, 0)))
+ return;
+ break;
+ case SQ_INTERRUPT_WORD_ENCODING_ERROR:
+ print_sq_intr_info_error(node, context_id0, context_id1);
+ sq_int_errtype = REG_GET_FIELD(context_id0,
+ SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE);
+ if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
+ sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
+ event_interrupt_poison_consumption_v12_1(
+ node, pasid, source_id);
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24, true);
+ }
+
+ } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
+ kfd_process_close_interrupt_drain(pasid);
+ }
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_v12_1 = {
+ .interrupt_isr = event_interrupt_isr_v12_1,
+ .interrupt_wq = event_interrupt_wq_v12_1,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 0cb5c582ce7d..1688d8e595f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -28,6 +28,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_smi_events.h"
#include "amdgpu_ras.h"
+#include "amdgpu_ras_mgr.h"
/*
* GFX9 SQ Interrupts
@@ -146,7 +147,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
uint32_t reset = 0;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
u64 event_id;
int old_poison, ret;
@@ -228,7 +229,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
kfd_signal_poison_consumed_event(dev, pasid);
- event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
+ if (amdgpu_uniras_enabled(dev->adev))
+ event_id = amdgpu_ras_mgr_gen_ras_event_seqno(dev->adev,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION);
+ else
+ event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
RAS_EVENT_LOG(dev->adev, event_id,
"poison is consumed by client %d, kick off gpu reset flow\n", client_id);
@@ -314,11 +319,12 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev,
& ~pasid_mask) | pasid);
}
- pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
- client_id, source_id, vmid, pasid);
- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
- data[0], data[1], data[2], data[3],
- data[4], data[5], data[6], data[7]);
+ dev_dbg(dev->adev->dev,
+ "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, vmid, pasid);
+ dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6],
+ data[7]);
/* If there is no valid PASID, it's likely a bug */
if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
@@ -373,34 +379,88 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
client_id == SOC15_IH_CLIENTID_SE2SH ||
client_id == SOC15_IH_CLIENTID_SE3SH) {
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
- kfd_signal_event_interrupt(pasid, context_id0, 32);
+ kfd_signal_event_interrupt(pasid, context_id0, 32, true);
else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
sq_int_data = KFD_CONTEXT_ID_GET_SQ_INT_DATA(context_id0, context_id1);
encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING);
switch (encoding) {
case SQ_INTERRUPT_WORD_ENCODING_AUTO:
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, WLT),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_BUF_FULL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, REG_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, CMD_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_CMD_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_REG_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, IMMED_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR));
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ THREAD_TRACE),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ WLT),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ THREAD_TRACE_BUF_FULL),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ REG_TIMESTAMP),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ CMD_TIMESTAMP),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ HOST_CMD_OVERFLOW),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ HOST_REG_OVERFLOW),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ IMMED_OVERFLOW),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_AUTO_CTXID,
+ THREAD_TRACE_UTC_ERROR));
break;
case SQ_INTERRUPT_WORD_ENCODING_INST:
- pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
+ dev_dbg_ratelimited(
+ dev->adev->dev,
+ "sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n",
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ DATA),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SH_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ PRIV),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ WAVE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SIMD_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ CU_ID),
sq_int_data);
if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) {
if (kfd_set_dbg_ev_from_interrupt(dev, pasid,
@@ -412,14 +472,37 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
break;
case SQ_INTERRUPT_WORD_ENCODING_ERROR:
sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE);
- pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID),
+ dev_warn_ratelimited(
+ dev->adev->dev,
+ "sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n",
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ DATA),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SH_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ PRIV),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ WAVE_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ SIMD_ID),
+ REG_GET_FIELD(
+ context_id0,
+ SQ_INTERRUPT_WORD_WAVE_CTXID,
+ CU_ID),
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
@@ -430,7 +513,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
default:
break;
}
- kfd_signal_event_interrupt(pasid, sq_int_data, 24);
+ kfd_signal_event_interrupt(pasid, sq_int_data, 24, true);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
@@ -447,7 +530,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
client_id == SOC15_IH_CLIENTID_SDMA6 ||
client_id == SOC15_IH_CLIENTID_SDMA7) {
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
- kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
+ kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28, true);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2b0a830f5b29..3ffa081daaec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -46,11 +46,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev,
int retval;
union PM4_MES_TYPE_3_HEADER nop;
- if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ))
- return false;
-
- pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ,
- queue_size);
+ pr_debug("Initializing queue type %d size %d\n", type, queue_size);
memset(&prop, 0, sizeof(prop));
memset(&nop, 0, sizeof(nop));
@@ -61,18 +57,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev,
kq->dev = dev;
kq->nop_packet = nop.u32all;
- switch (type) {
- case KFD_QUEUE_TYPE_DIQ:
- kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ];
- break;
- case KFD_QUEUE_TYPE_HIQ:
- kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
- break;
- default:
- dev_err(dev->adev->dev, "Invalid queue type %d\n", type);
- return false;
- }
-
+ kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
if (!kq->mqd_mgr)
return false;
@@ -144,9 +129,8 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev,
goto err_init_queue;
kq->queue->device = dev;
- kq->queue->process = kfd_get_process(current);
- kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev,
+ kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr,
&kq->queue->properties);
if (!kq->queue->mqd_mem_obj)
goto err_allocate_mqd;
@@ -162,24 +146,11 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev,
kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd,
kq->queue->pipe, kq->queue->queue,
&kq->queue->properties, NULL);
- } else {
- /* allocate fence for DIQ */
-
- retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t),
- &kq->fence_mem_obj);
-
- if (retval != 0)
- goto err_alloc_fence;
-
- kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr;
- kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr;
}
print_queue(kq->queue);
return true;
-err_alloc_fence:
- kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj);
err_allocate_mqd:
uninit_queue(kq->queue);
err_init_queue:
@@ -209,8 +180,6 @@ static void kq_uninitialize(struct kernel_queue *kq)
kq->queue->queue);
up_read(&kq->dev->adev->reset_domain->sem);
}
- else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
- kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd,
kq->queue->mqd_mem_obj);
@@ -259,7 +228,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
if (packet_size_in_dwords > available_size) {
/*
* make sure calling functions know
- * acquire_packet_buffer() failed
+ * kq_acquire_packet_buffer() failed
*/
goto err_no_space;
}
@@ -340,7 +309,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
{
struct kernel_queue *kq;
- kq = kzalloc(sizeof(*kq), GFP_KERNEL);
+ kq = kzalloc_obj(*kq);
if (!kq)
return NULL;
@@ -358,34 +327,3 @@ void kernel_queue_uninit(struct kernel_queue *kq)
kq_uninitialize(kq);
kfree(kq);
}
-
-/* FIXME: Can this test be removed? */
-static __attribute__((unused)) void test_kq(struct kfd_node *dev)
-{
- struct kernel_queue *kq;
- uint32_t *buffer, i;
- int retval;
-
- dev_err(dev->adev->dev, "Starting kernel queue test\n");
-
- kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
- if (unlikely(!kq)) {
- dev_err(dev->adev->dev, " Failed to initialize HIQ\n");
- dev_err(dev->adev->dev, "Kernel queue test failed\n");
- return;
- }
-
- retval = kq_acquire_packet_buffer(kq, 5, &buffer);
- if (unlikely(retval != 0)) {
- dev_err(dev->adev->dev, " Failed to acquire packet buffer\n");
- dev_err(dev->adev->dev, "Kernel queue test failed\n");
- return;
- }
- for (i = 0; i < 5; i++)
- buffer[i] = kq->nop_packet;
- kq_submit_packet(kq);
-
- dev_err(dev->adev->dev, "Ending kernel queue test\n");
-}
-
-
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index d05d199b5e44..964efa325908 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -21,7 +21,6 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/types.h>
-#include <linux/hmm.h>
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/migrate.h>
@@ -39,36 +38,38 @@
#endif
#define dev_fmt(fmt) "kfd_migrate: " fmt
-static uint64_t
-svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, uint64_t addr)
+static u64
+svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, u64 addr)
{
return addr + amdgpu_ttm_domain_start(adev, TTM_PL_VRAM);
}
static int
-svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
- dma_addr_t *addr, uint64_t *gart_addr, uint64_t flags)
+svm_migrate_gart_map(struct amdgpu_ring *ring,
+ struct amdgpu_ttm_buffer_entity *entity,
+ u64 npages,
+ dma_addr_t *addr, u64 *gart_addr, u64 flags)
{
struct amdgpu_device *adev = ring->adev;
struct amdgpu_job *job;
unsigned int num_dw, num_bytes;
struct dma_fence *fence;
- uint64_t src_addr, dst_addr;
- uint64_t pte_flags;
+ u64 src_addr, dst_addr;
+ u64 pte_flags;
void *cpu_addr;
int r;
- /* use gart window 0 */
- *gart_addr = adev->gmc.gart_start;
+ *gart_addr = amdgpu_compute_gart_address(&adev->gmc, entity, 0);
num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
- num_bytes = npages * 8;
+ num_bytes = npages * 8 * AMDGPU_GPU_PAGES_IN_CPU_PAGE;
- r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr,
+ r = amdgpu_job_alloc_with_ib(adev, &entity->base,
AMDGPU_FENCE_OWNER_UNDEFINED,
num_dw * 4 + num_bytes,
AMDGPU_IB_POOL_DELAYED,
- &job);
+ &job,
+ AMDGPU_KERNEL_JOB_ID_KFD_GART_MAP);
if (r)
return r;
@@ -76,6 +77,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
src_addr += job->ibs[0].gpu_addr;
dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
+ dst_addr += (entity->gart_window_offs[0] >> AMDGPU_GPU_PAGE_SHIFT) * 8;
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
dst_addr, num_bytes, 0);
@@ -114,7 +116,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
* multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for
* the last sdma finish fence which is returned to check copy memory is done.
*
- * Context: Process context, takes and releases gtt_window_lock
+ * Context: Process context
*
* Return:
* 0 - OK, otherwise error code
@@ -122,28 +124,31 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
static int
svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys,
- uint64_t *vram, uint64_t npages,
+ u64 *vram, u64 npages,
enum MIGRATION_COPY_DIR direction,
struct dma_fence **mfence)
{
- const uint64_t GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE;
+ const u64 GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE;
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
- uint64_t gart_s, gart_d;
+ struct amdgpu_ttm_buffer_entity *entity;
+ u64 gart_s, gart_d;
struct dma_fence *next;
- uint64_t size;
+ u64 size;
int r;
- mutex_lock(&adev->mman.gtt_window_lock);
+ entity = &adev->mman.move_entities[0];
+
+ mutex_lock(&entity->lock);
while (npages) {
size = min(GTT_MAX_PAGES, npages);
if (direction == FROM_VRAM_TO_RAM) {
gart_s = svm_migrate_direct_mapping_addr(adev, *vram);
- r = svm_migrate_gart_map(ring, size, sys, &gart_d, 0);
+ r = svm_migrate_gart_map(ring, entity, size, sys, &gart_d, 0);
} else if (direction == FROM_RAM_TO_VRAM) {
- r = svm_migrate_gart_map(ring, size, sys, &gart_s,
+ r = svm_migrate_gart_map(ring, entity, size, sys, &gart_s,
KFD_IOCTL_SVM_FLAG_GPU_RO);
gart_d = svm_migrate_direct_mapping_addr(adev, *vram);
}
@@ -152,8 +157,9 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys,
goto out_unlock;
}
- r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE,
- NULL, &next, false, true, 0);
+ r = amdgpu_copy_buffer(adev, entity,
+ gart_s, gart_d, size * PAGE_SIZE,
+ NULL, &next, true, 0);
if (r) {
dev_err(adev->dev, "fail %d to copy memory\n", r);
goto out_unlock;
@@ -169,7 +175,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys,
}
out_unlock:
- mutex_unlock(&adev->mman.gtt_window_lock);
+ mutex_unlock(&entity->lock);
return r;
}
@@ -217,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
page = pfn_to_page(pfn);
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
- zone_device_page_init(page);
+ zone_device_page_init(page, page_pgmap(page), 0);
}
static void
@@ -260,39 +266,39 @@ static void svm_migrate_put_sys_page(unsigned long addr)
put_page(page);
}
-static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate)
+static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate)
{
- unsigned long upages = 0;
+ unsigned long mpages = 0;
unsigned long i;
for (i = 0; i < migrate->npages; i++) {
- if (migrate->src[i] & MIGRATE_PFN_VALID &&
- !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- upages++;
+ if (migrate->dst[i] & MIGRATE_PFN_VALID &&
+ migrate->src[i] & MIGRATE_PFN_MIGRATE)
+ mpages++;
}
- return upages;
+ return mpages;
}
static int
svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
- dma_addr_t *scratch, uint64_t ttm_res_offset)
+ dma_addr_t *scratch, u64 ttm_res_offset)
{
- uint64_t npages = migrate->npages;
+ u64 npages = migrate->npages;
struct amdgpu_device *adev = node->adev;
struct device *dev = adev->dev;
struct amdgpu_res_cursor cursor;
- uint64_t mpages = 0;
+ u64 mpages = 0;
dma_addr_t *src;
- uint64_t *dst;
- uint64_t i, j;
+ u64 *dst;
+ u64 i, j;
int r;
pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n", prange->svms, prange->start,
prange->last, ttm_res_offset);
src = scratch;
- dst = (uint64_t *)(scratch + npages);
+ dst = (u64 *)(scratch + npages);
amdgpu_res_first(prange->ttm_res, ttm_res_offset,
npages << PAGE_SHIFT, &cursor);
@@ -385,11 +391,11 @@ out_free_vram_pages:
static long
svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange,
- struct vm_area_struct *vma, uint64_t start,
- uint64_t end, uint32_t trigger, uint64_t ttm_res_offset)
+ struct vm_area_struct *vma, u64 start,
+ u64 end, uint32_t trigger, u64 ttm_res_offset)
{
struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
- uint64_t npages = (end - start) >> PAGE_SHIFT;
+ u64 npages = (end - start) >> PAGE_SHIFT;
struct amdgpu_device *adev = node->adev;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
@@ -408,7 +414,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange,
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
buf = kvcalloc(npages,
- 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
+ 2 * sizeof(*migrate.src) + sizeof(u64) + sizeof(dma_addr_t),
GFP_KERNEL);
if (!buf)
goto out;
@@ -447,9 +453,9 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange,
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize(&migrate);
- mpages = cpages - svm_migrate_unsuccessful_pages(&migrate);
- pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
- mpages, cpages, migrate.npages);
+ mpages = svm_migrate_successful_pages(&migrate);
+ pr_debug("migrated/collected/requested 0x%lx/0x%lx/0x%lx\n",
+ mpages, cpages, migrate.npages);
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
@@ -490,7 +496,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
{
unsigned long addr, start, end;
struct vm_area_struct *vma;
- uint64_t ttm_res_offset;
+ u64 ttm_res_offset;
struct kfd_node *node;
unsigned long mpages = 0;
long r = 0;
@@ -567,8 +573,9 @@ out:
return r < 0 ? r : 0;
}
-static void svm_migrate_page_free(struct page *page)
+static void svm_migrate_folio_free(struct folio *folio)
{
+ struct page *page = &folio->page;
struct svm_range_bo *svm_bo = page->zone_device_data;
if (svm_bo) {
@@ -580,14 +587,14 @@ static void svm_migrate_page_free(struct page *page)
static int
svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
- dma_addr_t *scratch, uint64_t npages)
+ dma_addr_t *scratch, u64 npages)
{
struct device *dev = adev->dev;
- uint64_t *src;
+ u64 *src;
dma_addr_t *dst;
struct page *dpage;
- uint64_t i = 0, j;
- uint64_t addr;
+ u64 i = 0, j;
+ u64 addr;
int r = 0;
pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
@@ -595,7 +602,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
addr = migrate->start;
- src = (uint64_t *)(scratch + npages);
+ src = (u64 *)(scratch + npages);
dst = scratch;
for (i = 0, j = 0; i < npages; i++, addr += PAGE_SIZE) {
@@ -683,12 +690,11 @@ out_oom:
*/
static long
svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
- struct vm_area_struct *vma, uint64_t start, uint64_t end,
+ struct vm_area_struct *vma, u64 start, u64 end,
uint32_t trigger, struct page *fault_page)
{
struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
- uint64_t npages = (end - start) >> PAGE_SHIFT;
- unsigned long upages = npages;
+ u64 npages = (end - start) >> PAGE_SHIFT;
unsigned long cpages = 0;
unsigned long mpages = 0;
struct amdgpu_device *adev = node->adev;
@@ -710,7 +716,7 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
buf = kvcalloc(npages,
- 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
+ 2 * sizeof(*migrate.src) + sizeof(u64) + sizeof(dma_addr_t),
GFP_KERNEL);
if (!buf)
goto out;
@@ -736,7 +742,6 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
if (!cpages) {
pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n",
prange->start, prange->last);
- upages = svm_migrate_unsuccessful_pages(&migrate);
goto out_free;
}
if (cpages != npages)
@@ -749,9 +754,9 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
scratch, npages);
migrate_vma_pages(&migrate);
- upages = svm_migrate_unsuccessful_pages(&migrate);
- pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
- upages, cpages, migrate.npages);
+ mpages = svm_migrate_successful_pages(&migrate);
+ pr_debug("migrated/collected/requested 0x%lx/0x%lx/0x%lx\n",
+ mpages, cpages, migrate.npages);
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize(&migrate);
@@ -764,8 +769,7 @@ out_free:
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
node->id, 0, trigger, r);
out:
- if (!r && cpages) {
- mpages = cpages - upages;
+ if (!r && mpages) {
pdd = svm_range_get_pdd_by_node(prange, node);
if (pdd)
WRITE_ONCE(pdd->page_out, pdd->page_out + mpages);
@@ -848,6 +852,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
}
if (r >= 0) {
+ WARN_ONCE(prange->vram_pages < mpages,
+ "Recorded vram pages(0x%llx) should not be less than migration pages(0x%lx).",
+ prange->vram_pages, mpages);
prange->vram_pages -= mpages;
/* prange does not have vram page set its actual_loc to system
@@ -1008,7 +1015,7 @@ out_mmput:
}
static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
- .page_free = svm_migrate_page_free,
+ .folio_free = svm_migrate_folio_free,
.migrate_to_ram = svm_migrate_to_ram,
};
@@ -1027,7 +1034,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
return -EINVAL;
- if (adev->flags & AMD_IS_APU)
+ if (adev->apu_prefer_gtt)
return 0;
pgmap = &kfddev->pgmap;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index 2eebf67f9c2c..2b7fd442d29c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -31,7 +31,6 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/sched/mm.h>
-#include <linux/hmm.h>
#include "kfd_priv.h"
#include "kfd_svm.h"
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index aee2212e52f6..33aa23450b3f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -78,8 +78,8 @@ err_ioctl:
static void kfd_exit(void)
{
kfd_cleanup_processes();
- kfd_debugfs_fini();
kfd_process_destroy_wq();
+ kfd_debugfs_fini();
kfd_procfs_shutdown();
kfd_topology_shutdown();
kfd_chardev_exit();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index d9ae854b6908..723b725d20b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -46,28 +46,30 @@ int pipe_priority_map[] = {
KFD_PIPE_PRIORITY_CS_HIGH
};
-struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_node *dev, struct queue_properties *q)
+struct kfd_mem_obj *allocate_hiq_mqd(struct mqd_manager *mm, struct queue_properties *q)
{
struct kfd_mem_obj *mqd_mem_obj;
+ struct kfd_node *dev = mm->dev;
- mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj);
if (!mqd_mem_obj)
return NULL;
- mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem;
+ mqd_mem_obj->mem = dev->dqm->hiq_sdma_mqd.mem;
mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr;
mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr;
return mqd_mem_obj;
}
-struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev,
+struct kfd_mem_obj *allocate_sdma_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
struct kfd_mem_obj *mqd_mem_obj;
+ struct kfd_node *dev = mm->dev;
uint64_t offset;
- mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj);
if (!mqd_mem_obj)
return NULL;
@@ -79,7 +81,7 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev,
offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size *
NUM_XCC(dev->xcc_mask);
- mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem
+ mqd_mem_obj->mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.mem
+ offset);
mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset;
mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t)
@@ -91,7 +93,7 @@ struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev,
void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
- WARN_ON(!mqd_mem_obj->gtt_mem);
+ WARN_ON(!mqd_mem_obj->mem);
kfree(mqd_mem_obj);
}
@@ -224,8 +226,8 @@ int kfd_destroy_mqd_cp(struct mqd_manager *mm, void *mqd,
void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
- if (mqd_mem_obj->gtt_mem) {
- amdgpu_amdkfd_free_gtt_mem(mm->dev->adev, &mqd_mem_obj->gtt_mem);
+ if (mqd_mem_obj->mem) {
+ amdgpu_amdkfd_free_kernel_mem(mm->dev->adev, &mqd_mem_obj->mem);
kfree(mqd_mem_obj);
} else {
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
@@ -280,8 +282,8 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, struct kfd_mem_obj *mqd_mem_obj,
offset = kfd_hiq_mqd_stride(dev) * virtual_xcc_id;
- mqd_mem_obj->gtt_mem = (virtual_xcc_id == 0) ?
- dev->dqm->hiq_sdma_mqd.gtt_mem : NULL;
+ mqd_mem_obj->mem = (virtual_xcc_id == 0) ?
+ dev->dqm->hiq_sdma_mqd.mem : NULL;
mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset;
mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)
dev->dqm->hiq_sdma_mqd.cpu_ptr + offset);
@@ -290,6 +292,9 @@ void kfd_get_hiq_xcc_mqd(struct kfd_node *dev, struct kfd_mem_obj *mqd_mem_obj,
uint64_t kfd_mqd_stride(struct mqd_manager *mm,
struct queue_properties *q)
{
+ if (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0))
+ return AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
+
return mm->mqd_size;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index 17cc1f25c8d0..06ca6235ff1b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -68,7 +68,7 @@
*/
extern int pipe_priority_map[];
struct mqd_manager {
- struct kfd_mem_obj* (*allocate_mqd)(struct kfd_node *kfd,
+ struct kfd_mem_obj* (*allocate_mqd)(struct mqd_manager *mm,
struct queue_properties *q);
void (*init_mqd)(struct mqd_manager *mm, void **mqd,
@@ -102,7 +102,8 @@ struct mqd_manager {
u32 *ctl_stack_used_size,
u32 *save_area_used_size);
- void (*get_checkpoint_info)(struct mqd_manager *mm, void *mqd, uint32_t *ctl_stack_size);
+ int (*get_checkpoint_info)(struct mqd_manager *mm, void *mqd,
+ uint32_t *ctl_stack_size);
void (*checkpoint_mqd)(struct mqd_manager *mm,
void *mqd,
@@ -153,10 +154,10 @@ struct mqd_user_context_save_area_header {
uint32_t wave_state_size;
};
-struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_node *dev,
+struct kfd_mem_obj *allocate_hiq_mqd(struct mqd_manager *mm,
struct queue_properties *q);
-struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_node *dev,
+struct kfd_mem_obj *allocate_sdma_mqd(struct mqd_manager *mm,
struct queue_properties *q);
void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index 05f3ac2eaef9..bb70e57ae4d5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -70,12 +70,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct cik_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd,
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
+ struct kfd_node *kfd = mm->dev;
struct kfd_mem_obj *mqd_mem_obj;
if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd),
@@ -388,7 +388,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
index 1695dd78ede8..77fb41e2486a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -70,12 +70,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct v10_compute_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd,
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
+ struct kfd_node *kfd = mm->dev;
struct kfd_mem_obj *mqd_mem_obj;
if (kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd),
@@ -450,7 +450,7 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 3c0ae28c5923..a1e3cf2384dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -96,25 +96,16 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct v11_compute_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node,
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
+ struct kfd_node *node = mm->dev;
struct kfd_mem_obj *mqd_mem_obj;
- int size;
-
- /*
- * MES write to areas beyond MQD size. So allocate
- * 1 PAGE_SIZE memory for MQD is MES is enabled.
- */
- if (node->kfd->shared_resources.enable_mes)
- size = PAGE_SIZE;
- else
- size = sizeof(struct v11_compute_mqd);
- if (kfd_gtt_sa_allocate(node, size, &mqd_mem_obj))
+ if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj))
return NULL;
return mqd_mem_obj;
@@ -126,18 +117,13 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
{
uint64_t addr;
struct v11_compute_mqd *m;
- int size;
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
- if (mm->dev->kfd->shared_resources.enable_mes)
- size = PAGE_SIZE;
- else
- size = sizeof(struct v11_compute_mqd);
-
- memset(m, 0, size);
+ memset(m, 0, mqd_size);
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
@@ -478,7 +464,7 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
index 565858b9044d..b3e122d7876e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
@@ -77,19 +77,16 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct v12_compute_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node,
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
+ struct kfd_node *node = mm->dev;
struct kfd_mem_obj *mqd_mem_obj;
- /*
- * Allocate one PAGE_SIZE memory for MQD as MES writes to areas beyond
- * struct MQD size.
- */
- if (kfd_gtt_sa_allocate(node, PAGE_SIZE, &mqd_mem_obj))
+ if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj))
return NULL;
return mqd_mem_obj;
@@ -101,11 +98,12 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
{
uint64_t addr;
struct v12_compute_mqd *m;
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
m = (struct v12_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
- memset(m, 0, PAGE_SIZE);
+ memset(m, 0, mqd_size);
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
@@ -351,6 +349,12 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+ /* Allow context switch so we don't cross-process starve with a massive
+ * command buffer of long-running SDMA commands
+ * sdmax_rlcx_ib_cntl represent SDMA_QUEUE0_IB_CNTL register
+ */
+ m->sdmax_rlcx_ib_cntl |= SDMA0_QUEUE0_IB_CNTL__SWITCH_INSIDE_IB_MASK;
+
q->is_active = QUEUE_IS_ACTIVE(*q);
}
@@ -380,7 +384,7 @@ struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c
new file mode 100644
index 000000000000..c90c0d99b1e3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c
@@ -0,0 +1,724 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "v12_structs.h"
+#include "gc/gc_12_1_0_sh_mask.h"
+#include "amdgpu_amdkfd.h"
+#include "kfd_device_queue_manager.h"
+
+static inline struct v12_1_compute_mqd *get_mqd(void *mqd)
+{
+ return (struct v12_1_compute_mqd *)mqd;
+}
+
+static inline struct v12_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+ return (struct v12_sdma_mqd *)mqd;
+}
+
+static void mqd_symmetrically_map_cu_mask_v12_1(struct mqd_manager *mm,
+ const uint32_t *cu_mask, uint32_t cu_mask_count,
+ uint32_t *se_mask, uint32_t inst)
+{
+ struct amdgpu_cu_info *cu_info = &mm->dev->adev->gfx.cu_info;
+ struct amdgpu_gfx_config *gfx_info = &mm->dev->adev->gfx.config;
+ uint32_t cu_per_sh[2][2] = {0};
+ uint32_t en_mask = 0x3;
+ int i, se, sh, cu, cu_inc = 0;
+ uint32_t cu_active_per_node;
+ int inc = NUM_XCC(mm->dev->xcc_mask);
+ int xcc_inst = inst + ffs(mm->dev->xcc_mask) - 1;
+
+ cu_active_per_node = cu_info->number / mm->dev->kfd->num_nodes;
+ if (cu_mask_count > cu_active_per_node)
+ cu_mask_count = cu_active_per_node;
+
+ /*
+ * Count active CUs per SE/SH.
+ */
+ for (se = 0; se < gfx_info->max_shader_engines; se++)
+ for (sh = 0; sh < gfx_info->max_sh_per_se; sh++)
+ cu_per_sh[se][sh] = hweight32(
+ cu_info->bitmap[xcc_inst][se][sh]);
+
+ /* Symmetrically map cu_mask to all SEs & SHs:
+ * For GFX 12.1.0, the following code only looks at a
+ * subset of the cu_mask corresponding to the inst parameter.
+ * If we have n XCCs under one GPU node
+ * cu_mask[0] bit0 -> XCC0 se_mask[0] bit0 (XCC0,SE0,SH0,CU0)
+ * cu_mask[0] bit1 -> XCC1 se_mask[0] bit0 (XCC1,SE0,SH0,CU0)
+ * ..
+ * cu_mask[0] bitn -> XCCn se_mask[0] bit0 (XCCn,SE0,SH0,CU0)
+ * cu_mask[0] bit n+1 -> XCC0 se_mask[1] bit0 (XCC0,SE1,SH0,CU0)
+ *
+ * For example, if there are 6 XCCs under 1 KFD node, this code
+ * running for each inst, will look at the bits as:
+ * inst, inst + 6, inst + 12...
+ *
+ * First ensure all CUs are disabled, then enable user specified CUs.
+ */
+ for (i = 0; i < gfx_info->max_shader_engines; i++)
+ se_mask[i] = 0;
+
+ i = inst;
+ for (cu = 0; cu < 16; cu++) {
+ for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) {
+ for (se = 0; se < gfx_info->max_shader_engines; se++) {
+ if (cu_per_sh[se][sh] > cu) {
+ if (cu_mask[i / 32] & (1U << (i % 32))) {
+ if (cu == 8 && sh == 0)
+ se_mask[se] |= en_mask << 30;
+ else
+ se_mask[se] |= en_mask << (cu_inc + sh * 16);
+ }
+ i += inc;
+ if (i >= cu_mask_count)
+ return;
+ }
+ }
+ }
+ cu_inc += 2;
+ }
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ struct mqd_update_info *minfo, uint32_t inst)
+{
+ struct v12_1_compute_mqd *m;
+ uint32_t se_mask[2] = {0};
+
+ if (!minfo || !minfo->cu_mask.ptr)
+ return;
+
+ mqd_symmetrically_map_cu_mask_v12_1(mm,
+ minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, inst);
+
+ m = get_mqd(mqd);
+ m->compute_static_thread_mgmt_se0 = se_mask[0];
+ m->compute_static_thread_mgmt_se1 = se_mask[1];
+
+ pr_debug("update cu mask to %#x %#x\n",
+ m->compute_static_thread_mgmt_se0,
+ m->compute_static_thread_mgmt_se1);
+}
+
+static void set_priority(struct v12_1_compute_mqd *m, struct queue_properties *q)
+{
+ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
+ struct queue_properties *q)
+{
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
+ struct kfd_node *node = mm->dev;
+ struct kfd_mem_obj *mqd_mem_obj;
+
+ if (q->type == KFD_QUEUE_TYPE_COMPUTE)
+ mqd_size *= NUM_XCC(node->xcc_mask);
+
+ if (kfd_gtt_sa_allocate(node, mqd_size, &mqd_mem_obj))
+ return NULL;
+
+ return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ uint64_t addr;
+ struct v12_1_compute_mqd *m;
+ u32 mqd_size = AMDGPU_MQD_SIZE_ALIGN(mm->mqd_size);
+
+ m = (struct v12_1_compute_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
+
+ memset(m, 0, mqd_size);
+
+ m->header = 0xC0310800;
+ m->compute_pipelinestat_enable = 1;
+ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se8 = 0xFFFFFFFF;
+
+ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+ 0x63 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+
+ m->cp_mqd_base_addr_lo = lower_32_bits(addr);
+ m->cp_mqd_base_addr_hi = upper_32_bits(addr);
+
+ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+ 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+ /* Set cp_hqd_hq_status0.c_queue_debug_en to 1 to have the CP set up the
+ * DISPATCH_PTR. This is required for the kfd debugger
+ */
+ m->cp_hqd_hq_status0 = 1 << 14;
+
+ if (amdgpu_amdkfd_have_atomics_support(mm->dev->adev))
+ m->cp_hqd_hq_status0 |= 1 << 29;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->cp_hqd_aql_control =
+ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+ }
+
+ if (mm->dev->kfd->cwsr_enabled) {
+ m->cp_hqd_persistent_state |=
+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ m->cp_hqd_ctx_save_base_addr_lo =
+ lower_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_base_addr_hi =
+ upper_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+ m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+ }
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = addr;
+ mm->update_mqd(mm, m, q, NULL);
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ int r = 0;
+ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+
+ r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id,
+ (uint32_t __user *)p->write_ptr,
+ wptr_shift, 0, mms, 0);
+ return r;
+}
+
+static void update_mqd(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q,
+ struct mqd_update_info *minfo)
+{
+ struct v12_1_compute_mqd *m;
+
+ m = get_mqd(mqd);
+
+ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+ m->cp_hqd_pq_control |=
+ ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK;
+ pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+ if (q->metadata_queue_size) {
+ /* On GC 12.1 is 64 DWs which is 4 times size of AQL packet */
+ if (q->metadata_queue_size == q->queue_size * 4) {
+ /*
+ * User application allocates main queue ring and metadata queue ring
+ * with a single allocation. metadata queue ring starts after main
+ * queue ring.
+ */
+ m->cp_hqd_kd_base =
+ lower_32_bits((q->queue_address + q->queue_size) >> 8);
+ m->cp_hqd_kd_base_hi =
+ upper_32_bits((q->queue_address + q->queue_size) >> 8);
+
+ m->cp_hqd_kd_cntl |= CP_HQD_KD_CNTL__KD_FETCHER_ENABLE_MASK;
+ /* KD_SIZE = 2 for metadata packet = 64 DWs */
+ m->cp_hqd_kd_cntl |= 2 << CP_HQD_KD_CNTL__KD_SIZE__SHIFT;
+ } else {
+ pr_warn("Invalid metadata ring size, metadata queue will be ignored\n");
+ }
+ }
+
+ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+ m->cp_hqd_pq_doorbell_control =
+ q->doorbell_off <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+ pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+ m->cp_hqd_pq_doorbell_control);
+
+ m->cp_hqd_ib_control = 1 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT;
+
+ /*
+ * HW does not clamp this field correctly. Maximum EOP queue size
+ * is constrained by per-SE EOP done signal count, which is 8-bit.
+ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+ * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+ * is safe, giving a maximum field value of 0xA.
+ */
+ m->cp_hqd_eop_control = min(0xA,
+ ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
+ m->cp_hqd_eop_base_addr_lo =
+ lower_32_bits(q->eop_ring_buffer_address >> 8);
+ m->cp_hqd_eop_base_addr_hi =
+ upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+ m->cp_hqd_iq_timer = 0;
+
+ m->cp_hqd_vmid = q->vmid;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ /* GC 10 removed WPP_CLAMP from PQ Control */
+ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT;
+ m->cp_hqd_pq_doorbell_control |=
+ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+ }
+ if (mm->dev->kfd->cwsr_enabled)
+ m->cp_hqd_ctx_save_control = 0;
+
+ set_priority(m, q);
+
+ q->is_active = QUEUE_IS_ACTIVE(*q);
+}
+
+static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
+{
+ return false;
+}
+
+static int get_wave_state(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q,
+ void __user *ctl_stack,
+ u32 *ctl_stack_used_size,
+ u32 *save_area_used_size)
+{
+ struct v12_1_compute_mqd *m;
+ struct mqd_user_context_save_area_header header;
+
+ m = get_mqd(mqd);
+
+ /* Control stack is written backwards, while workgroup context data
+ * is written forwards. Both starts from m->cp_hqd_cntl_stack_size.
+ * Current position is at m->cp_hqd_cntl_stack_offset and
+ * m->cp_hqd_wg_state_offset, respectively.
+ */
+ *ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+ m->cp_hqd_cntl_stack_offset;
+ *save_area_used_size = m->cp_hqd_wg_state_offset -
+ m->cp_hqd_cntl_stack_size;
+
+ /* Control stack is not copied to user mode for GFXv12 because
+ * it's part of the context save area that is already
+ * accessible to user mode
+ */
+ header.control_stack_size = *ctl_stack_used_size;
+ header.wave_state_size = *save_area_used_size;
+
+ header.wave_state_offset = m->cp_hqd_wg_state_offset;
+ header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
+
+ if (copy_to_user(ctl_stack, &header, sizeof(header)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v12_1_compute_mqd *m;
+
+ init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+ m = get_mqd(*mqd);
+
+ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+}
+
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v12_sdma_mqd *m;
+
+ m = (struct v12_sdma_mqd *) mqd_mem_obj->cpu_ptr;
+
+ memset(m, 0, PAGE_SIZE);
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = mqd_mem_obj->gpu_addr;
+
+ mm->update_mqd(mm, m, q, NULL);
+}
+
+#define SDMA_RLC_DUMMY_DEFAULT 0xf
+
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q,
+ struct mqd_update_info *minfo)
+{
+ struct v12_sdma_mqd *m;
+
+ m = get_sdma_mqd(mqd);
+ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+ << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
+ q->vmid << SDMA0_SDMA_QUEUE0_RB_CNTL__RB_VMID__SHIFT |
+ 1 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 6 << SDMA0_SDMA_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT |
+ 1 << SDMA0_SDMA_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT;
+
+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+ m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+ m->sdmax_rlcx_doorbell_offset =
+ q->doorbell_off << SDMA0_SDMA_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+ m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum
+ << SDMA0_SDMA_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT)
+ & SDMA0_SDMA_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK;
+
+ m->sdma_engine_id = q->sdma_engine_id;
+ m->sdma_queue_id = q->sdma_queue_id;
+
+ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+
+ /* Allow context switch so we don't cross-process starve with a massive
+ * command buffer of long-running SDMA commands
+ * sdmax_rlcx_ib_cntl represent SDMA_QUEUE0_IB_CNTL register
+ */
+ m->sdmax_rlcx_ib_cntl |= SDMA0_SDMA_QUEUE0_IB_CNTL__SWITCH_INSIDE_IB_MASK;
+
+ q->is_active = QUEUE_IS_ACTIVE(*q);
+}
+
+static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj,
+ struct kfd_mem_obj *xcc_mqd_mem_obj,
+ uint64_t offset)
+{
+ xcc_mqd_mem_obj->mem = (offset == 0) ?
+ mqd_mem_obj->mem : NULL;
+ xcc_mqd_mem_obj->gpu_addr = mqd_mem_obj->gpu_addr + offset;
+ xcc_mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)mqd_mem_obj->cpu_ptr
+ + offset);
+}
+
+static void init_mqd_v12_1(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v12_1_compute_mqd *m;
+ int xcc = 0;
+ struct kfd_mem_obj xcc_mqd_mem_obj;
+ uint64_t xcc_gart_addr = 0;
+ uint64_t xcc_ctx_save_restore_area_address;
+ uint64_t offset = mm->mqd_stride(mm, q);
+ uint32_t local_xcc_start = mm->dev->dqm->current_logical_xcc_start++;
+
+ memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
+ for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+ get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc);
+
+ init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q);
+
+ m->cp_mqd_stride_size = offset;
+
+ /*
+ * Update the CWSR address for each XCC if CWSR is enabled
+ * and CWSR area is allocated in thunk
+ */
+ if (mm->dev->kfd->cwsr_enabled &&
+ q->ctx_save_restore_area_address) {
+ xcc_ctx_save_restore_area_address =
+ q->ctx_save_restore_area_address +
+ (xcc * q->ctx_save_restore_area_size);
+
+ m->cp_hqd_ctx_save_base_addr_lo =
+ lower_32_bits(xcc_ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_base_addr_hi =
+ upper_32_bits(xcc_ctx_save_restore_area_address);
+ }
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->compute_tg_chunk_size = 1;
+ m->compute_current_logical_xcc_id =
+ (local_xcc_start + xcc) %
+ NUM_XCC(mm->dev->xcc_mask);
+ } else {
+ /* PM4 Queue */
+ m->compute_current_logical_xcc_id = 0;
+ m->compute_tg_chunk_size = 0;
+ m->pm4_target_xcc_in_xcp = q->pm4_target_xcc;
+ }
+
+ if (xcc == 0) {
+ /* Set the MQD pointer and gart address to XCC0 MQD */
+ *mqd = m;
+ *gart_addr = xcc_gart_addr;
+ }
+ }
+}
+
+static void update_mqd_v12_1(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q, struct mqd_update_info *minfo)
+{
+ struct v12_1_compute_mqd *m;
+ int xcc = 0;
+ uint64_t size = mm->mqd_stride(mm, q);
+
+ for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+ m = get_mqd(mqd + size * xcc);
+ update_mqd(mm, m, q, minfo);
+
+ update_cu_mask(mm, m, minfo, xcc);
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->compute_tg_chunk_size = 1;
+ } else {
+ /* PM4 Queue */
+ m->compute_current_logical_xcc_id = 0;
+ m->compute_tg_chunk_size = 0;
+ m->pm4_target_xcc_in_xcp = q->pm4_target_xcc;
+ }
+ }
+}
+
+static int destroy_mqd_v12_1(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type, unsigned int timeout,
+ uint32_t pipe_id, uint32_t queue_id)
+{
+ uint32_t xcc_mask = mm->dev->xcc_mask;
+ int xcc_id, err, inst = 0;
+ void *xcc_mqd;
+ struct v12_1_compute_mqd *m;
+ uint64_t mqd_offset;
+
+ m = get_mqd(mqd);
+ mqd_offset = m->cp_mqd_stride_size;
+
+ for_each_inst(xcc_id, xcc_mask) {
+ xcc_mqd = mqd + mqd_offset * inst;
+ err = mm->dev->kfd2kgd->hqd_destroy(mm->dev->adev, xcc_mqd,
+ type, timeout, pipe_id,
+ queue_id, xcc_id);
+ if (err) {
+ pr_debug("Destroy MQD failed for xcc: %d\n", inst);
+ break;
+ }
+ ++inst;
+ }
+
+ return err;
+}
+
+static int load_mqd_v12_1(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+ uint32_t xcc_mask = mm->dev->xcc_mask;
+ int xcc_id, err, inst = 0;
+ void *xcc_mqd;
+ uint64_t mqd_stride_size = mm->mqd_stride(mm, p);
+
+ for_each_inst(xcc_id, xcc_mask) {
+ xcc_mqd = mqd + mqd_stride_size * inst;
+ err = mm->dev->kfd2kgd->hqd_load(
+ mm->dev->adev, xcc_mqd, pipe_id, queue_id,
+ (uint32_t __user *)p->write_ptr, wptr_shift, 0, mms,
+ xcc_id);
+ if (err) {
+ pr_debug("Load MQD failed for xcc: %d\n", inst);
+ break;
+ }
+ ++inst;
+ }
+
+ return err;
+}
+
+static int get_wave_state_v12_1(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q,
+ void __user *ctl_stack,
+ u32 *ctl_stack_used_size,
+ u32 *save_area_used_size)
+{
+ int xcc, err = 0;
+ void *xcc_mqd;
+ void __user *xcc_ctl_stack;
+ uint64_t mqd_stride_size = mm->mqd_stride(mm, q);
+ u32 tmp_ctl_stack_used_size = 0, tmp_save_area_used_size = 0;
+
+ for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+ xcc_mqd = mqd + mqd_stride_size * xcc;
+ xcc_ctl_stack = (void __user *)((uintptr_t)ctl_stack +
+ q->ctx_save_restore_area_size * xcc);
+
+ err = get_wave_state(mm, xcc_mqd, q, xcc_ctl_stack,
+ &tmp_ctl_stack_used_size,
+ &tmp_save_area_used_size);
+ if (err)
+ break;
+
+ /*
+ * Set the ctl_stack_used_size and save_area_used_size to
+ * ctl_stack_used_size and save_area_used_size of XCC 0 when
+ * passing the info to user-space.
+ * For multi XCC, user-space would have to look at the header
+ * info of each Control stack area to determine the control
+ * stack size and save area used.
+ */
+ if (xcc == 0) {
+ *ctl_stack_used_size = tmp_ctl_stack_used_size;
+ *save_area_used_size = tmp_save_area_used_size;
+ }
+ }
+
+ return err;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v12_1_compute_mqd), false);
+ return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v12_sdma_mqd), false);
+ return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_v12_1(enum KFD_MQD_TYPE type,
+ struct kfd_node *dev)
+{
+ struct mqd_manager *mqd;
+
+ if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ return NULL;
+
+ mqd = kzalloc_obj(*mqd);
+ if (!mqd)
+ return NULL;
+
+ mqd->dev = dev;
+
+ switch (type) {
+ case KFD_MQD_TYPE_CP:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_mqd;
+ mqd->init_mqd = init_mqd_v12_1;
+ mqd->free_mqd = kfd_free_mqd_cp;
+ mqd->load_mqd = load_mqd_v12_1;
+ mqd->update_mqd = update_mqd_v12_1;
+ mqd->destroy_mqd = destroy_mqd_v12_1;
+ mqd->is_occupied = kfd_is_occupied_cp;
+ mqd->mqd_size = sizeof(struct v12_1_compute_mqd);
+ mqd->get_wave_state = get_wave_state_v12_1;
+ mqd->mqd_stride = kfd_mqd_stride;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ case KFD_MQD_TYPE_HIQ:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_hiq_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = free_mqd_hiq_sdma;
+ mqd->load_mqd = kfd_hiq_load_mqd_kiq;
+ mqd->update_mqd = update_mqd;
+ mqd->destroy_mqd = kfd_destroy_mqd_cp;
+ mqd->is_occupied = kfd_is_occupied_cp;
+ mqd->mqd_size = sizeof(struct v12_1_compute_mqd);
+ mqd->mqd_stride = kfd_mqd_stride;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ mqd->check_preemption_failed = check_preemption_failed;
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ case KFD_MQD_TYPE_DIQ:
+ mqd->allocate_mqd = allocate_mqd;
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->free_mqd = kfd_free_mqd_cp;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd;
+ mqd->destroy_mqd = kfd_destroy_mqd_cp;
+ mqd->is_occupied = kfd_is_occupied_cp;
+ mqd->mqd_size = sizeof(struct v12_1_compute_mqd);
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_SDMA:
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ mqd->allocate_mqd = allocate_mqd;
+ mqd->init_mqd = init_mqd_sdma;
+ mqd->free_mqd = kfd_free_mqd_cp;
+ mqd->load_mqd = kfd_load_mqd_sdma;
+ mqd->update_mqd = update_mqd_sdma;
+ mqd->destroy_mqd = kfd_destroy_mqd_sdma;
+ mqd->is_occupied = kfd_is_occupied_sdma;
+ mqd->mqd_size = sizeof(struct v12_sdma_mqd);
+ mqd->mqd_stride = kfd_mqd_stride;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+ pr_debug("%s@%i\n", __func__, __LINE__);
+ break;
+ default:
+ kfree(mqd);
+ return NULL;
+ }
+
+ return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 3014925d95ff..f6d9d81003dc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm,
struct queue_properties *q)
{
if (mm->dev->kfd->cwsr_enabled &&
- q->type == KFD_QUEUE_TYPE_COMPUTE)
- return ALIGN(q->ctl_stack_size, PAGE_SIZE) +
- ALIGN(sizeof(struct v9_mqd), PAGE_SIZE);
+ q->type == KFD_QUEUE_TYPE_COMPUTE) {
+
+ /* On gfxv9, the MQD resides in the first 4K page,
+ * followed by the control stack. Align both to
+ * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary.
+ */
+
+ return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) +
+ ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE);
+ }
return mm->mqd_size;
}
@@ -106,13 +113,27 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct v9_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node,
+static bool mqd_on_vram(struct amdgpu_device *adev)
+{
+ if (adev->apu_prefer_gtt)
+ return false;
+
+ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(9, 4, 3):
+ case IP_VERSION(9, 5, 0):
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
int retval;
+ struct kfd_node *node = mm->dev;
struct kfd_mem_obj *mqd_mem_obj = NULL;
/* For V9 only, due to a HW bug, the control stack of a user mode
@@ -132,14 +153,16 @@ static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node,
* amdgpu memory functions to do so.
*/
if (node->kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
- mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ mqd_mem_obj = kzalloc_obj(struct kfd_mem_obj);
if (!mqd_mem_obj)
return NULL;
- retval = amdgpu_amdkfd_alloc_gtt_mem(node->adev,
- (ALIGN(q->ctl_stack_size, PAGE_SIZE) +
- ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) *
+ retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev,
+ (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) +
+ ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE)) *
NUM_XCC(node->xcc_mask),
- &(mqd_mem_obj->gtt_mem),
+ mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM :
+ AMDGPU_GEM_DOMAIN_GTT,
+ &(mqd_mem_obj->mem),
&(mqd_mem_obj->gpu_addr),
(void *)&(mqd_mem_obj->cpu_ptr), true);
@@ -341,11 +364,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
{
struct v9_mqd *m;
struct kfd_context_save_area_header header;
+ u32 cntl_stack_size;
+ u32 cntl_stack_offset;
/* Control stack is located one page after MQD. */
- void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+ void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
m = get_mqd(mqd);
+ cntl_stack_size = min_t(u32, m->cp_hqd_cntl_stack_size, q->ctl_stack_size);
+ cntl_stack_offset = min_t(u32, m->cp_hqd_cntl_stack_offset, cntl_stack_size);
*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
m->cp_hqd_cntl_stack_offset;
@@ -361,26 +388,30 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state)))
return -EFAULT;
- if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
- mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
- *ctl_stack_used_size))
+ *ctl_stack_used_size = cntl_stack_size - cntl_stack_offset;
+
+ if (copy_to_user(ctl_stack + cntl_stack_offset, mqd_ctl_stack + cntl_stack_offset,
+ *ctl_stack_used_size))
return -EFAULT;
return 0;
}
-static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size)
+static int get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size)
{
struct v9_mqd *m = get_mqd(mqd);
- *ctl_stack_size = m->cp_hqd_cntl_stack_size;
+ if (check_mul_overflow(m->cp_hqd_cntl_stack_size, NUM_XCC(mm->dev->xcc_mask), ctl_stack_size))
+ return -EINVAL;
+
+ return 0;
}
static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
{
struct v9_mqd *m;
/* Control stack is located one page after MQD. */
- void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+ void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
m = get_mqd(mqd);
@@ -388,6 +419,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi
memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size);
}
+static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm,
+ void *mqd,
+ void *mqd_dst,
+ void *ctl_stack_dst)
+{
+ struct v9_mqd *m;
+ int xcc;
+ uint64_t size = get_mqd(mqd)->cp_mqd_stride_size;
+
+ for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+ m = get_mqd(mqd + size * xcc);
+
+ checkpoint_mqd(mm, m,
+ (uint8_t *)mqd_dst + sizeof(*m) * xcc,
+ (uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc);
+ }
+}
+
static void restore_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *qp,
@@ -408,7 +457,7 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd,
*gart_addr = addr;
/* Control stack is located one page after MQD. */
- ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE);
+ ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE);
memcpy(ctl_stack, ctl_stack_src, ctl_stack_size);
m->cp_hqd_pq_doorbell_control =
@@ -495,6 +544,10 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+ /* Allow context switch so we don't cross-process starve with a massive
+ * command buffer of long-running SDMA commands
+ */
+ m->sdmax_rlcx_ib_cntl |= SDMA0_GFX_IB_CNTL__SWITCH_INSIDE_IB_MASK;
q->is_active = QUEUE_IS_ACTIVE(*q);
}
@@ -554,7 +607,7 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd,
m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
- if (amdgpu_sriov_vf(mm->dev->adev))
+ if (amdgpu_sriov_multi_vf_mode(mm->dev->adev))
m->cp_hqd_pq_doorbell_control |= 1 <<
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev);
@@ -574,7 +627,7 @@ static int hiq_load_mqd_kiq_v9_4_3(struct mqd_manager *mm, void *mqd,
struct queue_properties *p, struct mm_struct *mms)
{
uint32_t xcc_mask = mm->dev->xcc_mask;
- int xcc_id, err, inst = 0;
+ int xcc_id, err = 0, inst = 0;
void *xcc_mqd;
uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev);
@@ -598,7 +651,7 @@ static int destroy_hiq_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id)
{
uint32_t xcc_mask = mm->dev->xcc_mask;
- int xcc_id, err, inst = 0;
+ int xcc_id, err = 0, inst = 0;
uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev);
struct v9_mqd *m;
u32 doorbell_off;
@@ -643,8 +696,8 @@ static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj,
struct kfd_mem_obj *xcc_mqd_mem_obj,
uint64_t offset)
{
- xcc_mqd_mem_obj->gtt_mem = (offset == 0) ?
- mqd_mem_obj->gtt_mem : NULL;
+ xcc_mqd_mem_obj->mem = (offset == 0) ?
+ mqd_mem_obj->mem : NULL;
xcc_mqd_mem_obj->gpu_addr = mqd_mem_obj->gpu_addr + offset;
xcc_mqd_mem_obj->cpu_ptr = (uint32_t *)((uintptr_t)mqd_mem_obj->cpu_ptr
+ offset);
@@ -667,7 +720,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset*xcc);
init_mqd(mm, (void **)&m, &xcc_mqd_mem_obj, &xcc_gart_addr, q);
-
+ if (amdgpu_sriov_multi_vf_mode(mm->dev->adev))
+ m->cp_hqd_pq_doorbell_control |= 1 <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
m->cp_mqd_stride_size = offset;
/*
@@ -714,6 +769,9 @@ static void init_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
*gart_addr = xcc_gart_addr;
}
}
+
+ if (mqd_on_vram(mm->dev->adev))
+ amdgpu_device_flush_hdp(mm->dev->adev, NULL);
}
static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
@@ -727,6 +785,9 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
m = get_mqd(mqd + size * xcc);
update_mqd(mm, m, q, minfo);
+ if (amdgpu_sriov_multi_vf_mode(mm->dev->adev))
+ m->cp_hqd_pq_doorbell_control |= 1 <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
update_cu_mask(mm, m, minfo, xcc);
if (q->format == KFD_QUEUE_FORMAT_AQL) {
@@ -747,14 +808,57 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
m->pm4_target_xcc_in_xcp = q->pm4_target_xcc;
}
}
+
+ if (mqd_on_vram(mm->dev->adev))
+ amdgpu_device_flush_hdp(mm->dev->adev, NULL);
}
+static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *qp,
+ const void *mqd_src,
+ const void *ctl_stack_src, u32 ctl_stack_size)
+{
+ struct kfd_mem_obj xcc_mqd_mem_obj;
+ u32 mqd_ctl_stack_size;
+ struct v9_mqd *m;
+ u32 num_xcc;
+ int xcc;
+
+ uint64_t offset = mm->mqd_stride(mm, qp);
+
+ mm->dev->dqm->current_logical_xcc_start++;
+
+ num_xcc = NUM_XCC(mm->dev->xcc_mask);
+ mqd_ctl_stack_size = ctl_stack_size / num_xcc;
+
+ memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
+
+ /* Set the MQD pointer and gart address to XCC0 MQD */
+ *mqd = mqd_mem_obj->cpu_ptr;
+ if (gart_addr)
+ *gart_addr = mqd_mem_obj->gpu_addr;
+
+ for (xcc = 0; xcc < num_xcc; xcc++) {
+ get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc);
+ restore_mqd(mm, (void **)&m,
+ &xcc_mqd_mem_obj,
+ NULL,
+ qp,
+ (uint8_t *)mqd_src + xcc * sizeof(*m),
+ (uint8_t *)ctl_stack_src + xcc * mqd_ctl_stack_size,
+ mqd_ctl_stack_size);
+ }
+
+ if (mqd_on_vram(mm->dev->adev))
+ amdgpu_device_flush_hdp(mm->dev->adev, NULL);
+}
static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type, unsigned int timeout,
uint32_t pipe_id, uint32_t queue_id)
{
uint32_t xcc_mask = mm->dev->xcc_mask;
- int xcc_id, err, inst = 0;
+ int xcc_id, err = 0, inst = 0;
void *xcc_mqd;
struct v9_mqd *m;
uint64_t mqd_offset;
@@ -784,7 +888,7 @@ static int load_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
uint32_t xcc_mask = mm->dev->xcc_mask;
- int xcc_id, err, inst = 0;
+ int xcc_id, err = 0, inst = 0;
void *xcc_mqd;
uint64_t mqd_stride_size = mm->mqd_stride(mm, p);
@@ -870,7 +974,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
@@ -882,8 +986,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
mqd->free_mqd = kfd_free_mqd_cp;
mqd->is_occupied = kfd_is_occupied_cp;
mqd->get_checkpoint_info = get_checkpoint_info;
- mqd->checkpoint_mqd = checkpoint_mqd;
- mqd->restore_mqd = restore_mqd;
mqd->mqd_size = sizeof(struct v9_mqd);
mqd->mqd_stride = mqd_stride_v9;
#if defined(CONFIG_DEBUG_FS)
@@ -897,12 +999,16 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
mqd->update_mqd = update_mqd_v9_4_3;
mqd->destroy_mqd = destroy_mqd_v9_4_3;
mqd->get_wave_state = get_wave_state_v9_4_3;
+ mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3;
+ mqd->restore_mqd = restore_mqd_v9_4_3;
} else {
mqd->init_mqd = init_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = kfd_destroy_mqd_cp;
mqd->get_wave_state = get_wave_state;
+ mqd->checkpoint_mqd = checkpoint_mqd;
+ mqd->restore_mqd = restore_mqd;
}
break;
case KFD_MQD_TYPE_HIQ:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index c1fafc502515..431a20323146 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -73,12 +73,12 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
static void set_priority(struct vi_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
- m->cp_hqd_queue_priority = q->priority;
}
-static struct kfd_mem_obj *allocate_mqd(struct kfd_node *kfd,
+static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm,
struct queue_properties *q)
{
+ struct kfd_node *kfd = mm->dev;
struct kfd_mem_obj *mqd_mem_obj;
if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd),
@@ -274,10 +274,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
return 0;
}
-static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size)
+static int get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stack_size)
{
/* Control stack is stored in user mode */
*ctl_stack_size = 0;
+ return 0;
}
static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
@@ -445,7 +446,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc_obj(*mqd);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 4984b41cd372..b1a6eb349bb3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -31,6 +31,7 @@
#define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0)
#define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1)
#define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2)
+#define OVER_SUBSCRIPTION_XNACK_CONFLICT (1 << 3)
static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
unsigned int buffer_size_bytes)
@@ -44,7 +45,8 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
static void pm_calc_rlib_size(struct packet_manager *pm,
unsigned int *rlib_size,
- int *over_subscription)
+ int *over_subscription,
+ int xnack_conflict)
{
unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
unsigned int map_queue_size;
@@ -73,6 +75,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
*over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT;
if (gws_queue_count > 1)
*over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT;
+ if (xnack_conflict && (node->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN))
+ *over_subscription |= OVER_SUBSCRIPTION_XNACK_CONFLICT;
if (*over_subscription)
dev_dbg(dev, "Over subscribed runlist\n");
@@ -96,7 +100,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
unsigned int **rl_buffer,
uint64_t *rl_gpu_buffer,
unsigned int *rl_buffer_size,
- int *is_over_subscription)
+ int *is_over_subscription,
+ int xnack_conflict)
{
struct kfd_node *node = pm->dqm->dev;
struct device *dev = node->adev->dev;
@@ -105,7 +110,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
if (WARN_ON(pm->allocated))
return -EINVAL;
- pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+ pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription,
+ xnack_conflict);
mutex_lock(&pm->lock);
@@ -142,11 +148,27 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
struct queue *q;
struct kernel_queue *kq;
int is_over_subscription;
+ int xnack_enabled = -1;
+ bool xnack_conflict = 0;
rl_wptr = retval = processes_mapped = 0;
+ /* Check if processes set different xnack modes */
+ list_for_each_entry(cur, queues, list) {
+ qpd = cur->qpd;
+ if (xnack_enabled < 0)
+ /* First process */
+ xnack_enabled = qpd->pqm->process->xnack_enabled;
+ else if (qpd->pqm->process->xnack_enabled != xnack_enabled) {
+ /* Found a process with a different xnack mode */
+ xnack_conflict = 1;
+ break;
+ }
+ }
+
retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr,
- &alloc_size_bytes, &is_over_subscription);
+ &alloc_size_bytes, &is_over_subscription,
+ xnack_conflict);
if (retval)
return retval;
@@ -156,9 +178,13 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n",
pm->dqm->processes_count, pm->dqm->active_queue_count);
+build_runlist_ib:
/* build the run list ib packet */
list_for_each_entry(cur, queues, list) {
qpd = cur->qpd;
+ /* group processes with the same xnack mode together */
+ if (qpd->pqm->process->xnack_enabled != xnack_enabled)
+ continue;
/* build map process packet */
if (processes_mapped >= pm->dqm->processes_count) {
dev_dbg(dev, "Not enough space left in runlist IB\n");
@@ -215,18 +241,26 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
alloc_size_bytes);
}
}
+ if (xnack_conflict) {
+ /* pick up processes with the other xnack mode */
+ xnack_enabled = !xnack_enabled;
+ xnack_conflict = 0;
+ goto build_runlist_ib;
+ }
dev_dbg(dev, "Finished map process and queues to runlist\n");
if (is_over_subscription) {
if (!pm->is_over_subscription)
- dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s. Expect reduced ROCm performance.\n",
- is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ?
- " too many processes." : "",
- is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ?
- " too many queues." : "",
- is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ?
- " multiple processes using cooperative launch." : "");
+ dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s%s. Expect reduced ROCm performance.\n",
+ is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ?
+ " too many processes" : "",
+ is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ?
+ " too many queues" : "",
+ is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ?
+ " multiple processes using cooperative launch" : "",
+ is_over_subscription & OVER_SUBSCRIPTION_XNACK_CONFLICT ?
+ " xnack on/off processes mixed on gfx9" : "");
retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
*rl_gpu_addr,
@@ -396,14 +430,33 @@ out:
return retval;
}
-int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
+/* pm_config_dequeue_wait_counts: Configure dequeue timer Wait Counts
+ * by writing to CP_IQ_WAIT_TIME2 registers.
+ *
+ * @cmd: See emum kfd_config_dequeue_wait_counts_cmd definition
+ * @value: Depends on the cmd. This parameter is unused for
+ * KFD_DEQUEUE_WAIT_INIT and KFD_DEQUEUE_WAIT_RESET. For
+ * KFD_DEQUEUE_WAIT_SET_SCH_WAVE it holds value to be set
+ *
+ */
+int pm_config_dequeue_wait_counts(struct packet_manager *pm,
+ enum kfd_config_dequeue_wait_counts_cmd cmd,
+ uint32_t value)
{
struct kfd_node *node = pm->dqm->dev;
struct device *dev = node->adev->dev;
int retval = 0;
uint32_t *buffer, size;
- size = pm->pmf->set_grace_period_size;
+ if (!pm->pmf->config_dequeue_wait_counts ||
+ !pm->pmf->config_dequeue_wait_counts_size)
+ return 0;
+
+ if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) ||
+ KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0)))
+ return 0;
+
+ size = pm->pmf->config_dequeue_wait_counts_size;
mutex_lock(&pm->lock);
@@ -419,13 +472,18 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
goto out;
}
- retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
- if (!retval)
+ retval = pm->pmf->config_dequeue_wait_counts(pm, buffer,
+ cmd, value);
+ if (!retval) {
retval = kq_submit_packet(pm->priv_queue);
- else
+
+ /* If default value is modified, cache that in dqm->wait_times */
+ if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT)
+ update_dqm_wait_times(pm->dqm);
+ } else {
kq_rollback_packet(pm->priv_queue);
+ }
}
-
out:
mutex_unlock(&pm->lock);
return retval;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 1f9f5bfeaf86..3d2375817c3e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -43,11 +43,11 @@ static int pm_map_process_v9(struct packet_manager *pm,
memset(buffer, 0, sizeof(struct pm4_mes_map_process));
packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
sizeof(struct pm4_mes_map_process));
- if (adev->enforce_isolation[kfd->node_id])
+ if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
packet->bitfields2.exec_cleaner_shader = 1;
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 10;
- packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields2.pasid = pdd->pasid;
packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0;
@@ -102,11 +102,12 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
sizeof(struct pm4_mes_map_process_aldebaran));
- if (adev->enforce_isolation[knode->node_id])
+ if (adev->enforce_isolation[knode->node_id] ==
+ AMDGPU_ENFORCE_ISOLATION_ENABLE)
packet->bitfields2.exec_cleaner_shader = 1;
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 10;
- packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields2.pasid = pdd->pasid;
packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0;
@@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
* hws_max_conc_proc has been done in
* kgd2kfd_device_init().
*/
- concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ?
- 1 : min(pm->dqm->processes_count,
- kfd->max_proc_per_quantum);
+ concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] ==
+ AMDGPU_ENFORCE_ISOLATION_ENABLE) ?
+ 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum);
packet = (struct pm4_mes_runlist *)buffer;
@@ -202,6 +203,8 @@ static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer,
queue_type__mes_set_resources__hsa_interface_queue_hiq;
packet->bitfields2.vmid_mask = res->vmid_mask;
packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+ if (pm->dqm->dev->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN)
+ packet->bitfields2.enb_xnack_retry_disable_check = 1;
packet->bitfields7.oac_mask = res->oac_mask;
packet->bitfields8.gds_heap_base = res->gds_heap_base;
packet->bitfields8.gds_heap_size = res->gds_heap_size;
@@ -237,7 +240,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
packet->bitfields2.engine_sel =
engine_sel__mes_map_queues__compute_vi;
- packet->bitfields2.gws_control_queue = q->gws ? 1 : 0;
+ packet->bitfields2.gws_control_queue = q->properties.is_gws ? 1 : 0;
packet->bitfields2.extended_engine_sel =
extended_engine_sel__mes_map_queues__legacy_engine_sel;
packet->bitfields2.queue_type =
@@ -249,10 +252,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_latency_static_queue_vi;
break;
- case KFD_QUEUE_TYPE_DIQ:
- packet->bitfields2.queue_type =
- queue_type__mes_map_queues__debug_interface_queue_vi;
- break;
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
if (q->properties.sdma_engine_id < 2 &&
@@ -297,23 +296,79 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
return 0;
}
-static int pm_set_grace_period_v9(struct packet_manager *pm,
+static inline void pm_build_dequeue_wait_counts_packet_info(struct packet_manager *pm,
+ uint32_t sch_value, uint32_t que_sleep, uint32_t *reg_offset,
+ uint32_t *reg_data)
+{
+ pm->dqm->dev->kfd2kgd->build_dequeue_wait_counts_packet_info(
+ pm->dqm->dev->adev,
+ pm->dqm->wait_times,
+ sch_value,
+ que_sleep,
+ reg_offset,
+ reg_data);
+}
+
+/* pm_config_dequeue_wait_counts_v9: Builds WRITE_DATA packet with
+ * register/value for configuring dequeue wait counts
+ *
+ * @return: -ve for failure and 0 for success and buffer is
+ * filled in with packet
+ *
+ **/
+static int pm_config_dequeue_wait_counts_v9(struct packet_manager *pm,
uint32_t *buffer,
- uint32_t grace_period)
+ enum kfd_config_dequeue_wait_counts_cmd cmd,
+ uint32_t value)
{
struct pm4_mec_write_data_mmio *packet;
uint32_t reg_offset = 0;
uint32_t reg_data = 0;
- pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
- pm->dqm->dev->adev,
- pm->dqm->wait_times,
- grace_period,
- &reg_offset,
- &reg_data);
+ switch (cmd) {
+ case KFD_DEQUEUE_WAIT_INIT: {
+ uint32_t sch_wave = 0, que_sleep = 1;
+
+ /* For all gfx9 ASICs > gfx941,
+ * Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default 0x40.
+ * On a 1GHz machine this is roughly 1 microsecond, which is
+ * about how long it takes to load data out of memory during
+ * queue connect
+ * QUE_SLEEP: Wait Count for Dequeue Retry.
+ *
+ * Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU
+ */
+ if (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) ||
+ KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0))
+ return -EPERM;
+
+ if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev->gmc.is_app_apu &&
+ (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3)))
+ sch_wave = 1;
+
+ pm_build_dequeue_wait_counts_packet_info(pm, sch_wave, que_sleep,
+ &reg_offset, &reg_data);
- if (grace_period == USE_DEFAULT_GRACE_PERIOD)
- reg_data = pm->dqm->wait_times;
+ break;
+ }
+ case KFD_DEQUEUE_WAIT_RESET:
+ /* reg_data would be set to dqm->wait_times */
+ pm_build_dequeue_wait_counts_packet_info(pm, 0, 0, &reg_offset, &reg_data);
+ break;
+
+ case KFD_DEQUEUE_WAIT_SET_SCH_WAVE:
+ /* The CP cannot handle value 0 and it will result in
+ * an infinite grace period being set so set to 1 to prevent this. Also
+ * avoid debugger API breakage as it sets 0 and expects a low value.
+ */
+ if (!value)
+ value = 1;
+ pm_build_dequeue_wait_counts_packet_info(pm, value, 0, &reg_offset, &reg_data);
+ break;
+ default:
+ pr_err("Invalid dequeue wait cmd\n");
+ return -EINVAL;
+ }
packet = (struct pm4_mec_write_data_mmio *)buffer;
memset(buffer, 0, sizeof(struct pm4_mec_write_data_mmio));
@@ -415,7 +470,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
.set_resources = pm_set_resources_v9,
.map_queues = pm_map_queues_v9,
.unmap_queues = pm_unmap_queues_v9,
- .set_grace_period = pm_set_grace_period_v9,
+ .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9,
.query_status = pm_query_status_v9,
.release_mem = NULL,
.map_process_size = sizeof(struct pm4_mes_map_process),
@@ -423,7 +478,7 @@ const struct packet_manager_funcs kfd_v9_pm_funcs = {
.set_resources_size = sizeof(struct pm4_mes_set_resources),
.map_queues_size = sizeof(struct pm4_mes_map_queues),
.unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
- .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio),
+ .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio),
.query_status_size = sizeof(struct pm4_mes_query_status),
.release_mem_size = 0,
};
@@ -434,7 +489,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
.set_resources = pm_set_resources_v9,
.map_queues = pm_map_queues_v9,
.unmap_queues = pm_unmap_queues_v9,
- .set_grace_period = pm_set_grace_period_v9,
+ .config_dequeue_wait_counts = pm_config_dequeue_wait_counts_v9,
.query_status = pm_query_status_v9,
.release_mem = NULL,
.map_process_size = sizeof(struct pm4_mes_map_process_aldebaran),
@@ -442,7 +497,7 @@ const struct packet_manager_funcs kfd_aldebaran_pm_funcs = {
.set_resources_size = sizeof(struct pm4_mes_set_resources),
.map_queues_size = sizeof(struct pm4_mes_map_queues),
.unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
- .set_grace_period_size = sizeof(struct pm4_mec_write_data_mmio),
+ .config_dequeue_wait_counts_size = sizeof(struct pm4_mec_write_data_mmio),
.query_status_size = sizeof(struct pm4_mes_query_status),
.release_mem_size = 0,
};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
index c1199d06d131..1f0b8ef7c966 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
@@ -42,6 +42,7 @@ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
struct qcm_process_device *qpd)
{
+ struct kfd_process_device *pdd = qpd_to_pdd(qpd);
struct pm4_mes_map_process *packet;
packet = (struct pm4_mes_map_process *)buffer;
@@ -52,7 +53,7 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
sizeof(struct pm4_mes_map_process));
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 10;
- packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields2.pasid = pdd->pasid;
packet->bitfields3.page_table_base = qpd->page_table_base;
packet->bitfields10.gds_size = qpd->gds_size;
packet->bitfields10.num_gws = qpd->num_gws;
@@ -165,15 +166,10 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_latency_static_queue_vi;
break;
- case KFD_QUEUE_TYPE_DIQ:
- packet->bitfields2.queue_type =
- queue_type__mes_map_queues__debug_interface_queue_vi;
- break;
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0_vi;
- use_static = false; /* no static queues under SDMA */
break;
default:
WARN(1, "queue type %d", q->properties.type);
@@ -303,7 +299,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = {
.set_resources = pm_set_resources_vi,
.map_queues = pm_map_queues_vi,
.unmap_queues = pm_unmap_queues_vi,
- .set_grace_period = NULL,
+ .config_dequeue_wait_counts = NULL,
.query_status = pm_query_status_vi,
.release_mem = pm_release_mem_vi,
.map_process_size = sizeof(struct pm4_mes_map_process),
@@ -311,7 +307,7 @@ const struct packet_manager_funcs kfd_vi_pm_funcs = {
.set_resources_size = sizeof(struct pm4_mes_set_resources),
.map_queues_size = sizeof(struct pm4_mes_map_queues),
.unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
- .set_grace_period_size = 0,
+ .config_dequeue_wait_counts_size = 0,
.query_status_size = sizeof(struct pm4_mes_query_status),
.release_mem_size = sizeof(struct pm4_mec_release_mem)
};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
deleted file mode 100644
index e3b250918f39..000000000000
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright 2014-2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <linux/types.h>
-#include "kfd_priv.h"
-#include "amdgpu_ids.h"
-
-static unsigned int pasid_bits = 16;
-static bool pasids_allocated; /* = false */
-
-bool kfd_set_pasid_limit(unsigned int new_limit)
-{
- if (new_limit < 2)
- return false;
-
- if (new_limit < (1U << pasid_bits)) {
- if (pasids_allocated)
- /* We've already allocated user PASIDs, too late to
- * change the limit
- */
- return false;
-
- while (new_limit < (1U << pasid_bits))
- pasid_bits--;
- }
-
- return true;
-}
-
-unsigned int kfd_get_pasid_limit(void)
-{
- return 1U << pasid_bits;
-}
-
-u32 kfd_pasid_alloc(void)
-{
- int r = amdgpu_pasid_alloc(pasid_bits);
-
- if (r > 0) {
- pasids_allocated = true;
- return r;
- }
-
- return 0;
-}
-
-void kfd_pasid_free(u32 pasid)
-{
- amdgpu_pasid_free(pasid);
-}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index cd8611401a66..e356a207d03c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -63,7 +63,8 @@ struct pm4_mes_set_resources {
struct {
uint32_t vmid_mask:16;
uint32_t unmap_latency:8;
- uint32_t reserved1:5;
+ uint32_t reserved1:4;
+ uint32_t enb_xnack_retry_disable_check:1;
enum mes_set_resources_queue_type_enum queue_type:3;
} bitfields2;
uint32_t ordinal2;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d8cd913aa772..d5b07789eda4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -91,7 +91,7 @@
/* Macro for allocating structures */
#define kfd_alloc_struct(ptr_to_struct) \
- ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
+ ((typeof(ptr_to_struct)) kzalloc_obj(*ptr_to_struct))
#define KFD_MAX_NUM_OF_PROCESSES 512
#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
@@ -102,8 +102,8 @@
* The first chunk is the TBA used for the CWSR ISA code. The second
* chunk is used as TMA for user-mode trap handler setup in daisy-chain mode.
*/
-#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
-#define KFD_CWSR_TMA_OFFSET (PAGE_SIZE + 2048)
+#define KFD_CWSR_TBA_TMA_SIZE (AMDGPU_GPU_PAGE_SIZE * 2)
+#define KFD_CWSR_TMA_OFFSET (AMDGPU_GPU_PAGE_SIZE + 2048)
#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \
(KFD_MAX_NUM_OF_PROCESSES * \
@@ -111,7 +111,14 @@
#define KFD_KERNEL_QUEUE_SIZE 2048
-#define KFD_UNMAP_LATENCY_MS (4000)
+/* KFD_UNMAP_LATENCY_MS is the timeout CP waiting for SDMA preemption. One XCC
+ * can be associated to 2 SDMA engines. queue_preemption_timeout_ms is the time
+ * driver waiting for CP returning the UNMAP_QUEUE fence. Thus the math is
+ * queue_preemption_timeout_ms = sdma_preemption_time * 2 + cp workload
+ * The format here makes CP workload 10% of total timeout
+ */
+#define KFD_UNMAP_LATENCY_MS \
+ ((queue_preemption_timeout_ms - queue_preemption_timeout_ms / 10) >> 1)
#define KFD_MAX_SDMA_QUEUES 128
@@ -208,7 +215,8 @@ enum cache_policy {
((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \
(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) || \
(KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) || \
- (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)))
+ (KFD_GC_VERSION(dev) == IP_VERSION(9, 5, 0)) || \
+ (KFD_GC_VERSION(dev) == IP_VERSION(12, 1, 0)))
struct kfd_node;
@@ -234,7 +242,6 @@ struct kfd_device_info {
uint32_t no_atomic_fw_version;
unsigned int num_sdma_queues_per_engine;
unsigned int num_reserved_sdma_queues_per_engine;
- DECLARE_BITMAP(reserved_sdma_queues_bitmap, KFD_MAX_SDMA_QUEUES);
};
unsigned int kfd_get_num_sdma_engines(struct kfd_node *kdev);
@@ -245,7 +252,7 @@ struct kfd_mem_obj {
uint32_t range_end;
uint64_t gpu_addr;
uint32_t *cpu_ptr;
- void *gtt_mem;
+ void *mem;
};
struct kfd_vmid_info {
@@ -289,7 +296,6 @@ struct kfd_node {
/* Global GWS resource shared between processes */
void *gws;
- bool gws_debug_workaround;
/* Clients watching SMI events */
struct list_head smi_clients;
@@ -373,6 +379,11 @@ struct kfd_dev {
/* bitmap for dynamic doorbell allocation from doorbell object */
unsigned long *doorbell_bitmap;
+
+ /* for dynamic partitioning */
+ int kfd_dev_lock;
+
+ atomic_t kfd_processes_count;
};
enum kfd_mempool {
@@ -384,6 +395,7 @@ enum kfd_mempool {
/* Character device interface */
int kfd_chardev_init(void);
void kfd_chardev_exit(void);
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
/**
* enum kfd_unmap_queues_filter - Enum for queue filters.
@@ -423,7 +435,6 @@ enum kfd_queue_type {
KFD_QUEUE_TYPE_COMPUTE,
KFD_QUEUE_TYPE_SDMA,
KFD_QUEUE_TYPE_HIQ,
- KFD_QUEUE_TYPE_DIQ,
KFD_QUEUE_TYPE_SDMA_XGMI,
KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
};
@@ -496,7 +507,8 @@ struct queue_properties {
enum kfd_queue_format format;
unsigned int queue_id;
uint64_t queue_address;
- uint64_t queue_size;
+ uint64_t queue_size;
+ uint64_t metadata_queue_size;
uint32_t priority;
uint32_t queue_percent;
void __user *read_ptr;
@@ -687,6 +699,7 @@ struct qcm_process_device {
uint32_t num_gws;
uint32_t num_oac;
uint32_t sh_hidden_private_base;
+ uint32_t vm_cntx_cntl;
/* CWSR memory */
struct kgd_mem *cwsr_mem;
@@ -851,6 +864,8 @@ struct kfd_process_device {
/* Tracks queue reset status */
bool has_reset_queue;
+
+ u32 pasid;
};
#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
@@ -910,8 +925,6 @@ struct kfd_process {
/* We want to receive a notification when the mm_struct is destroyed */
struct mmu_notifier mmu_notifier;
- u32 pasid;
-
/*
* Array of kfd_process_device pointers,
* one for each device the process is using.
@@ -1007,9 +1020,19 @@ struct kfd_process {
/* if gpu page fault sent to KFD */
bool gpu_page_fault;
+
+ /*kfd context id */
+ u16 context_id;
+
+ /* The primary kfd_process allocating IDs for its secondary kfd_process, 0 for primary kfd_process */
+ struct ida id_table;
+
};
-#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
+#define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */
+#define KFD_CONTEXT_ID_PRIMARY 0xFFFF
+#define KFD_CONTEXT_ID_MIN 0
+
extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
extern struct srcu_struct kfd_processes_srcu;
@@ -1025,22 +1048,28 @@ extern struct srcu_struct kfd_processes_srcu;
typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p,
void *data);
+typedef int amdkfd_ioctl_validate_t(void *kdata, unsigned int usize);
+
struct amdkfd_ioctl_desc {
unsigned int cmd;
int flags;
amdkfd_ioctl_t *func;
+ amdkfd_ioctl_validate_t *validate;
unsigned int cmd_drv;
const char *name;
};
bool kfd_dev_is_large_bar(struct kfd_node *dev);
+struct kfd_process *create_process(const struct task_struct *thread, bool primary);
int kfd_process_create_wq(void);
void kfd_process_destroy_wq(void);
void kfd_cleanup_processes(void);
struct kfd_process *kfd_create_process(struct task_struct *thread);
-struct kfd_process *kfd_get_process(const struct task_struct *task);
-struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
+int kfd_create_process_sysfs(struct kfd_process *process);
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid,
+ struct kfd_process_device **pdd);
struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
+struct kfd_process *kfd_lookup_process_by_id(const struct mm_struct *mm, u16 id);
int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id);
int kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
@@ -1078,6 +1107,7 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported);
int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
struct vm_area_struct *vma);
+void kfd_process_notifier_release_internal(struct kfd_process *p);
/* KFD process API for creating and translating handles */
int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
@@ -1091,8 +1121,6 @@ struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid);
/* PASIDs */
int kfd_pasid_init(void);
void kfd_pasid_exit(void);
-bool kfd_set_pasid_limit(unsigned int new_limit);
-unsigned int kfd_get_pasid_limit(void);
u32 kfd_pasid_alloc(void);
void kfd_pasid_free(u32 pasid);
@@ -1142,7 +1170,6 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain);
struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
-struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id,
uint32_t vmid)
{
@@ -1154,9 +1181,11 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
struct kfd_dev *dev = adev->kfd.dev;
uint32_t i;
- if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) &&
- KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) &&
- KFD_GC_VERSION(dev) != IP_VERSION(9, 5, 0))
+ /*
+ * On multi-aid system, attempt per-node matching. Otherwise,
+ * fall back to the first node.
+ */
+ if (!amdgpu_is_multi_aid(adev))
return dev->nodes[0];
for (i = 0; i < dev->num_nodes; i++)
@@ -1166,7 +1195,9 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
return NULL;
}
int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_node **kdev);
+uint32_t kfd_topology_get_num_devices(void);
int kfd_numa_node_to_apic_id(int numa_node_id);
+uint32_t kfd_gpu_node_num(void);
/* Interrupts */
#define KFD_IRQ_FENCE_CLIENTID 0xff
@@ -1332,12 +1363,14 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type,
struct kfd_node *dev);
struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type,
struct kfd_node *dev);
+struct mqd_manager *mqd_manager_init_v12_1(enum KFD_MQD_TYPE type,
+ struct kfd_node *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
-int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid);
+int kfd_evict_process_device(struct kfd_process_device *pdd);
int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id);
/* Process Queue Manager */
@@ -1366,8 +1399,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm, unsigned int qid,
struct mqd_update_info *minfo);
int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
void *gws);
-struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
- unsigned int qid);
struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
unsigned int qid);
int pqm_get_wave_state(struct process_queue_manager *pqm,
@@ -1394,6 +1425,24 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm,
#define KFD_FENCE_COMPLETED (100)
#define KFD_FENCE_INIT (10)
+/**
+ * enum kfd_config_dequeue_wait_counts_cmd - Command for configuring
+ * dequeue wait counts.
+ *
+ * @KFD_DEQUEUE_WAIT_INIT: Set optimized dequeue wait counts for a
+ * certain ASICs. For these ASICs, this is default value used by RESET
+ * @KFD_DEQUEUE_WAIT_RESET: Reset dequeue wait counts to the optimized value
+ * for certain ASICs. For others set it to default hardware reset value
+ * @KFD_DEQUEUE_WAIT_SET_SCH_WAVE: Set context switch latency wait
+ *
+ */
+enum kfd_config_dequeue_wait_counts_cmd {
+ KFD_DEQUEUE_WAIT_INIT = 1,
+ KFD_DEQUEUE_WAIT_RESET = 2,
+ KFD_DEQUEUE_WAIT_SET_SCH_WAVE = 3
+};
+
+
struct packet_manager {
struct device_queue_manager *dqm;
struct kernel_queue *priv_queue;
@@ -1419,8 +1468,8 @@ struct packet_manager_funcs {
int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
enum kfd_unmap_queues_filter mode,
uint32_t filter_param, bool reset);
- int (*set_grace_period)(struct packet_manager *pm, uint32_t *buffer,
- uint32_t grace_period);
+ int (*config_dequeue_wait_counts)(struct packet_manager *pm, uint32_t *buffer,
+ enum kfd_config_dequeue_wait_counts_cmd cmd, uint32_t value);
int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
uint64_t fence_address, uint64_t fence_value);
int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
@@ -1431,7 +1480,7 @@ struct packet_manager_funcs {
int set_resources_size;
int map_queues_size;
int unmap_queues_size;
- int set_grace_period_size;
+ int config_dequeue_wait_counts_size;
int query_status_size;
int release_mem_size;
};
@@ -1454,7 +1503,9 @@ int pm_send_unmap_queue(struct packet_manager *pm,
void pm_release_ib(struct packet_manager *pm);
-int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period);
+int pm_config_dequeue_wait_counts(struct packet_manager *pm,
+ enum kfd_config_dequeue_wait_counts_cmd cmd,
+ uint32_t wait_counts_config);
/* Following PM funcs can be shared among VI and AI */
unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
@@ -1467,6 +1518,7 @@ extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
extern const struct kfd_event_interrupt_class event_interrupt_class_v9_4_3;
extern const struct kfd_event_interrupt_class event_interrupt_class_v10;
extern const struct kfd_event_interrupt_class event_interrupt_class_v11;
+extern const struct kfd_event_interrupt_class event_interrupt_class_v12_1;
extern const struct kfd_device_global_init_class device_global_init_class_cik;
@@ -1478,7 +1530,7 @@ int kfd_wait_on_events(struct kfd_process *p,
bool all, uint32_t *user_timeout_ms,
uint32_t *wait_result);
void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
- uint32_t valid_id_bits);
+ uint32_t valid_id_bits, bool signal_mailbox_updated);
void kfd_signal_hw_exception_event(u32 pasid);
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
@@ -1492,21 +1544,24 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
int kfd_get_num_events(struct kfd_process *p);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
-void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va);
+
+void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
struct kfd_vm_fault_info *info,
struct kfd_hsa_memory_exception_data *data);
void kfd_signal_reset_event(struct kfd_node *dev);
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid);
+void kfd_signal_process_terminate_event(struct kfd_process *p);
-static inline void kfd_flush_tlb(struct kfd_process_device *pdd,
- enum TLB_FLUSH_TYPE type)
+static inline void kfd_flush_tlb(struct kfd_process_device *pdd)
{
struct amdgpu_device *adev = pdd->dev->adev;
struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
- amdgpu_vm_flush_compute_tlb(adev, vm, type, pdd->dev->xcc_mask);
+ amdgpu_vm_flush_compute_tlb(adev, vm, TLB_FLUSH_HEAVYWEIGHT,
+ pdd->dev->xcc_mask);
}
static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
@@ -1519,7 +1574,7 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
int kfd_send_exception_to_runtime(struct kfd_process *p,
unsigned int queue_id,
uint64_t error_reason);
-bool kfd_is_locked(void);
+bool kfd_is_locked(struct kfd_dev *kfd);
/* Compute profile */
void kfd_inc_compute_active(struct kfd_node *dev);
@@ -1566,10 +1621,15 @@ int kfd_debugfs_hang_hws(struct kfd_node *dev);
int pm_debugfs_hang_hws(struct packet_manager *pm);
int dqm_debugfs_hang_hws(struct device_queue_manager *dqm);
+void kfd_debugfs_add_process(struct kfd_process *p);
+void kfd_debugfs_remove_process(struct kfd_process *p);
+
#else
static inline void kfd_debugfs_init(void) {}
static inline void kfd_debugfs_fini(void) {}
+static inline void kfd_debugfs_add_process(struct kfd_process *p) {}
+static inline void kfd_debugfs_remove_process(struct kfd_process *p) {}
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 083f83c94531..d28ca581cad0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -35,6 +35,7 @@
#include <linux/pm_runtime.h>
#include "amdgpu_amdkfd.h"
#include "amdgpu.h"
+#include "amdgpu_reset.h"
struct mm_struct;
@@ -67,7 +68,6 @@ static struct workqueue_struct *kfd_restore_wq;
static struct kfd_process *find_process(const struct task_struct *thread,
bool ref);
static void kfd_process_ref_release(struct kref *ref);
-static struct kfd_process *create_process(const struct task_struct *thread);
static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);
@@ -153,7 +153,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work)
(q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
continue;
- sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL);
+ sdma_q = kzalloc_obj(struct temp_sdma_queue_list);
if (!sdma_q) {
dqm_unlock(dqm);
goto cleanup;
@@ -282,8 +282,8 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
cu_cnt = 0;
proc = pdd->process;
if (pdd->qpd.queue_count == 0) {
- pr_debug("Gpu-Id: %d has no active queues for process %d\n",
- dev->id, proc->pasid);
+ pr_debug("Gpu-Id: %d has no active queues for process pid %d\n",
+ dev->id, (int)proc->lead_thread->pid);
return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
}
@@ -291,7 +291,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
wave_cnt = 0;
max_waves_per_cu = 0;
- cu_occupancy = kcalloc(AMDGPU_MAX_QUEUES, sizeof(*cu_occupancy), GFP_KERNEL);
+ cu_occupancy = kzalloc_objs(*cu_occupancy, AMDGPU_MAX_QUEUES);
if (!cu_occupancy)
return -ENOMEM;
@@ -327,12 +327,9 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
char *buffer)
{
- if (strcmp(attr->name, "pasid") == 0) {
- struct kfd_process *p = container_of(attr, struct kfd_process,
- attr_pasid);
-
- return snprintf(buffer, PAGE_SIZE, "%d\n", p->pasid);
- } else if (strncmp(attr->name, "vram_", 5) == 0) {
+ if (strcmp(attr->name, "pasid") == 0)
+ return snprintf(buffer, PAGE_SIZE, "%d\n", 0);
+ else if (strncmp(attr->name, "vram_", 5) == 0) {
struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device,
attr_vram);
return snprintf(buffer, PAGE_SIZE, "%llu\n", atomic64_read(&pdd->vram_usage));
@@ -588,7 +585,7 @@ static void kfd_procfs_add_sysfs_stats(struct kfd_process *p)
ret = kobject_init_and_add(pdd->kobj_stats,
&procfs_stats_type,
p->kobj,
- stats_dir_filename);
+ "%s", stats_dir_filename);
if (ret) {
pr_warn("Creating KFD proc/stats_%s folder failed",
@@ -635,7 +632,7 @@ static void kfd_procfs_add_sysfs_counters(struct kfd_process *p)
return;
ret = kobject_init_and_add(kobj_counters, &sysfs_counters_type,
- p->kobj, counters_dir_filename);
+ p->kobj, "%s", counters_dir_filename);
if (ret) {
pr_warn("Creating KFD proc/%s folder failed",
counters_dir_filename);
@@ -682,7 +679,7 @@ static void kfd_procfs_add_sysfs_files(struct kfd_process *p)
void kfd_procfs_del_queue(struct queue *q)
{
- if (!q)
+ if (!q || !q->process->kobj)
return;
kobject_del(&q->kobj);
@@ -692,7 +689,8 @@ void kfd_procfs_del_queue(struct queue *q)
int kfd_process_create_wq(void)
{
if (!kfd_process_wq)
- kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
+ kfd_process_wq = alloc_workqueue("kfd_process_wq", WQ_UNBOUND,
+ 0);
if (!kfd_restore_wq)
kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
WQ_FREEZABLE);
@@ -827,6 +825,103 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
}
+int kfd_create_process_sysfs(struct kfd_process *process)
+{
+ struct kfd_process *primary_process;
+ int ret;
+
+ if (process->kobj) {
+ pr_warn("kobject already exists for the kfd_process\n");
+ return -EINVAL;
+ }
+
+ process->kobj = kfd_alloc_struct(process->kobj);
+ if (!process->kobj) {
+ pr_warn("Creating procfs kobject failed");
+ return -ENOMEM;
+ }
+
+ if (process->context_id == KFD_CONTEXT_ID_PRIMARY)
+ ret = kobject_init_and_add(process->kobj, &procfs_type,
+ procfs.kobj, "%d",
+ (int)process->lead_thread->pid);
+ else {
+ primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
+ if (!primary_process)
+ return -ESRCH;
+
+ ret = kobject_init_and_add(process->kobj, &procfs_type,
+ primary_process->kobj, "context_%u",
+ process->context_id);
+ kfd_unref_process(primary_process);
+ }
+
+ if (ret) {
+ pr_warn("Creating procfs pid directory failed");
+ kobject_put(process->kobj);
+ process->kobj = NULL;
+ return ret;
+ }
+
+ kfd_sysfs_create_file(process->kobj, &process->attr_pasid,
+ "pasid");
+
+ process->kobj_queues = kobject_create_and_add("queues",
+ process->kobj);
+ if (!process->kobj_queues)
+ pr_warn("Creating KFD proc/queues folder failed");
+
+ kfd_procfs_add_sysfs_stats(process);
+ kfd_procfs_add_sysfs_files(process);
+ kfd_procfs_add_sysfs_counters(process);
+
+ return 0;
+}
+
+static int kfd_process_alloc_id(struct kfd_process *process)
+{
+ int ret;
+ struct kfd_process *primary_process;
+
+ /* already assign 0xFFFF when create */
+ if (process->context_id == KFD_CONTEXT_ID_PRIMARY)
+ return 0;
+
+ primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
+ if (!primary_process)
+ return -ESRCH;
+
+ /* id range: KFD_CONTEXT_ID_MIN to 0xFFFE */
+ ret = ida_alloc_range(&primary_process->id_table, KFD_CONTEXT_ID_MIN,
+ KFD_CONTEXT_ID_PRIMARY - 1, GFP_KERNEL);
+ if (ret < 0)
+ goto out;
+
+ process->context_id = ret;
+ ret = 0;
+
+out:
+ kfd_unref_process(primary_process);
+
+ return ret;
+}
+
+static void kfd_process_free_id(struct kfd_process *process)
+{
+ struct kfd_process *primary_process;
+
+ if (process->context_id != KFD_CONTEXT_ID_PRIMARY)
+ return;
+
+ primary_process = kfd_lookup_process_by_mm(process->lead_thread->mm);
+ if (!primary_process)
+ return;
+
+ ida_free(&primary_process->id_table, process->context_id);
+
+ kfd_unref_process(primary_process);
+}
+
struct kfd_process *kfd_create_process(struct task_struct *thread)
{
struct kfd_process *process;
@@ -835,11 +930,13 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
if (!(thread->mm && mmget_not_zero(thread->mm)))
return ERR_PTR(-EINVAL);
- /* Only the pthreads threading model is supported. */
- if (thread->group_leader->mm != thread->mm) {
- mmput(thread->mm);
- return ERR_PTR(-EINVAL);
- }
+ /* If the process just called exec(3), it is possible that the
+ * cleanup of the kfd_process (following the release of the mm
+ * of the old process image) is still in the cleanup work queue.
+ * Make sure to drain any job before trying to recreate any
+ * resource for this process.
+ */
+ flush_workqueue(kfd_process_wq);
/*
* take kfd processes mutex before starting of process creation
@@ -848,7 +945,13 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
*/
mutex_lock(&kfd_processes_mutex);
- if (kfd_is_locked()) {
+ if (kfd_gpu_node_num() <= 0) {
+ pr_warn("no gpu node! Cannot create KFD process");
+ process = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ if (kfd_is_locked(NULL)) {
pr_debug("KFD is locked! Cannot create process");
process = ERR_PTR(-EINVAL);
goto out;
@@ -861,46 +964,18 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
if (process) {
pr_debug("Process already found\n");
} else {
- /* If the process just called exec(3), it is possible that the
- * cleanup of the kfd_process (following the release of the mm
- * of the old process image) is still in the cleanup work queue.
- * Make sure to drain any job before trying to recreate any
- * resource for this process.
- */
- flush_workqueue(kfd_process_wq);
-
- process = create_process(thread);
+ process = create_process(thread, true);
if (IS_ERR(process))
goto out;
if (!procfs.kobj)
goto out;
- process->kobj = kfd_alloc_struct(process->kobj);
- if (!process->kobj) {
- pr_warn("Creating procfs kobject failed");
- goto out;
- }
- ret = kobject_init_and_add(process->kobj, &procfs_type,
- procfs.kobj, "%d",
- (int)process->lead_thread->pid);
- if (ret) {
- pr_warn("Creating procfs pid directory failed");
- kobject_put(process->kobj);
- goto out;
- }
-
- kfd_sysfs_create_file(process->kobj, &process->attr_pasid,
- "pasid");
-
- process->kobj_queues = kobject_create_and_add("queues",
- process->kobj);
- if (!process->kobj_queues)
- pr_warn("Creating KFD proc/queues folder failed");
+ ret = kfd_create_process_sysfs(process);
+ if (ret)
+ pr_warn("Failed to create sysfs entry for the kfd_process");
- kfd_procfs_add_sysfs_stats(process);
- kfd_procfs_add_sysfs_files(process);
- kfd_procfs_add_sysfs_counters(process);
+ kfd_debugfs_add_process(process);
init_waitqueue_head(&process->wait_irq_drain);
}
@@ -911,31 +986,13 @@ out:
return process;
}
-struct kfd_process *kfd_get_process(const struct task_struct *thread)
-{
- struct kfd_process *process;
-
- if (!thread->mm)
- return ERR_PTR(-EINVAL);
-
- /* Only the pthreads threading model is supported. */
- if (thread->group_leader->mm != thread->mm)
- return ERR_PTR(-EINVAL);
-
- process = find_process(thread, false);
- if (!process)
- return ERR_PTR(-EINVAL);
-
- return process;
-}
-
static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
{
struct kfd_process *process;
hash_for_each_possible_rcu(kfd_processes_table, process,
kfd_processes, (uintptr_t)mm)
- if (process->mm == mm)
+ if (process->mm == mm && process->context_id == KFD_CONTEXT_ID_PRIMARY)
return process;
return NULL;
@@ -1056,17 +1113,15 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
- pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n",
- pdd->dev->id, p->pasid);
+ kfd_smi_event_process(pdd, false);
+ pr_debug("Releasing pdd (topology id %d, for pid %d)\n",
+ pdd->dev->id, p->lead_thread->pid);
kfd_process_device_destroy_cwsr_dgpu(pdd);
kfd_process_device_destroy_ib_mem(pdd);
- if (pdd->drm_file) {
- amdgpu_amdkfd_gpuvm_release_process_vm(
- pdd->dev->adev, pdd->drm_priv);
+ if (pdd->drm_file)
fput(pdd->drm_file);
- }
if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
@@ -1078,18 +1133,19 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
if (pdd->dev->kfd->shared_resources.enable_mes &&
pdd->proc_ctx_cpu_ptr)
- amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
+ amdgpu_amdkfd_free_kernel_mem(pdd->dev->adev,
&pdd->proc_ctx_bo);
/*
* before destroying pdd, make sure to report availability
* for auto suspend
*/
if (pdd->runtime_inuse) {
- pm_runtime_mark_last_busy(adev_to_drm(pdd->dev->adev)->dev);
pm_runtime_put_autosuspend(adev_to_drm(pdd->dev->adev)->dev);
pdd->runtime_inuse = false;
}
+ atomic_dec(&pdd->dev->kfd->kfd_processes_count);
+
kfree(pdd);
p->pdds[i] = NULL;
}
@@ -1140,6 +1196,17 @@ static void kfd_process_remove_sysfs(struct kfd_process *p)
p->kobj = NULL;
}
+/*
+ * If any GPU is ongoing reset, wait for reset complete.
+ */
+static void kfd_process_wait_gpu_reset_complete(struct kfd_process *p)
+{
+ int i;
+
+ for (i = 0; i < p->n_pdds; i++)
+ flush_workqueue(p->pdds[i]->dev->adev->reset_domain->wq);
+}
+
/* No process locking is needed in this function, because the process
* is not findable any more. We must assume that no other thread is
* using it any more, otherwise we couldn't safely free the process
@@ -1151,8 +1218,10 @@ static void kfd_process_wq_release(struct work_struct *work)
release_work);
struct dma_fence *ef;
- kfd_process_dequeue_from_all_devices(p);
- pqm_uninit(&p->pqm);
+ /*
+ * If GPU in reset, user queues may still running, wait for reset complete.
+ */
+ kfd_process_wait_gpu_reset_complete(p);
/* Signal the eviction fence after user mode queues are
* destroyed. This allows any BOs to be freed without
@@ -1163,7 +1232,12 @@ static void kfd_process_wq_release(struct work_struct *work)
if (ef)
dma_fence_signal(ef);
- kfd_process_remove_sysfs(p);
+ if (p->context_id != KFD_CONTEXT_ID_PRIMARY)
+ kfd_process_free_id(p);
+ else
+ ida_destroy(&p->id_table);
+
+ kfd_debugfs_remove_process(p);
kfd_process_kunmap_signal_bo(p);
kfd_process_free_outstanding_kfd_bos(p);
@@ -1174,11 +1248,15 @@ static void kfd_process_wq_release(struct work_struct *work)
kfd_event_free_process(p);
- kfd_pasid_free(p->pasid);
mutex_destroy(&p->mutex);
put_task_struct(p->lead_thread);
+ /* the last step is removing process entries under /sys
+ * to indicate the process has been terminated.
+ */
+ kfd_process_remove_sysfs(p);
+
kfree(p);
}
@@ -1203,13 +1281,41 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
}
-static void kfd_process_notifier_release_internal(struct kfd_process *p)
+static void kfd_process_table_remove(struct kfd_process *p)
+{
+ mutex_lock(&kfd_processes_mutex);
+ /*
+ * Do early return if table is empty.
+ *
+ * This could potentially happen if this function is called concurrently
+ * by mmu_notifier and by kfd_cleanup_pocesses.
+ *
+ */
+ if (hash_empty(kfd_processes_table)) {
+ mutex_unlock(&kfd_processes_mutex);
+ return;
+ }
+ hash_del_rcu(&p->kfd_processes);
+ mutex_unlock(&kfd_processes_mutex);
+ synchronize_srcu(&kfd_processes_srcu);
+}
+
+void kfd_process_notifier_release_internal(struct kfd_process *p)
{
int i;
+ kfd_process_table_remove(p);
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
+ /*
+ * Dequeue and destroy user queues, it is not safe for GPU to access
+ * system memory after mmu release notifier callback returns because
+ * exit_mmap free process memory afterwards.
+ */
+ kfd_process_dequeue_from_all_devices(p);
+ pqm_uninit(&p->pqm);
+
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
@@ -1240,7 +1346,8 @@ static void kfd_process_notifier_release_internal(struct kfd_process *p)
srcu_read_unlock(&kfd_processes_srcu, idx);
}
- mmu_notifier_put(&p->mmu_notifier);
+ if (p->context_id == KFD_CONTEXT_ID_PRIMARY)
+ mmu_notifier_put(&p->mmu_notifier);
}
static void kfd_process_notifier_release(struct mmu_notifier *mn,
@@ -1256,22 +1363,6 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
if (WARN_ON(p->mm != mm))
return;
- mutex_lock(&kfd_processes_mutex);
- /*
- * Do early return if table is empty.
- *
- * This could potentially happen if this function is called concurrently
- * by mmu_notifier and by kfd_cleanup_pocesses.
- *
- */
- if (hash_empty(kfd_processes_table)) {
- mutex_unlock(&kfd_processes_mutex);
- return;
- }
- hash_del_rcu(&p->kfd_processes);
- mutex_unlock(&kfd_processes_mutex);
- synchronize_srcu(&kfd_processes_srcu);
-
kfd_process_notifier_release_internal(p);
}
@@ -1472,7 +1563,8 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
* management and memory-manager-related preemptions or
* even deadlocks.
*/
- if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1))
+ if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1) &&
+ KFD_GC_VERSION(dev) < IP_VERSION(12, 1, 0))
return false;
if (dev->kfd->noretry)
@@ -1496,13 +1588,13 @@ void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
* On return the kfd_process is fully operational and will be freed when the
* mm is released
*/
-static struct kfd_process *create_process(const struct task_struct *thread)
+struct kfd_process *create_process(const struct task_struct *thread, bool primary)
{
struct kfd_process *process;
struct mmu_notifier *mn;
int err = -ENOMEM;
- process = kzalloc(sizeof(*process), GFP_KERNEL);
+ process = kzalloc_obj(*process);
if (!process)
goto err_alloc_process;
@@ -1512,6 +1604,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
process->lead_thread = thread->group_leader;
process->n_pdds = 0;
process->queues_paused = false;
+
INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
process->last_restore_timestamp = get_jiffies_64();
@@ -1525,12 +1618,6 @@ static struct kfd_process *create_process(const struct task_struct *thread)
atomic_set(&process->debugged_process_count, 0);
sema_init(&process->runtime_enable_sema, 0);
- process->pasid = kfd_pasid_alloc();
- if (process->pasid == 0) {
- err = -ENOSPC;
- goto err_alloc_pasid;
- }
-
err = pqm_init(&process->pqm, process);
if (err != 0)
goto err_process_pqm_init;
@@ -1561,12 +1648,22 @@ static struct kfd_process *create_process(const struct task_struct *thread)
* After this point, mmu_notifier_put will trigger the cleanup by
* dropping the last process reference in the free_notifier.
*/
- mn = mmu_notifier_get(&kfd_process_mmu_notifier_ops, process->mm);
- if (IS_ERR(mn)) {
- err = PTR_ERR(mn);
- goto err_register_notifier;
+ if (primary) {
+ process->context_id = KFD_CONTEXT_ID_PRIMARY;
+ mn = mmu_notifier_get(&kfd_process_mmu_notifier_ops, process->mm);
+ if (IS_ERR(mn)) {
+ err = PTR_ERR(mn);
+ goto err_register_notifier;
+ }
+ BUG_ON(mn != &process->mmu_notifier);
+ ida_init(&process->id_table);
+ }
+
+ err = kfd_process_alloc_id(process);
+ if (err) {
+ pr_err("Creating kfd process: failed to alloc an id\n");
+ goto err_alloc_id;
}
- BUG_ON(mn != &process->mmu_notifier);
kfd_unref_process(process);
get_task_struct(process->lead_thread);
@@ -1575,6 +1672,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
return process;
+err_alloc_id:
+ kfd_process_free_id(process);
err_register_notifier:
hash_del_rcu(&process->kfd_processes);
svm_range_list_fini(process);
@@ -1584,8 +1683,6 @@ err_init_svm_range_list:
err_init_apertures:
pqm_uninit(&process->pqm);
err_process_pqm_init:
- kfd_pasid_free(process->pasid);
-err_alloc_pasid:
kfd_event_free_process(process);
err_event_init:
mutex_destroy(&process->mutex);
@@ -1613,7 +1710,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
return NULL;
- pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
+ pdd = kzalloc_obj(*pdd);
if (!pdd)
return NULL;
@@ -1643,6 +1740,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
/* Init idr used for memory handle translation */
idr_init(&pdd->alloc_idr);
+ atomic_inc(&dev->kfd->kfd_processes_count);
+
return pdd;
}
@@ -1670,9 +1769,6 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
struct kfd_node *dev;
int ret;
- if (!drm_file)
- return -EINVAL;
-
if (pdd->drm_priv)
return -EBUSY;
@@ -1704,15 +1800,21 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
if (ret)
goto err_init_cwsr;
- ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, avm, p->pasid);
- if (ret)
- goto err_set_pasid;
+ if (unlikely(!avm->pasid)) {
+ dev_warn(pdd->dev->adev->dev, "WARN: vm %p has no pasid associated",
+ avm);
+ ret = -EINVAL;
+ goto err_get_pasid;
+ }
+ pdd->pasid = avm->pasid;
pdd->drm_file = drm_file;
+ kfd_smi_event_process(pdd, true);
+
return 0;
-err_set_pasid:
+err_get_pasid:
kfd_process_device_destroy_cwsr_dgpu(pdd);
err_init_cwsr:
kfd_process_device_destroy_ib_mem(pdd);
@@ -1798,25 +1900,50 @@ void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
idr_remove(&pdd->alloc_idr, handle);
}
-/* This increments the process->ref counter. */
-struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid)
+static struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 pasid)
{
- struct kfd_process *p, *ret_p = NULL;
+ struct kfd_process_device *ret_p = NULL;
+ struct kfd_process *p;
unsigned int temp;
-
- int idx = srcu_read_lock(&kfd_processes_srcu);
+ int i;
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- if (p->pasid == pasid) {
- kref_get(&p->ref);
- ret_p = p;
- break;
+ for (i = 0; i < p->n_pdds; i++) {
+ if (p->pdds[i]->pasid == pasid) {
+ ret_p = p->pdds[i];
+ break;
+ }
}
+ if (ret_p)
+ break;
+ }
+ return ret_p;
+}
+
+/* This increments the process->ref counter. */
+struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid,
+ struct kfd_process_device **pdd)
+{
+ struct kfd_process_device *ret_p;
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ ret_p = kfd_lookup_process_device_by_pasid(pasid);
+ if (ret_p) {
+ if (pdd)
+ *pdd = ret_p;
+ kref_get(&ret_p->process->ref);
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+ return ret_p->process;
}
srcu_read_unlock(&kfd_processes_srcu, idx);
- return ret_p;
+ if (pdd)
+ *pdd = NULL;
+
+ return NULL;
}
/* This increments the process->ref counter. */
@@ -1835,6 +1962,27 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
return p;
}
+/* This increments the process->ref counter. */
+struct kfd_process *kfd_lookup_process_by_id(const struct mm_struct *mm, u16 id)
+{
+ struct kfd_process *p, *ret_p = NULL;
+ unsigned int temp;
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ if (p->mm == mm && p->context_id == id) {
+ kref_get(&p->ref);
+ ret_p = p;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ return ret_p;
+}
+
/* kfd_process_evict_queues - Evict all user queues of a process
*
* Eviction is reference-counted per process-device. This means multiple
@@ -1942,18 +2090,18 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
return -EINVAL;
}
-static int signal_eviction_fence(struct kfd_process *p)
+static bool signal_eviction_fence(struct kfd_process *p)
{
struct dma_fence *ef;
- int ret;
+ bool ret;
rcu_read_lock();
ef = dma_fence_get_rcu_safe(&p->ef);
rcu_read_unlock();
if (!ef)
- return -EINVAL;
+ return true;
- ret = dma_fence_signal(ef);
+ ret = dma_fence_check_and_signal(ef);
dma_fence_put(ef);
return ret;
@@ -1972,7 +2120,7 @@ static void evict_process_worker(struct work_struct *work)
*/
p = container_of(dwork, struct kfd_process, eviction_work);
- pr_debug("Started evicting pasid 0x%x\n", p->pasid);
+ pr_debug("Started evicting process pid %d\n", p->lead_thread->pid);
ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
/* If another thread already signaled the eviction fence,
@@ -1984,9 +2132,9 @@ static void evict_process_worker(struct work_struct *work)
msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
- pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
+ pr_debug("Finished evicting process pid %d\n", p->lead_thread->pid);
} else
- pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
+ pr_err("Failed to evict queues of process pid %d\n", p->lead_thread->pid);
}
static int restore_process_helper(struct kfd_process *p)
@@ -2003,9 +2151,11 @@ static int restore_process_helper(struct kfd_process *p)
ret = kfd_process_restore_queues(p);
if (!ret)
- pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
+ pr_debug("Finished restoring process pid %d\n",
+ p->lead_thread->pid);
else
- pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
+ pr_err("Failed to restore queues of process pid %d\n",
+ p->lead_thread->pid);
return ret;
}
@@ -2022,7 +2172,7 @@ static void restore_process_worker(struct work_struct *work)
* lifetime of this thread, kfd_process p will be valid
*/
p = container_of(dwork, struct kfd_process, restore_work);
- pr_debug("Started restoring pasid 0x%x\n", p->pasid);
+ pr_debug("Started restoring process pasid %d\n", (int)p->lead_thread->pid);
/* Setting last_restore_timestamp before successful restoration.
* Otherwise this would have to be set by KGD (restore_process_bos)
@@ -2038,8 +2188,8 @@ static void restore_process_worker(struct work_struct *work)
ret = restore_process_helper(p);
if (ret) {
- pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
- p->pasid, PROCESS_BACK_OFF_TIME_MS);
+ pr_debug("Failed to restore BOs of process pid %d, retry after %d ms\n",
+ p->lead_thread->pid, PROCESS_BACK_OFF_TIME_MS);
if (mod_delayed_work(kfd_restore_wq, &p->restore_work,
msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
@@ -2055,7 +2205,7 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
- pr_err("Failed to suspend process 0x%x\n", p->pasid);
+ pr_err("Failed to suspend process pid %d\n", p->lead_thread->pid);
signal_eviction_fence(p);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
@@ -2069,8 +2219,8 @@ int kfd_resume_all_processes(void)
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
if (restore_process_helper(p)) {
- pr_err("Restore process %d failed during resume\n",
- p->pasid);
+ pr_err("Restore process pid %d failed during resume\n",
+ p->lead_thread->pid);
ret = -EFAULT;
}
}
@@ -2125,14 +2275,15 @@ int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
KFD_IRQ_FENCE_CLIENTID;
- irq_drain_fence[3] = pdd->process->pasid;
+ irq_drain_fence[3] = pdd->pasid;
/*
* For GFX 9.4.3/9.5.0, send the NodeId also in IH cookie DW[3]
*/
if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) ||
KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4) ||
- KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0)) {
+ KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 5, 0) ||
+ KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(12, 1, 0)) {
node_id = ffs(pdd->dev->interrupt_bitmap) - 1;
irq_drain_fence[3] |= node_id << 16;
}
@@ -2156,7 +2307,7 @@ void kfd_process_close_interrupt_drain(unsigned int pasid)
{
struct kfd_process *p;
- p = kfd_lookup_process_by_pasid(pasid);
+ p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return;
@@ -2277,8 +2428,8 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
int idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- seq_printf(m, "Process %d PASID 0x%x:\n",
- p->lead_thread->tgid, p->pasid);
+ seq_printf(m, "Process %d PASID %d:\n",
+ p->lead_thread->tgid, p->lead_thread->pid);
mutex_lock(&p->mutex);
r = pqm_debugfs_mqds(m, &p->pqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index bd36a75309e1..44e39ce222b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -69,8 +69,8 @@ static int find_available_queue_slot(struct process_queue_manager *pqm,
pr_debug("The new slot id %lu\n", found);
if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
- pr_info("Cannot open more queues for process with pasid 0x%x\n",
- pqm->process->pasid);
+ pr_info("Cannot open more queues for process with pid %d\n",
+ pqm->process->lead_thread->pid);
return -ENOMEM;
}
@@ -94,7 +94,8 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
if (dev->kfd->shared_resources.enable_mes && !!pdd->proc_ctx_gpu_addr &&
down_read_trylock(&dev->adev->reset_domain->sem)) {
amdgpu_mes_flush_shader_debugger(dev->adev,
- pdd->proc_ctx_gpu_addr);
+ pdd->proc_ctx_gpu_addr,
+ ffs(pdd->dev->xcc_mask) - 1);
up_read(&dev->adev->reset_domain->sem);
}
pdd->already_dequeued = true;
@@ -209,8 +210,8 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm,
}
if (dev->kfd->shared_resources.enable_mes) {
- amdgpu_amdkfd_free_gtt_mem(dev->adev, &pqn->q->gang_ctx_bo);
- amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart);
+ amdgpu_amdkfd_free_kernel_mem(dev->adev, &pqn->q->gang_ctx_bo);
+ amdgpu_amdkfd_free_kernel_mem(dev->adev, (void **)&pqn->q->wptr_bo_gart);
}
}
@@ -264,8 +265,9 @@ static int init_user_queue(struct process_queue_manager *pqm,
(*q)->process = pqm->process;
if (dev->kfd->shared_resources.enable_mes) {
- retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
+ retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev,
AMDGPU_MES_GANG_CTX_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
&(*q)->gang_ctx_bo,
&(*q)->gang_ctx_gpu_addr,
&(*q)->gang_ctx_cpu_ptr,
@@ -279,20 +281,17 @@ static int init_user_queue(struct process_queue_manager *pqm,
/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
* on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
*/
- if (((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK)
- >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
- if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) {
- pr_err("Queue memory allocated to wrong device\n");
- retval = -EINVAL;
- goto free_gang_ctx_bo;
- }
+ if (dev->adev != amdgpu_ttm_adev(q_properties->wptr_bo->tbo.bdev)) {
+ pr_err("Queue memory allocated to wrong device\n");
+ retval = -EINVAL;
+ goto free_gang_ctx_bo;
+ }
- retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo,
- &(*q)->wptr_bo_gart);
- if (retval) {
- pr_err("Failed to map wptr bo to GART\n");
- goto free_gang_ctx_bo;
- }
+ retval = amdgpu_amdkfd_map_gtt_bo_to_gart(q_properties->wptr_bo,
+ &(*q)->wptr_bo_gart);
+ if (retval) {
+ pr_err("Failed to map wptr bo to GART\n");
+ goto free_gang_ctx_bo;
}
}
@@ -300,7 +299,7 @@ static int init_user_queue(struct process_queue_manager *pqm,
return 0;
free_gang_ctx_bo:
- amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo);
+ amdgpu_amdkfd_free_kernel_mem(dev->adev, &(*q)->gang_ctx_bo);
cleanup:
uninit_queue(*q);
*q = NULL;
@@ -348,7 +347,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
* If we are just about to create DIQ, the is_debug flag is not set yet
* Hence we also check the type as well
*/
- if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ))
+ if (pdd->qpd.is_debug)
max_queues = dev->kfd->device_info.max_no_of_hqd/2;
if (pdd->qpd.queue_count >= max_queues)
@@ -363,11 +362,28 @@ int pqm_create_queue(struct process_queue_manager *pqm,
if (retval != 0)
return retval;
+ /* Register process if this is the first queue */
if (list_empty(&pdd->qpd.queues_list) &&
list_empty(&pdd->qpd.priv_queue_list))
dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
- pqn = kzalloc(sizeof(*pqn), GFP_KERNEL);
+ /* Allocate proc_ctx_bo only if MES is enabled and this is the first queue */
+ if (!pdd->proc_ctx_cpu_ptr && dev->kfd->shared_resources.enable_mes) {
+ retval = amdgpu_amdkfd_alloc_kernel_mem(dev->adev,
+ AMDGPU_MES_PROC_CTX_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &pdd->proc_ctx_bo,
+ &pdd->proc_ctx_gpu_addr,
+ &pdd->proc_ctx_cpu_ptr,
+ false);
+ if (retval) {
+ dev_err(dev->adev->dev, "failed to allocate process context bo\n");
+ return retval;
+ }
+ memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
+ }
+
+ pqn = kzalloc_obj(*pqn);
if (!pqn) {
retval = -ENOMEM;
goto err_allocate_pqn;
@@ -413,30 +429,21 @@ int pqm_create_queue(struct process_queue_manager *pqm,
restore_mqd, restore_ctl_stack);
print_queue(q);
break;
- case KFD_QUEUE_TYPE_DIQ:
- kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ);
- if (!kq) {
- retval = -ENOMEM;
- goto err_create_queue;
- }
- kq->queue->properties.queue_id = *qid;
- pqn->kq = kq;
- pqn->q = NULL;
- retval = kfd_process_drain_interrupts(pdd);
- if (retval)
- break;
-
- retval = dev->dqm->ops.create_kernel_queue(dev->dqm,
- kq, &pdd->qpd);
- break;
default:
WARN(1, "Invalid queue type %d", type);
retval = -EINVAL;
}
if (retval != 0) {
- pr_err("Pasid 0x%x DQM create queue type %d failed. ret %d\n",
- pqm->process->pasid, type, retval);
+ if ((type == KFD_QUEUE_TYPE_SDMA ||
+ type == KFD_QUEUE_TYPE_SDMA_XGMI ||
+ type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) &&
+ retval == -ENOMEM)
+ pr_warn("process pid %d DQM create queue type %d failed. ret %d\n",
+ pqm->process->lead_thread->pid, type, retval);
+ else
+ pr_err("process pid %d DQM create queue type %d failed. ret %d\n",
+ pqm->process->lead_thread->pid, type, retval);
goto err_create_queue;
}
@@ -530,9 +537,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
if (retval) {
pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
- pqm->process->pasid,
+ pdd->pasid,
pqn->q->properties.queue_id, retval);
- if (retval != -ETIME)
+ if (retval != -ETIME && retval != -EIO)
goto err_destroy_queue;
}
kfd_procfs_del_queue(pqn->q);
@@ -583,9 +590,11 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,
return err;
if (kfd_queue_buffer_get(vm, (void *)p->queue_address, &p->ring_bo,
- p->queue_size)) {
+ p->queue_size +
+ pqn->q->properties.metadata_queue_size)) {
pr_debug("ring buf 0x%llx size 0x%llx not mapped on GPU\n",
p->queue_address, p->queue_size);
+ amdgpu_bo_unreserve(vm->root.bo);
return -EFAULT;
}
@@ -652,19 +661,6 @@ int pqm_update_mqd(struct process_queue_manager *pqm,
return 0;
}
-struct kernel_queue *pqm_get_kernel_queue(
- struct process_queue_manager *pqm,
- unsigned int qid)
-{
- struct process_queue_node *pqn;
-
- pqn = get_queue_by_qid(pqm, qid);
- if (pqn && pqn->kq)
- return pqn->kq;
-
- return NULL;
-}
-
struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
unsigned int qid)
{
@@ -907,7 +903,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd,
q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
- /* data stored in this order: priv_data, mqd, ctl_stack */
+ /*
+ * data stored in this order:
+ * priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
+ */
q_data->mqd_size = mqd_size;
q_data->ctl_stack_size = ctl_stack_size;
@@ -956,7 +955,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p,
}
static void set_queue_properties_from_criu(struct queue_properties *qp,
- struct kfd_criu_queue_priv_data *q_data)
+ struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc)
{
qp->is_interop = false;
qp->queue_percent = q_data->q_percent;
@@ -969,7 +968,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
- qp->ctl_stack_size = q_data->ctl_stack_size;
+ if (q_data->type == KFD_QUEUE_TYPE_COMPUTE)
+ qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc;
+ else
+ qp->ctl_stack_size = q_data->ctl_stack_size;
+
qp->type = q_data->type;
qp->format = q_data->format;
}
@@ -990,7 +993,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size)
return -EINVAL;
- q_data = kmalloc(sizeof(*q_data), GFP_KERNEL);
+ q_data = kmalloc_obj(*q_data);
if (!q_data)
return -ENOMEM;
@@ -1029,12 +1032,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
goto exit;
}
- /* data stored in this order: mqd, ctl_stack */
+ /*
+ * data stored in this order:
+ * mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
+ */
mqd = q_extra_data;
ctl_stack = mqd + q_data->mqd_size;
memset(&qp, 0, sizeof(qp));
- set_queue_properties_from_criu(&qp, q_data);
+ set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask));
print_queue_properties(&qp);
@@ -1065,6 +1071,7 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm,
uint32_t *ctl_stack_size)
{
struct process_queue_node *pqn;
+ int ret;
pqn = get_queue_by_qid(pqm, qid);
if (!pqn) {
@@ -1077,9 +1084,14 @@ int pqm_get_queue_checkpoint_info(struct process_queue_manager *pqm,
return -EOPNOTSUPP;
}
- pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm,
+ ret = pqn->q->device->dqm->ops.get_queue_checkpoint_info(pqn->q->device->dqm,
pqn->q, mqd_size,
ctl_stack_size);
+ if (ret) {
+ pr_debug("amdkfd: Overflow while computing stack size for queue %d\n", qid);
+ return ret;
+ }
+
return 0;
}
@@ -1114,32 +1126,13 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
break;
default:
seq_printf(m,
- " Bad user queue type %d on device %x\n",
+ " Queue node with bad user queue type %d on device %x\n",
q->properties.type, q->device->id);
continue;
}
mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type];
size = mqd_mgr->mqd_stride(mqd_mgr,
&q->properties);
- } else if (pqn->kq) {
- q = pqn->kq->queue;
- mqd_mgr = pqn->kq->mqd_mgr;
- switch (q->properties.type) {
- case KFD_QUEUE_TYPE_DIQ:
- seq_printf(m, " DIQ on device %x\n",
- pqn->kq->dev->id);
- break;
- default:
- seq_printf(m,
- " Bad kernel queue type %d on device %x\n",
- q->properties.type,
- pqn->kq->dev->id);
- continue;
- }
- } else {
- seq_printf(m,
- " Weird: Queue node with neither kernel nor user queue\n");
- continue;
}
for (xcc = 0; xcc < num_xccs; xcc++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 24396a2c77bd..28354a4e5dd5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -70,7 +70,7 @@ int init_queue(struct queue **q, const struct queue_properties *properties)
{
struct queue *tmp_q;
- tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL);
+ tmp_q = kzalloc_obj(*tmp_q);
if (!tmp_q)
return -ENOMEM;
@@ -233,6 +233,7 @@ void kfd_queue_buffer_put(struct amdgpu_bo **bo)
int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
{
struct kfd_topology_device *topo_dev;
+ u64 expected_queue_size;
struct amdgpu_vm *vm;
u32 total_cwsr_size;
int err;
@@ -241,6 +242,18 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
if (!topo_dev)
return -EINVAL;
+ /* AQL queues on GFX7 and GFX8 appear twice their actual size */
+ if (properties->type == KFD_QUEUE_TYPE_COMPUTE &&
+ properties->format == KFD_QUEUE_FORMAT_AQL &&
+ topo_dev->node_props.gfx_target_version >= 70000 &&
+ topo_dev->node_props.gfx_target_version < 90000)
+ /* metadata_queue_size not supported on GFX7/GFX8 */
+ expected_queue_size =
+ PAGE_ALIGN(properties->queue_size / 2);
+ else
+ expected_queue_size =
+ PAGE_ALIGN(properties->queue_size + properties->metadata_queue_size);
+
vm = drm_priv_to_vm(pdd->drm_priv);
err = amdgpu_bo_reserve(vm->root.bo, false);
if (err)
@@ -255,7 +268,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
goto out_err_unreserve;
err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
- &properties->ring_bo, properties->queue_size);
+ &properties->ring_bo, expected_queue_size);
if (err)
goto out_err_unreserve;
@@ -265,8 +278,8 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
/* EOP buffer is not required for all ASICs */
if (properties->eop_ring_buffer_address) {
- if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) {
- pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n",
+ if (properties->eop_ring_buffer_size < topo_dev->node_props.eop_buffer_size) {
+ pr_debug("queue eop bo size 0x%x is less than node eop buf size 0x%x\n",
properties->eop_ring_buffer_size,
topo_dev->node_props.eop_buffer_size);
err = -EINVAL;
@@ -274,7 +287,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
}
err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address,
&properties->eop_buf_bo,
- properties->eop_ring_buffer_size);
+ ALIGN(properties->eop_ring_buffer_size, PAGE_SIZE));
if (err)
goto out_err_unreserve;
}
@@ -287,16 +300,16 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
goto out_err_unreserve;
}
- if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) {
- pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n",
+ if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) {
+ pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n",
properties->ctx_save_restore_area_size,
topo_dev->node_props.cwsr_size);
err = -EINVAL;
goto out_err_unreserve;
}
- total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
- * NUM_XCC(pdd->dev->xcc_mask);
+ total_cwsr_size = (properties->ctx_save_restore_area_size +
+ topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
@@ -342,8 +355,8 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
topo_dev = kfd_topology_device_by_id(pdd->dev->id);
if (!topo_dev)
return -EINVAL;
- total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
- * NUM_XCC(pdd->dev->xcc_mask);
+ total_cwsr_size = (properties->ctx_save_restore_area_size +
+ topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size);
@@ -382,34 +395,82 @@ int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
return 0;
}
-#define SGPR_SIZE_PER_CU 0x4000
-#define LDS_SIZE_PER_CU 0x10000
-#define HWREG_SIZE_PER_CU 0x1000
#define DEBUGGER_BYTES_ALIGN 64
#define DEBUGGER_BYTES_PER_WAVE 32
+static u32 kfd_get_sgpr_size_per_cu(u32 gfxv)
+{
+ u32 sgpr_size = 0x4000;
+
+ if (gfxv == 120500 ||
+ gfxv == 120501)
+ sgpr_size = 0x8000;
+
+ return sgpr_size;
+}
+
static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
{
u32 vgpr_size = 0x40000;
- if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */
+ if (gfxv == 90402 || /* GFX_VERSION_AQUA_VANJARAM */
gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */
gfxv == 90008 || /* GFX_VERSION_ARCTURUS */
gfxv == 90500)
vgpr_size = 0x80000;
else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */
gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */
+ gfxv == 110501 || /* GFX_VERSION_GFX1151 */
gfxv == 120000 || /* GFX_VERSION_GFX1200 */
gfxv == 120001) /* GFX_VERSION_GFX1201 */
vgpr_size = 0x60000;
+ else if (gfxv == 120500 || /* GFX_VERSION_GFX1250 */
+ gfxv == 120501) /* GFX_VERSION_GFX1251 */
+ vgpr_size = 0x80000;
return vgpr_size;
}
+static u32 kfd_get_hwreg_size_per_cu(u32 gfxv)
+{
+ u32 hwreg_size = 0x1000;
+
+ if (gfxv == 120500 || gfxv == 120501)
+ hwreg_size = 0x8000;
+
+ return hwreg_size;
+}
+
+static u32 kfd_get_lds_size_per_cu(u32 gfxv, struct kfd_node_properties *props)
+{
+ u32 lds_size = 0x10000;
+
+ if (gfxv == 90500 || gfxv == 120500 || gfxv == 120501)
+ lds_size = props->lds_size_in_kb << 10;
+
+ return lds_size;
+}
+
+static u32 get_num_waves(struct kfd_node_properties *props, u32 gfxv, u32 cu_num)
+{
+ u32 wave_num = 0;
+
+ if (gfxv < 100100)
+ wave_num = min(cu_num * 40,
+ props->array_count / props->simd_arrays_per_engine * 512);
+ else if (gfxv < 120500)
+ wave_num = cu_num * 32;
+ else if (gfxv <= 120501)
+ wave_num = cu_num * 64;
+
+ WARN_ON(wave_num == 0);
+
+ return wave_num;
+}
+
#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props) \
- (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\
- (((gfxv) == 90500) ? (props->lds_size_in_kb << 10) : LDS_SIZE_PER_CU) +\
- HWREG_SIZE_PER_CU)
+ (kfd_get_vgpr_size_per_cu(gfxv) + kfd_get_sgpr_size_per_cu(gfxv) +\
+ kfd_get_lds_size_per_cu(gfxv, props) + kfd_get_hwreg_size_per_cu(gfxv))
#define CNTL_STACK_BYTES_PER_WAVE(gfxv) \
((gfxv) >= 100100 ? 12 : 8) /* GFX_VERSION_NAVI10*/
@@ -429,14 +490,13 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
return;
cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
- wave_num = (gfxv < 100100) ? /* GFX_VERSION_NAVI10 */
- min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512)
- : cu_num * 32;
+ wave_num = get_num_waves(props, gfxv, cu_num);
- wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE);
+ wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props),
+ AMDGPU_GPU_PAGE_SIZE);
ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
- PAGE_SIZE);
+ AMDGPU_GPU_PAGE_SIZE);
if ((gfxv / 10000 * 10000) == 100000) {
/* HW design limits control stack size to 0x7000.
@@ -448,11 +508,11 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
props->ctl_stack_size = ctl_stack_size;
props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
- props->cwsr_size = ctl_stack_size + wg_data_size;
+ props->cwsr_size = ALIGN(ctl_stack_size + wg_data_size, PAGE_SIZE);
if (gfxv == 80002) /* GFX_VERSION_TONGA */
props->eop_buffer_size = 0x8000;
- else if ((gfxv / 100 * 100) == 90400) /* GFX_VERSION_AQUA_VANJARAM */
+ else if (gfxv == 90402) /* GFX_VERSION_AQUA_VANJARAM */
props->eop_buffer_size = 4096;
else if (gfxv >= 80000)
props->eop_buffer_size = 4096;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 9b8169761ec5..15975c23a88e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -163,10 +163,9 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
static bool kfd_smi_ev_enabled(pid_t pid, struct kfd_smi_client *client,
unsigned int event)
{
- uint64_t all = KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS);
uint64_t events = READ_ONCE(client->events);
- if (pid && client->pid != pid && !(client->suser && (events & all)))
+ if (pid && client->pid != pid && !client->suser)
return false;
return events & KFD_SMI_EVENT_MASK_FROM_INDEX(event);
@@ -254,9 +253,9 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
if (task_info) {
/* Report VM faults from user applications, not retry from kernel */
- if (task_info->pid)
+ if (task_info->task.pid)
kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT(
- task_info->pid, task_info->task_name));
+ task_info->task.pid, task_info->task.comm));
amdgpu_vm_put_task_info(task_info);
}
}
@@ -313,7 +312,7 @@ void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid)
{
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE,
KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid,
- node->id, 0));
+ node->id, '0'));
}
void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
@@ -345,12 +344,33 @@ void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
pid, address, last - address + 1, node->id, trigger));
}
+void kfd_smi_event_process(struct kfd_process_device *pdd, bool start)
+{
+ struct amdgpu_task_info *task_info;
+ struct amdgpu_vm *avm;
+
+ if (!pdd->drm_priv)
+ return;
+
+ avm = drm_priv_to_vm(pdd->drm_priv);
+ task_info = amdgpu_vm_get_task_info_vm(avm);
+
+ if (task_info) {
+ kfd_smi_event_add(0, pdd->dev,
+ start ? KFD_SMI_EVENT_PROCESS_START :
+ KFD_SMI_EVENT_PROCESS_END,
+ KFD_EVENT_FMT_PROCESS(task_info->task.pid,
+ task_info->task.comm));
+ amdgpu_vm_put_task_info(task_info);
+ }
+}
+
int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd)
{
struct kfd_smi_client *client;
int ret;
- client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL);
+ client = kzalloc_obj(struct kfd_smi_client);
if (!client)
return -ENOMEM;
INIT_LIST_HEAD(&client->list);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 503bff13d815..bb4d72b57387 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -53,4 +53,5 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid,
unsigned long address, unsigned long last,
uint32_t trigger);
+void kfd_smi_event_process(struct kfd_process_device *pdd, bool start);
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index bd3e20d981e0..35ec67d9739b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -33,6 +33,7 @@
#include "amdgpu_hmm.h"
#include "amdgpu.h"
#include "amdgpu_xgmi.h"
+#include "amdgpu_reset.h"
#include "kfd_priv.h"
#include "kfd_svm.h"
#include "kfd_migrate.h"
@@ -167,7 +168,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
int i, r;
if (!addr) {
- addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
+ addr = kvzalloc_objs(*addr, prange->npages);
if (!addr)
return -ENOMEM;
prange->dma_addr[gpuidx] = addr;
@@ -328,7 +329,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
struct svm_range *prange;
struct kfd_process *p;
- prange = kzalloc(sizeof(*prange), GFP_KERNEL);
+ prange = kzalloc_obj(*prange);
if (!prange)
return NULL;
@@ -538,7 +539,7 @@ static struct svm_range_bo *svm_range_bo_new(void)
{
struct svm_range_bo *svm_bo;
- svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
+ svm_bo = kzalloc_obj(*svm_bo);
if (!svm_bo)
return NULL;
@@ -563,7 +564,8 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
int r;
p = container_of(prange->svms, struct kfd_process, svms);
- pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
+ pr_debug("process pid: %d svms 0x%p [0x%lx 0x%lx]\n",
+ p->lead_thread->pid, prange->svms,
prange->start, prange->last);
if (svm_range_validate_svm_bo(node, prange))
@@ -584,7 +586,7 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
svm_bo->eviction_fence =
amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
mm,
- svm_bo);
+ svm_bo, p->context_id);
mmput(mm);
INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
svm_bo->evicting = 0;
@@ -626,9 +628,8 @@ svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
}
}
- r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
+ r = dma_resv_reserve_fences(bo->tbo.base.resv, TTM_NUM_MOVE_FENCES);
if (r) {
- pr_debug("failed %d to reserve bo\n", r);
amdgpu_bo_unreserve(bo);
goto reserve_bo_failed;
}
@@ -1143,40 +1144,57 @@ static int
svm_range_split_tail(struct svm_range *prange, uint64_t new_last,
struct list_head *insert_list, struct list_head *remap_list)
{
+ unsigned long last_align_down = ALIGN_DOWN(prange->last, 512);
+ unsigned long start_align = ALIGN(prange->start, 512);
+ bool huge_page_mapping = last_align_down > start_align;
struct svm_range *tail = NULL;
- int r = svm_range_split(prange, prange->start, new_last, &tail);
+ int r;
- if (!r) {
- list_add(&tail->list, insert_list);
- if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity))
- list_add(&tail->update_list, remap_list);
- }
- return r;
+ r = svm_range_split(prange, prange->start, new_last, &tail);
+
+ if (r)
+ return r;
+
+ list_add(&tail->list, insert_list);
+
+ if (huge_page_mapping && tail->start > start_align &&
+ tail->start < last_align_down && (!IS_ALIGNED(tail->start, 512)))
+ list_add(&tail->update_list, remap_list);
+
+ return 0;
}
static int
svm_range_split_head(struct svm_range *prange, uint64_t new_start,
struct list_head *insert_list, struct list_head *remap_list)
{
+ unsigned long last_align_down = ALIGN_DOWN(prange->last, 512);
+ unsigned long start_align = ALIGN(prange->start, 512);
+ bool huge_page_mapping = last_align_down > start_align;
struct svm_range *head = NULL;
- int r = svm_range_split(prange, new_start, prange->last, &head);
+ int r;
- if (!r) {
- list_add(&head->list, insert_list);
- if (!IS_ALIGNED(new_start, 1UL << prange->granularity))
- list_add(&head->update_list, remap_list);
- }
- return r;
+ r = svm_range_split(prange, new_start, prange->last, &head);
+
+ if (r)
+ return r;
+
+ list_add(&head->list, insert_list);
+
+ if (huge_page_mapping && head->last + 1 > start_align &&
+ head->last + 1 < last_align_down && (!IS_ALIGNED(head->last, 512)))
+ list_add(&head->update_list, remap_list);
+
+ return 0;
}
static void
-svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
- struct svm_range *pchild, enum svm_work_list_ops op)
+svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op)
{
pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
pchild, pchild->start, pchild->last, prange, op);
- pchild->work_item.mm = mm;
+ pchild->work_item.mm = NULL;
pchild->work_item.op = op;
list_add_tail(&pchild->child_list, &prange->child_list);
}
@@ -1189,7 +1207,7 @@ svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b)
}
static uint64_t
-svm_range_get_pte_flags(struct kfd_node *node,
+svm_range_get_pte_flags(struct kfd_node *node, struct amdgpu_vm *vm,
struct svm_range *prange, int domain)
{
struct kfd_node *bo_node;
@@ -1200,7 +1218,8 @@ svm_range_get_pte_flags(struct kfd_node *node,
bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
- unsigned int mtype_local;
+ unsigned int mtype_local, mtype_remote;
+ bool is_aid_a1, is_local;
if (domain == SVM_RANGE_VRAM_DOMAIN)
bo_node = prange->svm_bo->node;
@@ -1244,8 +1263,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
case IP_VERSION(9, 4, 4):
case IP_VERSION(9, 5, 0):
if (ext_coherent)
- mtype_local = (gc_ip_version < IP_VERSION(9, 5, 0) && !node->adev->rev_id) ?
- AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_CC;
+ mtype_local = AMDGPU_VM_MTYPE_CC;
else
mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
@@ -1278,7 +1296,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
/* system memory accessed by the dGPU */
} else {
- if (gc_ip_version < IP_VERSION(9, 5, 0))
+ if (gc_ip_version < IP_VERSION(9, 5, 0) || ext_coherent)
mapping_flags |= AMDGPU_VM_MTYPE_UC;
else
mapping_flags |= AMDGPU_VM_MTYPE_NC;
@@ -1286,12 +1304,26 @@ svm_range_get_pte_flags(struct kfd_node *node,
break;
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
- if (domain == SVM_RANGE_VRAM_DOMAIN) {
- if (bo_node != node)
- mapping_flags |= AMDGPU_VM_MTYPE_NC;
+ mapping_flags |= AMDGPU_VM_MTYPE_NC;
+ break;
+ case IP_VERSION(12, 1, 0):
+ is_aid_a1 = (node->adev->rev_id & 0x10);
+ is_local = (domain == SVM_RANGE_VRAM_DOMAIN) &&
+ (bo_node->adev == node->adev);
+
+ mtype_local = amdgpu_mtype_local == 0 ? AMDGPU_VM_MTYPE_RW :
+ amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
+ is_aid_a1 ? AMDGPU_VM_MTYPE_RW : AMDGPU_VM_MTYPE_NC;
+ mtype_remote = is_aid_a1 ? AMDGPU_VM_MTYPE_NC : AMDGPU_VM_MTYPE_UC;
+ snoop = true;
+
+ if (is_local) /* local HBM */ {
+ mapping_flags |= mtype_local;
+ } else if (ext_coherent) {
+ mapping_flags |= AMDGPU_VM_MTYPE_UC;
} else {
- mapping_flags |= coherent ?
- AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+ /* system memory or remote VRAM */
+ mapping_flags |= mtype_remote;
}
break;
default:
@@ -1299,10 +1331,6 @@ svm_range_get_pte_flags(struct kfd_node *node,
AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
}
- mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
-
- if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
- mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
@@ -1312,7 +1340,15 @@ svm_range_get_pte_flags(struct kfd_node *node,
if (gc_ip_version >= IP_VERSION(12, 0, 0))
pte_flags |= AMDGPU_PTE_IS_PTE;
- pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags);
+ amdgpu_gmc_get_vm_pte(node->adev, vm, NULL, mapping_flags, &pte_flags);
+ pte_flags |= AMDGPU_PTE_READABLE;
+ if (!(flags & KFD_IOCTL_SVM_FLAG_GPU_RO))
+ pte_flags |= AMDGPU_PTE_WRITEABLE;
+
+ if ((gc_ip_version == IP_VERSION(12, 1, 0)) &&
+ node->adev->have_atomics_support)
+ pte_flags |= AMDGPU_PTE_BUS_ATOMICS;
+
return pte_flags;
}
@@ -1321,12 +1357,23 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
uint64_t start, uint64_t last,
struct dma_fence **fence)
{
- uint64_t init_pte_value = 0;
+ uint64_t init_pte_value = adev->gmc.init_pte_flags;
+ uint64_t gpu_start, gpu_end;
+
+ /* Convert CPU page range to GPU page range */
+ gpu_start = start * AMDGPU_GPU_PAGES_IN_CPU_PAGE;
+ gpu_end = (last + 1) * AMDGPU_GPU_PAGES_IN_CPU_PAGE - 1;
+
+ pr_debug("CPU[0x%llx 0x%llx] -> GPU[0x%llx 0x%llx]\n", start, last,
+ gpu_start, gpu_end);
- pr_debug("[0x%llx 0x%llx]\n", start, last);
+ if (!amdgpu_vm_ready(vm)) {
+ pr_debug("VM not ready, canceling unmap\n");
+ return -EINVAL;
+ }
- return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start,
- last, init_pte_value, 0, 0, NULL, NULL,
+ return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, gpu_start,
+ gpu_end, init_pte_value, 0, 0, NULL, NULL,
fence);
}
@@ -1334,7 +1381,6 @@ static int
svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
unsigned long last, uint32_t trigger)
{
- DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
struct kfd_process_device *pdd;
struct dma_fence *fence = NULL;
struct kfd_process *p;
@@ -1352,11 +1398,9 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
prange->mapped_to_gpu = false;
}
- bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
- MAX_GPU_INSTANCE);
p = container_of(prange->svms, struct kfd_process, svms);
- for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+ for_each_or_bit(gpuidx, prange->bitmap_access, prange->bitmap_aip, MAX_GPU_INSTANCE) {
pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
pdd = kfd_process_device_from_gpuidx(p, gpuidx);
if (!pdd) {
@@ -1380,7 +1424,7 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
if (r)
break;
}
- kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
+ kfd_flush_tlb(pdd);
}
return r;
@@ -1405,7 +1449,15 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
last_start, last_start + npages - 1, readonly);
+ if (!amdgpu_vm_ready(vm)) {
+ pr_debug("VM not ready, canceling map\n");
+ return -EINVAL;
+ }
+
for (i = offset; i < offset + npages; i++) {
+ uint64_t gpu_start;
+ uint64_t gpu_end;
+
last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
@@ -1419,21 +1471,26 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
last_start, prange->start + i, last_domain ? "GPU" : "CPU");
- pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain);
+ pte_flags = svm_range_get_pte_flags(pdd->dev, vm, prange, last_domain);
if (readonly)
pte_flags &= ~AMDGPU_PTE_WRITEABLE;
- pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
- prange->svms, last_start, prange->start + i,
- (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
- pte_flags);
/* For dGPU mode, we use same vm_manager to allocate VRAM for
* different memory partition based on fpfn/lpfn, we should use
* same vm_manager.vram_base_offset regardless memory partition.
*/
+ gpu_start = last_start * AMDGPU_GPU_PAGES_IN_CPU_PAGE;
+ gpu_end = (prange->start + i + 1) * AMDGPU_GPU_PAGES_IN_CPU_PAGE - 1;
+
+ pr_debug("svms 0x%p map CPU[0x%lx 0x%llx] GPU[0x%llx 0x%llx] vram %d PTE 0x%llx\n",
+ prange->svms, last_start, prange->start + i,
+ gpu_start, gpu_end,
+ (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
+ pte_flags);
+
r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true,
- NULL, last_start, prange->start + i,
+ NULL, gpu_start, gpu_end,
pte_flags,
(last_start - prange->start) << PAGE_SHIFT,
bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
@@ -1514,7 +1571,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
}
}
- kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
+ kfd_flush_tlb(pdd);
}
return r;
@@ -1631,7 +1688,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
int32_t idx;
int r = 0;
- ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL);
+ ctx = kzalloc_obj(struct svm_validate_context);
if (!ctx)
return -ENOMEM;
ctx->process = container_of(prange->svms, struct kfd_process, svms);
@@ -1706,7 +1763,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
- struct hmm_range *hmm_range = NULL;
+ struct amdgpu_hmm_range *range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1721,10 +1778,36 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
next = min(vma->vm_end, end);
npages = (next - addr) >> PAGE_SHIFT;
+ /* HMM requires at least READ permissions. If provided with PROT_NONE,
+ * unmap the memory. If it's not already mapped, this is a no-op
+ * If PROT_WRITE is provided without READ, warn first then unmap
+ */
+ if (!(vma->vm_flags & VM_READ)) {
+ unsigned long e, s;
+
+ svm_range_lock(prange);
+ if (vma->vm_flags & VM_WRITE)
+ pr_debug("VM_WRITE without VM_READ is not supported");
+ s = max(start, prange->start);
+ e = min(end, prange->last);
+ if (e >= s)
+ r = svm_range_unmap_from_gpus(prange, s, e,
+ KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU);
+ svm_range_unlock(prange);
+ /* If unmap returns non-zero, we'll bail on the next for loop
+ * iteration, so just leave r and continue
+ */
+ addr = next;
+ continue;
+ }
+
WRITE_ONCE(p->svms.faulting_task, current);
- r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
- readonly, owner, NULL,
- &hmm_range);
+ range = amdgpu_hmm_range_alloc(NULL);
+ if (likely(range))
+ r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
+ readonly, owner, range);
+ else
+ r = -ENOMEM;
WRITE_ONCE(p->svms.faulting_task, NULL);
if (r)
pr_debug("failed %d to get svm range pages\n", r);
@@ -1735,7 +1818,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
if (!r) {
offset = (addr >> PAGE_SHIFT) - prange->start;
r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
- hmm_range->hmm_pfns);
+ range->hmm_range.hmm_pfns);
if (r)
pr_debug("failed %d to dma map range\n", r);
}
@@ -1743,14 +1826,17 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
svm_range_lock(prange);
/* Free backing memory of hmm_range if it was initialized
- * Overrride return value to TRY AGAIN only if prior returns
+ * Override return value to TRY AGAIN only if prior returns
* were successful
*/
- if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {
+ if (range && !amdgpu_hmm_range_valid(range) && !r) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;
}
+ /* Free the hmm range */
+ amdgpu_hmm_range_free(range);
+
if (!r && !list_empty(&prange->child_list)) {
pr_debug("range split by unmap in parallel, validate again\n");
r = -EAGAIN;
@@ -2310,6 +2396,9 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
+ if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem))
+ continue;
+
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
pdd->dev->adev->irq.retry_cam_enabled ?
&pdd->dev->adev->irq.ih :
@@ -2319,6 +2408,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
&pdd->dev->adev->irq.ih_soft);
+ up_read(&pdd->dev->adev->reset_domain->sem);
pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
}
@@ -2400,15 +2490,17 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
prange->work_item.op != SVM_OP_UNMAP_RANGE)
prange->work_item.op = op;
} else {
- prange->work_item.op = op;
-
- /* Pairs with mmput in deferred_list_work */
- mmget(mm);
- prange->work_item.mm = mm;
- list_add_tail(&prange->deferred_list,
- &prange->svms->deferred_range_list);
- pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
- prange, prange->start, prange->last, op);
+ /* Pairs with mmput in deferred_list_work.
+ * If process is exiting and mm is gone, don't update mmu notifier.
+ */
+ if (mmget_not_zero(mm)) {
+ prange->work_item.mm = mm;
+ prange->work_item.op = op;
+ list_add_tail(&prange->deferred_list,
+ &prange->svms->deferred_range_list);
+ pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
+ prange, prange->start, prange->last, op);
+ }
}
spin_unlock(&svms->deferred_list_lock);
}
@@ -2422,8 +2514,7 @@ void schedule_deferred_list_work(struct svm_range_list *svms)
}
static void
-svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
- struct svm_range *prange, unsigned long start,
+svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start,
unsigned long last)
{
struct svm_range *head;
@@ -2444,12 +2535,12 @@ svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
svm_range_split(tail, last + 1, tail->last, &head);
if (head != prange && tail != prange) {
- svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
- svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
+ svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
+ svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE);
} else if (tail != prange) {
- svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
+ svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE);
} else if (head != prange) {
- svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
+ svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
} else if (parent != prange) {
prange->work_item.op = SVM_OP_UNMAP_RANGE;
}
@@ -2501,7 +2592,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
adev = pdd->dev->adev;
/* Check and drain ih1 ring if cam not available */
- if (adev->irq.ih1.ring_size) {
+ if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) {
ih = &adev->irq.ih1;
checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
if (ih->rptr != checkpoint_wptr) {
@@ -2526,14 +2617,14 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
l = min(last, pchild->last);
if (l >= s)
svm_range_unmap_from_gpus(pchild, s, l, trigger);
- svm_range_unmap_split(mm, prange, pchild, start, last);
+ svm_range_unmap_split(prange, pchild, start, last);
mutex_unlock(&pchild->lock);
}
s = max(start, prange->start);
l = min(last, prange->last);
if (l >= s)
svm_range_unmap_from_gpus(prange, s, l, trigger);
- svm_range_unmap_split(mm, prange, prange, start, last);
+ svm_range_unmap_split(prange, prange, start, last);
if (unmap_parent)
svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
@@ -2576,8 +2667,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
if (range->event == MMU_NOTIFY_RELEASE)
return true;
- if (!mmget_not_zero(mni->mm))
- return true;
start = mni->interval_tree.start;
last = mni->interval_tree.last;
@@ -2604,7 +2693,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
}
svm_range_unlock(prange);
- mmput(mni->mm);
return true;
}
@@ -2691,7 +2779,7 @@ svm_range_best_restore_location(struct svm_range *prange,
return -1;
}
- if (node->adev->flags & AMD_IS_APU)
+ if (node->adev->apu_prefer_gtt)
return 0;
if (prange->preferred_loc == gpuid ||
@@ -2979,7 +3067,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
return -EFAULT;
}
- p = kfd_lookup_process_by_pasid(pasid);
+ p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p) {
pr_debug("kfd process not founded pasid 0x%x\n", pasid);
return 0;
@@ -3008,19 +3096,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out;
}
- /* check if this page fault time stamp is before svms->checkpoint_ts */
- if (svms->checkpoint_ts[gpuidx] != 0) {
- if (amdgpu_ih_ts_after(ts, svms->checkpoint_ts[gpuidx])) {
- pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
- r = 0;
- goto out;
- } else
- /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
- * to zero to avoid following ts wrap around give wrong comparing
- */
- svms->checkpoint_ts[gpuidx] = 0;
- }
-
if (!p->xnack_enabled) {
pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
r = -EFAULT;
@@ -3040,6 +3115,23 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
mmap_read_lock(mm);
retry_write_locked:
mutex_lock(&svms->lock);
+
+ /* check if this page fault time stamp is before svms->checkpoint_ts */
+ if (svms->checkpoint_ts[gpuidx] != 0) {
+ if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) {
+ pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
+ if (write_locked)
+ mmap_write_downgrade(mm);
+ r = -EAGAIN;
+ goto out_unlock_svms;
+ } else {
+ /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
+ * to zero to avoid following ts wrap around give wrong comparing
+ */
+ svms->checkpoint_ts[gpuidx] = 0;
+ }
+ }
+
prange = svm_range_from_addr(svms, addr, NULL);
if (!prange) {
pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
@@ -3165,7 +3257,8 @@ out_unlock_svms:
mutex_unlock(&svms->lock);
mmap_read_unlock(mm);
- svm_range_count_fault(node, p, gpuidx);
+ if (r != -EAGAIN)
+ svm_range_count_fault(node, p, gpuidx);
mmput(mm);
out:
@@ -3242,7 +3335,8 @@ void svm_range_list_fini(struct kfd_process *p)
struct svm_range *prange;
struct svm_range *next;
- pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
+ pr_debug("process pid %d svms 0x%p\n", p->lead_thread->pid,
+ &p->svms);
cancel_delayed_work_sync(&p->svms.restore_work);
@@ -3265,7 +3359,8 @@ void svm_range_list_fini(struct kfd_process *p)
mutex_destroy(&p->svms.lock);
- pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
+ pr_debug("process pid %d svms 0x%p done\n",
+ p->lead_thread->pid, &p->svms);
}
int svm_range_list_init(struct kfd_process *p)
@@ -3438,7 +3533,7 @@ svm_range_best_prefetch_location(struct svm_range *prange)
goto out;
}
- if (bo_node->adev->flags & AMD_IS_APU) {
+ if (bo_node->adev->apu_prefer_gtt) {
best_loc = 0;
goto out;
}
@@ -3628,8 +3723,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
bool flush_tlb;
int r, ret = 0;
- pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
- p->pasid, &p->svms, start, start + size - 1, size);
+ pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
+ p->lead_thread->pid, &p->svms, start, start + size - 1, size);
r = svm_range_check_attr(p, nattr, attrs);
if (r)
@@ -3667,6 +3762,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
/* TODO: unmap ranges from GPU that lost access */
}
+ update_mapping |= !p->xnack_enabled && !list_empty(&remap_list);
+
list_for_each_entry_safe(prange, next, &remove_list, update_list) {
pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
prange->svms, prange, prange->start,
@@ -3737,8 +3834,8 @@ out_unlock_range:
out:
mutex_unlock(&process_info->lock);
- pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
- &p->svms, start, start + size - 1, r);
+ pr_debug("process pid %d svms 0x%p [0x%llx 0x%llx] done, r=%d\n",
+ p->lead_thread->pid, &p->svms, start, start + size - 1, r);
return ret ? ret : r;
}
@@ -4076,8 +4173,8 @@ exit:
return ret;
}
-int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
- uint64_t *svm_priv_data_size)
+void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
+ uint64_t *svm_priv_data_size)
{
uint64_t total_size, accessibility_size, common_attr_size;
int nattr_common = 4, nattr_accessibility = 1;
@@ -4089,8 +4186,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
*svm_priv_data_size = 0;
svms = &p->svms;
- if (!svms)
- return -EINVAL;
mutex_lock(&svms->lock);
list_for_each_entry(prange, &svms->list, list) {
@@ -4132,7 +4227,6 @@ int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
*svm_priv_data_size);
- return 0;
}
int kfd_criu_checkpoint_svm(struct kfd_process *p,
@@ -4149,8 +4243,6 @@ int kfd_criu_checkpoint_svm(struct kfd_process *p,
struct mm_struct *mm;
svms = &p->svms;
- if (!svms)
- return -EINVAL;
mm = get_task_mm(p->lead_thread);
if (!mm) {
@@ -4248,7 +4340,7 @@ svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
break;
default:
- r = EINVAL;
+ r = -EINVAL;
break;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index bddd24f04669..a63dfc95b602 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -31,7 +31,6 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/sched/mm.h>
-#include <linux/hmm.h>
#include "amdgpu.h"
#include "kfd_priv.h"
@@ -184,8 +183,8 @@ void schedule_deferred_list_work(struct svm_range_list *svms);
void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr,
unsigned long offset, unsigned long npages);
void svm_range_dma_unmap(struct svm_range *prange);
-int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
- uint64_t *svm_priv_data_size);
+void svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
+ uint64_t *svm_priv_data_size);
int kfd_criu_checkpoint_svm(struct kfd_process *p,
uint8_t __user *user_priv_data,
uint64_t *priv_offset);
@@ -202,7 +201,7 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
* is initialized to not 0 when page migration register device memory.
*/
#define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\
- ((adev)->flags & AMD_IS_APU))
+ ((adev)->apu_prefer_gtt))
void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
@@ -237,13 +236,12 @@ static inline int svm_range_schedule_evict_svm_bo(
return -EINVAL;
}
-static inline int svm_range_get_info(struct kfd_process *p,
- uint32_t *num_svm_ranges,
- uint64_t *svm_priv_data_size)
+static inline void svm_range_get_info(struct kfd_process *p,
+ uint32_t *num_svm_ranges,
+ uint64_t *svm_priv_data_size)
{
*num_svm_ranges = 0;
*svm_priv_data_size = 0;
- return 0;
}
static inline int kfd_criu_checkpoint_svm(struct kfd_process *p,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index ceb9fb475ef1..29dee26261ab 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -108,24 +108,6 @@ struct kfd_node *kfd_device_by_id(uint32_t gpu_id)
return top_dev->gpu;
}
-struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev)
-{
- struct kfd_topology_device *top_dev;
- struct kfd_node *device = NULL;
-
- down_read(&topology_lock);
-
- list_for_each_entry(top_dev, &topology_device_list, list)
- if (top_dev->gpu && top_dev->gpu->adev->pdev == pdev) {
- device = top_dev->gpu;
- break;
- }
-
- up_read(&topology_lock);
-
- return device;
-}
-
/* Called with write topology_lock acquired */
static void kfd_release_topology_device(struct kfd_topology_device *dev)
{
@@ -509,6 +491,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->node_props.num_sdma_queues_per_engine);
sysfs_show_32bit_prop(buffer, offs, "num_cp_queues",
dev->node_props.num_cp_queues);
+ sysfs_show_32bit_prop(buffer, offs, "cwsr_size",
+ dev->node_props.cwsr_size);
+ sysfs_show_32bit_prop(buffer, offs, "ctl_stack_size",
+ dev->node_props.ctl_stack_size);
if (dev->gpu) {
log_max_watch_addr =
@@ -528,6 +514,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->node_props.capability |=
HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
+ if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0) &&
+ (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
+ dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED;
+
sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute",
dev->node_props.max_engine_clk_fcompute);
@@ -537,11 +527,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->gpu->kfd->mec_fw_version);
sysfs_show_32bit_prop(buffer, offs, "capability",
dev->node_props.capability);
+ sysfs_show_32bit_prop(buffer, offs, "capability2",
+ dev->node_props.capability2);
sysfs_show_64bit_prop(buffer, offs, "debug_prop",
dev->node_props.debug_prop);
sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
dev->gpu->kfd->sdma_fw_version);
sysfs_show_64bit_prop(buffer, offs, "unique_id",
+ dev->gpu->xcp &&
+ (dev->gpu->xcp->xcp_mgr->mode !=
+ AMDGPU_SPX_PARTITION_MODE) ?
+ dev->gpu->xcp->unique_id :
dev->gpu->adev->unique_id);
sysfs_show_32bit_prop(buffer, offs, "num_xcc",
NUM_XCC(dev->gpu->xcc_mask));
@@ -715,7 +711,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
i = 0;
list_for_each_entry(mem, &dev->mem_props, list) {
- mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ mem->kobj = kzalloc_obj(struct kobject);
if (!mem->kobj)
return -ENOMEM;
ret = kobject_init_and_add(mem->kobj, &mem_type,
@@ -736,7 +732,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
i = 0;
list_for_each_entry(cache, &dev->cache_props, list) {
- cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ cache->kobj = kzalloc_obj(struct kobject);
if (!cache->kobj)
return -ENOMEM;
ret = kobject_init_and_add(cache->kobj, &cache_type,
@@ -757,7 +753,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
i = 0;
list_for_each_entry(iolink, &dev->io_link_props, list) {
- iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ iolink->kobj = kzalloc_obj(struct kobject);
if (!iolink->kobj)
return -ENOMEM;
ret = kobject_init_and_add(iolink->kobj, &iolink_type,
@@ -778,7 +774,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
i = 0;
list_for_each_entry(p2plink, &dev->p2p_link_props, list) {
- p2plink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ p2plink->kobj = kzalloc_obj(struct kobject);
if (!p2plink->kobj)
return -ENOMEM;
ret = kobject_init_and_add(p2plink->kobj, &iolink_type,
@@ -935,17 +931,12 @@ static void kfd_debug_print_topology(void)
dev = list_last_entry(&topology_device_list,
struct kfd_topology_device, list);
if (dev) {
- if (dev->node_props.cpu_cores_count &&
- dev->node_props.simd_count) {
- pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
- dev->node_props.device_id,
- dev->node_props.vendor_id);
- } else if (dev->node_props.cpu_cores_count)
+ if (dev->node_props.cpu_cores_count)
pr_info("Topology: Add CPU node\n");
- else if (dev->node_props.simd_count)
- pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
- dev->node_props.device_id,
- dev->node_props.vendor_id);
+ else
+ pr_info("Topology: Add GPU node [0x%0x:0x%0x]\n",
+ dev->node_props.vendor_id,
+ dev->node_props.device_id);
}
up_read(&topology_lock);
}
@@ -968,24 +959,23 @@ static void kfd_update_system_properties(void)
up_read(&topology_lock);
}
-static void find_system_memory(const struct dmi_header *dm,
- void *private)
+static void find_system_memory(const struct dmi_header *dm, void *private)
{
+ struct dmi_mem_device *memdev = container_of(dm, struct dmi_mem_device, header);
struct kfd_mem_properties *mem;
- u16 mem_width, mem_clock;
struct kfd_topology_device *kdev =
(struct kfd_topology_device *)private;
- const u8 *dmi_data = (const u8 *)(dm + 1);
-
- if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
- mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
- mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
- list_for_each_entry(mem, &kdev->mem_props, list) {
- if (mem_width != 0xFFFF && mem_width != 0)
- mem->width = mem_width;
- if (mem_clock != 0)
- mem->mem_clk_max = mem_clock;
- }
+
+ if (memdev->header.type != DMI_ENTRY_MEM_DEVICE)
+ return;
+ if (memdev->header.length < sizeof(struct dmi_mem_device))
+ return;
+
+ list_for_each_entry(mem, &kdev->mem_props, list) {
+ if (memdev->total_width != 0xFFFF && memdev->total_width != 0)
+ mem->width = memdev->total_width;
+ if (memdev->speed != 0)
+ mem->mem_clk_max = memdev->speed;
}
}
@@ -1284,34 +1274,41 @@ static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
{
struct kfd_node *gpu = outbound_link->gpu;
struct amdgpu_device *adev = gpu->adev;
- int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
+ unsigned int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
+ unsigned int num_xgmi_sdma_engines = kfd_get_num_xgmi_sdma_engines(gpu);
+ unsigned int num_sdma_engines = kfd_get_num_sdma_engines(gpu);
+ uint32_t sdma_eng_id_mask = (1 << num_sdma_engines) - 1;
+ uint32_t xgmi_sdma_eng_id_mask =
+ ((1 << num_xgmi_sdma_engines) - 1) << num_sdma_engines;
+
bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
adev->aid_mask && num_xgmi_nodes && gpu->kfd->num_nodes == 1 &&
- kfd_get_num_xgmi_sdma_engines(gpu) >= 14 &&
- (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);
+ num_xgmi_sdma_engines >= 6 && (!(adev->flags & AMD_IS_APU) &&
+ num_xgmi_nodes == 8);
if (support_rec_eng) {
int src_socket_id = adev->gmc.xgmi.physical_node_id;
int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;
+ unsigned int reshift = num_xgmi_sdma_engines == 6 ? 1 : 0;
outbound_link->rec_sdma_eng_id_mask =
- 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
+ 1 << (rec_sdma_eng_map[src_socket_id][dst_socket_id] >> reshift);
inbound_link->rec_sdma_eng_id_mask =
- 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
- } else {
- int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
- int i, eng_offset = 0;
+ 1 << (rec_sdma_eng_map[dst_socket_id][src_socket_id] >> reshift);
- if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
- kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
- eng_offset = num_sdma_eng;
- num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
- }
+ /* If recommended engine is out of range, need to reset the mask */
+ if (outbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask)
+ outbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask;
+ if (inbound_link->rec_sdma_eng_id_mask & sdma_eng_id_mask)
+ inbound_link->rec_sdma_eng_id_mask = xgmi_sdma_eng_id_mask;
- for (i = 0; i < num_sdma_eng; i++) {
- outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
- inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
- }
+ } else {
+ uint32_t engine_mask = (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
+ num_xgmi_sdma_engines && to_dev->gpu) ? xgmi_sdma_eng_id_mask :
+ sdma_eng_id_mask;
+
+ outbound_link->rec_sdma_eng_id_mask = engine_mask;
+ inbound_link->rec_sdma_eng_id_mask = engine_mask;
}
}
@@ -1384,7 +1381,7 @@ static int kfd_build_p2p_node_entry(struct kfd_topology_device *dev,
{
int ret;
- p2plink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ p2plink->kobj = kzalloc_obj(struct kobject);
if (!p2plink->kobj)
return -ENOMEM;
@@ -1593,7 +1590,8 @@ static int kfd_dev_create_p2p_links(void)
break;
if (!dev->gpu || !dev->gpu->adev ||
(dev->gpu->kfd->hive_id &&
- dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id))
+ dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id &&
+ amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev)))
goto next;
/* check if node(s) is/are peer accessible in one direction or bi-direction */
@@ -1683,17 +1681,32 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
int cache_type, unsigned int cu_processor_id,
struct kfd_node *knode)
{
- unsigned int cu_sibling_map_mask;
+ unsigned int cu_sibling_map_mask = 0;
int first_active_cu;
int i, j, k, xcc, start, end;
int num_xcc = NUM_XCC(knode->xcc_mask);
struct kfd_cache_properties *pcache = NULL;
enum amdgpu_memory_partition mode;
struct amdgpu_device *adev = knode->adev;
+ bool found = false;
start = ffs(knode->xcc_mask) - 1;
end = start + num_xcc;
- cu_sibling_map_mask = cu_info->bitmap[start][0][0];
+
+ /* To find the bitmap in the first active cu in the first
+ * xcc, it is based on the assumption that evrey xcc must
+ * have at least one active cu.
+ */
+ for (i = 0; i < gfx_info->max_shader_engines && !found; i++) {
+ for (j = 0; j < gfx_info->max_sh_per_se && !found; j++) {
+ if (cu_info->bitmap[start][i % 4][j % 4]) {
+ cu_sibling_map_mask =
+ cu_info->bitmap[start][i % 4][j % 4];
+ found = true;
+ }
+ }
+ }
+
cu_sibling_map_mask &=
((1 << pcache_info[cache_type].num_cu_shared) - 1);
first_active_cu = ffs(cu_sibling_map_mask);
@@ -2000,18 +2013,23 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
dev->node_props.capability |=
HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
- dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
+ if (!amdgpu_sriov_vf(dev->gpu->adev))
+ dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
+
} else {
dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
- if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0))
- dev->node_props.capability |=
- HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
-
if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0))
dev->node_props.capability |=
HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED;
+
+ if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
+ dev->node_props.capability |=
+ HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+ dev->node_props.capability2 |=
+ HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
+ }
}
kfd_topology_set_dbg_firmware_support(dev);
@@ -2279,6 +2297,17 @@ int kfd_topology_remove_device(struct kfd_node *gpu)
return res;
}
+uint32_t kfd_topology_get_num_devices(void)
+{
+ uint32_t num_devices;
+
+ down_read(&topology_lock);
+ num_devices = sys_props.num_devices;
+ up_read(&topology_lock);
+
+ return num_devices;
+}
+
/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
* topology. If GPU device is found @idx, then valid kfd_dev pointer is
* returned through @kdev
@@ -2339,6 +2368,28 @@ int kfd_numa_node_to_apic_id(int numa_node_id)
return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
}
+/* kfd_gpu_node_num - Return kfd gpu node number at system */
+uint32_t kfd_gpu_node_num(void)
+{
+ struct kfd_node *dev;
+ u8 gpu_num = 0;
+ u8 id = 0;
+
+ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+ if (!dev || kfd_devcgroup_check_permission(dev)) {
+ /* Skip non GPU devices and devices to which the
+ * current process have no access to
+ */
+ id++;
+ continue;
+ }
+ id++;
+ gpu_num++;
+ }
+
+ return gpu_num;
+}
+
#if defined(CONFIG_DEBUG_FS)
int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
@@ -2392,3 +2443,26 @@ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
}
#endif
+
+void kfd_update_svm_support_properties(struct amdgpu_device *adev)
+{
+ struct kfd_topology_device *dev;
+ int ret;
+
+ down_write(&topology_lock);
+ list_for_each_entry(dev, &topology_device_list, list) {
+ if (!dev->gpu || dev->gpu->adev != adev)
+ continue;
+
+ if (KFD_IS_SVM_API_SUPPORTED(adev)) {
+ dev->node_props.capability |= HSA_CAP_SVMAPI_SUPPORTED;
+ ret = kfd_topology_update_sysfs();
+ if (!ret)
+ sys_props.generation_count++;
+ else
+ dev_err(adev->dev, "Failed to update SVM support properties. ret=%d\n", ret);
+ } else
+ dev->node_props.capability &= ~HSA_CAP_SVMAPI_SUPPORTED;
+ }
+ up_write(&topology_lock);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 155b5c410af1..ad63ba67b577 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -24,6 +24,7 @@
#ifndef __KFD_TOPOLOGY_H__
#define __KFD_TOPOLOGY_H__
+#include <linux/dmi.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/kfd_sysfs.h>
@@ -50,6 +51,7 @@ struct kfd_node_properties {
uint32_t cpu_core_id_base;
uint32_t simd_id_base;
uint32_t capability;
+ uint32_t capability2;
uint64_t debug_prop;
uint32_t max_waves_per_simd;
uint32_t lds_size_in_kb;
@@ -179,8 +181,30 @@ struct kfd_system_properties {
struct attribute attr_props;
};
+struct dmi_mem_device {
+ struct dmi_header header;
+ u16 physical_handle;
+ u16 error_handle;
+ u16 total_width;
+ u16 data_width;
+ u16 size;
+ u8 form_factor;
+ u8 device_set;
+ u8 device_locator;
+ u8 bank_locator;
+ u8 memory_type;
+ u16 type_detail;
+ u16 speed;
+} __packed;
+
struct kfd_topology_device *kfd_create_topology_device(
struct list_head *device_list);
void kfd_release_topology_device_list(struct list_head *device_list);
+#if IS_ENABLED(CONFIG_HSA_AMD)
+void kfd_update_svm_support_properties(struct amdgpu_device *adev);
+#else
+static inline void kfd_update_svm_support_properties(struct amdgpu_device *adev) {}
+#endif
+
#endif /* __KFD_TOPOLOGY_H__ */