diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
29 files changed, 2290 insertions, 880 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index a5ae7bcf44eb..0d3d8972240d 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -38,6 +38,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v11.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v12.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_packet_manager_vi.o \ @@ -49,6 +50,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v12.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 5a0308d26b53..02f7ba8c93cd 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -678,7 +678,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { }; static const uint32_t cwsr_trap_nv1x_hex[] = { - 0xbf820001, 0xbf820394, + 0xbf820001, 0xbf820393, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -711,19 +711,19 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x907a8977, 0x877bff7a, 0x003f8000, @@ -932,23 +932,48 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840086, 0xbf8a0000, + 0xbf840085, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840082, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850033, + 0xbf840081, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850033, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf85001d, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xbe840380, 0xd7600000, + 0x00000901, 0x80048104, + 0xd7600001, 0x00000901, + 0x80048104, 0xd7600002, + 0x00000901, 0x80048104, + 0xd7600003, 0x00000901, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807cff7c, 0x00000080, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85ffe4, 0xbf820044, + 0xbe8303ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820032, 0xb97af803, 0x8a7a7aff, 0x10000000, 0xbf85001d, 0xd8d80000, 0x01000000, @@ -960,24 +985,45 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0x807cff7c, - 0x00000080, 0xd5250000, - 0x0001ff00, 0x00000080, + 0x00000100, 0xd5250000, + 0x0001ff00, 0x00000100, 0xbf0a7b7c, 0xbf85ffe4, - 0xbf820044, 0xbe8303ff, - 0x00000080, 0xbf800000, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, 0xe0704000, 0x705d0100, 0x807c037c, 0x80700370, 0xd5250000, - 0x0001ff00, 0x00000080, + 0x0001ff00, 0x00000100, 0xbf0a7b7c, 0xbf85fff4, - 0xbf820032, 0xb97af803, - 0x8a7a7aff, 0x10000000, - 0xbf85001d, 0xd8d80000, - 0x01000000, 0xbf8c0000, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf85006b, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf8400fa, + 0xb97af803, 0x8a7a7aff, + 0x10000000, 0xbf850050, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xbe840380, 0xd7600000, + 0x00000900, 0x80048104, + 0xd7600001, 0x00000900, + 0x80048104, 0xd7600002, + 0x00000900, 0x80048104, + 0xd7600003, 0x00000900, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000901, 0x80048104, 0xd7600001, 0x00000901, @@ -986,32 +1032,39 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0x807cff7c, 0x00000100, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85ffe4, 0xbf820011, - 0xbe8303ff, 0x00000100, - 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf85006b, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000902, 0x80048104, + 0xd7600001, 0x00000902, + 0x80048104, 0xd7600002, + 0x00000902, 0x80048104, + 0xd7600003, 0x00000902, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0xbe840380, 0xd7600000, + 0x00000903, 0x80048104, + 0xd7600001, 0x00000903, + 0x80048104, 0xd7600002, + 0x00000903, 0x80048104, + 0xd7600003, 0x00000903, + 0x80048104, 0xf469003a, + 0xe0000000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffb1, 0xbf8200a6, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820094, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf8400fa, 0xb97af803, + 0xbf840065, 0xb97af803, 0x8a7a7aff, 0x10000000, 0xbf850050, 0x7e008700, 0x7e028701, 0x7e048702, @@ -1023,7 +1076,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000900, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000901, 0x80048104, 0xd7600001, @@ -1032,7 +1085,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000901, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000902, 0x80048104, 0xd7600001, @@ -1041,7 +1094,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000902, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0xbe840380, 0xd7600000, 0x00000903, 0x80048104, 0xd7600001, @@ -1050,25 +1103,24 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd7600003, 0x00000903, 0x80048104, 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0x807c847c, 0xbf0a7b7c, 0xbf85ffb1, - 0xbf8200a6, 0x7e008700, + 0xbf82003b, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820094, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840065, - 0xb97af803, 0x8a7a7aff, - 0x10000000, 0xbf850050, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, + 0xb97b1e06, 0x877bc17b, + 0xbf840027, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0xb97af803, + 0x8a7a7aff, 0x10000000, + 0xbf850017, 0x7e008700, 0xbe840380, 0xd7600000, 0x00000900, 0x80048104, 0xd7600001, 0x00000900, @@ -1078,78 +1130,25 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xf469003a, 0xe0000000, 0x80709070, 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000901, 0x80048104, - 0xd7600001, 0x00000901, - 0x80048104, 0xd7600002, - 0x00000901, 0x80048104, - 0xd7600003, 0x00000901, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000902, 0x80048104, - 0xd7600001, 0x00000902, - 0x80048104, 0xd7600002, - 0x00000902, 0x80048104, - 0xd7600003, 0x00000902, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0xbe840380, 0xd7600000, - 0x00000903, 0x80048104, - 0xd7600001, 0x00000903, - 0x80048104, 0xd7600002, - 0x00000903, 0x80048104, - 0xd7600003, 0x00000903, - 0x80048104, 0xf469003a, - 0xe0000000, 0x80709070, - 0xbf06c004, 0xbf84ffef, - 0x807c847c, 0xbf0a7b7c, - 0xbf85ffb1, 0xbf82003b, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, - 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf840027, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, - 0xb97af803, 0x8a7a7aff, - 0x10000000, 0xbf850017, - 0x7e008700, 0xbe840380, - 0xd7600000, 0x00000900, - 0x80048104, 0xd7600001, - 0x00000900, 0x80048104, - 0xd7600002, 0x00000900, - 0x80048104, 0xd7600003, - 0x00000900, 0x80048104, - 0xf469003a, 0xe0000000, - 0x80709070, 0xbf06c004, - 0xbf84ffef, 0x807c817c, - 0xbf0a7b7c, 0xbf85ffea, - 0xbf820008, 0x7e008700, - 0xe0704000, 0x705d0000, - 0x807c817c, 0x8070ff70, - 0x00000080, 0xbf0a7b7c, - 0xbf85fff8, 0xbf820144, - 0xbef4037e, 0x8775ff7f, - 0x0000ffff, 0x8875ff75, - 0x00040000, 0xbef60380, - 0xbef703ff, 0x10807fac, - 0xb97202dc, 0x8f729972, - 0x876eff7f, 0x04000000, - 0xbf840034, 0xbefe03c1, - 0x907c9972, 0x877c817c, - 0xbf06817c, 0xbf850002, - 0xbeff0380, 0xbf820001, - 0xbeff03c1, 0xb96f4306, - 0x876fc16f, 0xbf840029, - 0x8f6f866f, 0x8f6f826f, + 0x807c817c, 0xbf0a7b7c, + 0xbf85ffea, 0xbf820008, + 0x7e008700, 0xe0704000, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82013f, 0xbef4037e, + 0x8775ff7f, 0x0000ffff, + 0x8875ff75, 0x00040000, + 0xbef60380, 0xbef703ff, + 0x10807fac, 0xb97202dc, + 0x8f729972, 0x876eff7f, + 0x04000000, 0xbf840033, + 0xbefe03c1, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f4306, 0x876fc16f, + 0xbf840028, 0x8f6f886f, 0xbef6036f, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, @@ -1185,7 +1184,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d0000, 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1194,7 +1193,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x6e5d0000, 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c0000, 0xbf820034, 0xbef603ff, 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, @@ -1203,7 +1202,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d0000, 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1213,7 +1212,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, 0x785d0000, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x807c817c, 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, 0xbeff03c1, @@ -1221,7 +1220,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, 0x6e5d0300, - 0xbf8c3f70, 0xb9783a05, + 0xbf8c0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, @@ -1232,16 +1231,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x01000000, 0xbefc03ff, 0x0000006c, 0x80f89078, 0xf429003a, 0xf0000000, - 0xbf8cc07f, 0x80fc847c, + 0xbf8c0000, 0x80fc847c, 0xbf800000, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, 0xf0000000, - 0xbf8cc07f, 0x80fc887c, + 0xbf8c0000, 0x80fc887c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, 0xf0000000, - 0xbf8cc07f, 0x80fc907c, + 0xbf8c0000, 0x80fc907c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0xbe883108, @@ -1271,15 +1270,13 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xf4211cfa, 0xf0000000, 0x80788478, 0xf4211bba, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef814, + 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c0000, 0xb9eef815, 0xbefc036f, 0xbefe0370, 0xbeff0371, - 0x876f7bff, 0x000003ff, - 0xb9ef4803, 0xb9f9f816, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, + 0xb9f9f816, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -1291,7 +1288,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x0000ffff, 0xf4091c37, 0xfa000050, 0xf4091d37, 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, + 0xfa000074, 0xbf8c0000, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, @@ -2276,7 +2273,7 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { }; static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf820221, + 0xbf820001, 0xbf820220, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -2302,19 +2299,19 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c0000, 0x8f6e976e, 0x8a77ff77, 0x00800000, 0x88776e77, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, + 0xfa000000, 0xbf8c0000, 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c0000, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff0000, - 0xbf850005, 0x8878ff78, - 0x00002000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x01000000, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff0000, + 0xbf850008, 0x876eff6d, + 0x01000000, 0xbf850007, + 0x8878ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, @@ -2322,7 +2319,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xbefa0380, 0xb9fa0283, 0xbeee037e, 0xbeef037f, 0xbefe0480, - 0xbf900004, 0xbf8cc07f, + 0xbf900004, 0xbf8c0000, 0x877aff7f, 0x04000000, 0x8f7a857a, 0x886d7a6d, 0x7e008200, 0xbefa037e, @@ -2475,94 +2472,93 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840044, 0xbf8a0000, + 0xbf840043, 0xbf8a0000, 0x877aff6d, 0x80000000, - 0xbf840040, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, - 0x8070ff70, 0x00000080, - 0xbef603ff, 0x01000000, - 0xd7650000, 0x000100c1, - 0xd7660000, 0x000200c1, - 0x16000084, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850012, - 0xbe8303ff, 0x00000080, + 0xbf84003f, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef603ff, + 0x01000000, 0xd7650000, + 0x000100c1, 0xd7660000, + 0x000200c1, 0x16000084, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850012, 0xbe8303ff, + 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a7b7c, - 0xbf85fff4, 0xbf820011, - 0xbe8303ff, 0x00000100, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a7b7c, 0xbf85fff4, + 0xbf820011, 0xbe8303ff, + 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, - 0xbf800000, 0xd8d80000, - 0x01000000, 0xbf8c0000, - 0xe0704000, 0x705d0100, - 0x807c037c, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a7b7c, - 0xbf85fff4, 0xbefe03c1, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850004, - 0xbef003ff, 0x00000200, - 0xbeff0380, 0xbf820003, - 0xbef003ff, 0x00000400, - 0xbeff03c1, 0xb97b3a05, - 0x807b817b, 0x8f7b827b, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850017, + 0xd8d80000, 0x01000000, + 0xbf8c0000, 0xe0704000, + 0x705d0100, 0x807c037c, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7c, 0xbf85fff4, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x00000200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x00000400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850017, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf840037, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000200, 0xbf0a7b7c, + 0xbf85ffef, 0xbf820025, 0xbef603ff, 0x01000000, 0xbefc0384, 0xbf0a7b7c, - 0xbf840037, 0x7e008700, + 0xbf840011, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x705d0000, 0xe0704080, - 0x705d0100, 0xe0704100, - 0x705d0200, 0xe0704180, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, 0x705d0300, 0x807c847c, - 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000400, 0xbf0a7b7c, 0xbf85ffef, - 0xbf820025, 0xbef603ff, - 0x01000000, 0xbefc0384, - 0xbf0a7b7c, 0xbf840011, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, + 0xb97b1e06, 0x877bc17b, + 0xbf84000c, 0x8f7b837b, + 0x807b7c7b, 0xbefe03c1, + 0xbeff0380, 0x7e008700, 0xe0704000, 0x705d0000, - 0xe0704100, 0x705d0100, - 0xe0704200, 0x705d0200, - 0xe0704300, 0x705d0300, - 0x807c847c, 0x8070ff70, - 0x00000400, 0xbf0a7b7c, - 0xbf85ffef, 0xb97b1e06, - 0x877bc17b, 0xbf84000c, - 0x8f7b837b, 0x807b7c7b, - 0xbefe03c1, 0xbeff0380, - 0x7e008700, 0xe0704000, - 0x705d0000, 0x807c817c, - 0x8070ff70, 0x00000080, - 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013b, 0xbef4037e, - 0x8775ff7f, 0x0000ffff, - 0x8875ff75, 0x00040000, - 0xbef60380, 0xbef703ff, - 0x10807fac, 0xb97202dc, - 0x8f729972, 0x876eff7f, - 0x04000000, 0xbf840034, - 0xbefe03c1, 0x907c9972, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0xbeff0380, - 0xbf820001, 0xbeff03c1, - 0xb96f4306, 0x876fc16f, - 0xbf840029, 0x8f6f866f, - 0x8f6f826f, 0xbef6036f, + 0x807c817c, 0x8070ff70, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff8, 0xbf820136, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0xb97202dc, 0x8f729972, + 0x876eff7f, 0x04000000, + 0xbf840033, 0xbefe03c1, + 0x907c9972, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf840028, + 0x8f6f886f, 0xbef6036f, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2598,7 +2594,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000200, @@ -2607,7 +2603,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, 0x6e5d0300, - 0xbf8c3f70, 0xbf820034, + 0xbf8c0000, 0xbf820034, 0xbef603ff, 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000400, 0xbefc0384, @@ -2616,7 +2612,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, 0x785d0300, - 0xbf8c3f70, 0x7e008500, + 0xbf8c0000, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, 0x00000400, @@ -2625,7 +2621,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf84000e, 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, - 0x785d0000, 0xbf8c3f70, + 0x785d0000, 0xbf8c0000, 0x7e008500, 0x807c817c, 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, @@ -2633,7 +2629,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x6e5d0000, 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c0000, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, @@ -2644,16 +2640,16 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbef603ff, 0x01000000, 0xbefc03ff, 0x0000006c, 0x80f89078, 0xf429003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc847c, 0xbf800000, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc887c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, - 0xf0000000, 0xbf8cc07f, + 0xf0000000, 0xbf8c0000, 0x80fc907c, 0xbf800000, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, @@ -2683,15 +2679,13 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x80788478, 0xf4211cfa, 0xf0000000, 0x80788478, 0xf4211bba, 0xf0000000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c0000, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef815, + 0xbf8c0000, 0xb9eef815, 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0x876f7bff, - 0x000003ff, 0xb9ef4803, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, + 0xbeff0371, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2703,7 +2697,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x0000ffff, 0xf4091c37, 0xfa000050, 0xf4091d37, 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, + 0xfa000074, 0xbf8c0000, 0x876dff6d, 0x0000ffff, 0x87fe7e7e, 0x87ea6a6a, 0xb9faf802, 0xbe80226c, @@ -2713,7 +2707,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { }; static const uint32_t cwsr_trap_gfx11_hex[] = { - 0xbfa00001, 0xbfa00225, + 0xbfa00001, 0xbfa00227, 0xb0804006, 0xb8f8f802, 0x9178ff78, 0x00020006, 0xb8fbf803, 0xbf0d9e6d, @@ -2737,187 +2731,188 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x8b6eff6e, 0x00000800, 0xbfa20003, 0x8b6eff7b, 0x00000400, 0xbfa2002a, - 0xbefa4d82, 0xbf89fc07, + 0xbefa4d82, 0xbf890000, 0x84fa887a, 0xbf0d8f7b, 0xbfa10002, 0x8c7bff7b, 0xffff0000, 0xf4005bbd, - 0xf8000010, 0xbf89fc07, + 0xf8000010, 0xbf890000, 0x846e976e, 0x9177ff77, 0x00800000, 0x8c776e77, 0xf4045bbd, 0xf8000000, - 0xbf89fc07, 0xf4045ebd, - 0xf8000008, 0xbf89fc07, + 0xbf890000, 0xf4045ebd, + 0xf8000008, 0xbf890000, 0x8bee6e6e, 0xbfa10001, 0xbe80486e, 0x8b6eff6d, - 0x01ff0000, 0xbfa20005, - 0x8c78ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xbfa00005, 0x8b6eff6d, - 0x01000000, 0xbfa20002, + 0x00ff0000, 0xbfa20008, + 0x8b6eff6d, 0x01000000, + 0xbfa20007, 0x8c78ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, 0x806c846c, 0x826d806d, 0x8b6dff6d, 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, 0xb978f802, 0xbe804a6c, - 0x8b6dff6d, 0x0000ffff, - 0xbefa0080, 0xb97a0283, - 0xbeee007e, 0xbeef007f, - 0xbefe0180, 0xbefe4d84, - 0xbf89fc07, 0x8b7aff7f, - 0x04000000, 0x847a857a, - 0x8c6d7a6d, 0xbefa007e, - 0x8b7bff7f, 0x0000ffff, - 0xbefe00c1, 0xbeff00c1, - 0xdca6c000, 0x007a0000, - 0x7e000280, 0xbefe007a, - 0xbeff007b, 0xb8fb02dc, - 0x847b997b, 0xb8fa3b05, - 0x807a817a, 0xbf0d997b, - 0xbfa20002, 0x847a897a, - 0xbfa00001, 0x847a8a7a, - 0xb8fb1e06, 0x847b8a7b, - 0x807a7b7a, 0x8b7bff7f, - 0x0000ffff, 0x807aff7a, - 0x00000200, 0x807a7e7a, - 0x827b807b, 0xd7610000, - 0x00010870, 0xd7610000, - 0x00010a71, 0xd7610000, - 0x00010c72, 0xd7610000, - 0x00010e73, 0xd7610000, - 0x00011074, 0xd7610000, - 0x00011275, 0xd7610000, - 0x00011476, 0xd7610000, - 0x00011677, 0xd7610000, - 0x00011a79, 0xd7610000, - 0x00011c7e, 0xd7610000, - 0x00011e7f, 0xbefe00ff, - 0x00003fff, 0xbeff0080, - 0xdca6c040, 0x007a0000, - 0xd760007a, 0x00011d00, - 0xd760007b, 0x00011f00, + 0xbf0d9878, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0283, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf890000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xdca6c000, + 0x007a0000, 0x7e000280, 0xbefe007a, 0xbeff007b, - 0xbef4007e, 0x8b75ff7f, - 0x0000ffff, 0x8c75ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x10807fac, - 0xbef1007d, 0xbef00080, - 0xb8f302dc, 0x84739973, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00002, 0xbeff00c1, - 0xbfa00009, 0xbef600ff, - 0x01000000, 0xe0685080, - 0x701d0100, 0xe0685100, - 0x701d0200, 0xe0685180, - 0x701d0300, 0xbfa00008, + 0xb8fb02dc, 0x847b997b, + 0xb8fa3b05, 0x807a817a, + 0xbf0d997b, 0xbfa20002, + 0x847a897a, 0xbfa00001, + 0x847a8a7a, 0xb8fb1e06, + 0x847b8a7b, 0x807a7b7a, + 0x8b7bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xd7610000, 0x00010870, + 0xd7610000, 0x00010a71, + 0xd7610000, 0x00010c72, + 0xd7610000, 0x00010e73, + 0xd7610000, 0x00011074, + 0xd7610000, 0x00011275, + 0xd7610000, 0x00011476, + 0xd7610000, 0x00011677, + 0xd7610000, 0x00011a79, + 0xd7610000, 0x00011c7e, + 0xd7610000, 0x00011e7f, + 0xbefe00ff, 0x00003fff, + 0xbeff0080, 0xdca6c040, + 0x007a0000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f302dc, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa00009, 0xbef600ff, 0x01000000, - 0xe0685100, 0x701d0100, - 0xe0685200, 0x701d0200, - 0xe0685300, 0x701d0300, + 0xe0685080, 0x701d0100, + 0xe0685100, 0x701d0200, + 0xe0685180, 0x701d0300, + 0xbfa00008, 0xbef600ff, + 0x01000000, 0xe0685100, + 0x701d0100, 0xe0685200, + 0x701d0200, 0xe0685300, + 0x701d0300, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefd0080, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xd7610002, 0x0000fa6c, + 0x807d817d, 0x917aff6d, + 0x80000000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xd7610002, 0x0000fa6e, + 0x807d817d, 0xd7610002, + 0x0000fa6f, 0x807d817d, + 0xd7610002, 0x0000fa78, + 0x807d817d, 0xb8faf803, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa7b, 0x807d817d, + 0xb8f1f801, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f814, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f815, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xbefe00ff, 0x0000ffff, + 0xbeff0080, 0xe0685000, + 0x701d0200, 0xbefe00c1, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, 0x84708a70, 0xb8fa1e06, 0x847a8a7a, 0x80707a70, - 0x8070ff70, 0x00000200, 0xbef600ff, 0x01000000, - 0x7e000280, 0x7e020280, - 0x7e040280, 0xbefd0080, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xd7610002, - 0x0000fa6c, 0x807d817d, - 0x917aff6d, 0x80000000, - 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xd7610002, - 0x0000fa6e, 0x807d817d, - 0xd7610002, 0x0000fa6f, - 0x807d817d, 0xd7610002, - 0x0000fa78, 0x807d817d, - 0xb8faf803, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xd7610002, 0x0000fa7b, - 0x807d817d, 0xb8f1f801, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f814, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xb8f1f815, - 0xd7610002, 0x0000fa71, - 0x807d817d, 0xbefe00ff, - 0x0000ffff, 0xbeff0080, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0xbef600ff, - 0x01000000, 0xbef90080, - 0xbefd0080, 0xbf800000, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xbe8c410c, 0xbe8e410e, - 0xd7610002, 0x0000f200, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, - 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xd7610002, 0x0000f20c, - 0x80798179, 0xd7610002, - 0x0000f20d, 0x80798179, - 0xd7610002, 0x0000f20e, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, 0x80798179, 0xd7610002, - 0x0000f20f, 0x80798179, - 0xbf06a079, 0xbfa10006, - 0xe0685000, 0x701d0200, - 0x8070ff70, 0x00000080, - 0xbef90080, 0x7e040280, - 0x807d907d, 0xbf0aff7d, - 0x00000060, 0xbfa2ffbc, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xd7610002, 0x0000f200, + 0x0000f20c, 0x80798179, + 0xd7610002, 0x0000f20d, 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, + 0x0000f20e, 0x80798179, + 0xd7610002, 0x0000f20f, + 0x80798179, 0xbf06a079, + 0xbfa10006, 0xe0685000, + 0x701d0200, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ffbc, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xe0685000, 0x701d0200, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20002, 0xbeff0080, - 0xbfa00001, 0xbeff00c1, - 0xb8fb4306, 0x8b7bc17b, - 0xbfa10044, 0xbfbd0000, - 0x8b7aff6d, 0x80000000, - 0xbfa10040, 0x847b867b, - 0x847b827b, 0xbef6007b, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, + 0x80798179, 0xe0685000, + 0x701d0200, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8fb4306, + 0x8b7bc17b, 0xbfa10043, + 0xbfbd0000, 0x8b7aff6d, + 0x80000000, 0xbfa1003f, + 0x847b887b, 0xbef6007b, 0xb8f03b05, 0x80708170, 0xbf0d9973, 0xbfa20002, 0x84708970, 0xbfa00001, @@ -2988,173 +2983,171 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d0000, 0x807d817d, 0x8070ff70, 0x00000080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa00146, 0xbef4007e, + 0xbfa00143, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, 0x8c75ff75, 0x00040000, 0xbef60080, 0xbef700ff, 0x10807fac, 0xb8f202dc, 0x84729972, 0x8b6eff7f, - 0x04000000, 0xbfa1003a, + 0x04000000, 0xbfa10039, 0xbefe00c1, 0x857d9972, 0x8b7d817d, 0xbf06817d, 0xbfa20002, 0xbeff0080, 0xbfa00001, 0xbeff00c1, 0xb8ef4306, 0x8b6fc16f, - 0xbfa1002f, 0x846f866f, - 0x846f826f, 0xbef6006f, - 0xb8f83b05, 0x80788178, - 0xbf0d9972, 0xbfa20002, - 0x84788978, 0xbfa00001, - 0x84788a78, 0xb8ee1e06, - 0x846e8a6e, 0x80786e78, + 0xbfa1002e, 0x846f886f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000c, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000080, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff5, 0xbfa0000b, + 0xe0500000, 0x781d0000, + 0xbf890000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff5, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20024, 0xbef600ff, + 0x01000000, 0xbeee0078, 0x8078ff78, 0x00000200, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa2000c, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000080, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbfa0000b, 0xe0500000, - 0x781d0000, 0xbf8903f7, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000100, - 0x8078ff78, 0x00000100, - 0xbf0a6f7d, 0xbfa2fff5, - 0xbef80080, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef3b05, - 0x806f816f, 0x846f826f, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20024, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000200, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10050, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10050, 0xe0505000, + 0x781d0000, 0xe0505080, + 0x781d0100, 0xe0505100, + 0x781d0200, 0xe0505180, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffee, 0xe0505000, + 0x6e1d0000, 0xe0505080, + 0x6e1d0100, 0xe0505100, + 0x6e1d0200, 0xe0505180, + 0x6e1d0300, 0xbf890000, + 0xbfa00034, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10012, 0xe0505000, + 0x781d0000, 0xe0505100, + 0x781d0100, 0xe0505200, + 0x781d0200, 0xe0505300, + 0x781d0300, 0xbf890000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffee, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000e, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, 0xe0505000, 0x781d0000, - 0xe0505080, 0x781d0100, - 0xe0505100, 0x781d0200, - 0xe0505180, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000200, - 0xbf0a6f7d, 0xbfa2ffee, + 0xbf890000, 0x7e008500, + 0x807d817d, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff7, 0xbeff00c1, 0xe0505000, 0x6e1d0000, - 0xe0505080, 0x6e1d0100, - 0xe0505100, 0x6e1d0200, - 0xe0505180, 0x6e1d0300, - 0xbf8903f7, 0xbfa00034, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10012, - 0xe0505000, 0x781d0000, - 0xe0505100, 0x781d0100, - 0xe0505200, 0x781d0200, - 0xe0505300, 0x781d0300, - 0xbf8903f7, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000400, - 0xbf0a6f7d, 0xbfa2ffee, - 0xb8ef1e06, 0x8b6fc16f, - 0xbfa1000e, 0x846f836f, - 0x806f7d6f, 0xbefe00c1, - 0xbeff0080, 0xe0505000, - 0x781d0000, 0xbf8903f7, - 0x7e008500, 0x807d817d, - 0x8078ff78, 0x00000080, - 0xbf0a6f7d, 0xbfa2fff7, - 0xbeff00c1, 0xe0505000, - 0x6e1d0000, 0xe0505100, - 0x6e1d0100, 0xe0505200, - 0x6e1d0200, 0xe0505300, - 0x6e1d0300, 0xbf8903f7, + 0xe0505100, 0x6e1d0100, + 0xe0505200, 0x6e1d0200, + 0xe0505300, 0x6e1d0300, + 0xbf890000, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000050, 0xbef600ff, + 0x01000000, 0xbefd00ff, + 0x0000006c, 0x80f89078, + 0xf428403a, 0xf0000000, + 0xbf890000, 0x80fd847d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0x80f8a078, + 0xf42c403a, 0xf0000000, + 0xbf890000, 0x80fd887d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0x80f8c078, + 0xf430403a, 0xf0000000, + 0xbf890000, 0x80fd907d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0xbe884308, + 0xbe8a430a, 0xbe8c430c, + 0xbe8e430e, 0xbf06807d, + 0xbfa1fff0, 0xb980f801, + 0x00000000, 0xbfbd0000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, 0xb8ee1e06, 0x846e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0x80f8ff78, 0x00000050, 0xbef600ff, 0x01000000, - 0xbefd00ff, 0x0000006c, - 0x80f89078, 0xf428403a, - 0xf0000000, 0xbf89fc07, - 0x80fd847d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0x80f8a078, 0xf42c403a, - 0xf0000000, 0xbf89fc07, - 0x80fd887d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0x80f8c078, 0xf430403a, - 0xf0000000, 0xbf89fc07, - 0x80fd907d, 0xbf800000, - 0xbe804300, 0xbe824302, - 0xbe844304, 0xbe864306, - 0xbe884308, 0xbe8a430a, - 0xbe8c430c, 0xbe8e430e, - 0xbf06807d, 0xbfa1fff0, - 0xb980f801, 0x00000000, - 0xbfbd0000, 0xb8f83b05, - 0x80788178, 0xbf0d9972, - 0xbfa20002, 0x84788978, - 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef600ff, - 0x01000000, 0xf4205bfa, + 0xf4205bfa, 0xf0000000, + 0x80788478, 0xf4205b3a, 0xf0000000, 0x80788478, - 0xf4205b3a, 0xf0000000, - 0x80788478, 0xf4205b7a, + 0xf4205b7a, 0xf0000000, + 0x80788478, 0xf4205c3a, 0xf0000000, 0x80788478, - 0xf4205c3a, 0xf0000000, - 0x80788478, 0xf4205c7a, + 0xf4205c7a, 0xf0000000, + 0x80788478, 0xf4205eba, 0xf0000000, 0x80788478, - 0xf4205eba, 0xf0000000, - 0x80788478, 0xf4205efa, + 0xf4205efa, 0xf0000000, + 0x80788478, 0xf4205e7a, 0xf0000000, 0x80788478, - 0xf4205e7a, 0xf0000000, - 0x80788478, 0xf4205cfa, + 0xf4205cfa, 0xf0000000, + 0x80788478, 0xf4205bba, 0xf0000000, 0x80788478, + 0xbf890000, 0xb96ef814, 0xf4205bba, 0xf0000000, - 0x80788478, 0xbf89fc07, - 0xb96ef814, 0xf4205bba, - 0xf0000000, 0x80788478, - 0xbf89fc07, 0xb96ef815, - 0xbefd006f, 0xbefe0070, - 0xbeff0071, 0x8b6f7bff, - 0x000003ff, 0xb96f4803, - 0x8b6f7bff, 0xfffff800, - 0x856f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee3b05, - 0x806e816e, 0xbf0d9972, - 0xbfa20002, 0x846e896e, - 0xbfa00001, 0x846e8a6e, - 0xb8ef1e06, 0x846f8a6f, - 0x806e6f6e, 0x806eff6e, - 0x00000200, 0x806e746e, - 0x826f8075, 0x8b6fff6f, - 0x0000ffff, 0xf4085c37, - 0xf8000050, 0xf4085d37, - 0xf8000060, 0xf4005e77, - 0xf8000074, 0xbf89fc07, - 0x8b6dff6d, 0x0000ffff, - 0x8bfe7e7e, 0x8bea6a6a, - 0xb8eef802, 0xbf0d866e, - 0xbfa20002, 0xb97af802, - 0xbe80486c, 0xb97af802, - 0xbe804a6c, 0xbfb10000, + 0x80788478, 0xbf890000, + 0xb96ef815, 0xbefd006f, + 0xbefe0070, 0xbeff0071, + 0xb97b4803, 0x857b8b7b, + 0xb97b22c3, 0x857b867b, + 0xb97b7443, 0xb973f801, + 0xb8ee3b05, 0x806e816e, + 0xbf0d9972, 0xbfa20002, + 0x846e896e, 0xbfa00001, + 0x846e8a6e, 0xb8ef1e06, + 0x846f8a6f, 0x806e6f6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x8b6fff6f, 0x0000ffff, + 0xf4085c37, 0xf8000050, + 0xf4085d37, 0xf8000060, + 0xf4005e77, 0xf8000074, + 0xbf890000, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0xb8eef802, + 0xbf0d866e, 0xbfa20002, + 0xb97af802, 0xbe80486c, + 0xb97af802, 0xbe804a6c, + 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { @@ -3645,3 +3638,487 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0xb97a0002, 0xbf8a0000, 0xbe801f6c, 0xbf9b0000, }; + +static const uint32_t cwsr_trap_gfx12_hex[] = { + 0xbfa00001, 0xbfa0024b, + 0xb0804009, 0xb8f8f804, + 0x9178ff78, 0x00008c00, + 0xb8fbf811, 0x8b6eff78, + 0x00004000, 0xbfa10008, + 0x8b6eff7b, 0x00000080, + 0xbfa20018, 0x8b6ea07b, + 0xbfa20042, 0xbf830010, + 0xb8fbf811, 0xbfa0fffb, + 0x8b6eff7b, 0x00000bd0, + 0xbfa20010, 0xb8eef812, + 0x8b6f8f7b, 0xbfa10002, + 0x8c6eff6e, 0x00000080, + 0xb8eff813, 0x8b6e6e6f, + 0xbfa20008, 0x8b6eff6d, + 0xf0000000, 0xbfa20005, + 0x8b6fff6f, 0x00000200, + 0xbfa20002, 0x8b6ea07b, + 0xbfa2002c, 0xbefa4d82, + 0xbf8a0000, 0x84fa887a, + 0xbf0d8f7b, 0xbfa10002, + 0x8c7bff7b, 0xffff0000, + 0xf4601bbd, 0xf8000010, + 0xbf8a0000, 0x846e976e, + 0x9177ff77, 0x00800000, + 0x8c776e77, 0xf4603bbd, + 0xf8000000, 0xbf8a0000, + 0xf4603ebd, 0xf8000008, + 0xbf8a0000, 0x8bee6e6e, + 0xbfa10001, 0xbe80486e, + 0x8b6eff6d, 0xf0000000, + 0xbfa20009, 0xb8eef811, + 0x8b6eff6e, 0x00000080, + 0xbfa20007, 0x8c78ff78, + 0x00004000, 0x80ec886c, + 0x82ed806d, 0xbfa00002, + 0x806c846c, 0x826d806d, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0x85788978, 0xb9783244, + 0xbe804a6c, 0xb8faf802, + 0xbf0d987a, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x0000ffff, 0xbefa0080, + 0xb97a0151, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbefe4d84, 0xbf8a0000, + 0x8b7aff7f, 0x04000000, + 0x847a857a, 0x8c6d7a6d, + 0xbefa007e, 0x8b7bff7f, + 0x0000ffff, 0xbefe00c1, + 0xbeff00c1, 0xee0a407a, + 0x000c0000, 0x00000000, + 0x7e000280, 0xbefe007a, + 0xbeff007b, 0xb8fb0742, + 0x847b997b, 0xb8fa3b05, + 0x807a817a, 0xbf0d997b, + 0xbfa20002, 0x847a897a, + 0xbfa00001, 0x847a8a7a, + 0xb8fb1e06, 0x847b8a7b, + 0x807a7b7a, 0x8b7bff7f, + 0x0000ffff, 0x807aff7a, + 0x00000200, 0x807a7e7a, + 0x827b807b, 0xd7610000, + 0x00010870, 0xd7610000, + 0x00010a71, 0xd7610000, + 0x00010c72, 0xd7610000, + 0x00010e73, 0xd7610000, + 0x00011074, 0xd7610000, + 0x00011275, 0xd7610000, + 0x00011476, 0xd7610000, + 0x00011677, 0xd7610000, + 0x00011a79, 0xd7610000, + 0x00011c7e, 0xd7610000, + 0x00011e7f, 0xbefe00ff, + 0x00003fff, 0xbeff0080, + 0xee0a407a, 0x000c0000, + 0x00004000, 0xd760007a, + 0x00011d00, 0xd760007b, + 0x00011f00, 0xbefe007a, + 0xbeff007b, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007d, + 0xbef00080, 0xb8f30742, + 0x84739973, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00002, + 0xbeff00c1, 0xbfa0000c, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, + 0xc4068070, 0x008ce803, + 0x00018000, 0xbfa0000b, + 0xbef600ff, 0x01000000, + 0xc4068070, 0x008ce801, + 0x00010000, 0xc4068070, + 0x008ce802, 0x00020000, + 0xc4068070, 0x008ce803, + 0x00030000, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef600ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefd0080, 0xbe804ec2, + 0xbf94fffe, 0xb8faf804, + 0x8b7a847a, 0x91788478, + 0x8c787a78, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xd7610002, 0x0000fa6c, + 0x807d817d, 0x917aff6d, + 0x80000000, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xd7610002, 0x0000fa6e, + 0x807d817d, 0xd7610002, + 0x0000fa6f, 0x807d817d, + 0xd7610002, 0x0000fa78, + 0x807d817d, 0xb8faf811, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xd7610002, + 0x0000fa7b, 0x807d817d, + 0xb8f1f801, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f814, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f815, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f812, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8f1f813, 0xd7610002, + 0x0000fa71, 0x807d817d, + 0xb8faf802, 0xd7610002, + 0x0000fa7a, 0x807d817d, + 0xbefa50c1, 0xbfc70000, + 0xd7610002, 0x0000fa7a, + 0x807d817d, 0xbefe00ff, + 0x0000ffff, 0xbeff0080, + 0xc4068070, 0x008ce802, + 0x00000000, 0xbefe00c1, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0xbef600ff, 0x01000000, + 0xbef90080, 0xbefd0080, + 0xbf800000, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xbe8c410c, + 0xbe8e410e, 0xd7610002, + 0x0000f200, 0x80798179, + 0xd7610002, 0x0000f201, + 0x80798179, 0xd7610002, + 0x0000f202, 0x80798179, + 0xd7610002, 0x0000f203, + 0x80798179, 0xd7610002, + 0x0000f204, 0x80798179, + 0xd7610002, 0x0000f205, + 0x80798179, 0xd7610002, + 0x0000f206, 0x80798179, + 0xd7610002, 0x0000f207, + 0x80798179, 0xd7610002, + 0x0000f208, 0x80798179, + 0xd7610002, 0x0000f209, + 0x80798179, 0xd7610002, + 0x0000f20a, 0x80798179, + 0xd7610002, 0x0000f20b, + 0x80798179, 0xd7610002, + 0x0000f20c, 0x80798179, + 0xd7610002, 0x0000f20d, + 0x80798179, 0xd7610002, + 0x0000f20e, 0x80798179, + 0xd7610002, 0x0000f20f, + 0x80798179, 0xbf06a079, + 0xbfa10007, 0xc4068070, + 0x008ce802, 0x00000000, + 0x8070ff70, 0x00000080, + 0xbef90080, 0x7e040280, + 0x807d907d, 0xbf0aff7d, + 0x00000060, 0xbfa2ffbb, + 0xbe804100, 0xbe824102, + 0xbe844104, 0xbe864106, + 0xbe884108, 0xbe8a410a, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xc4068070, 0x008ce802, + 0x00000000, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8fb4306, + 0x8b7bc17b, 0xbfa10044, + 0x8b7aff6d, 0x80000000, + 0xbfa10041, 0x847b897b, + 0xbef6007b, 0xb8f03b05, + 0x80708170, 0xbf0d9973, + 0xbfa20002, 0x84708970, + 0xbfa00001, 0x84708a70, + 0xb8fa1e06, 0x847a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0x8070ff70, + 0x00000080, 0xbef600ff, + 0x01000000, 0xd71f0000, + 0x000100c1, 0xd7200000, + 0x000200c1, 0x16000084, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbefd0080, + 0xbfa20013, 0xbe8300ff, + 0x00000080, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8a0000, 0xc4068070, + 0x008ce801, 0x00000000, + 0x807d037d, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7d, + 0xbfa2fff3, 0xbfa00012, + 0xbe8300ff, 0x00000100, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8a0000, + 0xc4068070, 0x008ce801, + 0x00000000, 0x807d037d, + 0x80700370, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a7b7d, 0xbfa2fff3, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20004, 0xbef000ff, + 0x00000200, 0xbeff0080, + 0xbfa00003, 0xbef000ff, + 0x00000400, 0xbeff00c1, + 0xb8fb3b05, 0x807b817b, + 0x847b827b, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa2001b, 0xbef600ff, + 0x01000000, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10040, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xc4068070, 0x008ce800, + 0x00000000, 0xc4068070, + 0x008ce801, 0x00008000, + 0xc4068070, 0x008ce802, + 0x00010000, 0xc4068070, + 0x008ce803, 0x00018000, + 0x807d847d, 0x8070ff70, + 0x00000200, 0xbf0a7b7d, + 0xbfa2ffeb, 0xbfa0002a, + 0xbef600ff, 0x01000000, + 0xbefd0084, 0xbf0a7b7d, + 0xbfa10015, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xc4068070, + 0x008ce800, 0x00000000, + 0xc4068070, 0x008ce801, + 0x00010000, 0xc4068070, + 0x008ce802, 0x00020000, + 0xc4068070, 0x008ce803, + 0x00030000, 0x807d847d, + 0x8070ff70, 0x00000400, + 0xbf0a7b7d, 0xbfa2ffeb, + 0xb8fb1e06, 0x8b7bc17b, + 0xbfa1000d, 0x847b837b, + 0x807b7d7b, 0xbefe00c1, + 0xbeff0080, 0x7e008700, + 0xc4068070, 0x008ce800, + 0x00000000, 0x807d817d, + 0x8070ff70, 0x00000080, + 0xbf0a7b7d, 0xbfa2fff7, + 0xbfa0016e, 0xbef4007e, + 0x8b75ff7f, 0x0000ffff, + 0x8c75ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x10807fac, 0xbef1007f, + 0xb8f20742, 0x84729972, + 0x8b6eff7f, 0x04000000, + 0xbfa1003b, 0xbefe00c1, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8ef4306, + 0x8b6fc16f, 0xbfa10030, + 0x846f896f, 0xbef6006f, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbefd0080, + 0xbfa2000d, 0xc4050078, + 0x0080e800, 0x00000000, + 0xbf8a0000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000080, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff4, 0xbfa0000c, + 0xc4050078, 0x0080e800, + 0x00000000, 0xbf8a0000, + 0xdac00000, 0x00000000, + 0x807dff7d, 0x00000100, + 0x8078ff78, 0x00000100, + 0xbf0a6f7d, 0xbfa2fff4, + 0xbef80080, 0xbefe00c1, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbfa20002, + 0xbeff0080, 0xbfa00001, + 0xbeff00c1, 0xb8ef3b05, + 0x806f816f, 0x846f826f, + 0x857d9972, 0x8b7d817d, + 0xbf06817d, 0xbfa2002c, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000200, 0xbefd0084, + 0xbf0a6f7d, 0xbfa10061, + 0xc4050078, 0x008ce800, + 0x00000000, 0xc4050078, + 0x008ce801, 0x00008000, + 0xc4050078, 0x008ce802, + 0x00010000, 0xc4050078, + 0x008ce803, 0x00018000, + 0xbf8a0000, 0x7e008500, + 0x7e028501, 0x7e048502, + 0x7e068503, 0x807d847d, + 0x8078ff78, 0x00000200, + 0xbf0a6f7d, 0xbfa2ffea, + 0xc405006e, 0x008ce800, + 0x00000000, 0xc405006e, + 0x008ce801, 0x00008000, + 0xc405006e, 0x008ce802, + 0x00010000, 0xc405006e, + 0x008ce803, 0x00018000, + 0xbf8a0000, 0xbfa0003d, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefd0084, + 0xbf0a6f7d, 0xbfa10016, + 0xc4050078, 0x008ce800, + 0x00000000, 0xc4050078, + 0x008ce801, 0x00010000, + 0xc4050078, 0x008ce802, + 0x00020000, 0xc4050078, + 0x008ce803, 0x00030000, + 0xbf8a0000, 0x7e008500, + 0x7e028501, 0x7e048502, + 0x7e068503, 0x807d847d, + 0x8078ff78, 0x00000400, + 0xbf0a6f7d, 0xbfa2ffea, + 0xb8ef1e06, 0x8b6fc16f, + 0xbfa1000f, 0x846f836f, + 0x806f7d6f, 0xbefe00c1, + 0xbeff0080, 0xc4050078, + 0x008ce800, 0x00000000, + 0xbf8a0000, 0x7e008500, + 0x807d817d, 0x8078ff78, + 0x00000080, 0xbf0a6f7d, + 0xbfa2fff6, 0xbeff00c1, + 0xc405006e, 0x008ce800, + 0x00000000, 0xc405006e, + 0x008ce801, 0x00010000, + 0xc405006e, 0x008ce802, + 0x00020000, 0xc405006e, + 0x008ce803, 0x00030000, + 0xbf8a0000, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x80f8ff78, + 0x00000050, 0xbef600ff, + 0x01000000, 0xbefd00ff, + 0x0000006c, 0x80f89078, + 0xf462403a, 0xf0000000, + 0xbf8a0000, 0x80fd847d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0x80f8a078, + 0xf462603a, 0xf0000000, + 0xbf8a0000, 0x80fd887d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0x80f8c078, + 0xf462803a, 0xf0000000, + 0xbf8a0000, 0x80fd907d, + 0xbf800000, 0xbe804300, + 0xbe824302, 0xbe844304, + 0xbe864306, 0xbe884308, + 0xbe8a430a, 0xbe8c430c, + 0xbe8e430e, 0xbf06807d, + 0xbfa1fff0, 0xb980f801, + 0x00000000, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0xbef600ff, + 0x01000000, 0xbeff0071, + 0xf4621bfa, 0xf0000000, + 0x80788478, 0xf4621b3a, + 0xf0000000, 0x80788478, + 0xf4621b7a, 0xf0000000, + 0x80788478, 0xf4621c3a, + 0xf0000000, 0x80788478, + 0xf4621c7a, 0xf0000000, + 0x80788478, 0xf4621eba, + 0xf0000000, 0x80788478, + 0xf4621efa, 0xf0000000, + 0x80788478, 0xf4621e7a, + 0xf0000000, 0x80788478, + 0xf4621cfa, 0xf0000000, + 0x80788478, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef814, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef815, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xb96ef812, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xb96ef813, 0x8b6eff7f, + 0x04000000, 0xbfa1000d, + 0x80788478, 0xf4621bba, + 0xf0000000, 0x80788478, + 0xbf8a0000, 0xbf0d806e, + 0xbfa10006, 0x856e906e, + 0x8b6e6e6e, 0xbfa10003, + 0xbe804ec1, 0x816ec16e, + 0xbfa0fffb, 0xbefd006f, + 0xbefe0070, 0xbeff0071, + 0xb97b2011, 0x857b867b, + 0xb97b0191, 0x857b827b, + 0xb97bba11, 0xb973f801, + 0xb8ee3b05, 0x806e816e, + 0xbf0d9972, 0xbfa20002, + 0x846e896e, 0xbfa00001, + 0x846e8a6e, 0xb8ef1e06, + 0x846f8a6f, 0x806e6f6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x8b6fff6f, 0x0000ffff, + 0xf4605c37, 0xf8000050, + 0xf4605d37, 0xf8000060, + 0xf4601e77, 0xf8000074, + 0xbf8a0000, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0xb97af804, + 0xbe804ec2, 0xbf94fffe, + 0xbe804a6c, 0xbfb10000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index e1aaa5ce0784..44772eec9ef4 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -33,33 +33,76 @@ * gfx11: * cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3 * sp3 gfx11.sp3 -hex gfx11.hex + * + * gfx12: + * cpp -DASIC_FAMILY=CHIP_GFX12 cwsr_trap_handler_gfx10.asm -P -o gfx12.sp3 + * sp3 gfx12.sp3 -hex gfx12.hex */ #define CHIP_NAVI10 26 #define CHIP_SIENNA_CICHLID 30 #define CHIP_PLUM_BONITO 36 +#define CHIP_GFX12 37 #define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID) #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) -#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) +#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO && ASIC_FAMILY < CHIP_GFX12) #define SAVE_AFTER_XNACK_ERROR (HAVE_XNACK && !NO_SQC_STORE) // workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger +#define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised -var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#if ASIC_FAMILY < CHIP_GFX12 +#define S_COHERENCE glc:1 +#define V_COHERENCE slc:1 glc:1 +#define S_WAITCNT_0 s_waitcnt 0 +#else +#define S_COHERENCE scope:SCOPE_SYS +#define V_COHERENCE scope:SCOPE_SYS +#define S_WAITCNT_0 s_wait_idle + +#define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO +#define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI +#define HW_REG_GPR_ALLOC HW_REG_WAVE_GPR_ALLOC +#define HW_REG_LDS_ALLOC HW_REG_WAVE_LDS_ALLOC +#define HW_REG_MODE HW_REG_WAVE_MODE +#endif +#if ASIC_FAMILY < CHIP_GFX12 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 +var S_STATUS_HWREG = HW_REG_STATUS +var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK +var S_STATUS_HALT_MASK = SQ_WAVE_STATUS_HALT_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 +var S_SAVE_PC_HI_HT_MASK = 0x01000000 +#else +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 +var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 +var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK = 0x8000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT = 15 +var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 +var SQ_WAVE_STATUS_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 +var S_STATUS_HWREG = HW_REG_WAVE_STATE_PRIV +var S_STATUS_ALWAYS_CLEAR_MASK = SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK +var S_STATUS_HALT_MASK = SQ_WAVE_STATE_PRIV_HALT_MASK +var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF0000000 +#endif +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 -var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 -var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 #if ASIC_FAMILY < CHIP_PLUM_BONITO var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 @@ -67,6 +110,7 @@ var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 12 #endif +#if ASIC_FAMILY < CHIP_GFX12 var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 var SQ_WAVE_TRAPSTS_EXCP_MASK = 0x1FF var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 @@ -74,16 +118,13 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK = 0x80 var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 -var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 -var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 #if ASIC_FAMILY >= CHIP_PLUM_BONITO +var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT = 16 var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x20000 +var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17 var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x40000 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x100000 #endif @@ -99,14 +140,59 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 +var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT + #if ASIC_FAMILY < CHIP_PLUM_BONITO var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0 +var S_TRAPSTS_RESTORE_PART_3_SIZE = 0 #else var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK |\ SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |\ SQ_WAVE_TRAPSTS_WAVE_START_MASK |\ SQ_WAVE_TRAPSTS_WAVE_END_MASK |\ SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT - SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_TRAPSTS_WAVE_START_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT +#endif +var S_TRAPSTS_HWREG = HW_REG_TRAPSTS +var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_TRAPSTS_SAVECTX_MASK +var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT +#else +var SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK = 0xF +var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 +var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 +var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT = 7 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK = 0x100 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT = 8 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 +var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 +var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 +var SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK = 0x200 + +var S_TRAPSTS_HWREG = HW_REG_WAVE_EXCP_FLAG_PRIV +var S_TRAPSTS_SAVE_CONTEXT_MASK = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK +var S_TRAPSTS_SAVE_CONTEXT_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT +var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK |\ + SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK +var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SIZE = 32 - S_TRAPSTS_RESTORE_PART_3_SHIFT +var BARRIER_STATE_SIGNAL_OFFSET = 16 +var BARRIER_STATE_VALID_OFFSET = 0 #endif // bits [31:24] unused by SPI debug data @@ -121,8 +207,6 @@ var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 // when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC -var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF0000 -var S_SAVE_PC_HI_HT_MASK = 0x01000000 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 @@ -182,6 +266,7 @@ var s_restore_buf_rsrc3 = ttmp11 var s_restore_size = ttmp6 var s_restore_ttmps_lo = s_restore_tmp var s_restore_ttmps_hi = s_restore_alloc_size +var s_restore_spi_init_hi_save = s_restore_exec_hi shader main asic(DEFAULT) @@ -194,13 +279,13 @@ L_JUMP_TO_RESTORE: s_branch L_RESTORE L_SKIP_RESTORE: - s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_getreg_b32 s_save_status, hwreg(S_STATUS_HWREG) //save STATUS since we will change SCC // Clear SPI_PRIO: do not save with elevated priority. // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. - s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK + s_andn2_b32 s_save_status, s_save_status, S_STATUS_ALWAYS_CLEAR_MASK - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) #if SW_SA_TRAP // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. @@ -215,23 +300,27 @@ L_TRAP_NO_BARRIER: s_cbranch_scc1 L_CHECK_SAVE #endif - s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_and_b32 ttmp2, s_save_status, S_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_HALTED L_HALTED: // Host trap may occur while wave is halted. +#if ASIC_FAMILY < CHIP_GFX12 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK +#else + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK +#endif s_cbranch_scc1 L_FETCH_2ND_TRAP L_CHECK_SAVE: - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE // Wave is halted but neither host trap nor SAVECTX is raised. // Caused by instruction fetch memory violation. // Spin wait until context saved to prevent interrupt storm. s_sleep 0x10 - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_trapsts, hwreg(S_TRAPSTS_HWREG) s_branch L_CHECK_SAVE L_NOT_HALTED: @@ -247,6 +336,7 @@ L_NOT_HALTED: // Check for maskable exceptions in trapsts.excp and trapsts.excp_hi. // Maskable exceptions only cause the wave to enter the trap handler if // their respective bit in mode.excp_en is set. +#if ASIC_FAMILY < CHIP_GFX12 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK s_cbranch_scc0 L_CHECK_TRAP_ID @@ -259,21 +349,37 @@ L_NOT_ADDR_WATCH: s_lshl_b32 ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT s_and_b32 ttmp2, ttmp2, ttmp3 s_cbranch_scc1 L_FETCH_2ND_TRAP +#else + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + s_and_b32 ttmp3, s_save_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ADDR_WATCH_MASK + s_cbranch_scc0 L_NOT_ADDR_WATCH + s_or_b32 ttmp2, ttmp2, SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK + +L_NOT_ADDR_WATCH: + s_getreg_b32 ttmp3, hwreg(HW_REG_WAVE_TRAP_CTRL) + s_and_b32 ttmp2, ttmp3, ttmp2 + s_cbranch_scc1 L_FETCH_2ND_TRAP +#endif L_CHECK_TRAP_ID: // Check trap_id != 0 s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_FETCH_2ND_TRAP -if SINGLE_STEP_MISSED_WORKAROUND +#if SINGLE_STEP_MISSED_WORKAROUND // Prioritize single step exception over context save. // Second-level trap will halt wave and RFE, re-entering for SAVECTX. +#if ASIC_FAMILY < CHIP_GFX12 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK +#else + // WAVE_TRAP_CTRL is already in ttmp3. + s_and_b32 ttmp3, ttmp3, SQ_WAVE_TRAP_CTRL_TRAP_AFTER_INST_MASK +#endif s_cbranch_scc1 L_FETCH_2ND_TRAP -end +#endif - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_and_b32 ttmp2, s_save_trapsts, S_TRAPSTS_SAVE_CONTEXT_MASK s_cbranch_scc1 L_SAVE L_FETCH_2ND_TRAP: @@ -286,7 +392,7 @@ L_FETCH_2ND_TRAP: // ttmp12 holds SQ_WAVE_STATUS #if HAVE_SENDMSG_RTN s_sendmsg_rtn_b64 [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #else s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) @@ -298,16 +404,16 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp15, ttmp15, 0xFFFF0000 L_NO_SIGN_EXTEND_TMA: - s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag - s_waitcnt lgkmcnt(0) + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE // debug trap enabled flag + S_WAITCNT_0 s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT s_andn2_b32 ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK s_or_b32 ttmp11, ttmp11, ttmp2 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA - s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA - s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE // second-level TBA + S_WAITCNT_0 + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE // second-level TMA + S_WAITCNT_0 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set @@ -315,9 +421,18 @@ L_NO_SIGN_EXTEND_TMA: L_NO_NEXT_TRAP: // If not caused by trap then halt wave to prevent re-entry. - s_and_b32 ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK) + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK s_cbranch_scc1 L_TRAP_CASE - s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Host trap will not cause trap re-entry. +#if ASIC_FAMILY < CHIP_GFX12 + s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK +#else + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK +#endif + s_cbranch_scc1 L_EXIT_TRAP + s_or_b32 s_save_status, s_save_status, S_STATUS_HALT_MASK // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. // Rewind the PC to prevent this from occurring. @@ -327,10 +442,6 @@ L_NO_NEXT_TRAP: s_branch L_EXIT_TRAP L_TRAP_CASE: - // Host trap will not cause trap re-entry. - s_and_b32 ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK - s_cbranch_scc1 L_EXIT_TRAP - // Advance past trap instruction to prevent re-entry. s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 @@ -345,14 +456,39 @@ L_EXIT_TRAP: // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status + +#if ASIC_FAMILY < CHIP_GFX12 + s_setreg_b32 hwreg(S_STATUS_HWREG), s_save_status +#else + // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // Only restore fields which the trap handler changes. + s_lshr_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32 hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ + SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status +#endif s_rfe_b64 [ttmp0, ttmp1] L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. +#if ASIC_FAMILY == CHIP_PLUM_BONITO + s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif +#if ASIC_FAMILY >= CHIP_GFX12 + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit #if HAVE_XNACK save_and_clear_ib_sts(s_save_tmp, s_save_trapsts) @@ -377,7 +513,7 @@ L_SLEEP: s_sleep 0x2 s_cbranch_execz L_SLEEP #else - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 #endif // Save first_wave flag so we can clear high bits of save address. @@ -399,7 +535,7 @@ L_SLEEP: s_and_b32 s_save_ttmps_hi, exec_hi, 0xFFFF s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0xFFFFFFFF - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] V_COHERENCE v_mov_b32 v0, 0x0 s_mov_b32 exec_lo, s_save_ttmps_lo s_mov_b32 exec_hi, s_save_ttmps_hi @@ -407,7 +543,7 @@ L_SLEEP: // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40 - get_wave_size(s_save_ttmps_hi) + get_wave_size2(s_save_ttmps_hi) get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) get_svgpr_size_bytes(s_save_ttmps_hi) s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi @@ -431,15 +567,15 @@ L_SLEEP: s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_hi, 0x0 - global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1 + global_store_dword_addtid v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 V_COHERENCE v_readlane_b32 ttmp14, v0, 0xE v_readlane_b32 ttmp15, v0, 0xF s_mov_b32 exec_lo, ttmp14 s_mov_b32 exec_hi, ttmp15 #else - s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 S_COHERENCE + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 S_COHERENCE + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 S_COHERENCE #endif /* setup Resource Contants */ @@ -453,7 +589,7 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - get_wave_size(s_wave_size) + get_wave_size2(s_wave_size) #if HAVE_XNACK // Save and clear vector XNACK state late to free up SGPRs. @@ -488,11 +624,11 @@ L_SAVE_FIRST_VGPRS32_WITH_TCP: #endif #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_branch L_SAVE_HWREG L_SAVE_4VGPR_WAVE64: @@ -511,11 +647,11 @@ L_SAVE_FIRST_VGPRS64_WITH_TCP: #endif #if !NO_SQC_STORE - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #endif - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 /* save HW registers */ @@ -535,6 +671,19 @@ L_SAVE_HWREG: s_mov_b32 m0, 0x0 //Next lane of v2 to write to #endif +#if ASIC_FAMILY >= CHIP_GFX12 + // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. + s_barrier_signal -2 + s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_or_b32 s_save_status, s_save_status, s_save_tmp +#endif + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK @@ -543,7 +692,7 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_getreg_b32 s_save_tmp, hwreg(S_TRAPSTS_HWREG) write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) // Not used on Sienna_Cichlid but keep layout same for debugger. @@ -558,11 +707,26 @@ L_SAVE_HWREG: s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) +#if ASIC_FAMILY >= CHIP_GFX12 + s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL) + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) + + s_get_barrier_state s_save_tmp, -1 + s_wait_kmcnt (0) + write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) +#endif + #if NO_SQC_STORE // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. s_mov_b32 exec_lo, 0xFFFF s_mov_b32 exec_hi, 0x0 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode. s_mov_b32 exec_lo, 0xFFFFFFFF @@ -605,7 +769,7 @@ L_SAVE_SGPR_LOOP: s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 s_mov_b32 ttmp13, 0x0 v_mov_b32 v2, 0x0 @@ -626,7 +790,7 @@ L_SAVE_SGPR_SKIP_TCP_STORE: write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) #if NO_SQC_STORE - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE #else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask @@ -650,14 +814,15 @@ L_SAVE_LDS_NORMAL: s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE +#if ASIC_FAMILY < CHIP_GFX12 s_barrier //LDS is used? wait for other waves in the same TG +#endif s_and_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK s_cbranch_scc0 L_SAVE_LDS_DONE // first wave do LDS save; - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -688,7 +853,7 @@ L_SAVE_LDS_W32: L_SAVE_LDS_LOOP_SQC_W32: ds_read_b32 v1, v0 - s_waitcnt 0 + S_WAITCNT_0 write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) @@ -708,8 +873,8 @@ L_SAVE_LDS_WITH_TCP_W32: s_nop 0 L_SAVE_LDS_LOOP_W32: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, s3 //every buffer_store_lds does 128 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -726,7 +891,7 @@ L_SAVE_LDS_W64: L_SAVE_LDS_LOOP_SQC_W64: ds_read_b32 v1, v0 - s_waitcnt 0 + S_WAITCNT_0 write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset) @@ -746,8 +911,8 @@ L_SAVE_LDS_WITH_TCP_W64: s_nop 0 L_SAVE_LDS_LOOP_W64: ds_read_b32 v1, v0 - s_waitcnt 0 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + S_WAITCNT_0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 @@ -814,10 +979,10 @@ L_SAVE_VGPR_W32_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:128*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes @@ -859,10 +1024,10 @@ L_SAVE_VGPR_W64_LOOP: v_movrels_b32 v2, v2 //v2 = v[2+m0] v_movrels_b32 v3, v3 //v3 = v[3+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE offset:256*3 s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes @@ -899,7 +1064,7 @@ L_SAVE_SHARED_VGPR_WAVE64_LOOP_SQC: L_SAVE_SHARED_VGPR_WAVE64_LOOP: v_movrels_b32 v0, v0 //v0 = v[0+m0] - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 @@ -916,8 +1081,13 @@ L_RESTORE: s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC +#if ASIC_FAMILY >= CHIP_GFX12 + // Save s_restore_spi_init_hi for later use. + s_mov_b32 s_restore_spi_init_hi_save, s_restore_spi_init_hi +#endif + //determine it is wave32 or wave64 - get_wave_size(s_restore_size) + get_wave_size2(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -937,8 +1107,7 @@ L_RESTORE_LDS_NORMAL: s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw - s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) @@ -962,7 +1131,7 @@ L_RESTORE_LDS_LOOP_W32: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 128 // 128 DW @@ -976,7 +1145,7 @@ L_RESTORE_LDS_LOOP_W64: buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW #else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset - s_waitcnt vmcnt(0) + S_WAITCNT_0 ds_store_addtid_b32 v0 #endif s_add_u32 m0, m0, 256 // 256 DW @@ -1017,11 +1186,11 @@ L_RESTORE_VGPR_NORMAL: s_cbranch_scc0 L_RESTORE_SGPR L_RESTORE_VGPR_WAVE32_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -1032,11 +1201,11 @@ L_RESTORE_VGPR_WAVE32_LOOP: s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? /* VGPR restore on v0 */ - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3 + S_WAITCNT_0 s_branch L_RESTORE_SGPR @@ -1051,11 +1220,11 @@ L_RESTORE_VGPR_WAVE64: s_cbranch_scc0 L_RESTORE_SHARED_VGPR L_RESTORE_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3 + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 v_movreld_b32 v1, v1 v_movreld_b32 v2, v2 @@ -1077,8 +1246,8 @@ L_RESTORE_SHARED_VGPR: s_mov_b32 exec_lo, 0xFFFFFFFF s_mov_b32 exec_hi, 0x00000000 L_RESTORE_SHARED_VGPR_WAVE64_LOOP: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE + S_WAITCNT_0 v_movreld_b32 v0, v0 //v[0+m0] = v0 s_add_u32 m0, m0, 1 //next vgpr index s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 @@ -1089,11 +1258,11 @@ L_RESTORE_SHARED_VGPR_WAVE64_LOOP: /* VGPR restore on v0 */ L_RESTORE_V0: - buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 - buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - s_waitcnt vmcnt(0) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3 + S_WAITCNT_0 /* restore SGPRs */ //will be 2+8+16*6 @@ -1110,7 +1279,7 @@ L_RESTORE_SGPR: s_mov_b32 m0, s_sgpr_save_num read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1119,7 +1288,7 @@ L_RESTORE_SGPR: s_movreld_b64 s2, s2 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1131,7 +1300,7 @@ L_RESTORE_SGPR: L_RESTORE_SGPR_LOOP: read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] s_nop 0 // hazard SALU M0=> S_MOVREL @@ -1151,7 +1320,9 @@ L_RESTORE_SGPR: // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception. // Clear DEBUG_EN before and restore MODE after the barrier. s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0 +#if ASIC_FAMILY < CHIP_GFX12 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG +#endif /* restore HW registers */ L_RESTORE_HWREG: @@ -1163,6 +1334,11 @@ L_RESTORE_HWREG: s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if ASIC_FAMILY >= CHIP_GFX12 + // Restore s_restore_spi_init_hi before the saved value gets clobbered. + s_mov_b32 s_restore_spi_init_hi, s_restore_spi_init_hi_save +#endif + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) @@ -1173,29 +1349,72 @@ L_RESTORE_HWREG: read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) - s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + S_WAITCNT_0 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch +#if ASIC_FAMILY >= CHIP_GFX12 + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + S_WAITCNT_0 + s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + S_WAITCNT_0 + s_setreg_b32 hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp + + // Only the first wave needs to restore the workgroup barrier. + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // Skip over WAVE_STATUS, since there is no state to restore from it + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 4 + + read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset) + S_WAITCNT_0 + + s_bitcmp1_b32 s_restore_tmp, BARRIER_STATE_VALID_OFFSET + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + + // extract the saved signal count from s_restore_tmp + s_lshr_b32 s_restore_tmp, s_restore_tmp, BARRIER_STATE_SIGNAL_OFFSET + + // We need to call s_barrier_signal repeatedly to restore the signal + // count of the work group barrier. The member count is already + // initialized with the number of waves in the work group. +L_BARRIER_RESTORE_LOOP: + s_and_b32 s_restore_tmp, s_restore_tmp, s_restore_tmp + s_cbranch_scc0 L_SKIP_BARRIER_RESTORE + s_barrier_signal -1 + s_add_i32 s_restore_tmp, s_restore_tmp, -1 + s_branch L_BARRIER_RESTORE_LOOP + +L_SKIP_BARRIER_RESTORE: +#endif + s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 - #if HAVE_XNACK s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif - s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts - s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + // {TRAPSTS/EXCP_FLAG_PRIV}.SAVE_CONTEXT and HOST_TRAP may have changed. + // Only restore the other fields to avoid clobbering them. + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, 0, S_TRAPSTS_RESTORE_PART_1_SIZE), s_restore_trapsts + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_2_SHIFT, S_TRAPSTS_RESTORE_PART_2_SIZE), s_restore_trapsts + +if S_TRAPSTS_RESTORE_PART_3_SIZE > 0 + s_lshr_b32 s_restore_trapsts, s_restore_trapsts, S_TRAPSTS_RESTORE_PART_3_SHIFT - S_TRAPSTS_RESTORE_PART_2_SHIFT + s_setreg_b32 hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_RESTORE_PART_3_SHIFT, S_TRAPSTS_RESTORE_PART_3_SIZE), s_restore_trapsts +end + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic @@ -1207,10 +1426,10 @@ L_RESTORE_HWREG: s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 - s_waitcnt lgkmcnt(0) + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE + S_WAITCNT_0 #if HAVE_XNACK restore_ib_sts(s_restore_tmp, s_restore_m0) @@ -1232,7 +1451,15 @@ L_RESTORE_HWREG: L_RETURN_WITHOUT_PRIV: #endif - s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + s_setreg_b32 hwreg(S_STATUS_HWREG), s_restore_status // SCC is included, which is changed by previous salu + +#if ASIC_FAMILY >= CHIP_GFX12 + // Make barrier and LDS state visible to all waves in the group. + // STATE_PRIV.BARRIER_COMPLETE may change after this point. + s_barrier_signal -2 + s_barrier_wait -2 +#endif + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: @@ -1247,7 +1474,7 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) #else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset - s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_buffer_store_dword s, s_rsrc, m0 S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo #endif @@ -1262,10 +1489,10 @@ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE + s_buffer_store_dwordx4 s[12], s_rsrc, 48 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif @@ -1279,32 +1506,32 @@ function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) s_add_u32 ttmp13, ttmp13, 0x1 end #else - s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 S_COHERENCE + s_buffer_store_dwordx4 s[4], s_rsrc, 16 S_COHERENCE + s_buffer_store_dwordx4 s[8], s_rsrc, 32 S_COHERENCE s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 #endif end function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) - s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dword s, s_rsrc, s_mem_offset S_COHERENCE s_add_u32 s_mem_offset, s_mem_offset, 4 end function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*16 - s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset S_COHERENCE end function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*8 - s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset S_COHERENCE end function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*4 - s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset S_COHERENCE end #if SAVE_AFTER_XNACK_ERROR @@ -1345,11 +1572,6 @@ function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset) end #endif -function get_lds_size_bytes(s_lds_size_byte) - s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) - s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW -end - function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 @@ -1375,11 +1597,16 @@ function get_hwreg_size_bytes return 128 end -function get_wave_size(s_reg) +function get_wave_size2(s_reg) +#if ASIC_FAMILY < CHIP_GFX12 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) +#else + s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) +#endif s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE end +#if HAVE_XNACK function save_and_clear_ib_sts(tmp1, tmp2) // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into @@ -1404,3 +1631,4 @@ function restore_ib_sts(tmp1, tmp2) s_or_b32 tmp1, tmp1, tmp2 s_setreg_b32 hwreg(HW_REG_IB_STS), tmp1 end +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index fdf171ad4a3c..32e5db509560 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1913,11 +1913,6 @@ static int criu_checkpoint_bos(struct kfd_process *p, struct kfd_criu_bo_priv_data *bo_priv; int i, dev_idx = 0; - if (!mem) { - ret = -ENOMEM; - goto exit; - } - kgd_mem = (struct kgd_mem *)mem; dumper_bo = kgd_mem->bo; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 7f2ae0d15d4a..cd7b81b7b939 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1614,6 +1614,7 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd, *pcache_info); @@ -1677,6 +1678,9 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); break; @@ -2210,9 +2214,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * Modify length and total_entries as subunits are added. */ avail_size -= sizeof(struct crat_header); - if (avail_size < 0) - return -ENOMEM; - memset(crat_table, 0, sizeof(struct crat_header)); memcpy(&crat_table->signature, CRAT_SIGNATURE, @@ -2226,9 +2227,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * First fill in the sub type header and then sub type data */ avail_size -= sizeof(struct crat_subtype_computeunit); - if (avail_size < 0) - return -ENOMEM; - sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 300634b9f668..a8ca7ecb6d27 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -42,8 +42,6 @@ #define CRAT_OEMTABLEID_LENGTH 8 #define CRAT_RESERVED_LENGTH 6 -#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) - /* Compute Unit flags */ #define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ #define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index d889e3545120..34a282540c7e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -25,6 +25,7 @@ #include "kfd_topology.h" #include <linux/file.h> #include <uapi/linux/kfd_ioctl.h> +#include <uapi/linux/kfd_sysfs.h> #define MAX_WATCH_ADDRESSES 4 @@ -103,7 +104,8 @@ void debug_event_write_work_handler(struct work_struct *work) struct kfd_process, debug_event_workarea); - kernel_write(process->dbg_ev_file, &write_data, 1, &pos); + if (process->debug_trap_enabled && process->dbg_ev_file) + kernel_write(process->dbg_ev_file, &write_data, 1, &pos); } /* update process/device/queue exception status, write to descriptor @@ -497,14 +499,24 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) int i, r = 0, rewind_count = 0; for (i = 0; i < target->n_pdds; i++) { - if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && + struct kfd_topology_device *topo_dev = + kfd_topology_device_by_id(target->pdds[i]->dev->id); + uint32_t caps = topo_dev->node_props.capability; + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { *flags = prev_flags; return -EACCES; } + + if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) && + (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) { + *flags = prev_flags; + return -EACCES; + } } - target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; + target->dbg_flags = *flags; *flags = prev_flags; for (i = 0; i < target->n_pdds; i++) { struct kfd_process_device *pdd = target->pdds[i]; @@ -645,6 +657,7 @@ int kfd_dbg_trap_disable(struct kfd_process *target) else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; + cancel_work_sync(&target->debug_event_workarea); fput(target->dbg_ev_file); target->dbg_ev_file = NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h index fd0ff64d4184..924d0fd85dfb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h @@ -78,6 +78,7 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev) { return (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) || KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4) || KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)); } @@ -134,6 +135,7 @@ static inline bool kfd_dbg_has_ttmps_always_setup(struct kfd_node *dev) KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 2)) || (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) && KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0) && - (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70); + (dev->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 70) || + (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index afc57df421cd..f4d20adaa068 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -56,6 +56,7 @@ extern const struct kfd2kgd_calls gc_9_4_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; extern const struct kfd2kgd_calls gfx_v11_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v12_kfd2kgd; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, unsigned int chunk_size); @@ -83,6 +84,7 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(4, 2, 2):/* ARCTURUS */ case IP_VERSION(4, 4, 0):/* ALDEBARAN */ case IP_VERSION(4, 4, 2): + case IP_VERSION(4, 4, 5): case IP_VERSION(5, 0, 0):/* NAVI10 */ case IP_VERSION(5, 0, 1):/* CYAN_SKILLFISH */ case IP_VERSION(5, 0, 2):/* NAVI14 */ @@ -97,6 +99,9 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): kfd->device_info.num_sdma_queues_per_engine = 8; break; default: @@ -115,6 +120,9 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 3): case IP_VERSION(6, 1, 0): case IP_VERSION(6, 1, 1): + case IP_VERSION(6, 1, 2): + case IP_VERSION(7, 0, 0): + case IP_VERSION(7, 0, 1): /* Reserve 1 for paging and 1 for gfx */ kfd->device_info.num_reserved_sdma_queues_per_engine = 2; /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ @@ -143,6 +151,7 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) kfd->device_info.event_interrupt_class = &event_interrupt_class_v9; break; case IP_VERSION(9, 4, 3): /* GC 9.4.3 */ + case IP_VERSION(9, 4, 4): /* GC 9.4.4 */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v9_4_3; break; @@ -168,6 +177,12 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 0, 4): case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): + case IP_VERSION(11, 5, 2): + kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; + break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + /* GFX12_TODO: Change to v12 version. */ kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; default: @@ -220,6 +235,8 @@ static void kfd_device_info_init(struct kfd_dev *kfd, */ kfd->device_info.needs_pci_atomics = true; kfd->device_info.no_atomic_fw_version = kfd->adev->gfx.rs64_enable ? 509 : 0; + } else { + kfd->device_info.needs_pci_atomics = true; } } else { kfd->device_info.doorbell_size = 4; @@ -332,6 +349,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) : 90401; f2g = &gc_9_4_3_kfd2kgd; break; + case IP_VERSION(9, 4, 4): + gfx_target_version = 90402; + f2g = &gc_9_4_3_kfd2kgd; + break; /* Navi10 */ case IP_VERSION(10, 1, 10): gfx_target_version = 100100; @@ -420,6 +441,18 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 110501; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 5, 2): + gfx_target_version = 110502; + f2g = &gfx_v11_kfd2kgd; + break; + case IP_VERSION(12, 0, 0): + gfx_target_version = 120000; + f2g = &gfx_v12_kfd2kgd; + break; + case IP_VERSION(12, 0, 1): + gfx_target_version = 120001; + f2g = &gfx_v12_kfd2kgd; + break; default: break; } @@ -473,7 +506,8 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_aldebaran_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_aldebaran_hex); - } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) { + } else if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_4_3_hex) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx9_4_3_hex; @@ -493,12 +527,16 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) > KFD_CWSR_TMA_OFFSET); kfd->cwsr_isa = cwsr_trap_gfx10_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); - } else { + } else if (KFD_GC_VERSION(kfd) < IP_VERSION(12, 0, 0)) { /* The gfx11 cwsr trap handler must fit inside a single page. */ BUILD_BUG_ON(sizeof(cwsr_trap_gfx11_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx11_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx11_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx12_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx12_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx12_hex); } kfd->cwsr_enabled = true; @@ -523,7 +561,8 @@ static int kfd_gws_init(struct kfd_node *node) && kfd->mec2_fw_version >= 0x30) || (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 2) && kfd->mec2_fw_version >= 0x28) || - (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) || + (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(node) == IP_VERSION(9, 4, 4)) || (KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0) && KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0) && kfd->mec2_fw_version >= 0x6b) || @@ -766,7 +805,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, * xGMI connected in the topology so assign a unique hive id per * device based on the pci device location if device is in PCIe mode. */ - if (!kfd->hive_id && (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) && kfd->num_nodes > 1) + if (!kfd->hive_id && + (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) && + kfd->num_nodes > 1) kfd->hive_id = pci_dev_id(kfd->adev->pdev); kfd->noretry = kfd->adev->gmc.noretry; @@ -804,7 +846,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, KFD_XCP_MEMORY_SIZE(node->adev, node->node_id) >> 20); } - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) && + if ((KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) && partition_mode == AMDGPU_CPX_PARTITION_MODE && kfd->num_nodes != 1) { /* For GFX9.4.3 and CPX mode, first XCD gets VMID range @@ -832,7 +875,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, amdgpu_amdkfd_get_local_mem_info(kfd->adev, &node->local_mem_info, node->xcp); - if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(kfd) == IP_VERSION(9, 4, 4)) kfd_setup_interrupt_bitmap(node, i); /* Initialize the KFD node */ @@ -887,7 +931,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfree(kfd); } -int kgd2kfd_pre_reset(struct kfd_dev *kfd) +int kgd2kfd_pre_reset(struct kfd_dev *kfd, + struct amdgpu_reset_context *reset_context) { struct kfd_node *node; int i; @@ -897,8 +942,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; - kfd_smi_event_update_gpu_reset(node, false); - node->dqm->ops.pre_reset(node->dqm); + kfd_smi_event_update_gpu_reset(node, false, reset_context); } kgd2kfd_suspend(kfd, false); @@ -937,7 +981,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) for (i = 0; i < kfd->num_nodes; i++) { node = kfd->nodes[i]; atomic_set(&node->sram_ecc_flag, 0); - kfd_smi_event_update_gpu_reset(node, true); + kfd_smi_event_update_gpu_reset(node, true, NULL); } return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c08b6ee25289..4f48507418d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -35,7 +35,8 @@ #include "cik_regs.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" -#include "mes_api_def.h" +#include "amdgpu_reset.h" +#include "mes_v11_api_def.h" #include "kfd_debug.h" /* Size of the per-pipe EOP queue */ @@ -155,14 +156,7 @@ static void kfd_hws_hang(struct device_queue_manager *dqm) /* * Issue a GPU reset if HWS is unresponsive */ - dqm->is_hws_hang = true; - - /* It's possible we're detecting a HWS hang in the - * middle of a GPU reset. No need to schedule another - * reset in this case. - */ - if (!dqm->is_resetting) - schedule_work(&dqm->hw_exception_work); + schedule_work(&dqm->hw_exception_work); } static int convert_to_mes_queue_type(int queue_type) @@ -194,7 +188,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r, queue_type; uint64_t wptr_addr_off; - if (dqm->is_hws_hang) + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); @@ -236,6 +230,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (queue_type < 0) { dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n", q->properties.type); + up_read(&adev->reset_domain->sem); return -EINVAL; } queue_input.queue_type = (uint32_t)queue_type; @@ -245,6 +240,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n", q->properties.doorbell_off); @@ -262,7 +258,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r; struct mes_remove_queue_input queue_input; - if (dqm->is_hws_hang) + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); @@ -272,6 +268,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", @@ -1468,20 +1465,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) } if (dqm->dev->adev->asic_type == CHIP_HAWAII) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); dqm->sched_running = false; dqm_unlock(dqm); return 0; } -static void pre_reset(struct device_queue_manager *dqm) -{ - dqm_lock(dqm); - dqm->is_resetting = true; - dqm_unlock(dqm); -} - static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id) { @@ -1669,8 +1659,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); /* clear hang status when driver try to start the hw scheduler */ - dqm->is_hws_hang = false; - dqm->is_resetting = false; dqm->sched_running = true; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1700,7 +1688,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_allocate_vidmem: fail_set_sched_resources: if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); fail_packet_manager_init: dqm_unlock(dqm); return retval; @@ -1708,22 +1696,17 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { - bool hanging; - dqm_lock(dqm); if (!dqm->sched_running) { dqm_unlock(dqm); return 0; } - if (!dqm->is_hws_hang) { - if (!dqm->dev->kfd->shared_resources.enable_mes) - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); - else - remove_all_queues_mes(dqm); - } + if (!dqm->dev->kfd->shared_resources.enable_mes) + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); + else + remove_all_queues_mes(dqm); - hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1731,7 +1714,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, hanging); + pm_uninit(&dqm->packet_mgr); dqm_unlock(dqm); return 0; @@ -1957,24 +1940,24 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; - int retval = 0; + int retval; if (!dqm->sched_running) return 0; - if (dqm->is_hws_hang || dqm->is_resetting) - return -EIO; if (!dqm->active_runlist) - return retval; + return 0; + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) + return -EIO; if (grace_period != USE_DEFAULT_GRACE_PERIOD) { retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); if (retval) - return retval; + goto out; } retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset); if (retval) - return retval; + goto out; *dqm->fence_addr = KFD_FENCE_INIT; pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, @@ -1985,7 +1968,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (retval) { dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); kfd_hws_hang(dqm); - return retval; + goto out; } /* In the current MEC firmware implementation, if compute queue @@ -2001,7 +1984,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, while (halt_if_hws_hang) schedule(); kfd_hws_hang(dqm); - return -ETIME; + retval = -ETIME; + goto out; } /* We need to reset the grace period value for this device */ @@ -2014,6 +1998,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; +out: + up_read(&dqm->dev->adev->reset_domain->sem); return retval; } @@ -2040,13 +2026,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; - if (dqm->is_hws_hang) + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false); - if (retval) - return retval; - - return map_queues_cpsch(dqm); + if (!retval) + retval = map_queues_cpsch(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return retval; } static int wait_on_destroy_queue(struct device_queue_manager *dqm, @@ -2427,10 +2413,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, if (!dqm->dev->kfd->shared_resources.enable_mes) retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD); - if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + if ((retval || qpd->reset_wavefronts) && + down_read_trylock(&dqm->dev->adev->reset_domain->sem)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; + up_read(&dqm->dev->adev->reset_domain->sem); } /* Lastly, free mqd resources. @@ -2537,7 +2525,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) dqm->ops.initialize = initialize_cpsch; dqm->ops.start = start_cpsch; dqm->ops.stop = stop_cpsch; - dqm->ops.pre_reset = pre_reset; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.register_process = register_process; @@ -2558,7 +2545,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) /* initialize dqm for no cp scheduling */ dqm->ops.start = start_nocpsch; dqm->ops.stop = stop_nocpsch; - dqm->ops.pre_reset = pre_reset; dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; @@ -2597,7 +2583,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) + device_queue_manager_init_v12(&dqm->asic_ops); + else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) device_queue_manager_init_v11(&dqm->asic_ops); else if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1)) device_queue_manager_init_v10(&dqm->asic_ops); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index cf7e182588f8..3b9b8eabaacc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -152,7 +152,6 @@ struct device_queue_manager_ops { int (*initialize)(struct device_queue_manager *dqm); int (*start)(struct device_queue_manager *dqm); int (*stop)(struct device_queue_manager *dqm); - void (*pre_reset)(struct device_queue_manager *dqm); void (*uninitialize)(struct device_queue_manager *dqm); int (*create_kernel_queue)(struct device_queue_manager *dqm, struct kernel_queue *kq, @@ -277,6 +276,8 @@ void device_queue_manager_init_v10( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v11( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_cp_queues_num(struct device_queue_manager *dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c new file mode 100644 index 000000000000..4f3295b29dfb --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v12.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "soc24_enum.h" + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v12( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v12; + asic_ops->init_sdma_vm = init_sdma_vm_v12; + asic_ops->mqd_manager_init = mqd_manager_init_v12; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v12(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v12(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 54eb1bff903c..210bcc048f4c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -63,7 +63,8 @@ static int update_qpd_v9(struct device_queue_manager *dqm, if (dqm->dev->kfd->noretry) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; - if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev->kfd) == IP_VERSION(9, 4, 4)) qpd->sh_mem_config |= (1 << SH_MEM_CONFIG__F8_MODE__SHIFT); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 4a64307bc438..dbcb60eb54b2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -380,7 +380,8 @@ int kfd_init_apertures(struct kfd_process *process) pdd = kfd_create_process_device_data(dev, process); if (!pdd) { - pr_err("Failed to create process device data\n"); + dev_err(dev->adev->dev, + "Failed to create process device data\n"); return -ENOMEM; } /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index e1c21d250611..a9c3580be8c9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -27,6 +27,7 @@ #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +#include "amdgpu_ras.h" /* * GFX9 SQ Interrupts @@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { enum amdgpu_ras_block block = 0; - int old_poison; uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int old_poison, ret; if (!p) return; @@ -164,7 +167,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: block = AMDGPU_RAS_BLOCK__GFX; - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || + amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC15_IH_CLIENTID_VMC: case SOC15_IH_CLIENTID_VMC1: @@ -177,7 +184,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || + amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; default: dev_warn(dev->adev->dev, @@ -185,10 +196,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, return; } + ret = amdgpu_ras_mark_ras_event(dev->adev, type); + if (ret) + return; + kfd_signal_poison_consumed_event(dev, pasid); - dev_warn(dev->adev->dev, - "poison is consumed by client %d, kick off gpu reset flow\n", client_id); + event_id = amdgpu_ras_acquire_event_id(dev->adev, type); + + RAS_EVENT_LOG(dev->adev, event_id, + "poison is consumed by client %d, kick off gpu reset flow\n", client_id); amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, block, pasid, NULL, NULL, reset); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 32c926986dbb..4843dcb9a5f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -32,6 +32,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers.h" #include "kfd_pm4_opcodes.h" +#include "amdgpu_reset.h" #define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) @@ -68,7 +69,7 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; break; default: - pr_err("Invalid queue type %d\n", type); + dev_err(dev->adev->dev, "Invalid queue type %d\n", type); return false; } @@ -78,13 +79,14 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev, prop.doorbell_ptr = kfd_get_kernel_doorbell(dev->kfd, &prop.doorbell_off); if (!prop.doorbell_ptr) { - pr_err("Failed to initialize doorbell"); + dev_err(dev->adev->dev, "Failed to initialize doorbell"); goto err_get_kernel_doorbell; } retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); if (retval != 0) { - pr_err("Failed to init pq queues size %d\n", queue_size); + dev_err(dev->adev->dev, "Failed to init pq queues size %d\n", + queue_size); goto err_pq_allocate_vidmem; } @@ -196,15 +198,17 @@ err_get_kernel_doorbell: } /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq, bool hanging) +static void kq_uninitialize(struct kernel_queue *kq) { - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging) + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && down_read_trylock(&kq->dev->adev->reset_domain->sem)) { kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, kq->queue->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, kq->queue->pipe, kq->queue->queue); + up_read(&kq->dev->adev->reset_domain->sem); + } else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); @@ -338,15 +342,15 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev, if (kq_initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) return kq; - pr_err("Failed to init kernel queue\n"); + dev_err(dev->adev->dev, "Failed to init kernel queue\n"); kfree(kq); return NULL; } -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging) +void kernel_queue_uninit(struct kernel_queue *kq) { - kq_uninitialize(kq, hanging); + kq_uninitialize(kq); kfree(kq); } @@ -357,26 +361,26 @@ static __attribute__((unused)) void test_kq(struct kfd_node *dev) uint32_t *buffer, i; int retval; - pr_err("Starting kernel queue test\n"); + dev_err(dev->adev->dev, "Starting kernel queue test\n"); kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); if (unlikely(!kq)) { - pr_err(" Failed to initialize HIQ\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to initialize HIQ\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } retval = kq_acquire_packet_buffer(kq, 5, &buffer); if (unlikely(retval != 0)) { - pr_err(" Failed to acquire packet buffer\n"); - pr_err("Kernel queue test failed\n"); + dev_err(dev->adev->dev, " Failed to acquire packet buffer\n"); + dev_err(dev->adev->dev, "Kernel queue test failed\n"); return; } for (i = 0; i < 5; i++) buffer[i] = kq->nop_packet; kq_submit_packet(kq); - pr_err("Ending kernel queue test\n"); + dev_err(dev->adev->dev, "Ending kernel queue test\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 8746a61a852d..50a81da43ce1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -118,18 +118,20 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * attention grabbing. */ if (gfx_info->max_shader_engines > KFD_MAX_NUM_SE) { - pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", - gfx_info->max_shader_engines); + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SE, chip reports %d\n", + gfx_info->max_shader_engines); return; } if (gfx_info->max_sh_per_se > KFD_MAX_NUM_SH_PER_SE) { - pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n", + dev_err(mm->dev->adev->dev, + "Exceeded KFD_MAX_NUM_SH, chip reports %d\n", gfx_info->max_sh_per_se * gfx_info->max_shader_engines); return; } cu_bitmap_sh_mul = (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0) && - KFD_GC_VERSION(mm->dev) < IP_VERSION(12, 0, 0)) ? 2 : 1; + KFD_GC_VERSION(mm->dev) < IP_VERSION(13, 0, 0)) ? 2 : 1; /* Count active CUs per SH. * diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c new file mode 100644 index 000000000000..b7a08e7a4423 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v12_structs.h" +#include "gc/gc_12_0_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v12_compute_mqd *get_mqd(void *mqd) +{ + return (struct v12_compute_mqd *)mqd; +} + +static inline struct v12_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v12_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; + + if (!minfo || !minfo->cu_mask.ptr) + return; + + mqd_symmetrically_map_cu_mask(mm, + minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask, 0); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + m->compute_static_thread_mgmt_se4 = se_mask[4]; + m->compute_static_thread_mgmt_se5 = se_mask[5]; + m->compute_static_thread_mgmt_se6 = se_mask[6]; + m->compute_static_thread_mgmt_se7 = se_mask[7]; + + pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3, + m->compute_static_thread_mgmt_se4, + m->compute_static_thread_mgmt_se5, + m->compute_static_thread_mgmt_se6, + m->compute_static_thread_mgmt_se7); +} + +static void set_priority(struct v12_compute_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_node *node, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + /* + * Allocate one PAGE_SIZE memory for MQD as MES writes to areas beyond + * struct MQD size. + */ + if (kfd_gtt_sa_allocate(node, PAGE_SIZE, &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v12_compute_mqd *m; + + m = (struct v12_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, PAGE_SIZE); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + /* Set cp_hqd_hq_status0.c_queue_debug_en to 1 to have the CP set up the + * DISPATCH_PTR. This is required for the kfd debugger + */ + m->cp_hqd_hq_status0 = 1 << 14; + + if (amdgpu_amdkfd_have_atomics_support(mm->dev->adev)) + m->cp_hqd_hq_status0 |= 1 << 29; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->kfd->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q, NULL); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms, 0); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->kfd->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, minfo); + set_priority(m, q); + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) +{ + struct v12_compute_mqd *m = (struct v12_compute_mqd *)mqd; + + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v12_compute_mqd *m; + struct mqd_user_context_save_area_header header; + + m = get_mqd(mqd); + + /* Control stack is written backwards, while workgroup context data + * is written forwards. Both starts from m->cp_hqd_cntl_stack_size. + * Current position is at m->cp_hqd_cntl_stack_offset and + * m->cp_hqd_wg_state_offset, respectively. + */ + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + /* Control stack is not copied to user mode for GFXv12 because + * it's part of the context save area that is already + * accessible to user mode + */ + header.control_stack_size = *ctl_stack_used_size; + header.wave_state_size = *save_area_used_size; + + header.wave_state_offset = m->cp_hqd_wg_state_offset; + header.control_stack_offset = m->cp_hqd_cntl_stack_offset; + + if (copy_to_user(ctl_stack, &header, sizeof(header))) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v12_sdma_mqd *m; + + m = (struct v12_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v12_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q, NULL); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q, + struct mqd_update_info *minfo) +{ + struct v12_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__MCU_WPTR_POLL_ENABLE__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + q->is_active = QUEUE_IS_ACTIVE(*q); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v12_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->get_wave_state = get_wave_state; + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = kfd_hiq_load_mqd_kiq; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + mqd->check_preemption_failed = check_preemption_failed; + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; + mqd->mqd_size = sizeof(struct v12_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = kfd_free_mqd_cp; + mqd->load_mqd = kfd_load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = kfd_destroy_mqd_sdma; + mqd->is_occupied = kfd_is_occupied_sdma; + mqd->mqd_size = sizeof(struct v12_sdma_mqd); + mqd->mqd_stride = kfd_mqd_stride; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 6bddc16808d7..66c73825c0a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -77,7 +77,8 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se1 = se_mask[1]; m->compute_static_thread_mgmt_se2 = se_mask[2]; m->compute_static_thread_mgmt_se3 = se_mask[3]; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4)) { m->compute_static_thread_mgmt_se4 = se_mask[4]; m->compute_static_thread_mgmt_se5 = se_mask[5]; m->compute_static_thread_mgmt_se6 = se_mask[6]; @@ -299,7 +300,8 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, if (mm->dev->kfd->cwsr_enabled && q->ctx_save_restore_area_address) m->cp_hqd_ctx_save_control = 0; - if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(mm->dev) != IP_VERSION(9, 4, 4)) update_cu_mask(mm, mqd, minfo, 0); set_priority(m, q); @@ -544,6 +546,9 @@ static void init_mqd_hiq_v9_4_3(struct mqd_manager *mm, void **mqd, m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + if (amdgpu_sriov_vf(mm->dev->adev)) + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT; m->cp_mqd_stride_size = kfd_hiq_mqd_stride(mm->dev); if (xcc == 0) { /* Set no_update_rptr = 0 in Master XCC */ @@ -713,7 +718,7 @@ static void update_mqd_v9_4_3(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd + size * xcc); update_mqd(mm, m, q, minfo); - update_cu_mask(mm, mqd, minfo, xcc); + update_cu_mask(mm, m, minfo, xcc); if (q->format == KFD_QUEUE_FORMAT_AQL) { switch (xcc) { @@ -875,7 +880,8 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) { mqd->init_mqd = init_mqd_v9_4_3; mqd->load_mqd = load_mqd_v9_4_3; mqd->update_mqd = update_mqd_v9_4_3; @@ -899,7 +905,8 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) { mqd->init_mqd = init_mqd_hiq_v9_4_3; mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3; mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index d6f65f39072b..37930629edc5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -45,7 +45,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; unsigned int map_queue_size; unsigned int max_proc_per_quantum = 1; - struct kfd_node *dev = pm->dqm->dev; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->active_queue_count; @@ -59,14 +60,14 @@ static void pm_calc_rlib_size(struct packet_manager *pm, */ *over_subscription = false; - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; + if (node->max_proc_per_quantum > 1) + max_proc_per_quantum = node->max_proc_per_quantum; if ((process_count > max_proc_per_quantum) || compute_queue_count > get_cp_queues_num(pm->dqm) || gws_queue_count > 1) { *over_subscription = true; - pr_debug("Over subscribed runlist\n"); + dev_dbg(dev, "Over subscribed runlist\n"); } map_queue_size = pm->pmf->map_queues_size; @@ -81,7 +82,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, if (*over_subscription) *rlib_size += pm->pmf->runlist_size; - pr_debug("runlist ib size %d\n", *rlib_size); + dev_dbg(dev, "runlist ib size %d\n", *rlib_size); } static int pm_allocate_runlist_ib(struct packet_manager *pm, @@ -90,6 +91,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, unsigned int *rl_buffer_size, bool *is_over_subscription) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval; if (WARN_ON(pm->allocated)) @@ -99,11 +102,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); + retval = kfd_gtt_sa_allocate(node, *rl_buffer_size, &pm->ib_buffer_obj); if (retval) { - pr_err("Failed to allocate runlist IB\n"); + dev_err(dev, "Failed to allocate runlist IB\n"); goto out; } @@ -125,6 +127,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, { unsigned int alloc_size_bytes; unsigned int *rl_buffer, rl_wptr, i; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval, processes_mapped; struct device_process_node *cur; struct qcm_process_device *qpd; @@ -142,7 +146,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, *rl_size_bytes = alloc_size_bytes; pm->ib_size_bytes = alloc_size_bytes; - pr_debug("Building runlist ib process count: %d queues count %d\n", + dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->active_queue_count); /* build the run list ib packet */ @@ -150,7 +154,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, qpd = cur->qpd; /* build map process packet */ if (processes_mapped >= pm->dqm->processes_count) { - pr_debug("Not enough space left in runlist IB\n"); + dev_dbg(dev, "Not enough space left in runlist IB\n"); pm_release_ib(pm); return -ENOMEM; } @@ -167,7 +171,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!kq->queue->properties.is_active) continue; - pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -186,7 +191,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!q->properties.is_active) continue; - pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -203,11 +209,13 @@ static int pm_create_runlist_ib(struct packet_manager *pm, } } - pr_debug("Finished map process and queues to runlist\n"); + dev_dbg(dev, "Finished map process and queues to runlist\n"); if (is_over_subscription) { if (!pm->is_over_subscription) - pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); + dev_warn( + dev, + "Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), @@ -239,7 +247,8 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) break; default: if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2) || - KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3)) + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 4)) pm->pmf = &kfd_aldebaran_pm_funcs; else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1)) pm->pmf = &kfd_v9_pm_funcs; @@ -262,16 +271,18 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) return 0; } -void pm_uninit(struct packet_manager *pm, bool hanging) +void pm_uninit(struct packet_manager *pm) { mutex_destroy(&pm->lock); - kernel_queue_uninit(pm->priv_queue, hanging); + kernel_queue_uninit(pm->priv_queue); pm->priv_queue = NULL; } int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -281,7 +292,7 @@ int pm_send_set_resources(struct packet_manager *pm, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -343,6 +354,8 @@ fail_create_runlist_ib: int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, uint64_t fence_value) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -354,7 +367,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -372,6 +385,8 @@ out: int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval = 0; uint32_t *buffer, size; @@ -385,7 +400,8 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, + "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -406,6 +422,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_unmap_queues_filter filter, uint32_t filter_param, bool reset) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -414,7 +432,7 @@ int pm_send_unmap_queue(struct packet_manager *pm, kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } @@ -463,6 +481,8 @@ out: int pm_debugfs_hang_hws(struct packet_manager *pm) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int r = 0; @@ -474,16 +494,16 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); r = -ENOMEM; goto out; } memset(buffer, 0x55, size); kq_submit_packet(pm->priv_queue); - pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", - buffer[0], buffer[1], buffer[2], buffer[3], - buffer[4], buffer[5], buffer[6]); + dev_info(dev, "Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], + buffer[5], buffer[6]); out: mutex_unlock(&pm->lock); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 8ee2bedd301a..00776f08351c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -213,7 +213,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { struct pm4_mes_map_queues *packet; - bool use_static = is_static; packet = (struct pm4_mes_map_queues *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); @@ -234,7 +233,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, switch (q->properties.type) { case KFD_QUEUE_TYPE_COMPUTE: - if (use_static) + if (is_static) packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_latency_static_queue_vi; break; @@ -244,7 +243,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, break; case KFD_QUEUE_TYPE_SDMA: case KFD_QUEUE_TYPE_SDMA_XGMI: - use_static = false; /* no static queues under SDMA */ if (q->properties.sdma_engine_id < 2 && !pm_use_ext_eng(q->device->kfd)) packet->bitfields2.engine_sel = q->properties.sdma_engine_id + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..2b3ec92981e8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -206,7 +206,8 @@ enum cache_policy { #define KFD_IS_SOC15(dev) ((KFD_GC_VERSION(dev)) >= (IP_VERSION(9, 0, 1))) #define KFD_SUPPORT_XNACK_PER_PROCESS(dev)\ ((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \ - (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3))) + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) || \ + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4))) struct kfd_node; @@ -1128,7 +1129,8 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, struct kfd_dev *dev = adev->kfd.dev; uint32_t i; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4)) return dev->nodes[0]; for (i = 0; i < dev->num_nodes; i++) @@ -1293,11 +1295,13 @@ struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, struct kfd_node *dev); struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, struct kfd_node *dev); +struct mqd_manager *mqd_manager_init_v12(enum KFD_MQD_TYPE type, + struct kfd_node *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_node *dev, enum kfd_queue_type type); -void kernel_queue_uninit(struct kernel_queue *kq, bool hanging); +void kernel_queue_uninit(struct kernel_queue *kq); int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); /* Process Queue Manager */ @@ -1403,7 +1407,7 @@ extern const struct packet_manager_funcs kfd_v9_pm_funcs; extern const struct packet_manager_funcs kfd_aldebaran_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm, bool hanging); +void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 451bb058cc62..17e42161b015 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1311,7 +1311,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) if (IS_ERR_VALUE(qpd->tba_addr)) { int err = qpd->tba_addr; - pr_err("Failure to set tba address. error %d.\n", err); + dev_err(dev->adev->dev, + "Failure to set tba address. error %d.\n", err); qpd->tba_addr = 0; qpd->cwsr_kaddr = NULL; return err; @@ -1611,7 +1612,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev, &pdd->proc_ctx_cpu_ptr, false); if (retval) { - pr_err("failed to allocate process context bo\n"); + dev_err(dev->adev->dev, + "failed to allocate process context bo\n"); goto err_free_pdd; } memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); @@ -1676,7 +1678,7 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, &p->kgd_process_info, &ef); if (ret) { - pr_err("Failed to create process VM object\n"); + dev_err(dev->adev->dev, "Failed to create process VM object\n"); return ret; } RCU_INIT_POINTER(p->ef, ef); @@ -1723,7 +1725,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_node *dev, pdd = kfd_get_process_device_data(dev, p); if (!pdd) { - pr_err("Process device data doesn't exist\n"); + dev_err(dev->adev->dev, "Process device data doesn't exist\n"); return ERR_PTR(-ENOMEM); } @@ -1833,6 +1835,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid, trigger); @@ -1844,7 +1847,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) * them been add back since they actually not be saved right now. */ if (r && r != -EIO) { - pr_err("Failed to evict process queues\n"); + dev_err(dev, "Failed to evict process queues\n"); goto fail; } n_evicted++; @@ -1866,7 +1869,8 @@ fail: if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd)) - pr_err("Failed to restore queues\n"); + dev_err(pdd->dev->adev->dev, + "Failed to restore queues\n"); n_evicted--; } @@ -1882,13 +1886,14 @@ int kfd_process_restore_queues(struct kfd_process *p) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + struct device *dev = pdd->dev->adev->dev; kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid); r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd); if (r) { - pr_err("Failed to restore process queues\n"); + dev_err(dev, "Failed to restore process queues\n"); if (!ret) ret = r; } @@ -2065,7 +2070,7 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, struct qcm_process_device *qpd; if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { - pr_err("Incorrect CWSR mapping size.\n"); + dev_err(dev->adev->dev, "Incorrect CWSR mapping size.\n"); return -EINVAL; } @@ -2077,7 +2082,8 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process, qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(KFD_CWSR_TBA_TMA_SIZE)); if (!qpd->cwsr_kaddr) { - pr_err("Error allocating per process CWSR buffer.\n"); + dev_err(dev->adev->dev, + "Error allocating per process CWSR buffer.\n"); return -ENOMEM; } @@ -2109,7 +2115,8 @@ int kfd_process_drain_interrupts(struct kfd_process_device *pdd) /* * For GFX 9.4.3, send the NodeId also in IH cookie DW[3] */ - if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3)) { + if (KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(pdd->dev->kfd) == IP_VERSION(9, 4, 4)) { node_id = ffs(pdd->dev->interrupt_bitmap) - 1; irq_drain_fence[3] |= node_id << 16; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 4858112f9a53..21f5a1fb3bf8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -28,6 +28,7 @@ #include "kfd_priv.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_reset.h" static inline struct process_queue_node *get_queue_by_qid( struct process_queue_manager *pqm, unsigned int qid) @@ -87,8 +88,12 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) return; dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); - if (dev->kfd->shared_resources.enable_mes) - amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr); + if (dev->kfd->shared_resources.enable_mes && + down_read_trylock(&dev->adev->reset_domain->sem)) { + amdgpu_mes_flush_shader_debugger(dev->adev, + pdd->proc_ctx_gpu_addr); + up_read(&dev->adev->reset_domain->sem); + } pdd->already_dequeued = true; } @@ -126,7 +131,9 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, if (!gws && pdd->qpd.num_gws == 0) return -EINVAL; - if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && !dev->kfd->shared_resources.enable_mes) { + if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 4) && + !dev->kfd->shared_resources.enable_mes) { if (gws) ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info, gws, &mem); @@ -189,6 +196,7 @@ static void pqm_clean_queue_resource(struct process_queue_manager *pqm, if (pqn->q->gws) { if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) && + KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 4) && !dev->kfd->shared_resources.enable_mes) amdgpu_amdkfd_remove_gws_from_process( pqm->process->kgd_process_info, pqn->q->gws); @@ -290,7 +298,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, * On GFX 9.4.3, increase the number of queues that * can be created to 255. No HWS limit on GFX 9.4.3. */ - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 4)) max_queues = 255; q = NULL; @@ -430,7 +439,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, err_create_queue: uninit_queue(q); if (kq) - kernel_queue_uninit(kq, false); + kernel_queue_uninit(kq); kfree(pqn); err_allocate_pqn: /* check if queues list is empty unregister process from device */ @@ -477,7 +486,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) /* destroy kernel queue (DIQ) */ dqm = pqn->kq->dev->dqm; dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); - kernel_queue_uninit(pqn->kq, false); + kernel_queue_uninit(pqn->kq); } if (pqn->q) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 06ac835190f9..ea6a8e43bd5b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -29,6 +29,7 @@ #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" +#include "amdgpu_reset.h" struct kfd_smi_client { struct list_head list; @@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, add_event_to_kfifo(pid, dev, event, fifo_in, len); } -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context) { unsigned int event; + char reset_cause[64]; if (post_reset) { event = KFD_SMI_EVENT_GPU_POST_RESET; @@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + + memset(reset_cause, 0, sizeof(reset_cause)); + + if (reset_context) + amdgpu_reset_get_desc(reset_context, reset_cause, + sizeof(reset_cause)); + + kfd_smi_event_add(0, dev, event, "%x %s\n", + dev->reset_seq_num, + reset_cause); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index fa95c2dfd587..85010b8307f8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -24,11 +24,14 @@ #ifndef KFD_SMI_EVENTS_H_INCLUDED #define KFD_SMI_EVENTS_H_INCLUDED +struct amdgpu_reset_context; + int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask); -void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); +void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, + struct amdgpu_reset_context *reset_context); void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, unsigned long address, bool write_fault, ktime_t ts); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 31e500859ab0..bd9c2921e0dc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1171,7 +1171,6 @@ svm_range_get_pte_flags(struct kfd_node *node, bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; - bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) @@ -1213,15 +1212,14 @@ svm_range_get_pte_flags(struct kfd_node *node, } break; case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): if (ext_coherent) mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC; else mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; snoop = true; - if (uncached) { - mapping_flags |= AMDGPU_VM_MTYPE_UC; - } else if (domain == SVM_RANGE_VRAM_DOMAIN) { + if (domain == SVM_RANGE_VRAM_DOMAIN) { /* local HBM region close to partition */ if (bo_node->adev == node->adev && (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) @@ -1248,6 +1246,16 @@ svm_range_get_pte_flags(struct kfd_node *node, mapping_flags |= AMDGPU_VM_MTYPE_UC; } break; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + if (domain == SVM_RANGE_VRAM_DOMAIN) { + if (bo_node != node) + mapping_flags |= AMDGPU_VM_MTYPE_NC; + } else { + mapping_flags |= coherent ? + AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; + } + break; default: mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; @@ -1263,6 +1271,8 @@ svm_range_get_pte_flags(struct kfd_node *node, pte_flags = AMDGPU_PTE_VALID; pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; + if (KFD_GC_VERSION(node) >= IP_VERSION(12, 0, 0)) + pte_flags |= AMDGPU_PTE_IS_PTE; pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); return pte_flags; @@ -1658,7 +1668,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1678,11 +1688,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, &hmm_range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; } @@ -1696,7 +1703,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + /* Free backing memory of hmm_range if it was initialized + * Overrride return value to TRY AGAIN only if prior returns + * were successful + */ + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index bc9eb847ecfe..6f89b06f89d3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -31,6 +31,7 @@ #include <linux/log2.h> #include <linux/dmi.h> #include <linux/atomic.h> +#include <linux/crc16.h> #include "kfd_priv.h" #include "kfd_crat.h" @@ -958,8 +959,7 @@ static void kfd_update_system_properties(void) dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list); if (dev) { - sys_props.platform_id = - (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_id = dev->oem_id64; sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); sys_props.platform_rev = dev->oem_revision; } @@ -1092,14 +1092,17 @@ void kfd_topology_shutdown(void) static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) { - uint32_t hashout; + uint32_t gpu_id; uint32_t buf[8]; uint64_t local_mem_size; - int i; + struct kfd_topology_device *dev; + bool is_unique; + uint8_t *crc_buf; if (!gpu) return 0; + crc_buf = (uint8_t *)&buf; local_mem_size = gpu->local_mem_info.local_mem_size_private + gpu->local_mem_info.local_mem_size_public; buf[0] = gpu->adev->pdev->devfn; @@ -1112,10 +1115,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) buf[6] = upper_32_bits(local_mem_size); buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16); - for (i = 0, hashout = 0; i < 8; i++) - hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + gpu_id = crc16(0, crc_buf, sizeof(buf)) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + + /* There is a very small possibility when generating a + * 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer + * that the value could be 0 or non-unique. So, check if + * it is unique and non-zero. If not unique increment till + * unique one is found. In case of overflow, restart from 1 + */ - return hashout; + down_read(&topology_lock); + do { + is_unique = true; + if (!gpu_id) + gpu_id = 1; + list_for_each_entry(dev, &topology_device_list, list) { + if (dev->gpu && dev->gpu_id == gpu_id) { + is_unique = false; + break; + } + } + if (unlikely(!is_unique)) + gpu_id = (gpu_id + 1) & + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); + } while (!is_unique); + up_read(&topology_lock); + + return gpu_id; } /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If * the GPU device is not already present in the topology device @@ -1635,7 +1662,8 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, pcache->cache_level = pcache_info[cache_type].cache_level; pcache->cacheline_size = pcache_info[cache_type].cache_line_size; - if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(knode) == IP_VERSION(9, 4, 4)) mode = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); else mode = UNKNOWN_MEMORY_PARTITION_MODE; @@ -1772,7 +1800,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct pr_debug("Added [%d] GPU cache entries\n", num_of_entries); } -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, +static int kfd_topology_add_device_locked(struct kfd_node *gpu, struct kfd_topology_device **dev) { int proximity_domain = ++topology_crat_proximity_domain; @@ -1785,8 +1813,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, COMPUTE_UNIT_GPU, gpu, proximity_domain); if (res) { - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error creating VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1797,8 +1824,7 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, &temp_topology_device_list, proximity_domain); if (res) { - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", - gpu_id); + dev_err(gpu->adev->dev, "Error parsing VCRAT\n"); topology_crat_proximity_domain--; goto err; } @@ -1824,8 +1850,8 @@ static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t gpu_id, if (!res) sys_props.generation_count++; else - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", - gpu_id, res); + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs topology. res=%d\n", + res); err: kfd_destroy_crat_image(crat_image); @@ -1908,7 +1934,8 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { - if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3)) + if (KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dev->gpu) == IP_VERSION(9, 4, 4)) dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9_4_3 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT_GFX9_4_3; @@ -1927,6 +1954,10 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) + dev->node_props.capability |= + HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; } kfd_topology_set_dbg_firmware_support(dev); @@ -1942,14 +1973,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; - gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { dev_warn(gpu->adev->dev, - "Won't add GPU (ID: 0x%x) to topology since it has no drm node assigned.", - gpu_id); + "Won't add GPU to topology since it has no drm node assigned."); return 0; } else { - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n"); } /* Check to see if this gpu device exists in the topology_device_list. @@ -1961,11 +1990,12 @@ int kfd_topology_add_device(struct kfd_node *gpu) down_write(&topology_lock); dev = kfd_assign_gpu(gpu); if (!dev) - res = kfd_topology_add_device_locked(gpu, gpu_id, &dev); + res = kfd_topology_add_device_locked(gpu, &dev); up_write(&topology_lock); if (res) return res; + gpu_id = kfd_generate_gpu_id(gpu); dev->gpu_id = gpu_id; gpu->id = gpu_id; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 27386ce9a021..2d1c9d771bef 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -154,7 +154,10 @@ struct kfd_topology_device { struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; + union { + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint64_t oem_id64; + }; uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; uint32_t oem_revision; }; |