summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/Makefile2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h52
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c985
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h232
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c68
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h28
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c63
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c33
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c170
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h22
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c116
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c67
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c211
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c188
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c310
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c27
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c53
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c284
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h24
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c143
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c38
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c486
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h107
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c70
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h15
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h23
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_object.h40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c862
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c368
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c188
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c39
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h77
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h28
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h150
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c269
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h26
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c165
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h16
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c76
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c42
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c299
-rw-r--r--drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c63
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atom.c66
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atom.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/cik.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v10_0.c66
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v6_0.c57
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v8_0.c57
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c334
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c251
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c224
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c1097
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c125
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c88
-rw-r--r--drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c92
-rw-r--r--drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_userqueue.c147
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_userqueue.h9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v11_0.c240
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v12_0.c267
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v12_1.c25
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c231
-rw-r--r--drivers/gpu/drm/amd/amdgpu/nv.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c62
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c196
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c18
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c18
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c18
-rw-r--r--drivers/gpu/drm/amd/amdgpu/si.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc15.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc15_common.h65
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc21.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc24.c11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc_v1_0.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/tonga_ih.c40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.c585
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.h28
-rw-r--r--drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c45
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vce_v3_0.c69
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c45
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c87
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c90
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vi.c22
117 files changed, 3397 insertions, 8260 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index ba80542ead9d..5100e35027ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o amdgpu_doorbell_mgr.o amdgpu_kms
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o amdgpu_lockdep.o \
- amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \
+ amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_dev_coredump.o \
amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 7b09410d6d8f..dd8ea71077af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -44,6 +44,7 @@
#include <linux/hashtable.h>
#include <linux/dma-fence.h>
#include <linux/pci.h>
+#include <linux/xarray.h>
#include <drm/ttm/ttm_bo.h>
#include <drm/ttm/ttm_placement.h>
@@ -103,7 +104,6 @@
#include "amdgpu_smuio.h"
#include "amdgpu_fdinfo.h"
#include "amdgpu_mca.h"
-#include "amdgpu_aca.h"
#include "amdgpu_ras.h"
#include "amdgpu_lockdep.h"
#include "amdgpu_cper.h"
@@ -113,6 +113,7 @@
#include "amdgpu_userq.h"
#include "amdgpu_eviction_fence.h"
#include "amdgpu_ip.h"
+#include "amdgpu_sa.h"
#if defined(CONFIG_DRM_AMD_ISP)
#include "amdgpu_isp.h"
#endif
@@ -272,7 +273,6 @@ extern int amdgpu_ptl;
extern uint amdgpu_hdmi_hpd_debounce_delay_ms;
-#define AMDGPU_VM_MAX_NUM_CTX 4096
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
#define AMDGPU_MAX_USEC_TIMEOUT 100000 /* 100 ms */
@@ -305,9 +305,10 @@ extern uint amdgpu_hdmi_hpd_debounce_delay_ms;
/* reset mask */
#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */
-#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_SOFT_RECOVERY (1 << 1) /* soft recovery, eg. kill shaders */
#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+#define AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET (1 << 4) /* soft-resets an IP block */
/* max cursor sizes (in pixels) */
#define CIK_CURSOR_WIDTH 128
@@ -387,37 +388,6 @@ struct amdgpu_clock {
uint32_t max_pixel_clock;
};
-/* sub-allocation manager, it has to be protected by another lock.
- * By conception this is an helper for other part of the driver
- * like the indirect buffer or semaphore, which both have their
- * locking.
- *
- * Principe is simple, we keep a list of sub allocation in offset
- * order (first entry has offset == 0, last entry has the highest
- * offset).
- *
- * When allocating new object we first check if there is room at
- * the end total_size - (last_object_offset + last_object_size) >=
- * alloc_size. If so we allocate new object there.
- *
- * When there is not enough room at the end, we start waiting for
- * each sub object until we reach object_offset+object_size >=
- * alloc_size, this object then become the sub object we return.
- *
- * Alignment can't be bigger than page size.
- *
- * Hole are not considered for allocation to keep things simple.
- * Assumption is that there won't be hole (all object on same
- * alignment).
- */
-
-struct amdgpu_sa_manager {
- struct drm_suballoc_manager base;
- struct amdgpu_bo *bo;
- uint64_t gpu_addr;
- void *cpu_ptr;
-};
-
/*
* IRQS.
*/
@@ -446,8 +416,7 @@ struct amdgpu_fpriv {
struct amdgpu_bo_va *prt_va;
struct amdgpu_bo_va *csa_va;
struct amdgpu_bo_va *seq64_va;
- struct mutex bo_list_lock;
- struct idr bo_list_handles;
+ struct xarray bo_list_handles;
struct amdgpu_ctx_mgr ctx_mgr;
struct amdgpu_userq_mgr userq_mgr;
@@ -587,8 +556,6 @@ struct amdgpu_asic_funcs {
/* invalidate hdp read cache */
void (*invalidate_hdp)(struct amdgpu_device *adev,
struct amdgpu_ring *ring);
- /* check if the asic needs a full reset of if soft reset will work */
- bool (*need_full_reset)(struct amdgpu_device *adev);
/* initialize doorbell layout for specific asic*/
void (*init_doorbell_index)(struct amdgpu_device *adev);
/* PCIe bandwidth usage */
@@ -851,6 +818,7 @@ struct amdgpu_device {
struct dev_pm_domain vga_pm_domain;
bool have_disp_power_ref;
bool have_atomics_support;
+ bool is_sw_smu;
/* BIOS */
bool is_atom_fw;
@@ -1022,9 +990,6 @@ struct amdgpu_device {
/* MCA */
struct amdgpu_mca mca;
- /* ACA */
- struct amdgpu_aca aca;
-
/* CPER */
struct amdgpu_cper cper;
@@ -1136,6 +1101,8 @@ struct amdgpu_device {
bool debug_vm_userptr;
bool debug_disable_ce_logs;
bool debug_enable_ce_cs;
+ bool debug_hibernation_thaw_resume_gpu;
+ bool debug_disable_ip_block_soft_reset;
/* Protection for the following isolation structure */
struct mutex enforce_isolation_mutex;
@@ -1356,7 +1323,6 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
#define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l))
#define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v)))
#define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev))
-#define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev))
#define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev))
#define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1)))
#define amdgpu_asic_need_reset_on_init(adev) (adev)->asic_funcs->need_reset_on_init((adev))
@@ -1468,6 +1434,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc);
void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);
int amdgpu_info_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp);
+int amdgpu_proc_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp);
/*
* functions used by amdgpu_encoder.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
deleted file mode 100644
index db7858fe0c3d..000000000000
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ /dev/null
@@ -1,985 +0,0 @@
-/*
- * Copyright 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include <linux/list.h>
-#include "amdgpu.h"
-#include "amdgpu_aca.h"
-#include "amdgpu_ras.h"
-
-#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
-
-typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
-
-static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
- ACA_BANK_HWID(SMU, 0x01, 0x01),
- ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00),
- ACA_BANK_HWID(UMC, 0x96, 0x00),
-};
-
-static void aca_banks_init(struct aca_banks *banks)
-{
- if (!banks)
- return;
-
- memset(banks, 0, sizeof(*banks));
- INIT_LIST_HEAD(&banks->list);
-}
-
-static int aca_banks_add_bank(struct aca_banks *banks, struct aca_bank *bank)
-{
- struct aca_bank_node *node;
-
- if (!bank)
- return -EINVAL;
-
- node = kvzalloc_obj(*node);
- if (!node)
- return -ENOMEM;
-
- memcpy(&node->bank, bank, sizeof(*bank));
-
- INIT_LIST_HEAD(&node->node);
- list_add_tail(&node->node, &banks->list);
-
- banks->nr_banks++;
-
- return 0;
-}
-
-static void aca_banks_release(struct aca_banks *banks)
-{
- struct aca_bank_node *node, *tmp;
-
- if (list_empty(&banks->list))
- return;
-
- list_for_each_entry_safe(node, tmp, &banks->list, node) {
- list_del(&node->node);
- kvfree(node);
- banks->nr_banks--;
- }
-}
-
-static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
-{
- struct amdgpu_aca *aca = &adev->aca;
- const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
-
- if (!count)
- return -EINVAL;
-
- if (!smu_funcs || !smu_funcs->get_valid_aca_count)
- return -EOPNOTSUPP;
-
- return smu_funcs->get_valid_aca_count(adev, type, count);
-}
-
-static struct aca_regs_dump {
- const char *name;
- int reg_idx;
-} aca_regs[] = {
- {"CONTROL", ACA_REG_IDX_CTL},
- {"STATUS", ACA_REG_IDX_STATUS},
- {"ADDR", ACA_REG_IDX_ADDR},
- {"MISC", ACA_REG_IDX_MISC0},
- {"CONFIG", ACA_REG_IDX_CONFIG},
- {"IPID", ACA_REG_IDX_IPID},
- {"SYND", ACA_REG_IDX_SYND},
- {"DESTAT", ACA_REG_IDX_DESTAT},
- {"DEADDR", ACA_REG_IDX_DEADDR},
- {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK},
-};
-
-static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
- struct ras_query_context *qctx)
-{
- u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
- int i;
-
- if (adev->debug_disable_ce_logs &&
- bank->smu_err_type == ACA_SMU_TYPE_CE &&
- !ACA_BANK_ERR_IS_DEFFERED(bank))
- return;
-
- RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
- /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
- for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
- RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
- idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
-
- if (ACA_REG__STATUS__SCRUB(bank->regs[ACA_REG_IDX_STATUS]))
- RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n");
-}
-
-static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
-{
-
- struct aca_hwip *hwip;
- int hwid, mcatype;
- u64 ipid;
-
- if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
- return false;
-
- hwip = &aca_hwid_mcatypes[type];
- if (!hwip->hwid)
- return false;
-
- ipid = bank->regs[ACA_REG_IDX_IPID];
- hwid = ACA_REG__IPID__HARDWAREID(ipid);
- mcatype = ACA_REG__IPID__MCATYPE(ipid);
-
- return hwip->hwid == hwid && hwip->mcatype == mcatype;
-}
-
-static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
- int start, int count,
- struct aca_banks *banks, struct ras_query_context *qctx)
-{
- struct amdgpu_aca *aca = &adev->aca;
- const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
- struct aca_bank bank;
- int i, max_count, ret;
-
- if (!count)
- return 0;
-
- if (!smu_funcs || !smu_funcs->get_valid_aca_bank)
- return -EOPNOTSUPP;
-
- switch (type) {
- case ACA_SMU_TYPE_UE:
- max_count = smu_funcs->max_ue_bank_count;
- break;
- case ACA_SMU_TYPE_CE:
- max_count = smu_funcs->max_ce_bank_count;
- break;
- default:
- return -EINVAL;
- }
-
- if (start + count > max_count)
- return -EINVAL;
-
- count = min_t(int, count, max_count);
- for (i = 0; i < count; i++) {
- memset(&bank, 0, sizeof(bank));
- ret = smu_funcs->get_valid_aca_bank(adev, type, start + i, &bank);
- if (ret)
- return ret;
-
- bank.smu_err_type = type;
-
- /*
- * Poison being consumed when injecting a UE while running background workloads,
- * which are unexpected.
- */
- if (type == ACA_SMU_TYPE_UE &&
- ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
- !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
- continue;
-
- aca_smu_bank_dump(adev, i, count, &bank, qctx);
-
- ret = aca_banks_add_bank(banks, &bank);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
-{
- const struct aca_bank_ops *bank_ops = handle->bank_ops;
-
- /* Parse all deferred errors with UMC aca handle */
- if (ACA_BANK_ERR_IS_DEFFERED(bank))
- return handle->hwip == ACA_HWIP_TYPE_UMC;
-
- if (!aca_bank_hwip_is_matched(bank, handle->hwip))
- return false;
-
- if (!bank_ops->aca_bank_is_valid)
- return true;
-
- return bank_ops->aca_bank_is_valid(handle, bank, type, handle->data);
-}
-
-static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
-{
- struct aca_bank_error *bank_error;
-
- bank_error = kvzalloc_obj(*bank_error);
- if (!bank_error)
- return NULL;
-
- INIT_LIST_HEAD(&bank_error->node);
- memcpy(&bank_error->info, info, sizeof(*info));
-
- mutex_lock(&aerr->lock);
- list_add_tail(&bank_error->node, &aerr->list);
- aerr->nr_errors++;
- mutex_unlock(&aerr->lock);
-
- return bank_error;
-}
-
-static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
-{
- struct aca_bank_error *bank_error = NULL;
- struct aca_bank_info *tmp_info;
- bool found = false;
-
- mutex_lock(&aerr->lock);
- list_for_each_entry(bank_error, &aerr->list, node) {
- tmp_info = &bank_error->info;
- if (tmp_info->socket_id == info->socket_id &&
- tmp_info->die_id == info->die_id) {
- found = true;
- goto out_unlock;
- }
- }
-
-out_unlock:
- mutex_unlock(&aerr->lock);
-
- return found ? bank_error : NULL;
-}
-
-static void aca_bank_error_remove(struct aca_error *aerr, struct aca_bank_error *bank_error)
-{
- if (!aerr || !bank_error)
- return;
-
- list_del(&bank_error->node);
- aerr->nr_errors--;
-
- kvfree(bank_error);
-}
-
-static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_bank_info *info)
-{
- struct aca_bank_error *bank_error;
-
- if (!aerr || !info)
- return NULL;
-
- bank_error = find_bank_error(aerr, info);
- if (bank_error)
- return bank_error;
-
- return new_bank_error(aerr, info);
-}
-
-int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
- enum aca_error_type type, u64 count)
-{
- struct aca_error_cache *error_cache = &handle->error_cache;
- struct aca_bank_error *bank_error;
- struct aca_error *aerr;
-
- if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)
- return -EINVAL;
-
- if (!count)
- return 0;
-
- aerr = &error_cache->errors[type];
- bank_error = get_bank_error(aerr, info);
- if (!bank_error)
- return -ENOMEM;
-
- bank_error->count += count;
-
- return 0;
-}
-
-static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
-{
- const struct aca_bank_ops *bank_ops = handle->bank_ops;
-
- if (!bank)
- return -EINVAL;
-
- if (!bank_ops->aca_bank_parser)
- return -EOPNOTSUPP;
-
- return bank_ops->aca_bank_parser(handle, bank, type,
- handle->data);
-}
-
-static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- int ret;
-
- ret = aca_bank_parser(handle, bank, type);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
- enum aca_smu_type type, bank_handler_t handler, void *data)
-{
- struct aca_handle *handle;
- int ret;
-
- if (list_empty(&mgr->list))
- return 0;
-
- list_for_each_entry(handle, &mgr->list, node) {
- if (!aca_bank_is_valid(handle, bank, type))
- continue;
-
- ret = handler(handle, bank, type, data);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
- enum aca_smu_type type, bank_handler_t handler, void *data)
-{
- struct aca_bank_node *node;
- struct aca_bank *bank;
- int ret;
-
- if (!mgr || !banks)
- return -EINVAL;
-
- /* pre check to avoid unnecessary operations */
- if (list_empty(&mgr->list) || list_empty(&banks->list))
- return 0;
-
- list_for_each_entry(node, &banks->list, node) {
- bank = &node->bank;
-
- ret = aca_dispatch_bank(mgr, bank, type, handler, data);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
-{
- struct amdgpu_aca *aca = &adev->aca;
- bool ret = true;
-
- /*
- * Because the UE Valid MCA count will only be cleared after reset,
- * in order to avoid repeated counting of the error count,
- * the aca bank is only updated once during the gpu recovery stage.
- */
- if (type == ACA_SMU_TYPE_UE) {
- if (amdgpu_ras_intr_triggered())
- ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
- else
- atomic_set(&aca->ue_update_flag, 0);
- }
-
- return ret;
-}
-
-static void aca_banks_generate_cper(struct amdgpu_device *adev,
- enum aca_smu_type type,
- struct aca_banks *banks,
- int count)
-{
- struct aca_bank_node *node;
- struct aca_bank *bank;
- int r;
-
- if (!adev->cper.enabled)
- return;
-
- if (!banks || !count) {
- dev_warn(adev->dev, "fail to generate cper records\n");
- return;
- }
-
- /* UEs must be encoded into separate CPER entries */
- if (type == ACA_SMU_TYPE_UE) {
- struct aca_banks de_banks;
-
- aca_banks_init(&de_banks);
- list_for_each_entry(node, &banks->list, node) {
- bank = &node->bank;
- if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
- r = aca_banks_add_bank(&de_banks, bank);
- if (r)
- dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
- } else {
- if (amdgpu_cper_generate_ue_record(adev, bank))
- dev_warn(adev->dev, "fail to generate ue cper records\n");
- }
- }
-
- if (!list_empty(&de_banks.list)) {
- if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
- dev_warn(adev->dev, "fail to generate de cper records\n");
- }
-
- aca_banks_release(&de_banks);
- } else {
- /*
- * SMU_TYPE_CE banks are combined into 1 CPER entries,
- * they could be CEs or DEs or both
- */
- if (amdgpu_cper_generate_ce_records(adev, banks, count))
- dev_warn(adev->dev, "fail to generate ce cper records\n");
- }
-}
-
-static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
- bank_handler_t handler, struct ras_query_context *qctx, void *data)
-{
- struct amdgpu_aca *aca = &adev->aca;
- struct aca_banks banks;
- u32 count = 0;
- int ret;
-
- if (list_empty(&aca->mgr.list))
- return 0;
-
- if (!aca_bank_should_update(adev, type))
- return 0;
-
- ret = aca_smu_get_valid_aca_count(adev, type, &count);
- if (ret)
- return ret;
-
- if (!count)
- return 0;
-
- aca_banks_init(&banks);
-
- ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
- if (ret)
- goto err_release_banks;
-
- if (list_empty(&banks.list)) {
- ret = 0;
- goto err_release_banks;
- }
-
- ret = aca_dispatch_banks(&aca->mgr, &banks, type,
- handler, data);
- if (ret)
- goto err_release_banks;
-
- aca_banks_generate_cper(adev, type, &banks, count);
-
-err_release_banks:
- aca_banks_release(&banks);
-
- return ret;
-}
-
-static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_error_type type, struct ras_err_data *err_data)
-{
- struct aca_bank_info *info;
- struct amdgpu_smuio_mcm_config_info mcm_info;
- u64 count;
-
- if (type >= ACA_ERROR_TYPE_COUNT)
- return -EINVAL;
-
- count = bank_error->count;
- if (!count)
- return 0;
-
- info = &bank_error->info;
- mcm_info.die_id = info->die_id;
- mcm_info.socket_id = info->socket_id;
-
- switch (type) {
- case ACA_ERROR_TYPE_UE:
- amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count);
- break;
- case ACA_ERROR_TYPE_CE:
- amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count);
- break;
- case ACA_ERROR_TYPE_DEFERRED:
- amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-static int aca_log_aca_error(struct aca_handle *handle, enum aca_error_type type, struct ras_err_data *err_data)
-{
- struct aca_error_cache *error_cache = &handle->error_cache;
- struct aca_error *aerr = &error_cache->errors[type];
- struct aca_bank_error *bank_error, *tmp;
-
- mutex_lock(&aerr->lock);
-
- if (list_empty(&aerr->list))
- goto out_unlock;
-
- list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) {
- aca_log_aca_error_data(bank_error, type, err_data);
- aca_bank_error_remove(aerr, bank_error);
- }
-
-out_unlock:
- mutex_unlock(&aerr->lock);
-
- return 0;
-}
-
-static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
- struct ras_err_data *err_data, struct ras_query_context *qctx)
-{
- enum aca_smu_type smu_type;
- int ret;
-
- switch (type) {
- case ACA_ERROR_TYPE_UE:
- smu_type = ACA_SMU_TYPE_UE;
- break;
- case ACA_ERROR_TYPE_CE:
- case ACA_ERROR_TYPE_DEFERRED:
- smu_type = ACA_SMU_TYPE_CE;
- break;
- default:
- return -EINVAL;
- }
-
- /* update aca bank to aca source error_cache first */
- ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
- if (ret)
- return ret;
-
- /* DEs may contain in CEs or UEs */
- if (type != ACA_ERROR_TYPE_DEFERRED)
- aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
-
- return aca_log_aca_error(handle, type, err_data);
-}
-
-static bool aca_handle_is_valid(struct aca_handle *handle)
-{
- if (!handle->mask || !list_empty(&handle->node))
- return false;
-
- return true;
-}
-
-int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
- enum aca_error_type type, struct ras_err_data *err_data,
- struct ras_query_context *qctx)
-{
- if (!handle || !err_data)
- return -EINVAL;
-
- if (aca_handle_is_valid(handle))
- return -EOPNOTSUPP;
-
- if ((type < 0) || (!(BIT(type) & handle->mask)))
- return 0;
-
- return __aca_get_error_data(adev, handle, type, err_data, qctx);
-}
-
-static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
-{
- mutex_init(&aerr->lock);
- INIT_LIST_HEAD(&aerr->list);
- aerr->type = type;
- aerr->nr_errors = 0;
-}
-
-static void aca_init_error_cache(struct aca_handle *handle)
-{
- struct aca_error_cache *error_cache = &handle->error_cache;
- int type;
-
- for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++)
- aca_error_init(&error_cache->errors[type], type);
-}
-
-static void aca_error_fini(struct aca_error *aerr)
-{
- struct aca_bank_error *bank_error, *tmp;
-
- mutex_lock(&aerr->lock);
- if (list_empty(&aerr->list))
- goto out_unlock;
-
- list_for_each_entry_safe(bank_error, tmp, &aerr->list, node)
- aca_bank_error_remove(aerr, bank_error);
-
-out_unlock:
- mutex_unlock(&aerr->lock);
- mutex_destroy(&aerr->lock);
-}
-
-static void aca_fini_error_cache(struct aca_handle *handle)
-{
- struct aca_error_cache *error_cache = &handle->error_cache;
- int type;
-
- for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++)
- aca_error_fini(&error_cache->errors[type]);
-}
-
-static int add_aca_handle(struct amdgpu_device *adev, struct aca_handle_manager *mgr, struct aca_handle *handle,
- const char *name, const struct aca_info *ras_info, void *data)
-{
- memset(handle, 0, sizeof(*handle));
-
- handle->adev = adev;
- handle->mgr = mgr;
- handle->name = name;
- handle->hwip = ras_info->hwip;
- handle->mask = ras_info->mask;
- handle->bank_ops = ras_info->bank_ops;
- handle->data = data;
- aca_init_error_cache(handle);
-
- INIT_LIST_HEAD(&handle->node);
- list_add_tail(&handle->node, &mgr->list);
- mgr->nr_handles++;
-
- return 0;
-}
-
-static ssize_t aca_sysfs_read(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct aca_handle *handle = container_of(attr, struct aca_handle, aca_attr);
-
- /* NOTE: the aca cache will be auto cleared once read,
- * So the driver should unify the query entry point, forward request to ras query interface directly */
- return amdgpu_ras_aca_sysfs_read(dev, attr, handle, buf, handle->data);
-}
-
-static int add_aca_sysfs(struct amdgpu_device *adev, struct aca_handle *handle)
-{
- struct device_attribute *aca_attr = &handle->aca_attr;
-
- snprintf(handle->attr_name, sizeof(handle->attr_name) - 1, "aca_%s", handle->name);
- aca_attr->show = aca_sysfs_read;
- aca_attr->attr.name = handle->attr_name;
- aca_attr->attr.mode = S_IRUGO;
- sysfs_attr_init(&aca_attr->attr);
-
- return sysfs_add_file_to_group(&adev->dev->kobj,
- &aca_attr->attr,
- "ras");
-}
-
-int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
- const char *name, const struct aca_info *ras_info, void *data)
-{
- struct amdgpu_aca *aca = &adev->aca;
- int ret;
-
- if (!amdgpu_aca_is_enabled(adev))
- return 0;
-
- ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
- if (ret)
- return ret;
-
- return add_aca_sysfs(adev, handle);
-}
-
-static void remove_aca_handle(struct aca_handle *handle)
-{
- struct aca_handle_manager *mgr = handle->mgr;
-
- aca_fini_error_cache(handle);
- list_del(&handle->node);
- mgr->nr_handles--;
-}
-
-static void remove_aca_sysfs(struct aca_handle *handle)
-{
- struct amdgpu_device *adev = handle->adev;
- struct device_attribute *aca_attr = &handle->aca_attr;
-
- if (adev->dev->kobj.sd)
- sysfs_remove_file_from_group(&adev->dev->kobj,
- &aca_attr->attr,
- "ras");
-}
-
-void amdgpu_aca_remove_handle(struct aca_handle *handle)
-{
- if (!handle || list_empty(&handle->node))
- return;
-
- remove_aca_sysfs(handle);
- remove_aca_handle(handle);
-}
-
-static int aca_manager_init(struct aca_handle_manager *mgr)
-{
- INIT_LIST_HEAD(&mgr->list);
- mgr->nr_handles = 0;
-
- return 0;
-}
-
-static void aca_manager_fini(struct aca_handle_manager *mgr)
-{
- struct aca_handle *handle, *tmp;
-
- if (list_empty(&mgr->list))
- return;
-
- list_for_each_entry_safe(handle, tmp, &mgr->list, node)
- amdgpu_aca_remove_handle(handle);
-}
-
-bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
-{
- return (adev->aca.is_enabled ||
- adev->debug_enable_ras_aca);
-}
-
-int amdgpu_aca_init(struct amdgpu_device *adev)
-{
- struct amdgpu_aca *aca = &adev->aca;
- int ret;
-
- atomic_set(&aca->ue_update_flag, 0);
-
- ret = aca_manager_init(&aca->mgr);
- if (ret)
- return ret;
-
- return 0;
-}
-
-void amdgpu_aca_fini(struct amdgpu_device *adev)
-{
- struct amdgpu_aca *aca = &adev->aca;
-
- aca_manager_fini(&aca->mgr);
-
- atomic_set(&aca->ue_update_flag, 0);
-}
-
-int amdgpu_aca_reset(struct amdgpu_device *adev)
-{
- struct amdgpu_aca *aca = &adev->aca;
-
- atomic_set(&aca->ue_update_flag, 0);
-
- return 0;
-}
-
-void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs)
-{
- struct amdgpu_aca *aca = &adev->aca;
-
- WARN_ON(aca->smu_funcs);
- aca->smu_funcs = smu_funcs;
-}
-
-int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)
-{
- u64 ipid;
- u32 instidhi, instidlo;
-
- if (!bank || !info)
- return -EINVAL;
-
- ipid = bank->regs[ACA_REG_IDX_IPID];
- info->hwid = ACA_REG__IPID__HARDWAREID(ipid);
- info->mcatype = ACA_REG__IPID__MCATYPE(ipid);
- /*
- * Unfied DieID Format: SAASS. A:AID, S:Socket.
- * Unfied DieID[4:4] = InstanceId[0:0]
- * Unfied DieID[0:3] = InstanceIdHi[0:3]
- */
- instidhi = ACA_REG__IPID__INSTANCEIDHI(ipid);
- instidlo = ACA_REG__IPID__INSTANCEIDLO(ipid);
- info->die_id = ((instidhi >> 2) & 0x03);
- info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03);
-
- return 0;
-}
-
-static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)
-{
- struct amdgpu_aca *aca = &adev->aca;
- const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
-
- if (!smu_funcs || !smu_funcs->parse_error_code)
- return -EOPNOTSUPP;
-
- return smu_funcs->parse_error_code(adev, bank);
-}
-
-int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size)
-{
- int i, error_code;
-
- if (!bank || !err_codes)
- return -EINVAL;
-
- error_code = aca_bank_get_error_code(adev, bank);
- if (error_code < 0)
- return error_code;
-
- for (i = 0; i < size; i++) {
- if (err_codes[i] == error_code)
- return 0;
- }
-
- return -EINVAL;
-}
-
-int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en)
-{
- struct amdgpu_aca *aca = &adev->aca;
- const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
-
- if (!smu_funcs || !smu_funcs->set_debug_mode)
- return -EOPNOTSUPP;
-
- return smu_funcs->set_debug_mode(adev, en);
-}
-
-#if defined(CONFIG_DEBUG_FS)
-static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)data;
- int ret;
-
- ret = amdgpu_ras_set_aca_debug_mode(adev, val ? true : false);
- if (ret)
- return ret;
-
- dev_info(adev->dev, "amdgpu set smu aca debug mode %s success\n", val ? "on" : "off");
-
- return 0;
-}
-
-static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
-{
- struct aca_bank_info info;
- int i, ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return;
-
- seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
- seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
- idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
-
- for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
- seq_printf(m, "aca entry[%d].regs[%d]: 0x%016llx\n", idx, aca_regs[i].reg_idx, bank->regs[aca_regs[i].reg_idx]);
-}
-
-struct aca_dump_context {
- struct seq_file *m;
- int idx;
-};
-
-static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_dump_context *ctx = (struct aca_dump_context *)data;
-
- aca_dump_entry(ctx->m, bank, type, ctx->idx++);
-
- return handler_aca_log_bank_error(handle, bank, type, NULL);
-}
-
-static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
- struct aca_dump_context context = {
- .m = m,
- .idx = 0,
- };
-
- return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
-}
-
-static int aca_dump_ce_show(struct seq_file *m, void *unused)
-{
- return aca_dump_show(m, ACA_SMU_TYPE_CE);
-}
-
-static int aca_dump_ce_open(struct inode *inode, struct file *file)
-{
- return single_open(file, aca_dump_ce_show, inode->i_private);
-}
-
-static const struct file_operations aca_ce_dump_debug_fops = {
- .owner = THIS_MODULE,
- .open = aca_dump_ce_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int aca_dump_ue_show(struct seq_file *m, void *unused)
-{
- return aca_dump_show(m, ACA_SMU_TYPE_UE);
-}
-
-static int aca_dump_ue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, aca_dump_ue_show, inode->i_private);
-}
-
-static const struct file_operations aca_ue_dump_debug_fops = {
- .owner = THIS_MODULE,
- .open = aca_dump_ue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_set, "%llu\n");
-#endif
-
-void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root)
-{
-#if defined(CONFIG_DEBUG_FS)
- if (!root)
- return;
-
- debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops);
- debugfs_create_file("aca_ue_dump", 0400, root, adev, &aca_ue_dump_debug_fops);
- debugfs_create_file("aca_ce_dump", 0400, root, adev, &aca_ce_dump_debug_fops);
-#endif
-}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
deleted file mode 100644
index 38c88897e1ec..000000000000
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef __AMDGPU_ACA_H__
-#define __AMDGPU_ACA_H__
-
-#include <linux/list.h>
-
-struct ras_err_data;
-struct ras_query_context;
-
-#define ACA_MAX_REGS_COUNT (16)
-
-#define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
-#define ACA_REG__STATUS__VAL(x) ACA_REG_FIELD(x, 63, 63)
-#define ACA_REG__STATUS__OVERFLOW(x) ACA_REG_FIELD(x, 62, 62)
-#define ACA_REG__STATUS__UC(x) ACA_REG_FIELD(x, 61, 61)
-#define ACA_REG__STATUS__EN(x) ACA_REG_FIELD(x, 60, 60)
-#define ACA_REG__STATUS__MISCV(x) ACA_REG_FIELD(x, 59, 59)
-#define ACA_REG__STATUS__ADDRV(x) ACA_REG_FIELD(x, 58, 58)
-#define ACA_REG__STATUS__PCC(x) ACA_REG_FIELD(x, 57, 57)
-#define ACA_REG__STATUS__ERRCOREIDVAL(x) ACA_REG_FIELD(x, 56, 56)
-#define ACA_REG__STATUS__TCC(x) ACA_REG_FIELD(x, 55, 55)
-#define ACA_REG__STATUS__SYNDV(x) ACA_REG_FIELD(x, 53, 53)
-#define ACA_REG__STATUS__CECC(x) ACA_REG_FIELD(x, 46, 46)
-#define ACA_REG__STATUS__UECC(x) ACA_REG_FIELD(x, 45, 45)
-#define ACA_REG__STATUS__DEFERRED(x) ACA_REG_FIELD(x, 44, 44)
-#define ACA_REG__STATUS__POISON(x) ACA_REG_FIELD(x, 43, 43)
-#define ACA_REG__STATUS__SCRUB(x) ACA_REG_FIELD(x, 40, 40)
-#define ACA_REG__STATUS__ERRCOREID(x) ACA_REG_FIELD(x, 37, 32)
-#define ACA_REG__STATUS__ADDRLSB(x) ACA_REG_FIELD(x, 29, 24)
-#define ACA_REG__STATUS__ERRORCODEEXT(x) ACA_REG_FIELD(x, 21, 16)
-#define ACA_REG__STATUS__ERRORCODE(x) ACA_REG_FIELD(x, 15, 0)
-
-#define ACA_REG__IPID__MCATYPE(x) ACA_REG_FIELD(x, 63, 48)
-#define ACA_REG__IPID__INSTANCEIDHI(x) ACA_REG_FIELD(x, 47, 44)
-#define ACA_REG__IPID__HARDWAREID(x) ACA_REG_FIELD(x, 43, 32)
-#define ACA_REG__IPID__INSTANCEIDLO(x) ACA_REG_FIELD(x, 31, 0)
-
-#define ACA_REG__MISC0__VALID(x) ACA_REG_FIELD(x, 63, 63)
-#define ACA_REG__MISC0__OVRFLW(x) ACA_REG_FIELD(x, 48, 48)
-#define ACA_REG__MISC0__ERRCNT(x) ACA_REG_FIELD(x, 43, 32)
-
-#define ACA_REG__SYND__ERRORINFORMATION(x) ACA_REG_FIELD(x, 17, 0)
-
-/* NOTE: The following codes refers to the smu header file */
-#define ACA_EXTERROR_CODE_CE 0x3a
-#define ACA_EXTERROR_CODE_FAULT 0x3b
-
-#define ACA_ERROR_UE_MASK BIT_MASK(ACA_ERROR_TYPE_UE)
-#define ACA_ERROR_CE_MASK BIT_MASK(ACA_ERROR_TYPE_CE)
-#define ACA_ERROR_DEFERRED_MASK BIT_MASK(ACA_ERROR_TYPE_DEFERRED)
-
-#define mmSMNAID_AID0_MCA_SMU 0x03b30400 /* SMN AID AID0 */
-#define mmSMNAID_XCD0_MCA_SMU 0x36430400 /* SMN AID XCD0 */
-#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
-#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
-
-#define ACA_BANK_ERR_IS_DEFFERED(bank) \
- (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
- ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
-
-enum aca_reg_idx {
- ACA_REG_IDX_CTL = 0,
- ACA_REG_IDX_STATUS = 1,
- ACA_REG_IDX_ADDR = 2,
- ACA_REG_IDX_MISC0 = 3,
- ACA_REG_IDX_CONFIG = 4,
- ACA_REG_IDX_IPID = 5,
- ACA_REG_IDX_SYND = 6,
- ACA_REG_IDX_DESTAT = 8,
- ACA_REG_IDX_DEADDR = 9,
- ACA_REG_IDX_CTL_MASK = 10,
- ACA_REG_IDX_COUNT = 16,
-};
-
-enum aca_hwip_type {
- ACA_HWIP_TYPE_UNKNOW = -1,
- ACA_HWIP_TYPE_PSP = 0,
- ACA_HWIP_TYPE_UMC,
- ACA_HWIP_TYPE_SMU,
- ACA_HWIP_TYPE_PCS_XGMI,
- ACA_HWIP_TYPE_COUNT,
-};
-
-enum aca_error_type {
- ACA_ERROR_TYPE_INVALID = -1,
- ACA_ERROR_TYPE_UE = 0,
- ACA_ERROR_TYPE_CE,
- ACA_ERROR_TYPE_DEFERRED,
- ACA_ERROR_TYPE_COUNT
-};
-
-enum aca_smu_type {
- ACA_SMU_TYPE_INVALID = -1,
- ACA_SMU_TYPE_UE = 0,
- ACA_SMU_TYPE_CE,
- ACA_SMU_TYPE_COUNT,
-};
-
-struct aca_hwip {
- int hwid;
- int mcatype;
-};
-
-struct aca_bank {
- enum aca_error_type aca_err_type;
- enum aca_smu_type smu_err_type;
- u64 regs[ACA_MAX_REGS_COUNT];
-};
-
-struct aca_bank_node {
- struct aca_bank bank;
- struct list_head node;
-};
-
-struct aca_banks {
- int nr_banks;
- struct list_head list;
-};
-
-struct aca_bank_info {
- int die_id;
- int socket_id;
- int hwid;
- int mcatype;
-};
-
-struct aca_bank_error {
- struct list_head node;
- struct aca_bank_info info;
- u64 count;
-};
-
-struct aca_error {
- struct list_head list;
- struct mutex lock;
- enum aca_error_type type;
- int nr_errors;
-};
-
-struct aca_handle_manager {
- struct list_head list;
- int nr_handles;
-};
-
-struct aca_error_cache {
- struct aca_error errors[ACA_ERROR_TYPE_COUNT];
-};
-
-struct aca_handle {
- struct list_head node;
- enum aca_hwip_type hwip;
- struct amdgpu_device *adev;
- struct aca_handle_manager *mgr;
- struct aca_error_cache error_cache;
- const struct aca_bank_ops *bank_ops;
- struct device_attribute aca_attr;
- char attr_name[64];
- const char *name;
- u32 mask;
- void *data;
-};
-
-struct aca_bank_ops {
- int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
- bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
- void *data);
-};
-
-struct aca_smu_funcs {
- int max_ue_bank_count;
- int max_ce_bank_count;
- int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
- int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
- int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
- int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank);
-};
-
-struct amdgpu_aca {
- struct aca_handle_manager mgr;
- const struct aca_smu_funcs *smu_funcs;
- atomic_t ue_update_flag;
- bool is_enabled;
-};
-
-struct aca_info {
- enum aca_hwip_type hwip;
- const struct aca_bank_ops *bank_ops;
- u32 mask;
-};
-
-int amdgpu_aca_init(struct amdgpu_device *adev);
-void amdgpu_aca_fini(struct amdgpu_device *adev);
-int amdgpu_aca_reset(struct amdgpu_device *adev);
-void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs);
-bool amdgpu_aca_is_enabled(struct amdgpu_device *adev);
-
-int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info);
-int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size);
-
-int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
- const char *name, const struct aca_info *aca_info, void *data);
-void amdgpu_aca_remove_handle(struct aca_handle *handle);
-int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
- enum aca_error_type type, struct ras_err_data *err_data,
- struct ras_query_context *qctx);
-int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en);
-void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
-int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
- enum aca_error_type type, u64 count);
-#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index 516ab9cf88fc..7f5abb03be1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -140,13 +140,15 @@ static struct amdgpu_acpi_priv {
* @atif: atif structure
* @function: the ATIF function to execute
* @params: ATIF function params
+ * @min_size: minimum size of the expected output buffer in bytes
*
* Executes the requested ATIF function (all asics).
* Returns a pointer to the acpi output buffer.
*/
static union acpi_object *amdgpu_atif_call(struct amdgpu_atif *atif,
int function,
- struct acpi_buffer *params)
+ struct acpi_buffer *params,
+ size_t min_size)
{
acpi_status status;
union acpi_object *obj;
@@ -189,6 +191,28 @@ static union acpi_object *amdgpu_atif_call(struct amdgpu_atif *atif,
return NULL;
}
+ if (obj->buffer.length < sizeof(u16)) {
+ DRM_DEBUG_DRIVER("ATIF buffer too small to hold size field: %u\n",
+ obj->buffer.length);
+ kfree(obj);
+ return NULL;
+ }
+
+ if (obj->buffer.length < *(u16 *)obj->buffer.pointer) {
+ DRM_DEBUG_DRIVER("ATIF buffer length mismatch: reported %u, actual %u\n",
+ *(u16 *)obj->buffer.pointer,
+ obj->buffer.length);
+ kfree(obj);
+ return NULL;
+ }
+
+ if (*(u16 *)obj->buffer.pointer < min_size) {
+ DRM_DEBUG_DRIVER("ATIF buffer too small: expected %zu, got %u\n",
+ min_size, *(u16 *)obj->buffer.pointer);
+ kfree(obj);
+ return NULL;
+ }
+
return obj;
}
@@ -251,19 +275,14 @@ int amdgpu_atif_verify_interface(struct amdgpu_atif *atif)
size_t size;
int err = 0;
- info = amdgpu_atif_call(atif, ATIF_FUNCTION_VERIFY_INTERFACE, NULL);
+ info = amdgpu_atif_call(atif, ATIF_FUNCTION_VERIFY_INTERFACE, NULL,
+ sizeof(output));
if (!info)
return -EIO;
memset(&output, 0, sizeof(output));
- size = *(u16 *) info->buffer.pointer;
- if (size < 12) {
- DRM_INFO("ATIF buffer is too small: %zu\n", size);
- err = -EINVAL;
- goto out;
- }
- size = min(sizeof(output), size);
+ size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer);
memcpy(&output, info->buffer.pointer, size);
@@ -273,7 +292,6 @@ int amdgpu_atif_verify_interface(struct amdgpu_atif *atif)
amdgpu_atif_parse_notification(&atif->notifications, output.notification_mask);
amdgpu_atif_parse_functions(&atif->functions, output.function_bits);
-out:
kfree(info);
return err;
}
@@ -299,20 +317,14 @@ int amdgpu_atif_get_notification_params(struct amdgpu_atif *atif)
int err = 0;
info = amdgpu_atif_call(atif, ATIF_FUNCTION_GET_SYSTEM_PARAMETERS,
- NULL);
+ NULL, offsetof(struct atif_system_params, command_code));
if (!info) {
err = -EIO;
goto out;
}
- size = *(u16 *) info->buffer.pointer;
- if (size < 10) {
- err = -EINVAL;
- goto out;
- }
-
memset(&params, 0, sizeof(params));
- size = min(sizeof(params), size);
+ size = min(sizeof(params), (size_t)*(u16 *)info->buffer.pointer);
memcpy(&params, info->buffer.pointer, size);
DRM_DEBUG_DRIVER("SYSTEM_PARAMS: mask = %#x, flags = %#x\n",
@@ -376,20 +388,14 @@ int amdgpu_atif_query_backlight_caps(struct amdgpu_atif *atif)
info = amdgpu_atif_call(atif,
ATIF_FUNCTION_QUERY_BRIGHTNESS_TRANSFER_CHARACTERISTICS,
- &params);
+ &params, offsetof(struct atif_qbtc_output, data_points));
if (!info) {
err = -EIO;
goto out;
}
- size = *(u16 *) info->buffer.pointer;
- if (size < 10) {
- err = -EINVAL;
- goto out;
- }
-
memset(&characteristics, 0, sizeof(characteristics));
- size = min(sizeof(characteristics), size);
+ size = min(sizeof(characteristics), (size_t)*(u16 *)info->buffer.pointer);
memcpy(&characteristics, info->buffer.pointer, size);
atif->backlight_caps.caps_valid = true;
@@ -427,24 +433,18 @@ static int amdgpu_atif_get_sbios_requests(struct amdgpu_atif *atif,
int count = 0;
info = amdgpu_atif_call(atif, ATIF_FUNCTION_GET_SYSTEM_BIOS_REQUESTS,
- NULL);
+ NULL, sizeof(*req));
if (!info)
return -EIO;
- size = *(u16 *)info->buffer.pointer;
- if (size < 0xd) {
- count = -EINVAL;
- goto out;
- }
memset(req, 0, sizeof(*req));
- size = min(sizeof(*req), size);
+ size = min(sizeof(*req), (size_t)*(u16 *)info->buffer.pointer);
memcpy(req, info->buffer.pointer, size);
DRM_DEBUG_DRIVER("SBIOS pending requests: %#x\n", req->pending);
count = hweight32(req->pending);
-out:
kfree(info);
return count;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index da325863ad76..c693c508df1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -957,3 +957,17 @@ int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
return r;
}
+
+/* Reset an MES queue */
+int amdgpu_amdkfd_reset_mes_queue(struct amdgpu_device *adev,
+ uint32_t node_id,
+ int queue_type,
+ int pipe, int queue,
+ unsigned int db)
+{
+ if (!adev->kfd.init_complete)
+ return 0;
+
+ return kgd2kfd_reset_mes_queue(adev->kfd.dev, node_id, queue_type,
+ pipe, queue, db);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e443a7277299..338412a750ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
uint32_t domain,
struct dma_fence *fence);
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms);
#else
static inline
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
@@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
{
return 0;
}
+static inline
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms)
+{
+ return -EOPNOTSUPP;
+}
#endif
/* Shared API */
int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
@@ -275,7 +281,11 @@ int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable);
bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id);
-
+int amdgpu_amdkfd_reset_mes_queue(struct amdgpu_device *adev,
+ uint32_t node_id,
+ int queue_type,
+ int pipe, int queue,
+ unsigned int db);
/* Read user wptr from a specified user address space with page fault
* disabled. The memory must be pinned and mapped to the hardware when
@@ -326,9 +336,9 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
int amdgpu_amdkfd_gpuvm_sync_memory(
struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
-int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
- void **kptr, uint64_t *size);
-void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
+int amdgpu_amdkfd_gpuvm_map_bo_to_kernel(struct kgd_mem *mem, void **kptr,
+ u64 *size, u32 domain);
+void amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(struct kgd_mem *mem);
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart);
@@ -446,6 +456,9 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
bool retry_fault);
void kgd2kfd_lock_kfd(void);
void kgd2kfd_teardown_processes(struct amdgpu_device *adev);
+int kgd2kfd_reset_mes_queue(struct kfd_dev *kfd, uint32_t node_id,
+ int queue_type, int pipe, int queue,
+ unsigned int db);
#else
static inline int kgd2kfd_init(void)
@@ -576,5 +589,12 @@ static inline void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
{
}
+static inline int kgd2kfd_reset_mes_queue(struct kfd_dev *kfd, uint32_t node_id,
+ int queue_type, int pipe, int queue,
+ unsigned int db)
+{
+ return 0;
+}
+
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
index 6ed399163547..bc079b95fc52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
@@ -530,6 +530,66 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct amdgpu_device *adev,
ptl_state, fmt1, fmt2);
}
+static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device *adev,
+ void *mqd, uint32_t num_sdma_queues_per_eng,
+ uint64_t *val)
+{
+ struct v9_sdma_mqd *m = get_sdma_mqd(mqd);
+ uint32_t sdma_rlc_reg_offset = 0;
+ uint32_t sdma_rlc_rb_cntl;
+ uint32_t engine_id, queue_id;
+ uint32_t engines = adev->sdma.num_instances;
+ uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi;
+ bool found = false;
+
+ if (!m)
+ return -EINVAL;
+
+ if (((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+ adev->gfx.mec_fw_version < 194) ||
+ (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) &&
+ adev->gfx.mec_fw_version < 44)) {
+ pr_warn_once("MEC FW doesn't support SDMA counter!\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* SDMA doesn't support over-subscription, there must be
+ * a HQD associated with a MQD, so found must be true in
+ * the finding loop.
+ */
+ for (engine_id = 0; engine_id < engines && !found; engine_id++) {
+ for (queue_id = 0; queue_id < num_sdma_queues_per_eng; queue_id++) {
+ sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
+ engine_id, queue_id);
+ sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset +
+ regSDMA_RLC0_RB_BASE);
+ sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset +
+ regSDMA_RLC0_RB_BASE_HI);
+
+ if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base &&
+ m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) {
+ found = true;
+ break;
+ }
+ }
+ }
+
+ sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
+
+ /* Read sdma activity counter from utilization register
+ * if hw queue is enabled, otherwise read from MQD.
+ */
+ if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
+ *val = (uint64_t)RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_HI) << 32 |
+ RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_LO);
+ else
+ *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 |
+ m->sdmax_rlcx_utilization_lo;
+
+ return 0;
+}
+
const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
@@ -566,5 +626,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset,
.hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell,
- .ptl_ctrl = kgd_v9_4_3_ptl_ctrl
+ .ptl_ctrl = kgd_v9_4_3_ptl_ctrl,
+ .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 35fe2c974699..20831dbebc31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2271,11 +2271,14 @@ err_reserve_bo_failed:
return ret;
}
-/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access
+/** amdgpu_amdkfd_gpuvm_map_bo_to_kernel() - Map GTT or VRAM BO for kernel CPU access
*
* @mem: Buffer object to be mapped for CPU access
* @kptr[out]: pointer in kernel CPU address space
* @size[out]: size of the buffer
+ * @domain[IN]: domain for pinning (AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_DOMAIN_VRAM,
+ * or their combination to let the driver choose). CPU visibility is
+ * automatically enforced by amdgpu_bo_pin()
*
* Pins the BO and maps it for kernel CPU access. The eviction fence is removed
* from the BO, since pinned BOs cannot be evicted. The bo must remain on the
@@ -2284,8 +2287,8 @@ err_reserve_bo_failed:
*
* Return: 0 on success, error code on failure
*/
-int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
- void **kptr, uint64_t *size)
+int amdgpu_amdkfd_gpuvm_map_bo_to_kernel(struct kgd_mem *mem, void **kptr,
+ u64 *size, u32 domain)
{
int ret;
struct amdgpu_bo *bo = mem->bo;
@@ -2295,6 +2298,11 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
return -EINVAL;
}
+ if (!(domain & (AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_VRAM))) {
+ pr_debug("Invalid domain 0x%x for kernel mapping\n", domain);
+ return -EINVAL;
+ }
+
mutex_lock(&mem->process_info->lock);
ret = amdgpu_bo_reserve(bo, true);
@@ -2303,7 +2311,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
goto bo_reserve_failed;
}
- ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
+ ret = amdgpu_bo_pin(bo, domain);
if (ret) {
pr_err("Failed to pin bo. ret %d\n", ret);
goto pin_failed;
@@ -2336,7 +2344,7 @@ bo_reserve_failed:
return ret;
}
-/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access
+/** amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel() - Unmap GTT or VRAM BO for kernel CPU access
*
* @mem: Buffer object to be unmapped for CPU access
*
@@ -2344,7 +2352,7 @@ bo_reserve_failed:
* eviction fence, so this function should only be used for cleanup before the
* BO is destroyed.
*/
-void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)
+void amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(struct kgd_mem *mem)
{
struct amdgpu_bo *bo = mem->bo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index acd22bff1882..27c0dc8f6137 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -1923,7 +1923,7 @@ int amdgpu_atombios_init(struct amdgpu_device *adev)
atom_card_info->pll_read = cail_pll_read;
atom_card_info->pll_write = cail_pll_write;
- adev->mode_info.atom_context = amdgpu_atom_parse(atom_card_info, adev->bios);
+ adev->mode_info.atom_context = amdgpu_atom_parse(atom_card_info, adev->bios, adev->bios_size);
if (!adev->mode_info.atom_context) {
amdgpu_atombios_fini(adev);
return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
index 3893e6fc2f03..e2a4644896ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
@@ -89,6 +89,15 @@ bool amdgpu_is_atpx_hybrid(void)
return amdgpu_atpx_priv.atpx.is_hybrid;
}
+static bool amdgpu_atpx_buffer_validate(const union acpi_object *obj,
+ size_t min_size)
+{
+ return obj && obj->type == ACPI_TYPE_BUFFER &&
+ obj->buffer.length >= sizeof(u16) &&
+ obj->buffer.length >= *(u16 *)obj->buffer.pointer &&
+ *(u16 *)obj->buffer.pointer >= min_size;
+}
+
/**
* amdgpu_atpx_call - call an ATPX method
*
@@ -179,15 +188,15 @@ static int amdgpu_atpx_validate(struct amdgpu_atpx *atpx)
if (!info)
return -EIO;
- memset(&output, 0, sizeof(output));
-
- size = *(u16 *) info->buffer.pointer;
- if (size < 10) {
- pr_err("ATPX buffer is too small: %zu\n", size);
+ if (!amdgpu_atpx_buffer_validate(info, sizeof(output))) {
+ pr_err("Invalid ATPX GET_PX_PARAMETERS response\n");
kfree(info);
return -EINVAL;
}
- size = min(sizeof(output), size);
+
+ memset(&output, 0, sizeof(output));
+
+ size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer);
memcpy(&output, info->buffer.pointer, size);
@@ -258,15 +267,15 @@ static int amdgpu_atpx_verify_interface(struct amdgpu_atpx *atpx)
if (!info)
return -EIO;
- memset(&output, 0, sizeof(output));
-
- size = *(u16 *) info->buffer.pointer;
- if (size < 8) {
- pr_err("ATPX buffer is too small: %zu\n", size);
+ if (!amdgpu_atpx_buffer_validate(info, sizeof(output))) {
+ pr_err("Invalid ATPX VERIFY_INTERFACE response\n");
err = -EINVAL;
goto out;
}
- size = min(sizeof(output), size);
+
+ memset(&output, 0, sizeof(output));
+
+ size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer);
memcpy(&output, info->buffer.pointer, size);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
index aa039e148a5e..3ebdd792feec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
@@ -296,8 +296,14 @@ static int amdgpu_atrm_call(acpi_handle atrm_handle, uint8_t *bios,
}
obj = (union acpi_object *)buffer.pointer;
- memcpy(bios+offset, obj->buffer.pointer, obj->buffer.length);
- len = obj->buffer.length;
+ if (!obj || obj->type != ACPI_TYPE_BUFFER) {
+ DRM_ERROR("ATRM returned an invalid object\n");
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ len = min_t(size_t, obj->buffer.length, len);
+ memcpy(bios+offset, obj->buffer.pointer, len);
kfree(buffer.pointer);
return len;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 43864df8af04..ce1d08f112a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -38,14 +38,6 @@
#define AMDGPU_BO_LIST_NUM_BUCKETS (AMDGPU_BO_LIST_MAX_PRIORITY + 1)
#define AMDGPU_BO_LIST_MAX_ENTRIES (128 * 1024)
-static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu)
-{
- struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list,
- rhead);
- mutex_destroy(&list->bo_list_mutex);
- kvfree(list);
-}
-
static void amdgpu_bo_list_free(struct kref *ref)
{
struct amdgpu_bo_list *list = container_of(ref, struct amdgpu_bo_list,
@@ -54,7 +46,8 @@ static void amdgpu_bo_list_free(struct kref *ref)
amdgpu_bo_list_for_each_entry(e, list)
amdgpu_bo_unref(&e->bo);
- call_rcu(&list->rhead, amdgpu_bo_list_free_rcu);
+
+ kvfree(list);
}
static int amdgpu_bo_list_entry_cmp(const void *_a, const void *_b)
@@ -66,9 +59,9 @@ static int amdgpu_bo_list_entry_cmp(const void *_a, const void *_b)
return (int)a->priority - (int)b->priority;
}
-int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
- struct drm_amdgpu_bo_list_entry *info,
- size_t num_entries, struct amdgpu_bo_list **result)
+struct amdgpu_bo_list *
+amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
+ struct drm_amdgpu_bo_list_entry *info, size_t num_entries)
{
unsigned last_entry = 0, first_userptr = num_entries;
struct amdgpu_bo_list_entry *array;
@@ -79,7 +72,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
list = kvzalloc_flex(*list, entries, num_entries);
if (!list)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
kref_init(&list->refcount);
@@ -134,9 +127,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
trace_amdgpu_cs_bo_status(list->num_entries, total_size);
- mutex_init(&list->bo_list_mutex);
- *result = list;
- return 0;
+ return list;
error_free:
for (i = 0; i < last_entry; ++i)
@@ -144,150 +135,125 @@ error_free:
for (i = first_userptr; i < num_entries; ++i)
amdgpu_bo_unref(&array[i].bo);
kvfree(list);
- return r;
+ return ERR_PTR(r);
}
-static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id)
+struct amdgpu_bo_list *amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, u32 id)
{
struct amdgpu_bo_list *list;
- mutex_lock(&fpriv->bo_list_lock);
- list = idr_remove(&fpriv->bo_list_handles, id);
- mutex_unlock(&fpriv->bo_list_lock);
+ xa_lock(&fpriv->bo_list_handles);
+ list = xa_load(&fpriv->bo_list_handles, id);
if (list)
- kref_put(&list->refcount, amdgpu_bo_list_free);
-}
-
-int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
- struct amdgpu_bo_list **result)
-{
- rcu_read_lock();
- *result = idr_find(&fpriv->bo_list_handles, id);
-
- if (*result && kref_get_unless_zero(&(*result)->refcount)) {
- rcu_read_unlock();
- return 0;
- }
+ kref_get(&list->refcount);
+ else
+ list = ERR_PTR(-ENOENT);
+ xa_unlock(&fpriv->bo_list_handles);
- rcu_read_unlock();
- *result = NULL;
- return -ENOENT;
+ return list;
}
void amdgpu_bo_list_put(struct amdgpu_bo_list *list)
{
- kref_put(&list->refcount, amdgpu_bo_list_free);
+ if (list)
+ kref_put(&list->refcount, amdgpu_bo_list_free);
}
-int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
- struct drm_amdgpu_bo_list_entry **info_param)
+struct drm_amdgpu_bo_list_entry *
+amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in)
{
- const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry);
const void __user *uptr = u64_to_user_ptr(in->bo_info_ptr);
- const uint32_t bo_info_size = in->bo_info_size;
const uint32_t bo_number = in->bo_number;
- struct drm_amdgpu_bo_list_entry *info;
if (bo_number > AMDGPU_BO_LIST_MAX_ENTRIES)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- /* copy the handle array from userspace to a kernel buffer */
- if (likely(info_size == bo_info_size)) {
- info = vmemdup_array_user(uptr, bo_number, info_size);
- if (IS_ERR(info))
- return PTR_ERR(info);
- } else {
- const uint32_t bytes = min(bo_info_size, info_size);
- unsigned i;
-
- info = kvmalloc_array(bo_number, info_size, GFP_KERNEL);
- if (!info)
- return -ENOMEM;
-
- memset(info, 0, bo_number * info_size);
- for (i = 0; i < bo_number; ++i, uptr += bo_info_size) {
- if (copy_from_user(&info[i], uptr, bytes)) {
- kvfree(info);
- return -EFAULT;
- }
- }
- }
+ if (in->bo_info_size != sizeof(struct drm_amdgpu_bo_list_entry))
+ return ERR_PTR(-EINVAL);
- *info_param = info;
- return 0;
+ return vmemdup_array_user(uptr, bo_number,
+ sizeof(struct drm_amdgpu_bo_list_entry));
}
int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_fpriv *fpriv = filp->driver_priv;
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ struct amdgpu_bo_list *list, *prev, *curr;
union drm_amdgpu_bo_list *args = data;
uint32_t handle = args->in.list_handle;
- struct drm_amdgpu_bo_list_entry *info = NULL;
- struct amdgpu_bo_list *list, *old;
+ struct drm_amdgpu_bo_list_entry *info;
int r;
- r = amdgpu_bo_create_list_entry_array(&args->in, &info);
- if (r)
- return r;
-
switch (args->in.operation) {
case AMDGPU_BO_LIST_OP_CREATE:
- r = amdgpu_bo_list_create(adev, filp, info, args->in.bo_number,
- &list);
- if (r)
- goto error_free;
+ case AMDGPU_BO_LIST_OP_UPDATE:
+ info = amdgpu_bo_create_list_entry_array(&args->in);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
- mutex_lock(&fpriv->bo_list_lock);
- r = idr_alloc(&fpriv->bo_list_handles, list, 1, 0, GFP_KERNEL);
- mutex_unlock(&fpriv->bo_list_lock);
- if (r < 0) {
- goto error_put_list;
- }
+ list = amdgpu_bo_list_create(adev, filp, info,
+ args->in.bo_number);
+ kvfree(info);
+ if (IS_ERR(list))
+ return PTR_ERR(list);
- handle = r;
break;
case AMDGPU_BO_LIST_OP_DESTROY:
- amdgpu_bo_list_destroy(fpriv, handle);
+ list = xa_erase(&fpriv->bo_list_handles, handle);
+ amdgpu_bo_list_put(list);
handle = 0;
+
break;
- case AMDGPU_BO_LIST_OP_UPDATE:
- r = amdgpu_bo_list_create(adev, filp, info, args->in.bo_number,
- &list);
+ default:
+ return -EINVAL;
+ };
+
+ switch (args->in.operation) {
+ case AMDGPU_BO_LIST_OP_CREATE:
+ r = xa_alloc(&fpriv->bo_list_handles, &handle, list,
+ xa_limit_32b, GFP_KERNEL);
if (r)
- goto error_free;
+ goto error_put_list;
+
+ break;
- mutex_lock(&fpriv->bo_list_lock);
- old = idr_replace(&fpriv->bo_list_handles, list, handle);
- mutex_unlock(&fpriv->bo_list_lock);
+ case AMDGPU_BO_LIST_OP_UPDATE:
+ curr = xa_load(&fpriv->bo_list_handles, handle);
+ if (!curr) {
+ r = -ENOENT;
+ goto error_put_list;
+ }
- if (IS_ERR(old)) {
- r = PTR_ERR(old);
+ prev = xa_cmpxchg(&fpriv->bo_list_handles, handle, curr, list,
+ GFP_KERNEL);
+ if (xa_is_err(prev)) {
+ r = xa_err(prev);
+ goto error_put_list;
+ } else if (prev != curr) {
+ r = -ENOENT;
goto error_put_list;
}
- amdgpu_bo_list_put(old);
+ amdgpu_bo_list_put(curr);
break;
+ case AMDGPU_BO_LIST_OP_DESTROY:
default:
- r = -EINVAL;
- goto error_free;
+ /* Handled above. */
+ break;
}
memset(args, 0, sizeof(*args));
args->out.list_handle = handle;
- kvfree(info);
return 0;
error_put_list:
amdgpu_bo_list_put(list);
-
-error_free:
- kvfree(info);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 2b5e7c46a39d..bde912150824 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -43,7 +43,6 @@ struct amdgpu_bo_list_entry {
};
struct amdgpu_bo_list {
- struct rcu_head rhead;
struct kref refcount;
struct amdgpu_bo *gds_obj;
struct amdgpu_bo *gws_obj;
@@ -51,24 +50,19 @@ struct amdgpu_bo_list {
unsigned first_userptr;
unsigned num_entries;
- /* Protect access during command submission.
- */
- struct mutex bo_list_mutex;
-
struct amdgpu_bo_list_entry entries[] __counted_by(num_entries);
};
-int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
- struct amdgpu_bo_list **result);
+struct amdgpu_bo_list *amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, u32 id);
void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
-int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
- struct drm_amdgpu_bo_list_entry **info_param);
+struct drm_amdgpu_bo_list_entry *
+amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in);
-int amdgpu_bo_list_create(struct amdgpu_device *adev,
- struct drm_file *filp,
- struct drm_amdgpu_bo_list_entry *info,
- size_t num_entries,
- struct amdgpu_bo_list **list);
+struct amdgpu_bo_list *
+amdgpu_bo_list_create(struct amdgpu_device *adev,
+ struct drm_file *filp,
+ struct drm_amdgpu_bo_list_entry *info,
+ size_t num_entries);
#define amdgpu_bo_list_for_each_entry(e, list) \
for (e = list->entries; \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index d5e59c24d907..6fb129025761 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -289,40 +289,6 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
return hdr;
}
-int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
- struct aca_bank *bank)
-{
- struct cper_hdr *fatal = NULL;
- struct cper_sec_crashdump_reg_data reg_data = { 0 };
- struct amdgpu_ring *ring = &adev->cper.ring_buf;
- int ret;
-
- fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
- if (!fatal) {
- dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
- return -ENOMEM;
- }
-
- reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
- reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
- reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
- reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
- reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
- reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
- reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
- reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
-
- amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL_UNCORRECTED);
- ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
- if (ret)
- return ret;
-
- amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
- kfree(fatal);
-
- return 0;
-}
-
int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
{
struct cper_hdr *bp_threshold = NULL;
@@ -348,83 +314,6 @@ int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
return 0;
}
-static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
- enum aca_error_type aca_err_type)
-{
- switch (aca_err_type) {
- case ACA_ERROR_TYPE_UE:
- return CPER_SEV_FATAL_UNCORRECTED;
- case ACA_ERROR_TYPE_CE:
- return CPER_SEV_NON_FATAL_CORRECTED;
- case ACA_ERROR_TYPE_DEFERRED:
- return CPER_SEV_NON_FATAL_UNCORRECTED;
- default:
- dev_err(adev->dev, "Unknown ACA error type!\n");
- return CPER_SEV_FATAL_UNCORRECTED;
- }
-}
-
-int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
- struct aca_banks *banks,
- uint16_t bank_count)
-{
- struct cper_hdr *corrected = NULL;
- enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
- struct amdgpu_ring *ring = &adev->cper.ring_buf;
- uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
- struct aca_bank_node *node;
- struct aca_bank *bank;
- uint32_t i = 0;
- int ret;
-
- corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
- if (!corrected) {
- dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
- return -ENOMEM;
- }
-
- /* Raise severity if any DE is detected in the ACA bank list */
- list_for_each_entry(node, &banks->list, node) {
- bank = &node->bank;
- if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
- sev = CPER_SEV_NON_FATAL_UNCORRECTED;
- break;
- }
- }
-
- amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
-
- /* Combine CE and DE in cper record */
- list_for_each_entry(node, &banks->list, node) {
- bank = &node->bank;
- reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
- reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
- reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
- reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
- reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
- reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
- reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
- reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
- reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
- reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
- reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
- reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
- reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
- reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
-
- ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
- amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
- reg_data, CPER_ACA_REG_COUNT);
- if (ret)
- return ret;
- }
-
- amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
- kfree(corrected);
-
- return 0;
-}
-
static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
{
char signature[CPER_SIGNATURE_SZ];
@@ -592,8 +481,7 @@ int amdgpu_cper_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_cper_en(adev))
return 0;
- else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev) &&
- !amdgpu_aca_is_enabled(adev))
+ else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev))
return 0;
r = amdgpu_cper_ring_init(adev);
@@ -612,7 +500,7 @@ int amdgpu_cper_init(struct amdgpu_device *adev)
int amdgpu_cper_fini(struct amdgpu_device *adev)
{
- if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
+ if (amdgpu_sriov_vf(adev))
return 0;
adev->cper.enabled = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index 353421807387..d12c98077d9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -26,7 +26,6 @@
#define __AMDGPU_CPER_H__
#include "amd_cper.h"
-#include "amdgpu_aca.h"
#define CPER_MAX_ALLOWED_COUNT 0x1000
#define CPER_MAX_RING_SIZE 0X100000
@@ -88,13 +87,6 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
enum amdgpu_cper_type type,
uint16_t section_count);
-/* UE must be encoded into separated cper entries, 1 UE 1 cper */
-int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
- struct aca_bank *bank);
-/* CEs and DEs are combined into 1 cper entry */
-int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
- struct aca_banks *banks,
- uint16_t bank_count);
/* Bad page threshold is encoded into separated cper entry */
int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev);
void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index e714cee2997a..d777375e5350 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -140,24 +140,19 @@ static int amdgpu_cs_p1_bo_handles(struct amdgpu_cs_parser *p,
struct drm_amdgpu_bo_list_in *data)
{
struct drm_amdgpu_bo_list_entry *info;
- int r;
-
- r = amdgpu_bo_create_list_entry_array(data, &info);
- if (r)
- return r;
-
- r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number,
- &p->bo_list);
- if (r)
- goto error_free;
+ struct amdgpu_bo_list *list;
- kvfree(info);
- return 0;
+ info = amdgpu_bo_create_list_entry_array(data);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
-error_free:
+ list = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number);
kvfree(info);
+ if (IS_ERR(list))
+ return PTR_ERR(list);
- return r;
+ p->bo_list = list;
+ return 0;
}
/* Copy the data from userspace and go over it the first time */
@@ -846,6 +841,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct ttm_operation_ctx ctx = { true, false };
+ struct amdgpu_bo_list *list = NULL;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct drm_gem_object *obj;
@@ -857,25 +853,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
if (p->bo_list)
return -EINVAL;
- r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
- &p->bo_list);
- if (r)
- return r;
+ list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle);
} else if (!p->bo_list) {
/* Create a empty bo_list when no handle is provided */
- r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
- &p->bo_list);
- if (r)
- return r;
+ list = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0);
}
- mutex_lock(&p->bo_list->bo_list_mutex);
+ if (IS_ERR(list))
+ return PTR_ERR(list);
+ else if (list)
+ p->bo_list = list;
+ else
+ list = p->bo_list;
/* Get userptr backing pages. If pages are updated after registered
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
*/
- amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
+ amdgpu_bo_list_for_each_userptr_entry(e, list) {
bool userpage_invalidated = false;
struct amdgpu_bo *bo = e->bo;
@@ -905,7 +900,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
if (unlikely(r))
goto out_free_user_pages;
- amdgpu_bo_list_for_each_entry(e, p->bo_list) {
+ amdgpu_bo_list_for_each_entry(e, list) {
r = drm_exec_prepare_obj(&p->exec, &e->bo->tbo.base,
TTM_NUM_MOVE_FENCES + p->gang_size);
drm_exec_retry_on_contention(&p->exec);
@@ -924,7 +919,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
}
}
- amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
+ amdgpu_bo_list_for_each_userptr_entry(e, list) {
struct mm_struct *usermm;
usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm);
@@ -977,17 +972,15 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
p->bytes_moved_vis);
for (i = 0; i < p->gang_size; ++i)
- amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
- p->bo_list->gws_obj,
- p->bo_list->oa_obj);
+ amdgpu_job_set_resources(p->jobs[i], list->gds_obj,
+ list->gws_obj, list->oa_obj);
return 0;
out_free_user_pages:
- amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
+ amdgpu_bo_list_for_each_userptr_entry(e, list) {
amdgpu_hmm_range_free(e->range);
e->range = NULL;
}
- mutex_unlock(&p->bo_list->bo_list_mutex);
return r;
}
@@ -1371,7 +1364,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
mutex_unlock(&p->adev->notifier_lock);
- mutex_unlock(&p->bo_list->bo_list_mutex);
return 0;
}
@@ -1443,28 +1435,25 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
r = amdgpu_cs_patch_jobs(&parser);
if (r)
- goto error_backoff;
+ goto error_fini;
r = amdgpu_cs_vm_handling(&parser);
if (r)
- goto error_backoff;
+ goto error_fini;
r = amdgpu_cs_sync_rings(&parser);
if (r)
- goto error_backoff;
+ goto error_fini;
trace_amdgpu_cs_ibs(&parser);
r = amdgpu_cs_submit(&parser, data);
if (r)
- goto error_backoff;
+ goto error_fini;
amdgpu_cs_parser_fini(&parser);
return 0;
-error_backoff:
- mutex_unlock(&parser.bo_list->bo_list_mutex);
-
error_fini:
amdgpu_cs_parser_fini(&parser);
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index ce35b415093d..d53259a5b82f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -283,6 +283,8 @@ static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_device *adev,
if (!entity)
return res;
+ drm_sched_entity_destroy(&entity->entity);
+
for (i = 0; i < amdgpu_sched_jobs; ++i) {
res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i]));
dma_fence_put(entity->fences[i]);
@@ -294,32 +296,20 @@ static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_device *adev,
return res;
}
-static int amdgpu_ctx_get_stable_pstate(struct amdgpu_ctx *ctx,
- u32 *stable_pstate)
+static u32 amdgpu_get_stable_pstate(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = ctx->mgr->adev;
- enum amd_dpm_forced_level current_level;
-
- current_level = amdgpu_dpm_get_performance_level(adev);
-
- switch (current_level) {
+ switch (amdgpu_dpm_get_performance_level(adev)) {
case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD:
- *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_STANDARD;
- break;
+ return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK:
- *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
- break;
+ return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK:
- *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
- break;
+ return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
case AMD_DPM_FORCED_LEVEL_PROFILE_PEAK:
- *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_PEAK;
- break;
+ return AMDGPU_CTX_STABLE_PSTATE_PEAK;
default:
- *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_NONE;
- break;
+ return AMDGPU_CTX_STABLE_PSTATE_NONE;
}
- return 0;
}
static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
@@ -383,9 +373,9 @@ static int __amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx,
if (current_ctx && current_ctx != ctx)
return -EBUSY;
- r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
- if (r || current_stable_pstate == stable_pstate)
- return r;
+ current_stable_pstate = amdgpu_get_stable_pstate(adev);
+ if (current_stable_pstate == stable_pstate)
+ return 0;
r = amdgpu_dpm_force_performance_level(adev, level);
if (r)
@@ -416,7 +406,7 @@ static int amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx,
return r;
}
-static void amdgpu_ctx_fini(struct kref *ref)
+void amdgpu_ctx_fini(struct kref *ref)
{
struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
struct amdgpu_ctx_mgr *mgr = ctx->mgr;
@@ -504,53 +494,26 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
if (!ctx)
return -ENOMEM;
- mutex_lock(&mgr->lock);
- r = idr_alloc(&mgr->ctx_handles, ctx, 1, AMDGPU_VM_MAX_NUM_CTX, GFP_KERNEL);
- if (r < 0) {
- mutex_unlock(&mgr->lock);
- kfree(ctx);
- return r;
- }
-
- *id = (uint32_t)r;
r = amdgpu_ctx_init(mgr, priority, filp, ctx);
if (r) {
- idr_remove(&mgr->ctx_handles, *id);
- *id = 0;
kfree(ctx);
+ return r;
}
- mutex_unlock(&mgr->lock);
- return r;
-}
-
-static void amdgpu_ctx_do_release(struct kref *ref)
-{
- struct amdgpu_ctx *ctx;
- u32 i, j;
- ctx = container_of(ref, struct amdgpu_ctx, refcount);
- for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
- for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
- if (!ctx->entities[i][j])
- continue;
-
- drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
- }
- }
+ r = xa_alloc(&mgr->ctx_handles, id, ctx, xa_limit_32b, GFP_KERNEL);
+ if (r)
+ amdgpu_ctx_put(ctx);
- amdgpu_ctx_fini(ref);
+ return r;
}
static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
{
- struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
struct amdgpu_ctx *ctx;
- mutex_lock(&mgr->lock);
- ctx = idr_remove(&mgr->ctx_handles, id);
- if (ctx)
- kref_put(&ctx->refcount, amdgpu_ctx_do_release);
- mutex_unlock(&mgr->lock);
+ ctx = xa_erase(&fpriv->ctx_mgr.ctx_handles, id);
+ amdgpu_ctx_put(ctx);
+
return ctx ? 0 : -EINVAL;
}
@@ -559,19 +522,11 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
union drm_amdgpu_ctx_out *out)
{
struct amdgpu_ctx *ctx;
- struct amdgpu_ctx_mgr *mgr;
unsigned reset_counter;
- if (!fpriv)
- return -EINVAL;
-
- mgr = &fpriv->ctx_mgr;
- mutex_lock(&mgr->lock);
- ctx = idr_find(&mgr->ctx_handles, id);
- if (!ctx) {
- mutex_unlock(&mgr->lock);
+ ctx = amdgpu_ctx_get(fpriv, id);
+ if (!ctx)
return -EINVAL;
- }
/* TODO: these two are always zero */
out->state.flags = 0x0;
@@ -586,7 +541,8 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
out->state.reset_status = AMDGPU_CTX_UNKNOWN_RESET;
ctx->reset_counter_query = reset_counter;
- mutex_unlock(&mgr->lock);
+ amdgpu_ctx_put(ctx);
+
return 0;
}
@@ -619,18 +575,10 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_ctx *ctx;
- struct amdgpu_ctx_mgr *mgr;
- if (!fpriv)
- return -EINVAL;
-
- mgr = &fpriv->ctx_mgr;
- mutex_lock(&mgr->lock);
- ctx = idr_find(&mgr->ctx_handles, id);
- if (!ctx) {
- mutex_unlock(&mgr->lock);
+ ctx = amdgpu_ctx_get(fpriv, id);
+ if (!ctx)
return -EINVAL;
- }
out->state.flags = 0x0;
out->state.hangs = 0x0;
@@ -671,7 +619,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS));
}
- mutex_unlock(&mgr->lock);
+ amdgpu_ctx_put(ctx);
+
return 0;
}
@@ -680,26 +629,26 @@ static int amdgpu_ctx_stable_pstate(struct amdgpu_device *adev,
bool set, u32 *stable_pstate)
{
struct amdgpu_ctx *ctx;
- struct amdgpu_ctx_mgr *mgr;
- int r;
+ int r = 0;
- if (!fpriv)
+ ctx = amdgpu_ctx_get(fpriv, id);
+ if (!ctx)
return -EINVAL;
- mgr = &fpriv->ctx_mgr;
- mutex_lock(&mgr->lock);
- ctx = idr_find(&mgr->ctx_handles, id);
- if (!ctx) {
- mutex_unlock(&mgr->lock);
- return -EINVAL;
- }
+ /*
+ * The get path is odd in this uapi - it will check whether the context
+ * id exist, but otherwise does nothing with it. In other words, the
+ * uapi has historically been implemented as being able to query the
+ * global device state, as long as the caller supplies a random valid
+ * context id.
+ */
if (set)
r = amdgpu_ctx_set_stable_pstate(ctx, *stable_pstate);
else
- r = amdgpu_ctx_get_stable_pstate(ctx, stable_pstate);
+ *stable_pstate = amdgpu_get_stable_pstate(adev);
- mutex_unlock(&mgr->lock);
+ amdgpu_ctx_put(ctx);
return r;
}
@@ -778,23 +727,14 @@ struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id)
mgr = &fpriv->ctx_mgr;
- mutex_lock(&mgr->lock);
- ctx = idr_find(&mgr->ctx_handles, id);
+ xa_lock(&mgr->ctx_handles);
+ ctx = xa_load(&mgr->ctx_handles, id);
if (ctx)
kref_get(&ctx->refcount);
- mutex_unlock(&mgr->lock);
+ xa_unlock(&mgr->ctx_handles);
return ctx;
}
-int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
-{
- if (ctx == NULL)
- return -EINVAL;
-
- kref_put(&ctx->refcount, amdgpu_ctx_do_release);
- return 0;
-}
-
uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
struct drm_sched_entity *entity,
struct dma_fence *fence)
@@ -928,8 +868,7 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
unsigned int i;
mgr->adev = adev;
- mutex_init(&mgr->lock);
- idr_init_base(&mgr->ctx_handles, 1);
+ xa_init_flags(&mgr->ctx_handles, XA_FLAGS_ALLOC1);
for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
atomic64_set(&mgr->time_spend[i], 0);
@@ -938,13 +877,13 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
{
struct amdgpu_ctx *ctx;
- struct idr *idp;
- uint32_t id, i, j;
+ unsigned long id;
+ int i, j;
- idp = &mgr->ctx_handles;
-
- mutex_lock(&mgr->lock);
- idr_for_each_entry(idp, ctx, id) {
+ xa_lock(&mgr->ctx_handles);
+ xa_for_each(&mgr->ctx_handles, id, ctx) {
+ kref_get(&ctx->refcount);
+ xa_unlock(&mgr->ctx_handles);
for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
struct drm_sched_entity *entity;
@@ -956,45 +895,21 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
timeout = drm_sched_entity_flush(entity, timeout);
}
}
+ amdgpu_ctx_put(ctx);
+ xa_lock(&mgr->ctx_handles);
}
- mutex_unlock(&mgr->lock);
+ xa_unlock(&mgr->ctx_handles);
return timeout;
}
-static void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
+void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
{
struct amdgpu_ctx *ctx;
- struct idr *idp;
- uint32_t id, i, j;
-
- idp = &mgr->ctx_handles;
-
- idr_for_each_entry(idp, ctx, id) {
- if (kref_read(&ctx->refcount) != 1) {
- drm_err(adev_to_drm(mgr->adev), "ctx %p is still alive\n", ctx);
- continue;
- }
+ unsigned long id;
- for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
- for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
- struct drm_sched_entity *entity;
-
- if (!ctx->entities[i][j])
- continue;
-
- entity = &ctx->entities[i][j]->entity;
- drm_sched_entity_fini(entity);
- }
- }
- kref_put(&ctx->refcount, amdgpu_ctx_fini);
- }
-}
-
-void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
-{
- amdgpu_ctx_mgr_entity_fini(mgr);
- idr_destroy(&mgr->ctx_handles);
- mutex_destroy(&mgr->lock);
+ xa_for_each(&mgr->ctx_handles, id, ctx)
+ amdgpu_ctx_put(ctx);
+ xa_destroy(&mgr->ctx_handles);
}
void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
@@ -1002,21 +917,21 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
{
struct amdgpu_ctx *ctx;
unsigned int hw_ip, i;
- uint32_t id;
+ unsigned long id;
/*
* This is a little bit racy because it can be that a ctx or a fence are
* destroyed just in the moment we try to account them. But that is ok
* since exactly that case is explicitely allowed by the interface.
*/
- mutex_lock(&mgr->lock);
for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
uint64_t ns = atomic64_read(&mgr->time_spend[hw_ip]);
usage[hw_ip] = ns_to_ktime(ns);
}
- idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
+ xa_lock(&mgr->ctx_handles);
+ xa_for_each(&mgr->ctx_handles, id, ctx) {
for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
for (i = 0; i < amdgpu_ctx_num_entities[hw_ip]; ++i) {
struct amdgpu_ctx_entity *centity;
@@ -1030,5 +945,5 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
}
}
}
- mutex_unlock(&mgr->lock);
+ xa_unlock(&mgr->ctx_handles);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index e444b2088d40..a4b89eca4169 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -25,6 +25,7 @@
#include <linux/ktime.h>
#include <linux/types.h>
+#include <linux/xarray.h>
#include "amdgpu_ring.h"
@@ -60,16 +61,21 @@ struct amdgpu_ctx {
struct amdgpu_ctx_mgr {
struct amdgpu_device *adev;
- struct mutex lock;
- /* protected by lock */
- struct idr ctx_handles;
+ struct xarray ctx_handles;
atomic64_t time_spend[AMDGPU_HW_IP_NUM];
};
extern const unsigned int amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM];
struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
-int amdgpu_ctx_put(struct amdgpu_ctx *ctx);
+
+void amdgpu_ctx_fini(struct kref *kref);
+
+static inline void amdgpu_ctx_put(struct amdgpu_ctx *ctx)
+{
+ if (ctx)
+ kref_put(&ctx->refcount, amdgpu_ctx_fini);
+}
int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
u32 ring, struct drm_sched_entity **entity);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 389bad724273..0455c2cd043f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -26,6 +26,7 @@
#include <linux/kthread.h>
#include <linux/pci.h>
#include <linux/uaccess.h>
+#include <linux/security.h>
#include <linux/pm_runtime.h>
#include "amdgpu.h"
@@ -1739,6 +1740,12 @@ int amdgpu_debugfs_regs_init(struct amdgpu_device *adev)
struct dentry *ent, *root = minor->debugfs_root;
unsigned int i;
+ if (security_locked_down(LOCKDOWN_PCI_ACCESS)) {
+ drm_info(adev_to_drm(adev),
+ "amdgpu: HW debugfs nodes disabled (kernel lockdown)\n");
+ return 0;
+ }
+
for (i = 0; i < ARRAY_SIZE(debugfs_regs); i++) {
ent = debugfs_create_file(debugfs_regs_names[i],
S_IFREG | 0400, root,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index e77db76b48b8..4fd0df3aa70d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -64,6 +64,7 @@ const char *hw_ip_names[MAX_HWIP] = {
[VCN1_HWIP] = "VCN1",
[VCE_HWIP] = "VCE",
[VPE_HWIP] = "VPE",
+ [UMSCH_HWIP] = "UMSCH",
[DF_HWIP] = "DF",
[DCE_HWIP] = "DCE",
[OSSSYS_HWIP] = "OSSSYS",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8d6502a94306..78c96c7102e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -74,6 +74,7 @@
#include "amdgpu_ras.h"
#include "amdgpu_ras_mgr.h"
#include "amdgpu_pmu.h"
+#include "amdgpu_smu.h"
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
#include "amdgpu_virt.h"
@@ -2130,6 +2131,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
adev->cg_flags &= amdgpu_cg_mask;
adev->pg_flags &= amdgpu_pg_mask;
+ amdgpu_smu_early_init(adev);
+
return 0;
}
@@ -3677,6 +3680,10 @@ static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
+
+ amdgpu_discovery_sysfs_fini(adev);
+ amdgpu_preempt_mgr_sysfs_fini(adev);
+
amdgpu_device_attr_sysfs_fini(adev);
amdgpu_fru_sysfs_fini(adev);
@@ -3773,6 +3780,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->irq.lock);
+ amdgpu_early_init_rlc_reg_funcs(adev);
amdgpu_device_init_apu_flags(adev);
r = amdgpu_device_check_arguments(adev);
@@ -4208,6 +4216,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
if (adev->mman.initialized)
drain_workqueue(adev->mman.bdev.wq);
+
adev->shutdown = true;
unregister_pm_notifier(&adev->pm_nb);
@@ -4707,161 +4716,6 @@ exit:
}
/**
- * amdgpu_device_ip_check_soft_reset - did soft reset succeed
- *
- * @adev: amdgpu_device pointer
- *
- * The list of all the hardware IPs that make up the asic is walked and
- * the check_soft_reset callbacks are run. check_soft_reset determines
- * if the asic is still hung or not.
- * Returns true if any of the IPs are still in a hung state, false if not.
- */
-static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
-{
- int i;
- bool asic_hang = false;
-
- if (amdgpu_sriov_vf(adev))
- return true;
-
- if (amdgpu_asic_need_full_reset(adev))
- return true;
-
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
- continue;
- if (adev->ip_blocks[i].version->funcs->check_soft_reset)
- adev->ip_blocks[i].status.hang =
- adev->ip_blocks[i].version->funcs->check_soft_reset(
- &adev->ip_blocks[i]);
- if (adev->ip_blocks[i].status.hang) {
- dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
- asic_hang = true;
- }
- }
- return asic_hang;
-}
-
-/**
- * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
- *
- * @adev: amdgpu_device pointer
- *
- * The list of all the hardware IPs that make up the asic is walked and the
- * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
- * handles any IP specific hardware or software state changes that are
- * necessary for a soft reset to succeed.
- * Returns 0 on success, negative error code on failure.
- */
-static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
-{
- int i, r = 0;
-
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->pre_soft_reset) {
- r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
- if (r)
- return r;
- }
- }
-
- return 0;
-}
-
-/**
- * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
- *
- * @adev: amdgpu_device pointer
- *
- * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
- * reset is necessary to recover.
- * Returns true if a full asic reset is required, false if not.
- */
-static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
-{
- int i;
-
- if (amdgpu_asic_need_full_reset(adev))
- return true;
-
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
- continue;
- if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
- if (adev->ip_blocks[i].status.hang) {
- dev_info(adev->dev, "Some block need full reset!\n");
- return true;
- }
- }
- }
- return false;
-}
-
-/**
- * amdgpu_device_ip_soft_reset - do a soft reset
- *
- * @adev: amdgpu_device pointer
- *
- * The list of all the hardware IPs that make up the asic is walked and the
- * soft_reset callbacks are run if the block is hung. soft_reset handles any
- * IP specific hardware or software state changes that are necessary to soft
- * reset the IP.
- * Returns 0 on success, negative error code on failure.
- */
-static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
-{
- int i, r = 0;
-
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->soft_reset) {
- r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
- if (r)
- return r;
- }
- }
-
- return 0;
-}
-
-/**
- * amdgpu_device_ip_post_soft_reset - clean up from soft reset
- *
- * @adev: amdgpu_device pointer
- *
- * The list of all the hardware IPs that make up the asic is walked and the
- * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
- * handles any IP specific hardware or software state changes that are
- * necessary after the IP has been soft reset.
- * Returns 0 on success, negative error code on failure.
- */
-static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
-{
- int i, r = 0;
-
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->post_soft_reset)
- r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
- if (r)
- return r;
- }
-
- return 0;
-}
-
-/**
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
*
* @adev: amdgpu_device pointer
@@ -5152,20 +5006,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
if (!amdgpu_sriov_vf(adev)) {
-
- if (!need_full_reset)
- need_full_reset = amdgpu_device_ip_need_full_reset(adev);
-
- if (!need_full_reset && amdgpu_gpu_recovery &&
- amdgpu_device_ip_check_soft_reset(adev)) {
- amdgpu_device_ip_pre_soft_reset(adev);
- r = amdgpu_device_ip_soft_reset(adev);
- amdgpu_device_ip_post_soft_reset(adev);
- if (r || amdgpu_device_ip_check_soft_reset(adev)) {
- dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
- need_full_reset = true;
- }
- }
+ need_full_reset = true;
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
dev_info(tmp_adev->dev, "Dumping IP State\n");
@@ -5618,8 +5459,7 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
drm_client_dev_suspend(adev_to_drm(tmp_adev));
/* disable ras on ALL IPs */
- if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
- amdgpu_device_ip_need_full_reset(tmp_adev))
+ if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev))
amdgpu_ras_suspend(tmp_adev);
amdgpu_userq_pre_reset(tmp_adev);
@@ -6891,7 +6731,7 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
!amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
- size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+ size |= AMDGPU_RESET_TYPE_SOFT_RECOVERY;
return size;
}
@@ -6907,8 +6747,8 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
}
- if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
- size += sysfs_emit_at(buf, size, "soft ");
+ if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RECOVERY)
+ size += sysfs_emit_at(buf, size, "soft_recovery ");
if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
size += sysfs_emit_at(buf, size, "queue ");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 853365dee2a7..a015d55aa158 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -22,6 +22,7 @@
*/
#include <linux/firmware.h>
+#include <linux/kernfs.h>
#include "amdgpu.h"
#include "amdgpu_discovery.h"
@@ -148,6 +149,26 @@ MODULE_FIRMWARE("amdgpu/aldebaran_ip_discovery.bin");
#define mmDRIVER_SCRATCH_1 0x95
#define mmDRIVER_SCRATCH_2 0x96
+struct ip_discovery_top {
+ struct kobject kobj;
+ struct kset die_kset;
+ struct pci_dev *pdev;
+ struct amdgpu_device *adev;
+ uint8_t *discovery_bin;
+ uint32_t bin_size;
+ bool standalone_mode;
+};
+
+/* List to track early-initialized ip_discovery_top entries */
+struct early_ip_discovery {
+ struct list_head list;
+ struct pci_dev *pdev;
+ struct ip_discovery_top *ip_top;
+};
+
+static LIST_HEAD(early_ip_discovery_list);
+static DEFINE_MUTEX(early_ip_discovery_mutex);
+
static const char *hw_id_names[HW_ID_MAX] = {
[MP1_HWID] = "MP1",
[MP2_HWID] = "MP2",
@@ -226,6 +247,7 @@ static const char *hw_id_names[HW_ID_MAX] = {
[XGBE_HWID] = "XGBE",
[MP0_HWID] = "MP0",
[VPE_HWID] = "VPE",
+ [UMSCH_HWID] = "UMSCH",
[ATU_HWID] = "ATU",
[AIGC_HWID] = "AIGC",
};
@@ -258,6 +280,7 @@ static int hw_id_map[MAX_HWIP] = {
[DCI_HWIP] = DCI_HWID,
[PCIE_HWIP] = PCIE_HWID,
[VPE_HWIP] = VPE_HWID,
+ [UMSCH_HWIP] = UMSCH_HWID,
[ISP_HWIP] = ISP_HWID,
[ATU_HWIP] = ATU_HWID,
};
@@ -542,25 +565,37 @@ static const char *amdgpu_discovery_get_fw_name(struct amdgpu_device *adev)
}
}
-static int amdgpu_discovery_get_table_info(struct amdgpu_device *adev,
- struct table_info **info,
- uint16_t table_id)
+static struct table_info *
+amdgpu_discovery_get_table_info_from_bin(uint8_t *discovery_bin,
+ uint16_t table_id)
{
- struct binary_header *bhdr =
- (struct binary_header *)adev->discovery.bin;
+ struct binary_header *bhdr = (struct binary_header *)discovery_bin;
struct binary_header_v2 *bhdrv2;
switch (bhdr->version_major) {
case 2:
- bhdrv2 = (struct binary_header_v2 *)adev->discovery.bin;
- *info = &bhdrv2->table_list[table_id];
- break;
+ bhdrv2 = (struct binary_header_v2 *)discovery_bin;
+ return &bhdrv2->table_list[table_id];
case 1:
case 0:
- *info = &bhdr->table_list[table_id];
- break;
+ return &bhdr->table_list[table_id];
default:
- dev_err(adev->dev, "Invalid ip discovery table version %d\n",bhdr->version_major);
+ return NULL;
+ }
+}
+
+static int amdgpu_discovery_get_table_info(struct amdgpu_device *adev,
+ struct table_info **info,
+ uint16_t table_id)
+{
+ struct binary_header *bhdr =
+ (struct binary_header *)adev->discovery.bin;
+
+ *info = amdgpu_discovery_get_table_info_from_bin(adev->discovery.bin,
+ table_id);
+ if (!*info) {
+ dev_err(adev->dev, "Invalid ip discovery table version %d\n",
+ bhdr->version_major);
return -EINVAL;
}
@@ -724,11 +759,11 @@ out:
return r;
}
-static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev);
-
void amdgpu_discovery_fini(struct amdgpu_device *adev)
{
- amdgpu_discovery_sysfs_fini(adev);
+ if (adev->discovery.ip_top && !adev->discovery.ip_top->standalone_mode)
+ amdgpu_discovery_sysfs_fini(adev);
+
kfree(adev->discovery.bin);
adev->discovery.bin = NULL;
}
@@ -737,15 +772,17 @@ static int amdgpu_discovery_validate_ip(struct amdgpu_device *adev,
uint8_t instance, uint16_t hw_id)
{
if (instance >= HWIP_MAX_INSTANCE) {
- dev_err(adev->dev,
- "Unexpected instance_number (%d) from ip discovery blob\n",
- instance);
+ if (adev)
+ dev_err(adev->dev,
+ "Unexpected instance_number (%d) from ip discovery blob\n",
+ instance);
return -EINVAL;
}
if (hw_id >= HW_ID_MAX) {
- dev_err(adev->dev,
- "Unexpected hw_id (%d) from ip discovery blob\n",
- hw_id);
+ if (adev)
+ dev_err(adev->dev,
+ "Unexpected hw_id (%d) from ip discovery blob\n",
+ hw_id);
return -EINVAL;
}
@@ -1111,12 +1148,6 @@ static const struct kobj_type ip_discovery_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
};
-struct ip_discovery_top {
- struct kobject kobj; /* ip_discovery/ */
- struct kset die_kset; /* ip_discovery/die/, contains ip_die_entry */
- struct amdgpu_device *adev;
-};
-
static void die_kobj_release(struct kobject *kobj)
{
struct ip_discovery_top *ip_top = container_of(to_kset(kobj),
@@ -1132,8 +1163,14 @@ static void ip_disc_release(struct kobject *kobj)
kobj);
struct amdgpu_device *adev = ip_top->adev;
+ /* In standalone mode, discovery_bin is managed by devm and will be
+ * freed automatically when the PCI device is removed. Do not manually
+ * free it here to avoid double-free.
+ */
+
kfree(ip_top);
- adev->discovery.ip_top = NULL;
+ if (adev)
+ adev->discovery.ip_top = NULL;
}
static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
@@ -1141,6 +1178,10 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
{
uint8_t harvest = 0;
+ /* In early init mode (adev == NULL), harvest info is not available */
+ if (!adev)
+ return 0;
+
/* Until a uniform way is figured, get mask based on hwid */
switch (hw_id) {
case VCN_HWID:
@@ -1169,11 +1210,14 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
}
static int amdgpu_discovery_sysfs_ips(struct amdgpu_device *adev,
+ struct ip_discovery_top *ip_top,
struct ip_die_entry *ip_die_entry,
const size_t _ip_offset, const int num_ips,
bool reg_base_64)
{
- uint8_t *discovery_bin = adev->discovery.bin;
+ uint8_t *discovery_bin = ip_top->standalone_mode ?
+ ip_top->discovery_bin :
+ adev->discovery.bin;
int ii, jj, kk, res;
uint16_t hw_id;
uint8_t inst;
@@ -1270,10 +1314,12 @@ next_ip:
return 0;
}
-static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev)
+static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev,
+ struct ip_discovery_top *ip_top)
{
- struct ip_discovery_top *ip_top = adev->discovery.ip_top;
- uint8_t *discovery_bin = adev->discovery.bin;
+ uint8_t *discovery_bin = ip_top->standalone_mode ?
+ ip_top->discovery_bin :
+ adev->discovery.bin;
struct table_info *info;
struct ip_discovery_header *ihdr;
struct die_header *dhdr;
@@ -1282,9 +1328,10 @@ static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev)
size_t ip_offset;
int ii, res;
- res = amdgpu_discovery_get_table_info(adev, &info, IP_DISCOVERY);
- if (res)
- return res;
+ info = amdgpu_discovery_get_table_info_from_bin(discovery_bin,
+ IP_DISCOVERY);
+ if (!info)
+ return -EINVAL;
ihdr = (struct ip_discovery_header
*)(discovery_bin +
le16_to_cpu(info->offset));
@@ -1322,7 +1369,8 @@ static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev)
return res;
}
- amdgpu_discovery_sysfs_ips(adev, ip_die_entry, ip_offset, num_ips, !!ihdr->base_addr_64_bit);
+ amdgpu_discovery_sysfs_ips(adev, ip_top, ip_die_entry, ip_offset,
+ num_ips, !!ihdr->base_addr_64_bit);
}
return 0;
@@ -1338,12 +1386,30 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev)
if (!discovery_bin)
return -EINVAL;
+ /* If early init already created sysfs in standalone mode, skip normal init */
+ if (adev->discovery.ip_top && adev->discovery.ip_top->standalone_mode)
+ return 0;
+
ip_top = kzalloc_obj(*ip_top);
if (!ip_top)
return -ENOMEM;
ip_top->adev = adev;
- adev->discovery.ip_top = ip_top;
+
+ /* Check if ip_discovery already exists before creating.
+ * This shouldn't normally happen but handle it gracefully.
+ */
+ if (adev->dev->kobj.sd) {
+ struct kernfs_node *existing;
+
+ existing = kernfs_find_and_get(adev->dev->kobj.sd, "ip_discovery");
+ if (existing) {
+ kernfs_put(existing);
+ kfree(ip_top);
+ return 0;
+ }
+ }
+
res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype,
&adev->dev->kobj, "ip_discovery");
if (res) {
@@ -1351,6 +1417,8 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev)
goto Err;
}
+ adev->discovery.ip_top = ip_top;
+
die_kset = &ip_top->die_kset;
kobject_set_name(&die_kset->kobj, "%s", "die");
die_kset->kobj.parent = &ip_top->kobj;
@@ -1365,7 +1433,7 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev)
ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr;
ip_hw_instance_attrs[ii] = NULL;
- res = amdgpu_discovery_sysfs_recurse(adev);
+ res = amdgpu_discovery_sysfs_recurse(adev, ip_top);
return res;
Err:
@@ -1412,7 +1480,7 @@ static void amdgpu_discovery_sysfs_die_free(struct ip_die_entry *ip_die_entry)
kobject_put(&ip_die_entry->ip_kset.kobj);
}
-static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
+void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
{
struct ip_discovery_top *ip_top = adev->discovery.ip_top;
struct list_head *el, *tmp;
@@ -1421,6 +1489,16 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
if (!ip_top)
return;
+ /*
+ * In standalone mode the sysfs hierarchy is tied to the PCI device
+ * lifetime and is torn down by amdgpu_discovery_sysfs_early_fini().
+ * Freeing it here would leave a dangling pointer in the early
+ * discovery list, causing a use-after-free on driver unbind.
+ */
+ if (ip_top->standalone_mode)
+ return;
+
+ adev->discovery.ip_top = NULL;
die_kset = &ip_top->die_kset;
spin_lock(&die_kset->list_lock);
list_for_each_prev_safe(el, tmp, &die_kset->list) {
@@ -1479,6 +1557,150 @@ void amdgpu_discovery_dump(struct amdgpu_device *adev, struct drm_printer *p)
spin_unlock(&die_kset->list_lock);
}
+int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, struct pci_dev *pdev)
+{
+ struct ip_discovery_top *ip_top;
+ struct early_ip_discovery *early_entry, *tmp;
+ struct kset *die_kset;
+ uint8_t *discovery_bin;
+ int res, ii;
+
+ if (!adev || !adev->discovery.bin)
+ return -EINVAL;
+
+ if (adev->discovery.ip_top)
+ return 0;
+
+ mutex_lock(&early_ip_discovery_mutex);
+ list_for_each_entry_safe(early_entry, tmp, &early_ip_discovery_list, list) {
+ if (early_entry->pdev == pdev) {
+ adev->discovery.ip_top = early_entry->ip_top;
+ early_entry->ip_top->adev = adev;
+ mutex_unlock(&early_ip_discovery_mutex);
+ return 0;
+ }
+ }
+ mutex_unlock(&early_ip_discovery_mutex);
+
+ discovery_bin = adev->discovery.bin;
+
+ early_entry = kzalloc(sizeof(*early_entry), GFP_KERNEL);
+ if (!early_entry)
+ return -ENOMEM;
+
+ ip_top = kzalloc(sizeof(*ip_top), GFP_KERNEL);
+ if (!ip_top) {
+ kfree(early_entry);
+ return -ENOMEM;
+ }
+
+ ip_top->discovery_bin = devm_kmemdup(&pdev->dev, discovery_bin,
+ DISCOVERY_TMR_SIZE, GFP_KERNEL);
+ if (!ip_top->discovery_bin) {
+ kfree(ip_top);
+ kfree(early_entry);
+ return -ENOMEM;
+ }
+
+ ip_top->bin_size = DISCOVERY_TMR_SIZE;
+ ip_top->pdev = pdev;
+ ip_top->adev = adev;
+ ip_top->standalone_mode = true;
+
+ /* Check if ip_discovery already exists (from previous probe attempt).
+ * This can happen if the module was unloaded and reloaded but the
+ * sysfs persisted (tied to PCI device lifetime).
+ */
+ if (pdev->dev.kobj.sd) {
+ struct kernfs_node *existing;
+
+ existing = kernfs_find_and_get(pdev->dev.kobj.sd, "ip_discovery");
+ if (existing) {
+ kernfs_put(existing);
+ kfree(ip_top);
+ kfree(early_entry);
+ return 0;
+ }
+ }
+
+ res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype,
+ &pdev->dev.kobj, "ip_discovery");
+ if (res)
+ goto err_put_kobj;
+
+ adev->discovery.ip_top = ip_top;
+
+ die_kset = &ip_top->die_kset;
+ kobject_set_name(&die_kset->kobj, "%s", "die");
+ die_kset->kobj.parent = &ip_top->kobj;
+ die_kset->kobj.ktype = &die_kobj_ktype;
+ res = kset_register(&ip_top->die_kset);
+ if (res)
+ goto err_put_die_kset;
+
+ for (ii = 0; ii < ARRAY_SIZE(ip_hw_attr); ii++)
+ ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr;
+ ip_hw_instance_attrs[ii] = NULL;
+
+ res = amdgpu_discovery_sysfs_recurse(NULL, ip_top);
+ if (res)
+ goto err_put_die_kset;
+
+ early_entry->pdev = pdev;
+ early_entry->ip_top = ip_top;
+ mutex_lock(&early_ip_discovery_mutex);
+ list_add(&early_entry->list, &early_ip_discovery_list);
+ mutex_unlock(&early_ip_discovery_mutex);
+
+ return 0;
+
+err_put_die_kset:
+ kobject_put(&ip_top->die_kset.kobj);
+err_put_kobj:
+ kobject_put(&ip_top->kobj);
+ kfree(early_entry);
+ adev->discovery.ip_top = NULL;
+ return res;
+}
+
+void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev)
+{
+ struct early_ip_discovery *entry, *tmp_entry;
+ struct ip_discovery_top *ip_top = NULL;
+ struct list_head *el, *tmp;
+ struct kset *die_kset;
+
+ /* Find the entry in our tracking list */
+ mutex_lock(&early_ip_discovery_mutex);
+ list_for_each_entry_safe(entry, tmp_entry, &early_ip_discovery_list, list) {
+ if (entry->pdev == pdev) {
+ ip_top = entry->ip_top;
+ list_del(&entry->list);
+ kfree(entry);
+ break;
+ }
+ }
+ mutex_unlock(&early_ip_discovery_mutex);
+
+ if (!ip_top)
+ return;
+
+ /* Clean up sysfs hierarchy */
+ die_kset = &ip_top->die_kset;
+
+ spin_lock(&die_kset->list_lock);
+ list_for_each_prev_safe(el, tmp, &die_kset->list) {
+ list_del_init(el);
+ spin_unlock(&die_kset->list_lock);
+ amdgpu_discovery_sysfs_die_free(to_ip_die_entry(list_to_kobj(el)));
+ spin_lock(&die_kset->list_lock);
+ }
+ spin_unlock(&die_kset->list_lock);
+
+ kobject_put(&ip_top->die_kset.kobj);
+ kobject_put(&ip_top->kobj);
+ /* ip_top itself will be freed by kobject_put via ip_disc_release */
+}
/* ================================================== */
@@ -1504,6 +1726,9 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
r = amdgpu_discovery_init(adev);
if (r)
return r;
+
+ amdgpu_discovery_sysfs_early_init(adev, adev->pdev);
+
discovery_bin = adev->discovery.bin;
wafl_ver = 0;
adev->gfx.xcc_mask = 0;
@@ -2636,7 +2861,12 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev)
return -EINVAL;
}
} else {
- switch (amdgpu_ip_version(adev, UVD_HWIP, 0)) {
+ uint32_t vcn_version = amdgpu_ip_version(adev, UVD_HWIP, 0);
+
+ /* no VCN discovered; nothing to add */
+ if (!vcn_version)
+ return 0;
+ switch (vcn_version) {
case IP_VERSION(1, 0, 0):
case IP_VERSION(1, 0, 1):
amdgpu_device_ip_block_add(adev, &vcn_v1_0_ip_block);
@@ -2704,7 +2934,7 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev)
default:
dev_err(adev->dev,
"Failed to add vcn/jpeg ip block(UVD_HWIP:0x%x)\n",
- amdgpu_ip_version(adev, UVD_HWIP, 0));
+ vcn_version);
return -EINVAL;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
index e0010f6a3eda..5b2b16f68576 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
@@ -41,6 +41,7 @@ struct amdgpu_discovery_info {
bool reserve_tmr;
};
+void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev);
void amdgpu_discovery_fini(struct amdgpu_device *adev);
int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev);
@@ -53,4 +54,9 @@ int amdgpu_discovery_get_gc_major_minor_version(struct amdgpu_device *adev,
void amdgpu_discovery_dump(struct amdgpu_device *adev, struct drm_printer *p);
+/* Early sysfs functions for persistent ip_discovery export */
+int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev,
+ struct pci_dev *pdev);
+void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev);
+
#endif /* __AMDGPU_DISCOVERY__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4c0c77eafbd1..ad631ad31899 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -33,7 +33,6 @@
#include <drm/drm_vblank.h>
#include <linux/cc_platform.h>
-#include <linux/console.h>
#include <linux/dynamic_debug.h>
#include <linux/module.h>
#include <linux/mmu_notifier.h>
@@ -146,7 +145,9 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_SMU_POOL = BIT(7),
AMDGPU_DEBUG_VM_USERPTR = BIT(8),
AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9),
- AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10)
+ AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10),
+ AMDGPU_DEBUG_HIBERNATION_THAW_RESUME_GPU = BIT(11),
+ AMDGPU_DEBUG_DISABLE_IP_BLOCK_SOFT_RESET = BIT(12),
};
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -1939,6 +1940,7 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x6646, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE|AMD_IS_MOBILITY},
{0x1002, 0x6647, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE|AMD_IS_MOBILITY},
{0x1002, 0x6649, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE},
+ {0x1002, 0x664D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE},
{0x1002, 0x6650, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE},
{0x1002, 0x6651, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE},
{0x1002, 0x6658, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE},
@@ -2008,6 +2010,7 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x6930, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
{0x1002, 0x6938, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
{0x1002, 0x6939, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
+ {0x1002, 0x693B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA},
/* fiji */
{0x1002, 0x7300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_FIJI},
{0x1002, 0x730F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_FIJI},
@@ -2036,6 +2039,7 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x67C4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
{0x1002, 0x67C7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
{0x1002, 0x67D0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+ {0x1002, 0x67D4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
{0x1002, 0x67DF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
{0x1002, 0x67C8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
{0x1002, 0x67C9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
@@ -2049,6 +2053,7 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x6985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
+ {0x1002, 0x698F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6995, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x6997, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
{0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
@@ -2250,7 +2255,7 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
}
if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
- pr_info("debug: soft reset for GPU recovery disabled\n");
+ pr_info("debug: soft recovery disabled\n");
adev->debug_disable_soft_recovery = true;
}
@@ -2291,6 +2296,16 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: allowing command submission to CE engine\n");
adev->debug_enable_ce_cs = true;
}
+
+ if (amdgpu_debug_mask & AMDGPU_DEBUG_HIBERNATION_THAW_RESUME_GPU) {
+ pr_info("debug: resume gpu in thaw() of hibernation\n");
+ adev->debug_hibernation_thaw_resume_gpu = true;
+ }
+
+ if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_IP_BLOCK_SOFT_RESET) {
+ pr_info("debug: IP block soft reset disabled\n");
+ adev->debug_disable_ip_block_soft_reset = true;
+ }
}
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
@@ -2552,6 +2567,8 @@ amdgpu_pci_remove(struct pci_dev *pdev)
amdgpu_driver_unload_kms(dev);
+ amdgpu_discovery_sysfs_early_fini(pdev);
+
/*
* Flush any in flight DMA operations from device.
* Clear the Bus Master Enable bit and then wait on the PCIe Device
@@ -2705,9 +2722,10 @@ static int amdgpu_pmops_freeze(struct device *dev)
static int amdgpu_pmops_thaw(struct device *dev)
{
struct drm_device *drm_dev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(drm_dev);
/* do not resume device if it's normal hibernation */
- if (console_suspend_enabled &&
+ if (!adev->debug_hibernation_thaw_resume_gpu &&
!pm_hibernate_is_recovering() &&
!pm_hibernation_mode_is_suspend())
return 0;
@@ -3076,6 +3094,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(AMDGPU_PROC_OPTIONS, amdgpu_proc_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
};
static const struct drm_driver amdgpu_kms_driver = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index ea69b1bac7c6..3043ad041bb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -727,6 +727,15 @@ void amdgpu_ring_set_fence_errors_and_reemit(struct amdgpu_ring *ring,
last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
+ /* If there is nothing to reemit, return early and set an error on the fence
+ * if applicable. If all of the fences are siganlled, this will be a nop.
+ * if there are still fences and ring_backup_entries_to_copy is 0, then
+ * we are skipping it on purpose.
+ */
+ if (!ring->ring_backup_entries_to_copy) {
+ amdgpu_fence_driver_force_completion(ring, &guilty_fence->base);
+ return;
+ }
ring->reemit = true;
amdgpu_ring_alloc(ring, ring->ring_backup_entries_to_copy);
spin_lock_irqsave(&ring->fence_drv.lock, flags);
@@ -741,7 +750,8 @@ void amdgpu_ring_set_fence_errors_and_reemit(struct amdgpu_ring *ring,
if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
fence = container_of(unprocessed, struct amdgpu_fence, base);
is_guilty_fence = fence == guilty_fence;
- is_guilty_context = fence->context == guilty_fence->context;
+ is_guilty_context = guilty_fence ?
+ (fence->context == guilty_fence->context) : false;
/* mark all fences from the guilty context with an error */
if (is_guilty_fence)
@@ -794,6 +804,17 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
ring->ring_backup_entries_to_copy = 0;
+ /* if we've already seen this fence, return early.
+ * ring->ring_backup_entries_to_copy is set to 0 so
+ * the reemit helper will return early as well to
+ * avoid getting stuck in a reemit loop.
+ */
+ if (ring->guilty_fence == guilty_fence) {
+ ring->guilty_fence = NULL;
+ return;
+ }
+ ring->guilty_fence = guilty_fence;
+
do {
last_seq++;
last_seq &= ring->fence_drv.num_fences_mask;
@@ -811,6 +832,36 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
} while (last_seq != seq);
}
+struct amdgpu_fence *
+amdgpu_ring_find_guilty_fence(struct amdgpu_ring *ring)
+{
+ struct dma_fence *unprocessed;
+ struct dma_fence __rcu **ptr;
+ struct amdgpu_fence *fence;
+ u32 seq, last_seq;
+
+ last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
+ seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
+
+ do {
+ last_seq++;
+ last_seq &= ring->fence_drv.num_fences_mask;
+
+ ptr = &ring->fence_drv.fences[last_seq];
+ rcu_read_lock();
+ unprocessed = rcu_dereference(*ptr);
+
+ if (unprocessed && !dma_fence_is_signaled(unprocessed)) {
+ fence = container_of(unprocessed, struct amdgpu_fence, base);
+ rcu_read_unlock();
+ return fence;
+ }
+ rcu_read_unlock();
+ } while (last_seq != seq);
+
+ return NULL;
+}
+
/*
* Common fence implementation
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 85372af1216d..96c9d4f00b27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -34,6 +34,7 @@
#include "amdgpu_xcp.h"
#include "amdgpu_xgmi.h"
#include "amdgpu_mes.h"
+#include "mes_userqueue.h"
#include "nvd.h"
/* delay 0.1 second to enable gfx off feature */
@@ -377,6 +378,30 @@ int amdgpu_gfx_kiq_init(struct amdgpu_device *adev,
return 0;
}
+static void amdgpu_gfx_mqd_reset_restore(struct amdgpu_ring *ring)
+{
+ struct amdgpu_device *adev = ring->adev;
+ int mqd_idx, mqd_size;
+
+ /* restore mqd with the backup copy */
+ if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) {
+ mqd_idx = ring - &adev->gfx.compute_ring[0];
+ mqd_size = adev->mqds[AMDGPU_HW_IP_COMPUTE].mqd_size;
+ if (adev->gfx.mec.mqd_backup[mqd_idx])
+ memcpy_toio(ring->mqd_ptr, adev->gfx.mec.mqd_backup[mqd_idx], mqd_size);
+ } else if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
+ mqd_size = adev->mqds[AMDGPU_HW_IP_GFX].mqd_size;
+ mqd_idx = ring - &adev->gfx.gfx_ring[0];
+
+ if (adev->gfx.me.mqd_backup[mqd_idx])
+ memcpy_toio(ring->mqd_ptr, adev->gfx.me.mqd_backup[mqd_idx], mqd_size);
+ }
+ /* reset the ring */
+ ring->wptr = 0;
+ atomic64_set((atomic64_t *)ring->wptr_cpu_addr, 0);
+ amdgpu_ring_clear_ring(ring);
+}
+
/* create MQD for each compute/gfx queue */
int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
unsigned int mqd_size, int xcc_id)
@@ -1964,6 +1989,60 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
}
+static int amdgpu_gfx_mes_reset_queue_start(struct amdgpu_ring *ring,
+ unsigned int vmid,
+ struct amdgpu_fence *timedout_fence,
+ bool use_mmio)
+{
+ struct amdgpu_device *adev = ring->adev;
+ bool reinit_queue;
+ int r;
+
+ if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) &&
+ adev->mes.compute_pipe_reset_enabled)
+ reinit_queue = true;
+ else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) &&
+ adev->mes.gfx_pipe_reset_enabled)
+ reinit_queue = true;
+ else
+ reinit_queue = use_mmio;
+
+ amdgpu_ring_reset_helper_begin(ring, timedout_fence);
+
+ r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
+ if (r)
+ return r;
+
+ if (reinit_queue) {
+ r = amdgpu_mes_unmap_legacy_queue(adev, ring,
+ RESET_QUEUES, 0, 0, 0);
+ if (r)
+ return r;
+ amdgpu_gfx_mqd_reset_restore(ring);
+
+ r = amdgpu_mes_map_legacy_queue(adev, ring, 0);
+ if (r) {
+ dev_err(adev->dev, "failed to remap kgq\n");
+ return r;
+ }
+ }
+ return 0;
+}
+
+int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
+ unsigned int vmid,
+ struct amdgpu_fence *timedout_fence,
+ bool use_mmio)
+{
+ int r;
+
+ r = amdgpu_gfx_mes_reset_queue_start(ring, vmid, timedout_fence,
+ use_mmio);
+ if (r)
+ return r;
+ return amdgpu_ring_reset_helper_end(ring, timedout_fence);
+}
+
static DEVICE_ATTR(run_cleaner_shader, 0200,
NULL, amdgpu_gfx_set_run_cleaner_shader);
@@ -2122,6 +2201,200 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
}
}
+static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device *adev,
+ struct amdgpu_ring *guilty_ring)
+{
+ struct amdgpu_ring *ring;
+ int i;
+
+ for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+ ring = &adev->gfx.compute_ring[i];
+ if (ring == guilty_ring)
+ continue;
+ drm_sched_wqueue_start(&ring->sched);
+ }
+}
+
+static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev,
+ struct amdgpu_ring *guilty_ring)
+{
+ struct amdgpu_ring *ring;
+ int i;
+
+ for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+ ring = &adev->gfx.compute_ring[i];
+ if (ring == guilty_ring)
+ continue;
+ drm_sched_wqueue_stop(&ring->sched);
+ }
+}
+
+/*
+ * Match the MES-reported hung doorbell against a compute ring and run
+ * the reset. On hit, the matched ring and its guilty fence are returned
+ * via *out_ring / *out_fence so the caller can defer reset end until
+ * after MES has resumed all gangs.
+ */
+static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
+ struct amdgpu_ring *guilty_ring,
+ unsigned int db,
+ struct amdgpu_ring **out_ring,
+ struct amdgpu_fence **out_fence)
+{
+ bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
+ struct amdgpu_fence *fence;
+ struct amdgpu_ring *ring;
+ int i, r;
+
+ *out_ring = NULL;
+ *out_fence = NULL;
+ for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+ ring = &adev->gfx.compute_ring[i];
+ if (ring == guilty_ring)
+ continue;
+ if (ring->doorbell_index == db) {
+ fence = amdgpu_ring_find_guilty_fence(ring);
+ r = amdgpu_gfx_mes_reset_queue_start(ring, 0, fence,
+ use_mmio);
+ if (r)
+ return r;
+ *out_ring = ring;
+ *out_fence = fence;
+ break;
+ }
+ }
+ return 0;
+}
+
+int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring,
+ struct amdgpu_fence *guilty_fence,
+ struct amdgpu_usermode_queue *uq,
+ unsigned int *hung_queue_count,
+ void *faulty_queue_input)
+{
+ struct amdgpu_mes_hung_queue_hqd_info *hqd_info =
+ (struct amdgpu_mes_hung_queue_hqd_info *)
+ &adev->gfx.mec.mes_hung_db_array[adev->mes.hung_queue_hqd_info_offset];
+ int i, r, pipe, queue, queue_type;
+ unsigned int num_hung = 0;
+ bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
+ struct mes_remove_queue_input *queue_input = (struct mes_remove_queue_input *)faulty_queue_input;
+ struct amdgpu_gfx_deferred_entry deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1];
+ int n_deferred = 0;
+ int ring_err;
+
+ guard(mutex)(&adev->gfx.mec.reset_mutex);
+ /* stop the drm schedulers for all compute queues */
+ amdgpu_gfx_reset_stop_compute_scheds(adev, ring);
+ /* suspend all will determine which queues are hung.
+ * reset detect will return the array of bad queue doorbells
+ */
+ r = amdgpu_mes_suspend(adev, 0);
+ /* if suspend all success, it should no hang queue */
+ if (!r)
+ /* always reset the KCQ/userq since we need to signal the fence
+ * and we could be stuck in a loop which is preemptable.
+ */
+ goto fence_reset;
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE,
+ true, &num_hung, adev->gfx.mec.mes_hung_db_array, 0);
+ if (r)
+ goto out;
+ if (hung_queue_count)
+ *hung_queue_count = num_hung;
+
+fence_reset:
+ /* reset the queue this came from if specified */
+ if (ring) {
+ r = amdgpu_gfx_mes_reset_queue_start(ring, 0, guilty_fence,
+ use_mmio);
+ if (r)
+ goto out;
+ deferred_end[n_deferred].ring = ring;
+ deferred_end[n_deferred].fence = guilty_fence;
+ n_deferred++;
+ }
+ if (uq) {
+ r = mes_userq_reset(uq);
+ if (r)
+ goto out;
+ }
+ for (i = 0; i < num_hung; i++) {
+ struct amdgpu_ring *hr = NULL;
+ struct amdgpu_fence *hf = NULL;
+
+ pipe = hqd_info[i].pipe_index;
+ queue = hqd_info[i].queue_index;
+ queue_type = hqd_info[i].queue_type;
+
+ /* reset any KCQs */
+ r = amdgpu_gfx_reset_mes_kcq(adev, ring,
+ adev->gfx.mec.mes_hung_db_array[i],
+ &hr, &hf);
+ if (r)
+ goto out;
+ if (hr) {
+ deferred_end[n_deferred].ring = hr;
+ deferred_end[n_deferred].fence = hf;
+ n_deferred++;
+ }
+ /* reset any KFD queues */
+ r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, queue,
+ adev->gfx.mec.mes_hung_db_array[i]);
+ if (r)
+ goto out;
+ /* reset KGD user queues */
+ r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue,
+ adev->gfx.mec.mes_hung_db_array[i]);
+ if (r)
+ goto out;
+ }
+
+ /* MES doesn't detect any hung queue but we have a known bad queue
+ * and it is not KCQ
+ */
+ if (!num_hung && queue_input && !ring) {
+ /* MES suspend_all is successful means this bad queue is
+ * preempted successfuly. Remove it before resume all so it
+ * doesn't get mapped back
+ */
+ if (!down_read_trylock(&adev->reset_domain->sem)) {
+ r = -EIO;
+ goto out;
+ }
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->remove_hw_queue(&adev->mes, queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ up_read(&adev->reset_domain->sem);
+ }
+
+out:
+ /* resume all will enable the non-hung queues */
+ amdgpu_mes_resume(adev, 0);
+
+ /* Now CP is running again — replay backed-up commands and ring
+ * doorbells on each reset queue.
+ */
+ ring_err = r;
+ for (i = 0; i < n_deferred; i++) {
+ int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring,
+ deferred_end[i].fence);
+
+ if (er && !ring_err)
+ ring_err = er;
+ }
+
+ if (!ring_err)
+ amdgpu_gfx_reset_start_compute_scheds(adev, ring);
+
+ /* If this reset is triggered by non-KCQ, the KCQ result after resume must
+ * not override the reset result; otherwise a false reset failure is returned
+ * to the non-KCQ caller
+ */
+ return ring ? ring_err : r;
+}
+
int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,
unsigned int cleaner_shader_size)
{
@@ -2460,9 +2733,8 @@ void amdgpu_gfx_profile_ring_begin_use(struct amdgpu_ring *ring)
else
profile = PP_SMC_POWER_PROFILE_COMPUTE;
- atomic_inc(&adev->gfx.total_submission_cnt);
-
- cancel_delayed_work_sync(&adev->gfx.idle_work);
+ if (!atomic_fetch_inc(&adev->gfx.total_submission_cnt))
+ cancel_delayed_work_sync(&adev->gfx.idle_work);
/* We can safely return early here because we've cancelled the
* the delayed work so there is no one else to set it to false
@@ -2490,9 +2762,9 @@ void amdgpu_gfx_profile_ring_end_use(struct amdgpu_ring *ring)
if (amdgpu_dpm_is_overdrive_enabled(adev))
return;
- atomic_dec(&ring->adev->gfx.total_submission_cnt);
-
- schedule_delayed_work(&ring->adev->gfx.idle_work, GFX_PROFILE_IDLE_TIMEOUT);
+ if (atomic_dec_and_test(&ring->adev->gfx.total_submission_cnt))
+ schedule_delayed_work(&ring->adev->gfx.idle_work,
+ GFX_PROFILE_IDLE_TIMEOUT);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 54c1eb9c499b..aefd4f03b443 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -36,6 +36,8 @@
#include "amdgpu_ring_mux.h"
#include "amdgpu_xcp.h"
+struct amdgpu_usermode_queue;
+
/* GFX current status */
#define AMDGPU_GFX_NORMAL_MODE 0x00000000L
#define AMDGPU_GFX_SAFE_MODE 0x00000001L
@@ -116,6 +118,9 @@ struct amdgpu_mec {
u32 num_pipe_per_mec;
u32 num_queue_per_pipe;
void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES];
+ bool use_mmio_for_reset;
+ u32 *mes_hung_db_array;
+ struct mutex reset_mutex;
};
struct amdgpu_mec_bitmap {
@@ -401,6 +406,7 @@ struct amdgpu_me {
uint32_t num_pipe_per_me;
uint32_t num_queue_per_pipe;
void *mqd_backup[AMDGPU_MAX_GFX_RINGS];
+ bool use_mmio_for_reset;
/* These are the resources for which amdgpu takes ownership */
DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_GFX_QUEUES);
@@ -479,8 +485,6 @@ struct amdgpu_gfx {
const struct amdgpu_gfx_funcs *funcs;
/* reset mask */
- uint32_t grbm_soft_reset;
- uint32_t srbm_soft_reset;
uint32_t gfx_supported_reset;
uint32_t compute_supported_reset;
@@ -543,6 +547,11 @@ struct amdgpu_gfx {
bool disable_uq;
};
+struct amdgpu_gfx_deferred_entry {
+ struct amdgpu_ring *ring;
+ struct amdgpu_fence *fence;
+};
+
struct amdgpu_gfx_ras_reg_entry {
struct amdgpu_ras_err_status_reg_entry reg_entry;
enum amdgpu_gfx_ras_mem_id_type mem_id_type;
@@ -641,6 +650,12 @@ int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);
int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev);
void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev);
+int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring,
+ struct amdgpu_fence *guilty_fence,
+ struct amdgpu_usermode_queue *uq,
+ unsigned int *hung_queue_count,
+ void *faulty_queue_input);
void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev,
void *ras_error_status,
void (*func)(struct amdgpu_device *adev, void *ras_error_status,
@@ -667,6 +682,11 @@ void amdgpu_debugfs_compute_sched_mask_init(struct amdgpu_device *adev);
int amdgpu_gfx_ring_preempt_ib(struct amdgpu_ring *ring);
+int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
+ unsigned int vmid,
+ struct amdgpu_fence *timedout_fence,
+ bool use_mmio);
+
static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
{
switch (mode) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 5d6149ba7ab7..4000b2c6fc98 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -1763,10 +1763,15 @@ int amdgpu_gmc_init_mem_ranges(struct amdgpu_device *adev)
valid = true;
else
valid = amdgpu_gmc_validate_partition_info(adev);
- if (!valid) {
- /* TODO: handle invalid case */
+ if (!valid)
dev_warn(adev->dev,
"Mem ranges not matching with hardware config\n");
+
+ if (!adev->gmc.num_mem_partitions) {
+ dev_err(adev->dev, "num_mem_partitions is zero\n");
+ kfree(adev->gmc.mem_partitions);
+ adev->gmc.mem_partitions = NULL;
+ return -EINVAL;
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index ddb0d500e0fa..3ca187f5ade8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -286,7 +286,6 @@ struct amdgpu_gmc {
struct amdgpu_irq_src vm_fault;
uint32_t vram_type;
uint8_t vram_vendor;
- uint32_t srbm_soft_reset;
bool prt_warning;
uint32_t sdpif_register;
/* apertures */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c
index 6aa54156bbc9..33a04113ed74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c
@@ -369,43 +369,152 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
}
/**
- * amdgpu_device_ip_is_hw - is the hardware IP enabled
+ * amdgpu_device_ip_is_valid - is the hardware IP valid
*
* @adev: amdgpu_device pointer
* @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
*
- * Check if the hardware IP is enable or not.
- * Returns true if it the IP is enable, false if not.
+ * Check if the hardware IP is valid or not.
+ * Returns true if it the IP is valid, false if not.
*/
-bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
- enum amd_ip_block_type block_type)
+bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
+ enum amd_ip_block_type block_type)
{
struct amdgpu_ip_block *ip_block;
ip_block = amdgpu_device_ip_get_ip_block(adev, block_type);
if (ip_block)
- return ip_block->status.hw;
+ return ip_block->status.valid;
return false;
}
/**
- * amdgpu_device_ip_is_valid - is the hardware IP valid
+ * amdgpu_ip_from_ring() - Find IP block type corresponding to ring type.
*
- * @adev: amdgpu_device pointer
- * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
+ * @ring_type: The ring type whose IP block you are looking for.
+ */
+static enum amd_ip_block_type amdgpu_ip_from_ring(const enum amdgpu_ring_type ring_type)
+{
+ switch (ring_type) {
+ case AMDGPU_RING_TYPE_GFX:
+ case AMDGPU_RING_TYPE_COMPUTE:
+ return AMD_IP_BLOCK_TYPE_GFX;
+
+ case AMDGPU_RING_TYPE_SDMA:
+ return AMD_IP_BLOCK_TYPE_SDMA;
+
+ case AMDGPU_RING_TYPE_UVD:
+ case AMDGPU_RING_TYPE_UVD_ENC:
+ return AMD_IP_BLOCK_TYPE_UVD;
+
+ case AMDGPU_RING_TYPE_VCE:
+ return AMD_IP_BLOCK_TYPE_VCE;
+
+ case AMDGPU_RING_TYPE_VCN_DEC:
+ case AMDGPU_RING_TYPE_VCN_ENC:
+ return AMD_IP_BLOCK_TYPE_VCN;
+
+ case AMDGPU_RING_TYPE_VCN_JPEG:
+ return AMD_IP_BLOCK_TYPE_JPEG;
+
+ case AMDGPU_RING_TYPE_VPE:
+ return AMD_IP_BLOCK_TYPE_VPE;
+
+ default:
+ return AMD_IP_BLOCK_TYPE_NUM;
+ }
+}
+
+/**
+ * amdgpu_ring_mask_from_ip() - Find mask of ring types corresponding to an IP block type.
*
- * Check if the hardware IP is valid or not.
- * Returns true if it the IP is valid, false if not.
+ * @ip_type: The IP block type whose rings you are looking for.
*/
-bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
- enum amd_ip_block_type block_type)
+static u32 amdgpu_ring_mask_from_ip(const enum amd_ip_block_type ip_type)
+{
+ switch (ip_type) {
+ case AMD_IP_BLOCK_TYPE_GFX:
+ return BIT(AMDGPU_RING_TYPE_GFX) | BIT(AMDGPU_RING_TYPE_COMPUTE);
+
+ case AMD_IP_BLOCK_TYPE_SDMA:
+ return BIT(AMDGPU_RING_TYPE_SDMA);
+
+ case AMD_IP_BLOCK_TYPE_UVD:
+ return BIT(AMDGPU_RING_TYPE_UVD) | BIT(AMDGPU_RING_TYPE_UVD_ENC);
+
+ case AMD_IP_BLOCK_TYPE_VCE:
+ return BIT(AMD_IP_BLOCK_TYPE_VCE);
+
+ case AMD_IP_BLOCK_TYPE_VCN:
+ return BIT(AMDGPU_RING_TYPE_VCN_DEC) | BIT(AMDGPU_RING_TYPE_VCN_ENC);
+
+ case AMD_IP_BLOCK_TYPE_JPEG:
+ return BIT(AMDGPU_RING_TYPE_VCN_JPEG);
+
+ case AMD_IP_BLOCK_TYPE_VPE:
+ return BIT(AMDGPU_RING_TYPE_VPE);
+
+ default:
+ return 0;
+ }
+}
+
+/**
+ * amdgpu_device_ip_soft_reset() - Perform a graceful soft reset on an IP block.
+ *
+ * @guilty_ring: The ring which is guilty of causing a reset.
+ * @guilty_fence: The fence which didn't signal.
+ *
+ * IP block soft reset is used when attempting to recover
+ * from a GPU hang in a situation where a more fine grained
+ * reset type isn't available or didn't work. This effectively
+ * resets all rings that belong to the same device IP block
+ * and re-initializes the device IP block.
+ *
+ * The reset is handled gracefully, meaning that we try to
+ * minimize collateral damage (ie. avoid rejecting non-guilty jobs)
+ * as well as back up and restore the contents of all rings
+ * so that the system can move on from the hang.
+ */
+int amdgpu_device_ip_soft_reset(struct amdgpu_ring *guilty_ring,
+ struct amdgpu_fence *guilty_fence)
{
+ struct amdgpu_device *adev = guilty_ring->adev;
struct amdgpu_ip_block *ip_block;
+ enum amd_ip_block_type ip_type;
+ u32 ring_type_mask;
+ int r;
- ip_block = amdgpu_device_ip_get_ip_block(adev, block_type);
- if (ip_block)
- return ip_block->status.valid;
+ ip_type = amdgpu_ip_from_ring(guilty_ring->funcs->type);
+ ip_block = amdgpu_device_ip_get_ip_block(adev, ip_type);
- return false;
+ if (!ip_block || !ip_block->version->funcs->soft_reset) {
+ dev_warn(adev->dev, "IP block soft reset not supported on %s\n",
+ ip_block->version->funcs->name);
+ return -EOPNOTSUPP;
+ }
+
+ dev_err(adev->dev, "Starting %s IP block soft reset\n",
+ ip_block->version->funcs->name);
+
+ ring_type_mask = amdgpu_ring_mask_from_ip(ip_type);
+
+ amdgpu_device_lock_reset_domain(adev->reset_domain);
+ amdgpu_multi_ring_reset_helper_begin(ring_type_mask, guilty_ring, guilty_fence);
+
+ r = ip_block->version->funcs->soft_reset(ip_block);
+
+ r = amdgpu_multi_ring_reset_helper_end(ring_type_mask, guilty_ring, r);
+ amdgpu_device_unlock_reset_domain(adev->reset_domain);
+
+ if (r) {
+ dev_err(adev->dev, "Failed %s IP block soft reset: %d\n",
+ ip_block->version->funcs->name, r);
+ return r;
+ }
+
+ dev_err(adev->dev, "Successful %s IP block soft reset\n",
+ ip_block->version->funcs->name);
+ return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h
index 1d0df6d93957..70fc4e5db51f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h
@@ -68,6 +68,7 @@ enum amd_hw_ip_block_type {
ISP_HWIP,
ATU_HWIP,
AIGC_HWIP,
+ UMSCH_HWIP,
MAX_HWIP
};
@@ -84,6 +85,9 @@ enum amd_hw_ip_block_type {
#define IP_VERSION_SUBREV(ver) ((ver) & 0xF)
#define IP_VERSION_MAJ_MIN_REV(ver) ((ver) >> 8)
+struct amdgpu_ring;
+struct amdgpu_fence;
+
struct amdgpu_ip_map_info {
/* Map of logical to actual dev instances/mask */
uint32_t dev_inst[MAX_HWIP][HWIP_MAX_INSTANCE];
@@ -146,9 +150,9 @@ void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
u64 *flags);
int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
enum amd_ip_block_type block_type);
-bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
- enum amd_ip_block_type block_type);
bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
enum amd_ip_block_type block_type);
+int amdgpu_device_ip_soft_reset(struct amdgpu_ring *guilty_ring,
+ struct amdgpu_fence *guilty_fence);
#endif /* __AMDGPU_IP_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 9ecc6387c1eb..cff73f1b5a72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -112,7 +112,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
amdgpu_job_core_dump(adev, job);
if (amdgpu_gpu_recovery &&
- amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RESET) &&
+ amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RECOVERY) &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
s_job->sched->name);
@@ -151,6 +151,17 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
}
+ /* Attempt an IP block soft reset, if supported. */
+ if (amdgpu_gpu_recovery &&
+ amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET)) {
+ r = amdgpu_device_ip_soft_reset(ring, job->hw_fence);
+ if (!r) {
+ atomic_inc(&ring->adev->gpu_reset_counter);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info);
+ goto exit;
+ }
+ }
+
if (dma_fence_get_status(&s_job->s_fence->finished) == 0)
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 63ee6ba6a931..57935c321515 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -134,8 +134,8 @@ void amdgpu_jpeg_ring_begin_use(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
- atomic_inc(&adev->jpeg.total_submission_cnt);
- cancel_delayed_work_sync(&adev->jpeg.idle_work);
+ if (!atomic_fetch_inc(&adev->jpeg.total_submission_cnt))
+ cancel_delayed_work_sync(&adev->jpeg.idle_work);
mutex_lock(&adev->jpeg.jpeg_pg_lock);
amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_JPEG,
@@ -145,8 +145,9 @@ void amdgpu_jpeg_ring_begin_use(struct amdgpu_ring *ring)
void amdgpu_jpeg_ring_end_use(struct amdgpu_ring *ring)
{
- atomic_dec(&ring->adev->jpeg.total_submission_cnt);
- schedule_delayed_work(&ring->adev->jpeg.idle_work, JPEG_IDLE_TIMEOUT);
+ if (atomic_dec_and_test(&ring->adev->jpeg.total_submission_cnt))
+ schedule_delayed_work(&ring->adev->jpeg.idle_work,
+ JPEG_IDLE_TIMEOUT);
}
int amdgpu_jpeg_dec_ring_test_ring(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index 346ae0ab09d3..fe95d9188713 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -149,6 +149,9 @@ struct amdgpu_jpeg {
u32 *ip_dump;
u32 reg_count;
const struct amdgpu_hwip_reg_entry *reg_list;
+
+ bool disable_uq;
+ bool disable_kq;
};
int amdgpu_jpeg_sw_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 71272f40feef..215aa678d1d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1424,6 +1424,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
}
/**
+ * amdgpu_proc_options_ioctl - set per-fd user options
+ *
+ * @dev: drm dev pointer
+ * @data: pointer to struct drm_amdgpu_proc_options
+ * @filp: drm file
+ *
+ * Sets options stored on the per-file amdgpu_fpriv. Currently the only
+ * supported option is %AMDGPU_PROC_OPTIONS_OP_KFD_SIGBUS_DELAY which
+ * controls how KFD delivers SIGBUS for poison/RAS events to the calling
+ * process (immediate, suppressed, or delayed by N milliseconds).
+ */
+int amdgpu_proc_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp)
+{
+ struct drm_amdgpu_proc_options *args = data;
+
+ switch (args->op) {
+ case AMDGPU_PROC_OPTIONS_OP_KFD_SIGBUS_DELAY:
+ return amdgpu_amdkfd_set_sigbus_delay(current,
+ args->kfd_sigbus_delay.value);
+ default:
+ DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
+ return -EINVAL;
+ }
+}
+
+/**
* amdgpu_driver_open_kms - drm callback for open
*
* @dev: drm dev pointer
@@ -1504,8 +1531,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
if (r)
goto error_vm;
- mutex_init(&fpriv->bo_list_lock);
- idr_init_base(&fpriv->bo_list_handles, 1);
+ xa_init_flags(&fpriv->bo_list_handles, XA_FLAGS_ALLOC1);
r = amdgpu_userq_mgr_init(&fpriv->userq_mgr, file_priv, adev);
if (r)
@@ -1550,8 +1576,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
struct amdgpu_bo_list *list;
struct amdgpu_bo *pd;
+ unsigned long handle;
u32 pasid;
- int handle;
if (!fpriv)
return;
@@ -1587,11 +1613,9 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
amdgpu_pasid_free_delayed(pd->tbo.base.resv, pasid);
amdgpu_bo_unref(&pd);
- idr_for_each_entry(&fpriv->bo_list_handles, list, handle)
+ xa_for_each(&fpriv->bo_list_handles, handle, list)
amdgpu_bo_list_put(list);
-
- idr_destroy(&fpriv->bo_list_handles);
- mutex_destroy(&fpriv->bo_list_lock);
+ xa_destroy(&fpriv->bo_list_handles);
kfree(fpriv);
file_priv->driver_priv = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cc6d1a4e4c3a..9a7f7d2b2767 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -27,16 +27,6 @@
#include "umc/umc_6_7_0_offset.h"
#include "umc/umc_6_7_0_sh_mask.h"
-static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
- uint64_t mc_status)
-{
- if (adev->umc.ras->check_ecc_err_status)
- return adev->umc.ras->check_ecc_err_status(adev,
- AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
-
- return false;
-}
-
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
@@ -155,479 +145,3 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
-
-static void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set)
-{
- if (!mca_set)
- return;
-
- memset(mca_set, 0, sizeof(*mca_set));
- INIT_LIST_HEAD(&mca_set->list);
-}
-
-static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry)
-{
- struct mca_bank_node *node;
-
- if (!entry)
- return -EINVAL;
-
- node = kvzalloc_obj(*node);
- if (!node)
- return -ENOMEM;
-
- memcpy(&node->entry, entry, sizeof(*entry));
-
- INIT_LIST_HEAD(&node->node);
- list_add_tail(&node->node, &mca_set->list);
-
- mca_set->nr_entries++;
-
- return 0;
-}
-
-static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new)
-{
- struct mca_bank_node *node;
-
- list_for_each_entry(node, &new->list, node)
- amdgpu_mca_bank_set_add_entry(mca_set, &node->entry);
-
- return 0;
-}
-
-static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node)
-{
- if (!node)
- return;
-
- list_del(&node->node);
- kvfree(node);
-
- mca_set->nr_entries--;
-}
-
-static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
-{
- struct mca_bank_node *node, *tmp;
-
- if (list_empty(&mca_set->list))
- return;
-
- list_for_each_entry_safe(node, tmp, &mca_set->list, node)
- amdgpu_mca_bank_set_remove_node(mca_set, node);
-}
-
-void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
-{
- struct amdgpu_mca *mca = &adev->mca;
-
- mca->mca_funcs = mca_funcs;
-}
-
-int amdgpu_mca_init(struct amdgpu_device *adev)
-{
- struct amdgpu_mca *mca = &adev->mca;
- struct mca_bank_cache *mca_cache;
- int i;
-
- atomic_set(&mca->ue_update_flag, 0);
-
- for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
- mca_cache = &mca->mca_caches[i];
- mutex_init(&mca_cache->lock);
- amdgpu_mca_bank_set_init(&mca_cache->mca_set);
- }
-
- return 0;
-}
-
-void amdgpu_mca_fini(struct amdgpu_device *adev)
-{
- struct amdgpu_mca *mca = &adev->mca;
- struct mca_bank_cache *mca_cache;
- int i;
-
- atomic_set(&mca->ue_update_flag, 0);
-
- for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
- mca_cache = &mca->mca_caches[i];
- amdgpu_mca_bank_set_release(&mca_cache->mca_set);
- mutex_destroy(&mca_cache->lock);
- }
-}
-
-int amdgpu_mca_reset(struct amdgpu_device *adev)
-{
- amdgpu_mca_fini(adev);
-
- return amdgpu_mca_init(adev);
-}
-
-int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
-{
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
-
- if (mca_funcs && mca_funcs->mca_set_debug_mode)
- return mca_funcs->mca_set_debug_mode(adev, enable);
-
- return -EOPNOTSUPP;
-}
-
-static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
- struct ras_query_context *qctx)
-{
- u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
-
- RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
- RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
- idx, entry->regs[MCA_REG_IDX_STATUS]);
- RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
- idx, entry->regs[MCA_REG_IDX_ADDR]);
- RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
- idx, entry->regs[MCA_REG_IDX_MISC0]);
- RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
- idx, entry->regs[MCA_REG_IDX_IPID]);
- RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
- idx, entry->regs[MCA_REG_IDX_SYND]);
-}
-
-static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
-{
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
-
- if (!count)
- return -EINVAL;
-
- if (mca_funcs && mca_funcs->mca_get_valid_mca_count)
- return mca_funcs->mca_get_valid_mca_count(adev, type, count);
-
- return -EOPNOTSUPP;
-}
-
-static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
- int idx, struct mca_bank_entry *entry)
-{
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
- int count;
-
- if (!mca_funcs || !mca_funcs->mca_get_mca_entry)
- return -EOPNOTSUPP;
-
- switch (type) {
- case AMDGPU_MCA_ERROR_TYPE_UE:
- count = mca_funcs->max_ue_count;
- break;
- case AMDGPU_MCA_ERROR_TYPE_CE:
- count = mca_funcs->max_ce_count;
- break;
- default:
- return -EINVAL;
- }
-
- if (idx >= count)
- return -EINVAL;
-
- return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
-}
-
-static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev, enum amdgpu_mca_error_type type)
-{
- struct amdgpu_mca *mca = &adev->mca;
- bool ret = true;
-
- /*
- * Because the UE Valid MCA count will only be cleared after reset,
- * in order to avoid repeated counting of the error count,
- * the aca bank is only updated once during the gpu recovery stage.
- */
- if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
- if (amdgpu_ras_intr_triggered())
- ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) == 0;
- else
- atomic_set(&mca->ue_update_flag, 0);
- }
-
- return ret;
-}
-
-static bool amdgpu_mca_bank_should_dump(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
- struct mca_bank_entry *entry)
-{
- bool ret;
-
- switch (type) {
- case AMDGPU_MCA_ERROR_TYPE_CE:
- ret = amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]);
- break;
- case AMDGPU_MCA_ERROR_TYPE_UE:
- default:
- ret = true;
- break;
- }
-
- return ret;
-}
-
-static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
- struct ras_query_context *qctx)
-{
- struct mca_bank_entry entry;
- uint32_t count = 0, i;
- int ret;
-
- if (!mca_set)
- return -EINVAL;
-
- if (!amdgpu_mca_bank_should_update(adev, type))
- return 0;
-
- ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count);
- if (ret)
- return ret;
-
- for (i = 0; i < count; i++) {
- memset(&entry, 0, sizeof(entry));
- ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, &entry);
- if (ret)
- return ret;
-
- amdgpu_mca_bank_set_add_entry(mca_set, &entry);
-
- if (amdgpu_mca_bank_should_dump(adev, type, &entry))
- amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx);
- }
-
- return 0;
-}
-
-static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
-{
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
-
- if (!count || !entry)
- return -EINVAL;
-
- if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count)
- return -EOPNOTSUPP;
-
- return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
-}
-
-static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
- struct mca_bank_set *mca_set, struct ras_err_data *err_data)
-{
- struct amdgpu_smuio_mcm_config_info mcm_info;
- struct mca_bank_node *node, *tmp;
- struct mca_bank_entry *entry;
- uint32_t count;
- int ret;
-
- if (!mca_set)
- return -EINVAL;
-
- if (!mca_set->nr_entries)
- return 0;
-
- list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
- entry = &node->entry;
-
- count = 0;
- ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
- if (ret && ret != -EOPNOTSUPP)
- return ret;
-
- if (!count)
- continue;
-
- memset(&mcm_info, 0, sizeof(mcm_info));
-
- mcm_info.socket_id = entry->info.socket_id;
- mcm_info.die_id = entry->info.aid;
-
- if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
- amdgpu_ras_error_statistic_ue_count(err_data,
- &mcm_info, (uint64_t)count);
- } else {
- if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
- amdgpu_ras_error_statistic_de_count(err_data,
- &mcm_info, (uint64_t)count);
- else
- amdgpu_ras_error_statistic_ce_count(err_data,
- &mcm_info, (uint64_t)count);
- }
-
- amdgpu_mca_bank_set_remove_node(mca_set, node);
- }
-
- return 0;
-}
-
-static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new)
-{
- struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
- int ret;
-
- mutex_lock(&mca_cache->lock);
- ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new);
- mutex_unlock(&mca_cache->lock);
-
- return ret;
-}
-
-int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
- struct ras_err_data *err_data, struct ras_query_context *qctx)
-{
- struct mca_bank_set mca_set;
- struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
- int ret;
-
- amdgpu_mca_bank_set_init(&mca_set);
-
- ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx);
- if (ret)
- goto out_mca_release;
-
- ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
- if (ret)
- goto out_mca_release;
-
- /* add remain mca bank to mca cache */
- if (mca_set.nr_entries) {
- ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
- if (ret)
- goto out_mca_release;
- }
-
- /* dispatch mca set again if mca cache has valid data */
- mutex_lock(&mca_cache->lock);
- if (mca_cache->mca_set.nr_entries)
- ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data);
- mutex_unlock(&mca_cache->lock);
-
-out_mca_release:
- amdgpu_mca_bank_set_release(&mca_set);
-
- return ret;
-}
-
-#if defined(CONFIG_DEBUG_FS)
-static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)data;
- int ret;
-
- ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
- if (ret)
- return ret;
-
- dev_info(adev->dev, "amdgpu set smu mca debug mode %s success\n", val ? "on" : "off");
-
- return 0;
-}
-
-static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry)
-{
- int i, idx = entry->idx;
- int reg_idx_array[] = {
- MCA_REG_IDX_STATUS,
- MCA_REG_IDX_ADDR,
- MCA_REG_IDX_MISC0,
- MCA_REG_IDX_IPID,
- MCA_REG_IDX_SYND,
- };
-
- seq_printf(m, "mca entry[%d].type: %s\n", idx, entry->type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE");
- seq_printf(m, "mca entry[%d].ip: %d\n", idx, entry->ip);
- seq_printf(m, "mca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
- idx, entry->info.socket_id, entry->info.aid, entry->info.hwid, entry->info.mcatype);
-
- for (i = 0; i < ARRAY_SIZE(reg_idx_array); i++)
- seq_printf(m, "mca entry[%d].regs[%d]: 0x%016llx\n", idx, reg_idx_array[i], entry->regs[reg_idx_array[i]]);
-}
-
-static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
- struct mca_bank_node *node;
- struct mca_bank_set mca_set;
- struct ras_query_context qctx;
- int ret;
-
- amdgpu_mca_bank_set_init(&mca_set);
-
- qctx.evid.event_id = RAS_EVENT_INVALID_ID;
- ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
- if (ret)
- goto err_free_mca_set;
-
- seq_printf(m, "amdgpu smu %s valid mca count: %d\n",
- type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries);
-
- if (!mca_set.nr_entries)
- goto err_free_mca_set;
-
- list_for_each_entry(node, &mca_set.list, node)
- mca_dump_entry(m, &node->entry);
-
- /* add mca bank to mca bank cache */
- ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
-
-err_free_mca_set:
- amdgpu_mca_bank_set_release(&mca_set);
-
- return ret;
-}
-
-static int mca_dump_ce_show(struct seq_file *m, void *unused)
-{
- return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_CE);
-}
-
-static int mca_dump_ce_open(struct inode *inode, struct file *file)
-{
- return single_open(file, mca_dump_ce_show, inode->i_private);
-}
-
-static const struct file_operations mca_ce_dump_debug_fops = {
- .owner = THIS_MODULE,
- .open = mca_dump_ce_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int mca_dump_ue_show(struct seq_file *m, void *unused)
-{
- return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_UE);
-}
-
-static int mca_dump_ue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, mca_dump_ue_show, inode->i_private);
-}
-
-static const struct file_operations mca_ue_dump_debug_fops = {
- .owner = THIS_MODULE,
- .open = mca_dump_ue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_set, "%llu\n");
-#endif
-
-void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root)
-{
-#if defined(CONFIG_DEBUG_FS)
- if (!root)
- return;
-
- debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops);
- debugfs_create_file("mca_ue_dump", 0400, root, adev, &mca_ue_dump_debug_fops);
- debugfs_create_file("mca_ce_dump", 0400, root, adev, &mca_ce_dump_debug_fops);
-#endif
-}
-
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index e80323ff90c1..6d12f8a516d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -23,45 +23,6 @@
#include "amdgpu_ras.h"
-#define MCA_MAX_REGS_COUNT (16)
-
-#define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
-#define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63)
-#define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62)
-#define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61)
-#define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60)
-#define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59)
-#define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58)
-#define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57)
-#define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56)
-#define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55)
-#define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53)
-#define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46)
-#define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45)
-#define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44)
-#define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43)
-#define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40)
-#define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32)
-#define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24)
-#define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16)
-#define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0)
-
-#define MCA_REG__MISC0__ERRCNT(x) MCA_REG_FIELD(x, 43, 32)
-
-#define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0)
-
-enum amdgpu_mca_ip {
- AMDGPU_MCA_IP_UNKNOW = -1,
- AMDGPU_MCA_IP_PSP = 0,
- AMDGPU_MCA_IP_SDMA,
- AMDGPU_MCA_IP_GC,
- AMDGPU_MCA_IP_SMU,
- AMDGPU_MCA_IP_MP5,
- AMDGPU_MCA_IP_UMC,
- AMDGPU_MCA_IP_PCS_XGMI,
- AMDGPU_MCA_IP_COUNT,
-};
-
enum amdgpu_mca_error_type {
AMDGPU_MCA_ERROR_TYPE_UE = 0,
AMDGPU_MCA_ERROR_TYPE_CE,
@@ -77,77 +38,20 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
};
-struct mca_bank_set {
- int nr_entries;
- struct list_head list;
-};
-
-struct mca_bank_cache {
- struct mca_bank_set mca_set;
- struct mutex lock;
-};
-
struct amdgpu_mca {
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
- const struct amdgpu_mca_smu_funcs *mca_funcs;
- struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE];
- atomic_t ue_update_flag;
-};
-
-enum mca_reg_idx {
- MCA_REG_IDX_STATUS = 1,
- MCA_REG_IDX_ADDR = 2,
- MCA_REG_IDX_MISC0 = 3,
- MCA_REG_IDX_IPID = 5,
- MCA_REG_IDX_SYND = 6,
- MCA_REG_IDX_COUNT = 16,
-};
-
-struct mca_bank_info {
- int socket_id;
- int aid;
- int hwid;
- int mcatype;
-};
-
-struct mca_bank_entry {
- int idx;
- enum amdgpu_mca_error_type type;
- enum amdgpu_mca_ip ip;
- struct mca_bank_info info;
- uint64_t regs[MCA_MAX_REGS_COUNT];
-};
-
-struct mca_bank_node {
- struct mca_bank_entry entry;
- struct list_head node;
-};
-
-struct amdgpu_mca_smu_funcs {
- int max_ue_count;
- int max_ce_count;
- int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable);
- int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
- struct mca_bank_entry *entry, uint32_t *count);
- int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
- uint32_t *count);
- int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
- int idx, struct mca_bank_entry *entry);
};
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count);
-
void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count);
-
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr);
-
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
void *ras_error_status);
@@ -155,15 +59,4 @@ int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
-void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs);
-int amdgpu_mca_init(struct amdgpu_device *adev);
-void amdgpu_mca_fini(struct amdgpu_device *adev);
-int amdgpu_mca_reset(struct amdgpu_device *adev);
-int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable);
-int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- enum amdgpu_mca_error_type type, uint32_t *total);
-void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
-int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
- struct ras_err_data *err_data, struct ras_query_context *qctx);
-
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e3972673fd64..6c0dde3786e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -217,7 +217,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error_doorbell;
- if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) {
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
/* When queue/pipe reset is done in MES instead of in the
* driver, MES passes hung queues information to the driver in
* hung_queue_hqd_info. Calculate required space to store this
@@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
}
}
+ adev->gfx.mec.mes_hung_db_array =
+ kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev),
+ sizeof(u32), GFP_KERNEL);
+
return 0;
error_doorbell:
@@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
int i;
int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
+ kfree(adev->gfx.mec.mes_hung_db_array);
+
amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
&adev->mes.event_log_gpu_addr,
&adev->mes.event_log_cpu_addr);
@@ -439,6 +445,59 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
return r;
}
+int amdgpu_mes_reset_queue_mmio(struct amdgpu_device *adev,
+ int queue_type,
+ unsigned int vmid,
+ unsigned int me,
+ unsigned int pipe,
+ unsigned int queue,
+ uint32_t xcc_id)
+{
+ struct mes_reset_queue_input queue_input;
+ int r;
+
+ memset(&queue_input, 0, sizeof(queue_input));
+
+ queue_input.xcc_id = xcc_id;
+ queue_input.me_id = me;
+ queue_input.pipe_id = pipe;
+ queue_input.queue_id = queue;
+ queue_input.vmid = vmid;
+ queue_input.queue_type = queue_type;
+ queue_input.use_mmio = true;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r)
+ dev_err(adev->dev, "failed to reset legacy queue\n");
+
+ return r;
+}
+
+int amdgpu_mes_reset_user_queue(struct amdgpu_device *adev,
+ int queue_type,
+ unsigned int doorbell_index,
+ unsigned int xcc_id)
+{
+ struct mes_reset_queue_input queue_input;
+ int r;
+
+ memset(&queue_input, 0, sizeof(queue_input));
+
+ queue_input.xcc_id = xcc_id;
+ queue_input.queue_type = queue_type;
+ queue_input.doorbell_offset = doorbell_index;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r)
+ dev_err(adev->dev, "failed to reset user queue\n");
+
+ return r;
+}
+
int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
{
return adev->mes.hung_queue_db_array_size;
@@ -805,8 +864,13 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
{
- return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
- (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
+ u32 ip_maj = IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0));
+ u32 ip_min = IP_VERSION_MIN(amdgpu_ip_version(adev, GC_HWIP, 0));
+ u32 mes_sched = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
+
+ return (ip_maj == 11 && mes_sched >= 0x8c) ||
+ ((ip_maj == 12 && ip_min == 0) && mes_sched >= 0x8d) ||
+ ((ip_maj == 12 && ip_min == 1) && mes_sched >= 0x73);
}
/* Fix me -- node_id is used to identify the correct MES instances in the future */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 1aae49f4df49..f25cffad8efe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -168,6 +168,9 @@ struct amdgpu_mes {
int master_xcc_ids[AMDGPU_MAX_MES_INST_PIPES];
struct amdgpu_bo *shared_cmd_buf_obj[AMDGPU_MAX_MES_INST_PIPES];
uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
+
+ bool compute_pipe_reset_enabled;
+ bool gfx_pipe_reset_enabled;
};
struct amdgpu_mes_hung_queue_hqd_info {
@@ -271,6 +274,7 @@ struct mes_remove_queue_input {
uint32_t xcc_id;
uint32_t doorbell_offset;
uint64_t gang_context_addr;
+ uint32_t queue_type;
bool remove_queue_after_reset;
};
@@ -461,6 +465,17 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
unsigned int vmid,
bool use_mmio,
uint32_t xcc_id);
+int amdgpu_mes_reset_queue_mmio(struct amdgpu_device *adev,
+ int queue_type,
+ unsigned int vmid,
+ unsigned int me,
+ unsigned int pipe,
+ unsigned int queue,
+ uint32_t xcc_id);
+int amdgpu_mes_reset_user_queue(struct amdgpu_device *adev,
+ int queue_type,
+ unsigned int doorbell_index,
+ unsigned int xcc_id);
int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 6b8214650e5d..c5120ba51e24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -21,29 +21,6 @@
#ifndef __AMDGPU_MMHUB_H__
#define __AMDGPU_MMHUB_H__
-enum amdgpu_mmhub_ras_memory_id {
- AMDGPU_MMHUB_WGMI_PAGEMEM = 0,
- AMDGPU_MMHUB_RGMI_PAGEMEM = 1,
- AMDGPU_MMHUB_WDRAM_PAGEMEM = 2,
- AMDGPU_MMHUB_RDRAM_PAGEMEM = 3,
- AMDGPU_MMHUB_WIO_CMDMEM = 4,
- AMDGPU_MMHUB_RIO_CMDMEM = 5,
- AMDGPU_MMHUB_WGMI_CMDMEM = 6,
- AMDGPU_MMHUB_RGMI_CMDMEM = 7,
- AMDGPU_MMHUB_WDRAM_CMDMEM = 8,
- AMDGPU_MMHUB_RDRAM_CMDMEM = 9,
- AMDGPU_MMHUB_MAM_DMEM0 = 10,
- AMDGPU_MMHUB_MAM_DMEM1 = 11,
- AMDGPU_MMHUB_MAM_DMEM2 = 12,
- AMDGPU_MMHUB_MAM_DMEM3 = 13,
- AMDGPU_MMHUB_WRET_TAGMEM = 19,
- AMDGPU_MMHUB_RRET_TAGMEM = 20,
- AMDGPU_MMHUB_WIO_DATAMEM = 21,
- AMDGPU_MMHUB_WGMI_DATAMEM = 22,
- AMDGPU_MMHUB_WDRAM_DATAMEM = 23,
- AMDGPU_MMHUB_MEMORY_BLOCK_LAST,
-};
-
struct amdgpu_mmhub_ras {
struct amdgpu_ras_block_object ras_block;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 4d68732d6223..ff11a0903499 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -312,46 +312,6 @@ uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo);
uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
uint32_t domain);
-/*
- * sub allocation
- */
-static inline struct amdgpu_sa_manager *
-to_amdgpu_sa_manager(struct drm_suballoc_manager *manager)
-{
- return container_of(manager, struct amdgpu_sa_manager, base);
-}
-
-static inline uint64_t amdgpu_sa_bo_gpu_addr(struct drm_suballoc *sa_bo)
-{
- return to_amdgpu_sa_manager(sa_bo->manager)->gpu_addr +
- drm_suballoc_soffset(sa_bo);
-}
-
-static inline void *amdgpu_sa_bo_cpu_addr(struct drm_suballoc *sa_bo)
-{
- return to_amdgpu_sa_manager(sa_bo->manager)->cpu_ptr +
- drm_suballoc_soffset(sa_bo);
-}
-
-int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev,
- struct amdgpu_sa_manager *sa_manager,
- unsigned size, u32 align, u32 domain);
-void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev,
- struct amdgpu_sa_manager *sa_manager);
-int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev,
- struct amdgpu_sa_manager *sa_manager);
-int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager,
- struct drm_suballoc **sa_bo,
- unsigned int size);
-void amdgpu_sa_bo_free(struct drm_suballoc **sa_bo,
- struct dma_fence *fence);
-#if defined(CONFIG_DEBUG_FS)
-void amdgpu_sa_bo_dump_debug_info(struct amdgpu_sa_manager *sa_manager,
- struct seq_file *m);
-u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m);
-#endif
-void amdgpu_debugfs_sa_init(struct amdgpu_device *adev);
-
bool amdgpu_bo_support_uswc(u64 bo_flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index b1dc33301d83..e8592970aaab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -47,6 +47,17 @@ static ssize_t mem_info_preempt_used_show(struct device *dev,
static DEVICE_ATTR_RO(mem_info_preempt_used);
/**
+ * amdgpu_preempt_mgr_sysfs_fini - remove PREEMPT manager sysfs attributes
+ *
+ * @adev: amdgpu_device pointer
+ */
+void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev)
+{
+ if (adev->dev->kobj.sd)
+ device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
+}
+
+/**
* amdgpu_preempt_mgr_new - allocate a new node
*
* @man: TTM memory type manager
@@ -137,9 +148,6 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
if (ret)
return;
- if (adev->dev->kobj.sd)
- device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
-
ttm_resource_manager_cleanup(man);
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 0d3c18f04ac3..8ae72c862d11 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -166,7 +166,8 @@ static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf, size_t
if (ret)
return -EFAULT;
- if (ta_bin_len > PSP_1_MEG)
+ if (ta_bin_len < sizeof(struct common_firmware_header) ||
+ ta_bin_len > PSP_1_MEG)
return -EINVAL;
copy_pos += sizeof(uint32_t);
@@ -321,6 +322,8 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
ret = copy_from_user((void *)&shared_buf_len, &buf[copy_pos], sizeof(uint32_t));
if (ret)
return -EFAULT;
+ if (!shared_buf_len || shared_buf_len > PSP_1_MEG)
+ return -EINVAL;
copy_pos += sizeof(uint32_t);
shared_buf = memdup_user(&buf[copy_pos], shared_buf_len);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 764cd4950408..148bb4cb0a2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -128,12 +128,6 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10
-
-#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
-
-#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
-
#define BYPASS_ALLOCATED_ADDRESS 0x0
#define BYPASS_INITIALIZATION_ADDRESS 0x1
@@ -249,16 +243,12 @@ static int amdgpu_check_address_validity(struct amdgpu_device *adev,
(address >= RAS_UMC_INJECT_ADDR_LIMIT))
return -EFAULT;
- if (amdgpu_uniras_enabled(adev)) {
- if (amdgpu_sriov_vf(adev))
- count = amdgpu_virt_ras_convert_retired_address(adev, address,
- page_pfns, ARRAY_SIZE(page_pfns));
- else
- count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address,
- page_pfns, ARRAY_SIZE(page_pfns));
- } else
- count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
- address, page_pfns, ARRAY_SIZE(page_pfns));
+ if (amdgpu_sriov_vf(adev))
+ count = amdgpu_virt_ras_convert_retired_address(adev, address,
+ page_pfns, ARRAY_SIZE(page_pfns));
+ else
+ count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address,
+ page_pfns, ARRAY_SIZE(page_pfns));
if (count <= 0)
return -EPERM;
@@ -1381,76 +1371,6 @@ static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager
obj->err_data.de_count = err_data->de_count;
}
-static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
-{
- struct ras_common_if head;
-
- memset(&head, 0, sizeof(head));
- head.block = blk;
-
- return amdgpu_ras_find_obj(adev, &head);
-}
-
-int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- const struct aca_info *aca_info, void *data)
-{
- struct ras_manager *obj;
-
- /* in resume phase, no need to create aca fs node */
- if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
- return 0;
-
- obj = get_ras_manager(adev, blk);
- if (!obj)
- return -EINVAL;
-
- return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data);
-}
-
-int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
-{
- struct ras_manager *obj;
-
- obj = get_ras_manager(adev, blk);
- if (!obj)
- return -EINVAL;
-
- amdgpu_aca_remove_handle(&obj->aca_handle);
-
- return 0;
-}
-
-static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- enum aca_error_type type, struct ras_err_data *err_data,
- struct ras_query_context *qctx)
-{
- struct ras_manager *obj;
-
- obj = get_ras_manager(adev, blk);
- if (!obj)
- return -EINVAL;
-
- return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
-}
-
-ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
- struct aca_handle *handle, char *buf, void *data)
-{
- struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle);
- struct ras_query_if info = {
- .head = obj->head,
- };
-
- if (!amdgpu_ras_get_error_query_ready(obj->adev))
- return sysfs_emit(buf, "Query currently inaccessible\n");
-
- if (amdgpu_ras_query_error_status(obj->adev, &info))
- return -EINVAL;
-
- return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
- "ce", info.ce_count, "de", info.de_count);
-}
-
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
@@ -1459,7 +1379,6 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
- int ret;
if (blk == AMDGPU_RAS_BLOCK_COUNT)
return -EINVAL;
@@ -1469,7 +1388,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) {
return amdgpu_virt_req_ras_err_count(adev, blk, err_data);
- } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+ } else {
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, err_data);
} else {
@@ -1490,24 +1409,6 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
block_obj->hw_ops->query_ras_error_status(adev);
}
}
- } else {
- if (amdgpu_aca_is_enabled(adev)) {
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
- if (ret)
- return ret;
-
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
- if (ret)
- return ret;
-
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
- if (ret)
- return ret;
- } else {
- /* FIXME: add code to check return value later */
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
- }
}
return 0;
@@ -1624,8 +1525,6 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
- const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1633,17 +1532,14 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
return -EOPNOTSUPP;
}
- if (!amdgpu_ras_is_supported(adev, block) ||
- !amdgpu_ras_get_aca_debug_mode(adev))
+ if (!amdgpu_ras_is_supported(adev, block))
return -EOPNOTSUPP;
if (amdgpu_sriov_vf(adev))
return -EOPNOTSUPP;
/* skip ras error reset in gpu reset */
- if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
- ((smu_funcs && smu_funcs->set_debug_mode) ||
- (mca_funcs && mca_funcs->mca_set_debug_mode)))
+ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count)
@@ -2090,9 +1986,6 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
- if (amdgpu_aca_is_enabled(adev))
- return 0;
-
if (!obj || obj->attr_inuse)
return -EINVAL;
@@ -2130,9 +2023,6 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
- if (amdgpu_aca_is_enabled(adev))
- return 0;
-
if (!obj || !obj->attr_inuse)
return -EINVAL;
@@ -2245,25 +2135,6 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
obj, &amdgpu_ras_debugfs_ops);
}
-static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
-{
- bool ret;
-
- switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
- case IP_VERSION(13, 0, 6):
- case IP_VERSION(13, 0, 12):
- case IP_VERSION(13, 0, 14):
- case IP_VERSION(13, 0, 15):
- ret = true;
- break;
- default:
- ret = false;
- break;
- }
-
- return ret;
-}
-
void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2290,13 +2161,6 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
amdgpu_ras_debugfs_create(adev, &fs_info, dir);
}
}
-
- if (amdgpu_ras_aca_is_supported(adev)) {
- if (amdgpu_aca_is_enabled(adev))
- amdgpu_aca_smu_debugfs_init(adev, dir);
- else
- amdgpu_mca_smu_debugfs_init(adev, dir);
- }
}
/* debugfs end */
@@ -2489,14 +2353,6 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id, "Poison is created\n");
- if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
- struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
-
- atomic_inc(&con->page_retirement_req_cnt);
- atomic_inc(&con->poison_creation_count);
-
- wake_up(&con->page_retirement_wq);
- }
}
static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
@@ -3026,77 +2882,6 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
return 0;
}
-static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev,
- struct eeprom_table_record *bps,
- struct ras_err_data *err_data)
-{
- struct ta_ras_query_address_input addr_in;
- uint32_t socket = 0;
- int ret = 0;
-
- if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
- socket = adev->smuio.funcs->get_socket_id(adev);
-
- /* reinit err_data */
- err_data->err_addr_cnt = 0;
- err_data->err_addr_len = adev->umc.retire_unit;
-
- memset(&addr_in, 0, sizeof(addr_in));
- addr_in.ma.err_addr = bps->address;
- addr_in.ma.socket_id = socket;
- addr_in.ma.ch_inst = bps->mem_channel;
- if (!amdgpu_ras_smu_eeprom_supported(adev)) {
- /* tell RAS TA the node instance is not used */
- addr_in.ma.node_inst = TA_RAS_INV_NODE;
- } else {
- addr_in.ma.umc_inst = bps->mcumc_id;
- addr_in.ma.node_inst = bps->cu;
- }
-
- if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
- &addr_in, NULL, false);
-
- return ret;
-}
-
-static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
- struct eeprom_table_record *bps,
- struct ras_err_data *err_data)
-{
- struct ta_ras_query_address_input addr_in;
- uint32_t die_id, socket = 0;
-
- if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
- socket = adev->smuio.funcs->get_socket_id(adev);
-
- /* although die id is gotten from PA in nps1 mode, the id is
- * fitable for any nps mode
- */
- if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa)
- die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address,
- bps->retired_page << AMDGPU_GPU_PAGE_SHIFT);
- else
- return -EINVAL;
-
- /* reinit err_data */
- err_data->err_addr_cnt = 0;
- err_data->err_addr_len = adev->umc.retire_unit;
-
- memset(&addr_in, 0, sizeof(addr_in));
- addr_in.ma.err_addr = bps->address;
- addr_in.ma.ch_inst = bps->mem_channel;
- addr_in.ma.umc_inst = bps->mcumc_id;
- addr_in.ma.node_inst = die_id;
- addr_in.ma.socket_id = socket;
-
- if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- return adev->umc.ras->convert_ras_err_addr(adev, err_data,
- &addr_in, NULL, false);
- else
- return -EINVAL;
-}
-
static bool __check_record_in_range(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int count)
{
@@ -3157,117 +2942,13 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
struct eeprom_table_record *bps, struct ras_err_data *err_data,
enum amdgpu_memory_partition nps)
{
- int i = 0;
- uint64_t chan_idx_v2;
- enum amdgpu_memory_partition save_nps;
-
- save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
- chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2;
-
/*old asics just have pa in eeprom*/
- if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
- memcpy(err_data->err_addr, bps,
- sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
- goto out;
- }
-
- for (i = 0; i < adev->umc.retire_unit; i++)
- bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
- if (save_nps || chan_idx_v2) {
- if (save_nps == nps) {
- if (amdgpu_umc_pages_in_a_row(adev, err_data,
- bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- return -EINVAL;
- for (i = 0; i < adev->umc.retire_unit; i++) {
- err_data->err_addr[i].address = bps[0].address;
- err_data->err_addr[i].mem_channel = bps[0].mem_channel;
- err_data->err_addr[i].bank = bps[0].bank;
- err_data->err_addr[i].err_type = bps[0].err_type;
- err_data->err_addr[i].mcumc_id = bps[0].mcumc_id;
- }
- } else {
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
- return -EINVAL;
- }
- } else {
- if (bps[0].address == 0) {
- /* for specific old eeprom data, mca address is not stored,
- * calc it from pa
- */
- if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT,
- &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE))
- return -EINVAL;
- }
-
- if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
- if (nps == AMDGPU_NPS1_PARTITION_MODE)
- memcpy(err_data->err_addr, bps,
- sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
- else
- return -EOPNOTSUPP;
- }
- }
-
-out:
return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
}
-static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
- struct eeprom_table_record *bps, struct ras_err_data *err_data,
- enum amdgpu_memory_partition nps)
-{
- int i = 0;
- uint64_t chan_idx_v2;
- enum amdgpu_memory_partition save_nps;
-
- if (!amdgpu_ras_smu_eeprom_supported(adev)) {
- save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
- chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2;
- bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
- } else {
- /* if pmfw manages eeprom, save_nps is not stored on eeprom,
- * we should always convert mca address into physical address,
- * make save_nps different from nps
- */
- save_nps = nps + 1;
- }
-
- if (save_nps == nps) {
- if (amdgpu_umc_pages_in_a_row(adev, err_data,
- bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
- return -EINVAL;
- for (i = 0; i < adev->umc.retire_unit; i++) {
- err_data->err_addr[i].address = bps->address;
- err_data->err_addr[i].mem_channel = bps->mem_channel;
- err_data->err_addr[i].bank = bps->bank;
- err_data->err_addr[i].err_type = bps->err_type;
- err_data->err_addr[i].mcumc_id = bps->mcumc_id;
- }
- } else {
- if (save_nps || chan_idx_v2) {
- if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
- return -EINVAL;
- } else {
- /* for specific old eeprom data, mca address is not stored,
- * calc it from pa
- */
- if (bps->address == 0)
- if (amdgpu_umc_pa2mca(adev,
- bps->retired_page << AMDGPU_GPU_PAGE_SHIFT,
- &(bps->address),
- AMDGPU_NPS1_PARTITION_MODE))
- return -EINVAL;
-
- if (amdgpu_ras_mca2pa(adev, bps, err_data))
- return -EOPNOTSUPP;
- }
- }
-
- return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
- adev->umc.retire_unit);
-}
-
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
@@ -3300,8 +2981,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
if (from_rom) {
/* there is no pa recs in V3, so skip pa recs processing */
- if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) &&
- !amdgpu_ras_smu_eeprom_supported(adev)) {
+ if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
for (i = 0; i < pages; i++) {
if (control->ras_num_recs - i >= adev->umc.retire_unit) {
if ((bps[i].address == bps[i + 1].address) &&
@@ -3318,10 +2998,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
}
}
}
- for (; i < pages; i++) {
- ret = __amdgpu_ras_convert_rec_from_rom(adev,
- &bps[i], &err_data, nps);
- }
+ for (; i < pages; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
con->eh_data->count_saved = con->eh_data->count;
} else {
@@ -3346,7 +3024,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
- int save_count, unit_num, i;
+ int save_count, unit_num;
if (!con || !con->eh_data) {
if (new_cnt)
@@ -3367,12 +3045,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
- if (amdgpu_ras_smu_eeprom_supported(adev))
- unit_num = control->ras_num_recs -
- control->ras_num_recs_old;
- else
- unit_num = data->count / adev->umc.retire_unit -
- control->ras_num_recs;
+ unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
save_count = con->bad_page_num - control->ras_num_bad_pages;
mutex_unlock(&con->recovery_lock);
@@ -3383,21 +3056,10 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
/* only new entries are saved */
if (unit_num && save_count) {
/*old asics only save pa to eeprom like before*/
- if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
- if (amdgpu_ras_eeprom_append(control,
- &data->bps[data->count_saved], unit_num)) {
- dev_err(adev->dev, "Failed to save EEPROM table data!");
- return -EIO;
- }
- } else {
- for (i = 0; i < unit_num; i++) {
- if (amdgpu_ras_eeprom_append(control,
- &data->bps[data->count_saved +
- i * adev->umc.retire_unit], 1)) {
- dev_err(adev->dev, "Failed to save EEPROM table data!");
- return -EIO;
- }
- }
+ if (amdgpu_ras_eeprom_append(control,
+ &data->bps[data->count_saved], unit_num)) {
+ dev_err(adev->dev, "Failed to save EEPROM table data!");
+ return -EIO;
}
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
@@ -3416,7 +3078,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
struct eeprom_table_record *bps;
- int ret, i = 0;
+ int ret;
/* no bad page record, skip eeprom access */
if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
@@ -3430,33 +3092,6 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
if (ret) {
dev_err(adev->dev, "Failed to load EEPROM table records!");
} else {
- if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- /*In V3, there is no pa recs, and some cases(when address==0) may be parsed
- as pa recs, so add verion check to avoid it.
- */
- if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) &&
- !amdgpu_ras_smu_eeprom_supported(adev)) {
- for (i = 0; i < control->ras_num_recs; i++) {
- if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
- if ((bps[i].address == bps[i + 1].address) &&
- (bps[i].mem_channel == bps[i + 1].mem_channel)) {
- control->ras_num_pa_recs += adev->umc.retire_unit;
- i += (adev->umc.retire_unit - 1);
- } else {
- control->ras_num_mca_recs +=
- (control->ras_num_recs - i);
- break;
- }
- } else {
- control->ras_num_mca_recs += (control->ras_num_recs - i);
- break;
- }
- }
- } else {
- control->ras_num_mca_recs = control->ras_num_recs;
- }
- }
-
ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
if (ret)
goto out;
@@ -3550,293 +3185,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
}
}
-int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, uint16_t pasid,
- pasid_notify pasid_fn, void *data, uint32_t reset)
-{
- int ret = 0;
- struct ras_poison_msg poison_msg;
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
- memset(&poison_msg, 0, sizeof(poison_msg));
- poison_msg.block = block;
- poison_msg.pasid = pasid;
- poison_msg.reset = reset;
- poison_msg.pasid_fn = pasid_fn;
- poison_msg.data = data;
-
- ret = kfifo_put(&con->poison_fifo, poison_msg);
- if (!ret) {
- dev_err(adev->dev, "Poison message fifo is full!\n");
- return -ENOSPC;
- }
-
- return 0;
-}
-
-static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
- struct ras_poison_msg *poison_msg)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
- return kfifo_get(&con->poison_fifo, poison_msg);
-}
-
-static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
-{
- mutex_init(&ecc_log->lock);
-
- INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
- ecc_log->de_queried_count = 0;
- ecc_log->consumption_q_count = 0;
-}
-
-static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
-{
- struct radix_tree_iter iter;
- void __rcu **slot;
- struct ras_ecc_err *ecc_err;
-
- mutex_lock(&ecc_log->lock);
- radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
- ecc_err = radix_tree_deref_slot(slot);
- kfree(ecc_err->err_pages.pfn);
- kfree(ecc_err);
- radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
- }
- mutex_unlock(&ecc_log->lock);
-
- mutex_destroy(&ecc_log->lock);
- ecc_log->de_queried_count = 0;
- ecc_log->consumption_q_count = 0;
-}
-
-static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
- uint32_t delayed_ms)
-{
- int ret;
-
- mutex_lock(&con->umc_ecc_log.lock);
- ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
- UMC_ECC_NEW_DETECTED_TAG);
- mutex_unlock(&con->umc_ecc_log.lock);
-
- if (ret)
- schedule_delayed_work(&con->page_retirement_dwork,
- msecs_to_jiffies(delayed_ms));
-
- return ret ? true : false;
-}
-
-static void amdgpu_ras_do_page_retirement(struct work_struct *work)
-{
- struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
- page_retirement_dwork.work);
- struct amdgpu_device *adev = con->adev;
- struct ras_err_data err_data;
-
- /* If gpu reset is ongoing, delay retiring the bad pages */
- if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
- amdgpu_ras_schedule_retirement_dwork(con,
- AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);
- return;
- }
-
- amdgpu_ras_error_data_init(&err_data);
-
- amdgpu_umc_handle_bad_pages(adev, &err_data);
-
- amdgpu_ras_error_data_fini(&err_data);
-
- amdgpu_ras_schedule_retirement_dwork(con,
- AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
-}
-
-static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
- uint32_t poison_creation_count)
-{
- int ret = 0;
- struct ras_ecc_log_info *ecc_log;
- struct ras_query_if info;
- u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
- struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- u64 de_queried_count;
- u64 consumption_q_count;
- enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
-
- memset(&info, 0, sizeof(info));
- info.head.block = AMDGPU_RAS_BLOCK__UMC;
-
- ecc_log = &ras->umc_ecc_log;
- ecc_log->de_queried_count = 0;
- ecc_log->consumption_q_count = 0;
-
- do {
- ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
- if (ret)
- return ret;
-
- de_queried_count = ecc_log->de_queried_count;
- consumption_q_count = ecc_log->consumption_q_count;
-
- if (de_queried_count && consumption_q_count)
- break;
-
- msleep(100);
- } while (--timeout);
-
- if (de_queried_count)
- schedule_delayed_work(&ras->page_retirement_dwork, 0);
-
- if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
- amdgpu_ras_reset_gpu(adev);
-
- return 0;
-}
-
-static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_poison_msg msg;
- int ret;
-
- do {
- ret = kfifo_get(&con->poison_fifo, &msg);
- } while (ret);
-}
-
-static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
- uint32_t msg_count, uint32_t *gpu_reset)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- uint32_t reset_flags = 0, reset = 0;
- struct ras_poison_msg msg;
- int ret, i;
-
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-
- for (i = 0; i < msg_count; i++) {
- ret = amdgpu_ras_get_poison_req(adev, &msg);
- if (!ret)
- continue;
-
- if (msg.pasid_fn)
- msg.pasid_fn(adev, msg.pasid, msg.data);
-
- reset_flags |= msg.reset;
- }
-
- /*
- * Try to ensure poison creation handler is completed first
- * to set rma if bad page exceed threshold.
- */
- flush_delayed_work(&con->page_retirement_dwork);
-
- /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
- if (reset_flags && !amdgpu_ras_is_rma(adev)) {
- if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
- reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
- else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
- reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
- else
- reset = reset_flags;
-
- con->gpu_reset_flags |= reset;
- amdgpu_ras_reset_gpu(adev);
-
- *gpu_reset = reset;
-
- /* Wait for gpu recovery to complete */
- flush_work(&con->recovery_work);
- }
-
- return 0;
-}
-
-static int amdgpu_ras_page_retirement_thread(void *param)
-{
- struct amdgpu_device *adev = (struct amdgpu_device *)param;
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- uint32_t poison_creation_count, msg_count;
- uint32_t gpu_reset;
- int ret;
-
- while (!kthread_should_stop()) {
-
- wait_event_interruptible(con->page_retirement_wq,
- kthread_should_stop() ||
- atomic_read(&con->page_retirement_req_cnt));
-
- if (kthread_should_stop())
- break;
-
- mutex_lock(&con->poison_lock);
- gpu_reset = 0;
-
- do {
- poison_creation_count = atomic_read(&con->poison_creation_count);
- ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count);
- if (ret == -EIO)
- break;
-
- if (poison_creation_count) {
- atomic_sub(poison_creation_count, &con->poison_creation_count);
- atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
- }
- } while (atomic_read(&con->poison_creation_count) &&
- !atomic_read(&con->poison_consumption_count));
-
- if (ret != -EIO) {
- msg_count = kfifo_len(&con->poison_fifo);
- if (msg_count) {
- ret = amdgpu_ras_poison_consumption_handler(adev,
- msg_count, &gpu_reset);
- if ((ret != -EIO) &&
- (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET))
- atomic_sub(msg_count, &con->page_retirement_req_cnt);
- }
- }
-
- if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
- /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
- /* Clear poison creation request */
- atomic_set(&con->poison_creation_count, 0);
- atomic_set(&con->poison_consumption_count, 0);
-
- /* Clear poison fifo */
- amdgpu_ras_clear_poison_fifo(adev);
-
- /* Clear all poison requests */
- atomic_set(&con->page_retirement_req_cnt, 0);
-
- if (ret == -EIO) {
- /* Wait for mode-1 reset to complete */
- down_read(&adev->reset_domain->sem);
- up_read(&adev->reset_domain->sem);
- }
-
- /* Wake up work to save bad pages to eeprom */
- schedule_delayed_work(&con->page_retirement_dwork, 0);
- } else if (gpu_reset) {
- /* gpu just completed mode-2 reset or other reset */
- /* Clear poison consumption messages cached in fifo */
- msg_count = kfifo_len(&con->poison_fifo);
- if (msg_count) {
- amdgpu_ras_clear_poison_fifo(adev);
- atomic_sub(msg_count, &con->page_retirement_req_cnt);
- }
-
- atomic_set(&con->poison_consumption_count, 0);
-
- /* Wake up work to save bad pages to eeprom */
- schedule_delayed_work(&con->page_retirement_dwork, 0);
- }
- mutex_unlock(&con->poison_lock);
- }
-
- return 0;
-}
-
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3846,7 +3194,14 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
if (!con || amdgpu_sriov_vf(adev))
return 0;
- if (amdgpu_uniras_enabled(adev))
+ /*
+ * For the reset-on-init path (e.g. an NPS memory partition,
+ * switch) the RAS IP block hw_init has not been enabled and
+ * the amdgpu_uniras_enabled return false, check amdgpu ras
+ * context uniras_enabled flag, eeprom init will be called
+ * during RAS IP block hw_init.
+ */
+ if (amdgpu_uniras_enabled(adev) || con->uniras_enabled)
return 0;
control = &con->eeprom_control;
@@ -3855,9 +3210,6 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
ret = amdgpu_ras_eeprom_init(control);
control->is_eeprom_valid = !ret;
- if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
- control->ras_num_pa_recs = control->ras_num_recs;
-
if (adev->umc.ras &&
adev->umc.ras->get_retire_flip_bits)
adev->umc.ras->get_retire_flip_bits(adev);
@@ -3877,13 +3229,6 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
-
- /* The format action is only applied to new ASICs */
- if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
- control->tbl_hdr.version < RAS_TABLE_VER_V3)
- if (!amdgpu_ras_eeprom_reset_table(control))
- if (amdgpu_ras_save_bad_pages(adev, NULL))
- dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
return 0;
@@ -3917,10 +3262,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
}
mutex_init(&con->recovery_lock);
- mutex_init(&con->poison_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
- atomic_set(&con->rma_in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -3933,21 +3276,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
}
mutex_init(&con->page_rsv_lock);
- INIT_KFIFO(con->poison_fifo);
mutex_init(&con->page_retirement_lock);
- init_waitqueue_head(&con->page_retirement_wq);
- atomic_set(&con->page_retirement_req_cnt, 0);
- atomic_set(&con->poison_creation_count, 0);
- atomic_set(&con->poison_consumption_count, 0);
- con->page_retirement_thread =
- kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
- if (IS_ERR(con->page_retirement_thread)) {
- con->page_retirement_thread = NULL;
- dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
- }
-
- INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
- amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
+
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
@@ -3978,33 +3308,15 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
- int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
- bool ret;
/* recovery_init failed to init it, fini is useless */
if (!data)
return 0;
- /* Save all cached bad pages to eeprom */
- do {
- flush_delayed_work(&con->page_retirement_dwork);
- ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
- } while (ret && max_flush_timeout--);
-
- if (con->page_retirement_thread)
- kthread_stop(con->page_retirement_thread);
-
- atomic_set(&con->page_retirement_req_cnt, 0);
- atomic_set(&con->poison_creation_count, 0);
-
mutex_destroy(&con->page_rsv_lock);
cancel_work_sync(&con->recovery_work);
- cancel_delayed_work_sync(&con->page_retirement_dwork);
-
- amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
-
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
@@ -4206,15 +3518,6 @@ init_ras_enabled_flag:
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
- /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
- if (!amdgpu_sriov_vf(adev)) {
- adev->aca.is_enabled =
- (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
- amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
- amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) ||
- amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15));
- }
-
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
@@ -4435,15 +3738,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
- if (amdgpu_ras_aca_is_supported(adev)) {
- if (amdgpu_aca_is_enabled(adev))
- r = amdgpu_aca_init(adev);
- else
- r = amdgpu_mca_init(adev);
- if (r)
- goto release_con;
- }
-
con->init_task_pid = task_pid_nr(current);
get_task_comm(con->init_task_comm, current);
@@ -4541,9 +3835,9 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
goto cleanup;
}
- if (ras_obj->hw_ops &&
+ if (amdgpu_uniras_enabled(adev) || (ras_obj->hw_ops &&
(ras_obj->hw_ops->query_ras_error_count ||
- ras_obj->hw_ops->query_ras_error_status)) {
+ ras_obj->hw_ops->query_ras_error_status))) {
r = amdgpu_ras_sysfs_create(adev, ras_block);
if (r)
goto interrupt;
@@ -4671,28 +3965,13 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_event_mgr_init(adev);
- if (amdgpu_ras_aca_is_supported(adev)) {
- if (amdgpu_reset_in_recovery(adev)) {
- if (amdgpu_aca_is_enabled(adev))
- r = amdgpu_aca_reset(adev);
- else
- r = amdgpu_mca_reset(adev);
- if (r)
- return r;
- }
-
- if (!amdgpu_sriov_vf(adev)) {
- if (amdgpu_aca_is_enabled(adev))
- amdgpu_ras_set_aca_debug_mode(adev, false);
- else
- amdgpu_ras_set_mca_debug_mode(adev, false);
- }
- }
-
/* Guest side doesn't need init ras feature */
if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
return 0;
+ if (amdgpu_uniras_enabled(adev))
+ amdgpu_ras_mgr_set_debug_mode(adev, false);
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
obj = node->ras_obj;
if (!obj) {
@@ -4773,13 +4052,6 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
- if (amdgpu_ras_aca_is_supported(adev)) {
- if (amdgpu_aca_is_enabled(adev))
- amdgpu_aca_fini(adev);
- else
- amdgpu_mca_fini(adev);
- }
-
WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared");
if (AMDGPU_RAS_GET_FEATURES(con->features))
@@ -5064,6 +4336,13 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
* Use this list instead of mgpu_info to find the amdgpu
* device on which the UMC error was reported.
*/
+ if (mce_adev_list.num_gpu >= MAX_GPU_INSTANCE) {
+ dev_warn_ratelimited(adev->dev,
+ "mce_adev_list full, skip notifier registration (max=%d)\n",
+ MAX_GPU_INSTANCE);
+ return;
+ }
+
mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
/*
@@ -5181,59 +4460,10 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
return 0;
}
-int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int ret = 0;
-
- if (con) {
- ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
- if (!ret)
- con->is_aca_debug_mode = enable;
- }
-
- return ret;
-}
-
-int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int ret = 0;
-
- if (con) {
- if (amdgpu_aca_is_enabled(adev))
- ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
- else
- ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
- if (!ret)
- con->is_aca_debug_mode = enable;
- }
-
- return ret;
-}
-
-bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
-
- if (!con)
- return false;
-
- if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) ||
- (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode))
- return con->is_aca_debug_mode;
- else
- return true;
-}
-
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *error_query_mode)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
- const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
@@ -5242,9 +4472,6 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
if (amdgpu_sriov_vf(adev)) {
*error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY;
- } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) {
- *error_query_mode =
- (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
} else {
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
}
@@ -5834,3 +5061,8 @@ void amdgpu_ras_post_reset(struct amdgpu_device *adev,
amdgpu_ras_mgr_post_reset(tmp_adev);
}
}
+
+void amdgpu_ras_resume_after_reset(struct amdgpu_device *adev)
+{
+ amdgpu_ras_mgr_resume_after_reset(adev);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index a86ab65aa2f0..a44aed7f169e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,7 +31,6 @@
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"
-#include "amdgpu_aca.h"
struct amdgpu_iv_entry;
@@ -466,14 +465,6 @@ struct ras_query_context {
typedef int (*pasid_notify)(struct amdgpu_device *adev,
uint16_t pasid, void *data);
-struct ras_poison_msg {
- enum amdgpu_ras_block block;
- uint16_t pasid;
- uint32_t reset;
- pasid_notify pasid_fn;
- void *data;
-};
-
struct ras_err_pages {
uint32_t count;
uint64_t *pfn;
@@ -492,8 +483,6 @@ struct ras_ecc_err {
struct ras_ecc_log_info {
struct mutex lock;
struct radix_tree_root de_page_tree;
- uint64_t de_queried_count;
- uint64_t consumption_q_count;
};
struct ras_critical_region {
@@ -549,7 +538,6 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
- atomic_t rma_in_recovery;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
@@ -581,22 +569,15 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
/* Record status of smu mca debug mode */
- bool is_aca_debug_mode;
+ bool is_mca_debug_mode;
bool is_rma;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
- struct task_struct *page_retirement_thread;
- wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock;
- atomic_t page_retirement_req_cnt;
- atomic_t poison_creation_count;
- atomic_t poison_consumption_count;
struct mutex page_rsv_lock;
- DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log;
- struct delayed_work page_retirement_dwork;
/* ras errors detected */
unsigned long ras_err_state;
@@ -615,9 +596,6 @@ struct amdgpu_ras {
struct list_head critical_region_head;
struct mutex critical_region_lock;
- /* Protect poison injection */
- struct mutex poison_lock;
-
/* Disable/Enable uniras switch */
bool uniras_enabled;
const struct ras_smu_drv *ras_smu_drv;
@@ -702,8 +680,6 @@ struct ras_manager {
struct ras_ih_data ih_data;
struct ras_err_data err_data;
-
- struct aca_handle aca_handle;
};
struct ras_badpage {
@@ -964,8 +940,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
-int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
-bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
@@ -1006,12 +981,6 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
u64 count);
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
-int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- const struct aca_info *aca_info, void *data);
-int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk);
-
-ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
- struct aca_handle *handle, char *buf, void *data);
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
@@ -1029,10 +998,6 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo);
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr);
-int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
- enum amdgpu_ras_block block, uint16_t pasid,
- pasid_notify pasid_fn, void *data, uint32_t reset);
-
bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
__printf(3, 4)
@@ -1045,4 +1010,5 @@ void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
struct list_head *device_list);
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
struct list_head *device_list);
+void amdgpu_ras_resume_after_reset(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index b265b4d9053f..baa8cc3646d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -124,8 +124,6 @@
RAS_TABLE_V2_1_INFO_SIZE) \
/ RAS_TABLE_RECORD_SIZE)
-#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
-
/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
* offset off of RAS_TABLE_START. That is, this is something you can
* add to control->i2c_address, and then tell I2C layer to read
@@ -159,6 +157,9 @@
static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
{
+ if (amdgpu_sriov_vf(adev))
+ return false;
+
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */
case IP_VERSION(11, 0, 7): /* Sienna cichlid */
@@ -449,57 +450,46 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- u32 erase_res = 0;
u8 csum;
int res;
mutex_lock(&control->ras_tbl_mutex);
- if (!amdgpu_ras_smu_eeprom_supported(adev)) {
- hdr->header = RAS_TABLE_HDR_VAL;
- amdgpu_ras_set_eeprom_table_version(control);
-
- if (hdr->version >= RAS_TABLE_VER_V2_1) {
- hdr->first_rec_offset = RAS_RECORD_START_V2_1;
- hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
- RAS_TABLE_V2_1_INFO_SIZE;
- rai->rma_status = GPU_HEALTH_USABLE;
-
- control->ras_record_offset = RAS_RECORD_START_V2_1;
- control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
- /**
- * GPU health represented as a percentage.
- * 0 means worst health, 100 means fully health.
- */
- rai->health_percent = 100;
- /* ecc_page_threshold = 0 means disable bad page retirement */
- rai->ecc_page_threshold = con->bad_page_cnt_threshold;
- } else {
- hdr->first_rec_offset = RAS_RECORD_START;
- hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
+ hdr->header = RAS_TABLE_HDR_VAL;
+ amdgpu_ras_set_eeprom_table_version(control);
- control->ras_record_offset = RAS_RECORD_START;
- control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
- }
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ hdr->first_rec_offset = RAS_RECORD_START_V2_1;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE;
+ rai->rma_status = GPU_HEALTH_USABLE;
- csum = __calc_hdr_byte_sum(control);
- if (hdr->version >= RAS_TABLE_VER_V2_1)
- csum += __calc_ras_info_byte_sum(control);
- csum = -csum;
- hdr->checksum = csum;
- res = __write_table_header(control);
- if (!res && hdr->version > RAS_TABLE_VER_V1)
- res = __write_table_ras_info(control);
+ control->ras_record_offset = RAS_RECORD_START_V2_1;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+ /**
+ * GPU health represented as a percentage.
+ * 0 means worst health, 100 means fully health.
+ */
+ rai->health_percent = 100;
+ /* ecc_page_threshold = 0 means disable bad page retirement */
+ rai->ecc_page_threshold = con->bad_page_cnt_threshold;
} else {
- res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res);
- if (res || erase_res) {
- dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d",
- res, erase_res);
- if (!res)
- res = -EIO;
- }
+ hdr->first_rec_offset = RAS_RECORD_START;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
+
+ control->ras_record_offset = RAS_RECORD_START;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
}
+ csum = __calc_hdr_byte_sum(control);
+ if (hdr->version >= RAS_TABLE_VER_V2_1)
+ csum += __calc_ras_info_byte_sum(control);
+ csum = -csum;
+ hdr->checksum = csum;
+ res = __write_table_header(control);
+ if (!res && hdr->version > RAS_TABLE_VER_V1)
+ res = __write_table_ras_info(control);
+
control->ras_num_recs = 0;
control->ras_num_bad_pages = 0;
control->ras_num_mca_recs = 0;
@@ -662,7 +652,6 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
- struct amdgpu_device *adev = to_amdgpu_device(control);
u32 a, b, i;
u8 *buf, *pp;
int res;
@@ -767,10 +756,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
% control->ras_max_record_count;
/*old asics only save pa to eeprom like before*/
- if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12)
- control->ras_num_pa_recs += num;
- else
- control->ras_num_mca_recs += num;
+ control->ras_num_pa_recs += num;
control->ras_num_bad_pages = con->bad_page_num;
Out:
@@ -879,71 +865,6 @@ Out:
return res;
}
-int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
-{
- struct amdgpu_device *adev = to_amdgpu_device(control);
- int ret, retry = 20;
-
- if (!amdgpu_ras_smu_eeprom_supported(adev))
- return 0;
-
- control->ras_num_recs_old = control->ras_num_recs;
-
- do {
- /* 1000ms timeout is long enough, smu_get_badpage_count won't
- * return -EBUSY before timeout.
- */
- ret = amdgpu_ras_smu_get_badpage_count(adev,
- &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS);
- if (!ret &&
- (control->ras_num_recs_old == control->ras_num_recs)) {
- /* record number update in PMFW needs some time,
- * smu_get_badpage_count may return immediately without
- * count update, sleep for a while and retry again.
- */
- msleep(50);
- retry--;
- } else {
- break;
- }
- } while (retry);
-
- /* no update of record number is not a real failure,
- * don't print warning here
- */
- if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
- ret = -EINVAL;
-
- return ret;
-}
-
-static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control)
-{
- struct amdgpu_device *adev = to_amdgpu_device(control);
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
- if (!amdgpu_ras_smu_eeprom_supported(adev) || !con)
- return 0;
-
- control->ras_num_bad_pages = con->bad_page_num;
-
- if (amdgpu_bad_page_threshold != 0 &&
- control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
- dev_warn(adev->dev,
- "Saved bad pages %d reaches threshold value %d\n",
- control->ras_num_bad_pages, con->bad_page_cnt_threshold);
-
- if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev))
- dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
-
- if ((amdgpu_bad_page_threshold != -1) &&
- (amdgpu_bad_page_threshold != -2))
- con->is_rma = true;
- }
-
- return 0;
-}
-
/**
* amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
* @control: pointer to control structure
@@ -968,9 +889,6 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
if (!__is_ras_eeprom_supported(adev))
return 0;
- if (amdgpu_ras_smu_eeprom_supported(adev))
- return amdgpu_ras_smu_eeprom_append(control);
-
if (num == 0) {
dev_err(adev->dev, "will not append 0 records\n");
return -EINVAL;
@@ -1046,52 +964,6 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
return res;
}
-int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
- struct eeprom_table_record *record, u32 rec_idx,
- const u32 num)
-{
- struct amdgpu_device *adev = to_amdgpu_device(control);
- uint64_t ts, end_idx;
- int i, ret;
- u64 mca, ipid;
- u32 cu, mem_channel, mcumc_id;
-
- if (!amdgpu_ras_smu_eeprom_supported(adev))
- return 0;
-
- if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse)
- return -EOPNOTSUPP;
-
- end_idx = rec_idx + num;
- for (i = rec_idx; i < end_idx; i++) {
- ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca);
- if (ret)
- return ret;
-
- ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid);
- if (ret)
- return ret;
-
- ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts);
- if (ret)
- return ret;
-
- record[i - rec_idx].address = mca;
- /* retired_page (pa) is unused now */
- record[i - rec_idx].retired_page = 0x1ULL;
- record[i - rec_idx].ts = ts;
- record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
-
- adev->umc.ras->mca_ipid_parse(adev, ipid,
- &cu, &mem_channel, &mcumc_id, NULL);
- record[i - rec_idx].cu = (u8)cu;
- record[i - rec_idx].mem_channel = (u8)mem_channel;
- record[i - rec_idx].mcumc_id = (u8)mcumc_id;
- }
-
- return 0;
-}
-
/**
* amdgpu_ras_eeprom_read -- read EEPROM
* @control: pointer to control structure
@@ -1113,9 +985,6 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
u8 *buf, *pp;
u32 g0, g1;
- if (amdgpu_ras_smu_eeprom_supported(adev))
- return amdgpu_ras_eeprom_read_idx(control, record, 0, num);
-
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1396,6 +1265,86 @@ Out:
}
static ssize_t
+amdgpu_ras_debugfs_table_read_uniras(struct amdgpu_device *adev,
+ char __user *buf,
+ size_t size, loff_t *pos)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_core_context *ras_core = ras_mgr ? ras_mgr->ras_core : NULL;
+ struct eeprom_umc_record *records = NULL;
+ struct ras_eeprom_control *control;
+ size_t bufsz, len = 0;
+ u32 num_recs;
+ char *kbuf;
+ ssize_t res;
+ int i;
+
+ if (!ras_core)
+ return 0;
+
+ /* pmfw manages eeprom data by itself */
+ if (ras_fw_eeprom_supported(ras_core))
+ return 0;
+
+ control = &ras_core->ras_eeprom;
+ num_recs = ras_eeprom_get_record_count(ras_core);
+
+ bufsz = strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
+ strlen(rec_hdr_str) + (size_t)rec_hdr_fmt_size * num_recs + 1;
+
+ kbuf = kvmalloc(bufsz, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (num_recs) {
+ records = kvcalloc(num_recs, sizeof(*records), GFP_KERNEL);
+ if (!records) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ res = ras_eeprom_read(ras_core, records, num_recs);
+ if (res)
+ goto out;
+ }
+
+ len += scnprintf(kbuf + len, bufsz - len, "%s", tbl_hdr_str);
+ len += scnprintf(kbuf + len, bufsz - len, tbl_hdr_fmt,
+ control->tbl_hdr.header,
+ control->tbl_hdr.version,
+ control->tbl_hdr.first_rec_offset,
+ control->tbl_hdr.tbl_size,
+ control->tbl_hdr.checksum);
+ len += scnprintf(kbuf + len, bufsz - len, "%s", rec_hdr_str);
+
+ for (i = 0; i < num_recs; i++) {
+ u32 ai = RAS_RI_TO_AI(control, i);
+ int et = records[i].err_type;
+ const char *ets = (et >= 0 && et < AMDGPU_RAS_EEPROM_ERR_COUNT) ?
+ record_err_type_str[et] : "na";
+
+ len += scnprintf(kbuf + len, bufsz - len, rec_hdr_fmt,
+ i,
+ RAS_INDEX_TO_OFFSET(control, ai),
+ ets,
+ records[i].bank,
+ records[i].ts,
+ records[i].offset,
+ records[i].mem_channel,
+ records[i].mcumc_id,
+ records[i].retired_row_pfn);
+ }
+
+ res = simple_read_from_buffer(buf, size, pos, kbuf, len);
+
+out:
+ kvfree(records);
+ kvfree(kbuf);
+
+ return res;
+}
+
+static ssize_t
amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
@@ -1408,6 +1357,10 @@ amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
if (!size)
return size;
+ if (amdgpu_uniras_enabled(adev))
+ return amdgpu_ras_debugfs_table_read_uniras(adev, buf,
+ size, pos);
+
if (!ras || !control) {
res = snprintf(data, sizeof(data), "Not supported\n");
if (*pos >= res)
@@ -1521,42 +1474,6 @@ Out:
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
}
-static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control)
-{
- struct amdgpu_device *adev = to_amdgpu_device(control);
- struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
- struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint64_t local_time;
- int res;
-
- ras->is_rma = false;
-
- if (!__is_ras_eeprom_supported(adev))
- return 0;
- mutex_init(&control->ras_tbl_mutex);
-
- res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version));
- if (res)
- return res;
-
- res = amdgpu_ras_smu_get_badpage_count(adev,
- &(control->ras_num_recs), 100);
- if (res)
- return res;
-
- local_time = (uint64_t)ktime_get_real_seconds();
- res = amdgpu_ras_smu_set_timestamp(adev, local_time);
- if (res)
- return res;
-
- control->ras_max_record_count = 4000;
-
- control->ras_num_mca_recs = 0;
- control->ras_num_pa_recs = 0;
-
- return 0;
-}
-
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -1567,9 +1484,6 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
uint32_t vram_type = adev->gmc.vram_type;
int res;
- if (amdgpu_ras_smu_eeprom_supported(adev))
- return amdgpu_ras_smu_eeprom_init(control);
-
ras->is_rma = false;
if (!__is_ras_eeprom_supported(adev))
@@ -1663,47 +1577,6 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
return 0;
}
-static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control)
-{
- struct amdgpu_device *adev = to_amdgpu_device(control);
- struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
- if (!__is_ras_eeprom_supported(adev))
- return 0;
-
- control->ras_num_bad_pages = ras->bad_page_num;
-
- if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) &&
- amdgpu_bad_page_threshold != 0) {
- dev_warn(adev->dev,
- "RAS records:%d exceed threshold:%d\n",
- control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
- if ((amdgpu_bad_page_threshold == -1) ||
- (amdgpu_bad_page_threshold == -2)) {
- dev_warn(adev->dev,
- "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
- } else {
- ras->is_rma = true;
- dev_warn(adev->dev,
- "User defined threshold is set, runtime service will be halt when threshold is reached\n");
- }
-
- return 0;
- }
-
- dev_dbg(adev->dev,
- "Found existing EEPROM table with %d records",
- control->ras_num_bad_pages);
-
- /* Warn if we are at 90% of the threshold or above
- */
- if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
- dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
- control->ras_num_bad_pages,
- ras->bad_page_cnt_threshold);
- return 0;
-}
-
int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
@@ -1711,9 +1584,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res = 0;
- if (amdgpu_ras_smu_eeprom_supported(adev))
- return amdgpu_ras_smu_eeprom_check(control);
-
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1973,7 +1843,7 @@ void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
- if (!control || amdgpu_bad_page_threshold == 0)
+ if (!__is_ras_eeprom_supported(adev) || !control || amdgpu_bad_page_threshold == 0)
return;
if (control->ras_num_bad_pages > ras->bad_page_cnt_threshold) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index a62114800a92..3c7fcce5fe8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -82,7 +82,6 @@ struct amdgpu_ras_eeprom_control {
/* Number of records in the table.
*/
u32 ras_num_recs;
- u32 ras_num_recs_old;
/* the bad page number is ras_num_recs or
* ras_num_recs * umc.retire_unit
@@ -191,8 +190,6 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *record, u32 rec_idx,
const u32 num);
-int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
-
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index b97fa35bac23..4d417c4a5cd2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -935,6 +935,194 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
return 0;
}
+/**
+ * amdgpu_multi_ring_reset_helper_begin() - Prepare multiple rings for a reset.
+ *
+ * @ring_type_mask: Bitmask of affected ring types
+ * @guilty_ring: The ring which is guilty of causing a reset.
+ * @guilty_fence: The fence which didn't signal on the guilty ring.
+ *
+ * Useful when performing a GPU reset method that affects
+ * multiple rings at the same time, such as an IP block soft
+ * reset. For example, a GFX IP block soft reset will affect
+ * every graphics and compute queue.
+ *
+ * This function should be called before such a reset.
+ *
+ * Prepare the affected rings before the reset, make sure to
+ * minimize collateral damage, and backup the contents of
+ * the rings. Then the caller can call the actual HW specific
+ * reset function.
+ *
+ * After the reset is complete, the caller should then call
+ * amdgpu_multi_ring_reset_helper_end() to restore the rings.
+ */
+void amdgpu_multi_ring_reset_helper_begin(const u32 ring_type_mask,
+ struct amdgpu_ring *guilty_ring,
+ struct amdgpu_fence *guilty_fence)
+{
+ struct amdgpu_device *adev = guilty_ring->adev;
+ struct amdgpu_fence *ring_guilty_fence;
+ struct amdgpu_ring *ring;
+ bool rings_busy;
+ int i;
+ u32 t;
+
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ /* Don't accept new submissions on the ring. */
+ if (amdgpu_ring_sched_ready(ring) && !drm_sched_is_stopped(&ring->sched))
+ drm_sched_wqueue_stop(&ring->sched);
+
+ /*
+ * Clear the preempt condition to stop the ring
+ * from starting its next submission. This ensures
+ * that only the currently executing submission
+ * can be rejected because of the reset and helps
+ * minimize collateral damage.
+ */
+ if (ring->funcs->init_cond_exec)
+ amdgpu_ring_set_preempt_cond_exec(ring, false);
+ }
+
+ /* Flush HDP cache so the GPU can see the updated COND_EXEC values */
+ amdgpu_device_flush_hdp(adev, NULL);
+
+ /*
+ * Give some time for non-guilty rings to finish their
+ * current submission, to try to minimize collateral damage.
+ *
+ * Note that this is just a best effort, but really there
+ * is no way to really know which ring is actually responsible
+ * because different rings may share resources, eg. a compute
+ * ring may hog shader engines, causing a graphics ring to hang.
+ */
+ for (t = 0; t < adev->usec_timeout; t += 10000) {
+ rings_busy = false;
+
+ /* Check if any of the non-guilty rings are busy */
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ if (ring == guilty_ring)
+ continue;
+
+ rings_busy |=
+ atomic_read(&ring->fence_drv.last_seq) !=
+ READ_ONCE(ring->fence_drv.sync_seq);
+ }
+
+ if (!rings_busy)
+ break;
+
+ mdelay(10);
+ }
+
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ /*
+ * Find guilty fences, ie. the fences that didn't signal
+ * on each ring. At this point there is no way to know
+ * which one is really responsible for the hang, and no
+ * way to save any of them, so we treat all of them as guilty.
+ */
+ ring_guilty_fence =
+ ring == guilty_ring ? guilty_fence :
+ amdgpu_ring_find_guilty_fence(ring);
+
+ /*
+ * Backup current contents of the ring.
+ * The helper takes care to only reemit unsignalled fences
+ * so we don't have to worry about that here.
+ */
+ amdgpu_ring_reset_helper_begin(ring, ring_guilty_fence);
+ }
+}
+
+/**
+ * amdgpu_multi_ring_reset_helper_end() - Prepare multiple rings for a reset.
+ *
+ * @ring_type_mask: Bitmask of affected ring types
+ * @guilty_ring: The ring which is guilty of causing a reset.
+ * @ret: Return code from the reset function.
+ *
+ * After calling amdgpu_multi_ring_reset_helper_begin()
+ * and executing the actual reset method, call this
+ * function to restore normal operation.
+ *
+ * In case the reset failed, this function should still
+ * be called to restore preemption state, but it won't attempt to
+ * fully restore the ring contents.
+ */
+int amdgpu_multi_ring_reset_helper_end(const u32 ring_type_mask,
+ struct amdgpu_ring *guilty_ring, int ret)
+{
+ struct amdgpu_device *adev = guilty_ring->adev;
+ struct amdgpu_ring *ring;
+ int i, r;
+
+ /* Set preempt condition, rings are now allowed to execute submissions */
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ if (ring->funcs->init_cond_exec)
+ amdgpu_ring_set_preempt_cond_exec(ring, true);
+ }
+
+ /* Flush HDP cache so the GPU can see the updated COND_EXEC values */
+ amdgpu_device_flush_hdp(adev, NULL);
+
+ /* If the reset was unsuccessful, return without restoring anything else. */
+ if (ret)
+ return ret;
+
+ /* Restore contents of all rings */
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ /* Restore contents of the ring */
+ r = amdgpu_ring_reset_helper_end(ring, ring->guilty_fence);
+ if (r) {
+ dev_err(adev->dev,
+ "Failed to recover ring %s after soft reset\n",
+ ring->name);
+ return r;
+ }
+ }
+
+ /* Accept submissions on all rings again */
+ for (i = 0; i < adev->num_rings; ++i) {
+ ring = adev->rings[i];
+
+ if (!(BIT(ring->funcs->type) & ring_type_mask))
+ continue;
+
+ if (!amdgpu_ring_sched_ready(ring))
+ continue;
+
+ drm_sched_wqueue_start(&ring->sched);
+ }
+
+ return 0;
+}
+
bool amdgpu_ring_is_reset_type_supported(struct amdgpu_ring *ring,
u32 reset_type)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 8f28b3bd7010..9d3934b4f106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -314,6 +314,7 @@ struct amdgpu_ring {
uint32_t *ring_backup;
unsigned int ring_backup_entries_to_copy;
bool reemit;
+ struct amdgpu_fence *guilty_fence;
unsigned rptr_offs;
u64 rptr_gpu_addr;
u32 *rptr_cpu_addr;
@@ -588,10 +589,17 @@ int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
struct amdgpu_fence *guilty_fence);
+struct amdgpu_fence *
+amdgpu_ring_find_guilty_fence(struct amdgpu_ring *ring);
void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring,
struct amdgpu_fence *guilty_fence);
int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
struct amdgpu_fence *guilty_fence);
+void amdgpu_multi_ring_reset_helper_begin(const u32 ring_type_mask,
+ struct amdgpu_ring *guilty_ring,
+ struct amdgpu_fence *guilty_fence);
+int amdgpu_multi_ring_reset_helper_end(const u32 ring_type_mask,
+ struct amdgpu_ring *guilty_ring, int ret);
bool amdgpu_ring_is_reset_type_supported(struct amdgpu_ring *ring,
u32 reset_type);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 572a60e1b3cb..002fae3c380e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -583,3 +583,42 @@ int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev,
amdgpu_gfx_rlc_init_microcode_v2_5(adev);
return 0;
}
+
+static const struct amdgpu_rlc_reg_funcs amdgpu_sriov_rlc_reg_funcs = {
+ .rreg32 = amdgpu_sriov_rreg,
+ .wreg32 = amdgpu_sriov_wreg,
+};
+
+static u32
+amdgpu_rlc_rreg(struct amdgpu_device *adev, u32 reg, u32 acc_flags, u32 hwip,
+ u32 xcc_id)
+{
+ return amdgpu_device_rreg(adev, reg, 0);
+}
+
+static void
+amdgpu_rlc_wreg(struct amdgpu_device *adev, u32 reg, u32 value, u32 acc_flags,
+ u32 hwip, u32 xcc_id)
+{
+ amdgpu_device_wreg(adev, reg, value, 0);
+}
+
+static const struct amdgpu_rlc_reg_funcs amdgpu_rlc_reg_funcs = {
+ .rreg32 = amdgpu_rlc_rreg,
+ .wreg32 = amdgpu_rlc_wreg,
+};
+
+void amdgpu_early_init_rlc_reg_funcs(struct amdgpu_device *adev)
+{
+ adev->gfx.rlc.reg_funcs = &amdgpu_rlc_reg_funcs;
+}
+
+void amdgpu_init_rlc_reg_funcs(struct amdgpu_device *adev)
+{
+ if (amdgpu_sriov_vf(adev) &&
+ adev->gfx.rlc.funcs &&
+ adev->gfx.rlc.rlcg_reg_access_supported)
+ adev->gfx.rlc.reg_funcs = &amdgpu_sriov_rlc_reg_funcs;
+ else
+ adev->gfx.rlc.reg_funcs = &amdgpu_rlc_reg_funcs;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
index e535534237a1..959d60c90dcd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
@@ -262,6 +262,11 @@ struct amdgpu_rlc_funcs {
bool (*is_rlcg_access_range)(struct amdgpu_device *adev, uint32_t reg);
};
+struct amdgpu_rlc_reg_funcs {
+ u32 (*rreg32)(struct amdgpu_device *adev, u32 reg, u32 acc_flags, u32 hwip, u32 xcc_id);
+ void (*wreg32)(struct amdgpu_device *adev, u32 reg, u32 val, u32 acc_flags, u32 hwip, u32 xcc_id);
+};
+
struct amdgpu_rlcg_reg_access_ctrl {
uint32_t scratch_reg0;
uint32_t scratch_reg1;
@@ -303,6 +308,7 @@ struct amdgpu_rlc {
/* safe mode for updating CG/PG state */
bool in_safe_mode[AMDGPU_MAX_RLC_INSTANCES];
const struct amdgpu_rlc_funcs *funcs;
+ const struct amdgpu_rlc_reg_funcs *reg_funcs;
/* for firmware data */
u32 save_and_restore_offset;
@@ -374,4 +380,8 @@ void amdgpu_gfx_rlc_fini(struct amdgpu_device *adev);
int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev,
uint16_t version_major,
uint16_t version_minor);
+
+void amdgpu_early_init_rlc_reg_funcs(struct amdgpu_device *adev);
+void amdgpu_init_rlc_reg_funcs(struct amdgpu_device *adev);
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h
new file mode 100644
index 000000000000..8c85c80fc119
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2026 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef AMDGPU_SA_H_
+#define AMDGPU_SA_H_
+
+#include <drm/drm_suballoc.h>
+
+struct amdgpu_device;
+struct amdgpu_bo;
+
+struct amdgpu_sa_manager {
+ struct drm_suballoc_manager base;
+ struct amdgpu_bo *bo;
+ uint64_t gpu_addr;
+ void *cpu_ptr;
+};
+
+static inline struct amdgpu_sa_manager *
+to_amdgpu_sa_manager(struct drm_suballoc_manager *manager)
+{
+ return container_of(manager, struct amdgpu_sa_manager, base);
+}
+
+static inline uint64_t amdgpu_sa_bo_gpu_addr(struct drm_suballoc *sa_bo)
+{
+ return to_amdgpu_sa_manager(sa_bo->manager)->gpu_addr +
+ drm_suballoc_soffset(sa_bo);
+}
+
+static inline void *amdgpu_sa_bo_cpu_addr(struct drm_suballoc *sa_bo)
+{
+ return to_amdgpu_sa_manager(sa_bo->manager)->cpu_ptr +
+ drm_suballoc_soffset(sa_bo);
+}
+
+int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev,
+ struct amdgpu_sa_manager *sa_manager,
+ unsigned size, u32 align, u32 domain);
+void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev,
+ struct amdgpu_sa_manager *sa_manager);
+int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev,
+ struct amdgpu_sa_manager *sa_manager);
+int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager,
+ struct drm_suballoc **sa_bo,
+ unsigned int size);
+void amdgpu_sa_bo_free(struct drm_suballoc **sa_bo,
+ struct dma_fence *fence);
+#if defined(CONFIG_DEBUG_FS)
+void amdgpu_sa_bo_dump_debug_info(struct amdgpu_sa_manager *sa_manager,
+ struct seq_file *m);
+u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m);
+#endif
+void amdgpu_debugfs_sa_init(struct amdgpu_device *adev);
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index 0eecfaa3a94c..8effb1158430 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -39,7 +39,7 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv;
struct amdgpu_ctx_mgr *mgr;
struct amdgpu_ctx *ctx;
- uint32_t id;
+ unsigned long id;
int r;
if (fd_empty(f))
@@ -50,10 +50,10 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
return r;
mgr = &fpriv->ctx_mgr;
- mutex_lock(&mgr->lock);
- idr_for_each_entry(&mgr->ctx_handles, ctx, id)
+ xa_lock(&mgr->ctx_handles);
+ xa_for_each(&mgr->ctx_handles, id, ctx)
amdgpu_ctx_priority_override(ctx, priority);
- mutex_unlock(&mgr->lock);
+ xa_unlock(&mgr->ctx_handles);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index fcd81242059e..fbac732f3e01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -553,10 +553,11 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
bool caller_handles_kernel_queues)
{
- int ret = 0;
struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
struct amdgpu_ring *page_ring = &sdma_instance->page;
+ struct amdgpu_fence *gfx_fence, *page_fence;
+ int ret = 0;
if (amdgpu_sriov_vf(adev))
return -EOPNOTSUPP;
@@ -569,9 +570,14 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
* the reset is in progress.
*/
drm_sched_wqueue_stop(&gfx_ring->sched);
+ gfx_fence = amdgpu_ring_find_guilty_fence(gfx_ring);
+ amdgpu_ring_reset_helper_begin(gfx_ring, gfx_fence);
- if (adev->sdma.has_page_queue)
+ if (adev->sdma.has_page_queue) {
drm_sched_wqueue_stop(&page_ring->sched);
+ page_fence = amdgpu_ring_find_guilty_fence(page_ring);
+ amdgpu_ring_reset_helper_begin(page_ring, page_fence);
+ }
}
if (sdma_instance->funcs->stop_kernel_queue) {
@@ -600,14 +606,19 @@ exit:
* to be submitted to the queues after the reset is complete.
*/
if (!ret) {
- amdgpu_fence_driver_force_completion(gfx_ring, NULL);
+ ret = amdgpu_ring_reset_helper_end(gfx_ring, gfx_fence);
+ if (ret)
+ goto unlock;
drm_sched_wqueue_start(&gfx_ring->sched);
if (adev->sdma.has_page_queue) {
- amdgpu_fence_driver_force_completion(page_ring, NULL);
+ ret = amdgpu_ring_reset_helper_end(page_ring, page_fence);
+ if (ret)
+ goto unlock;
drm_sched_wqueue_start(&page_ring->sched);
}
}
}
+unlock:
mutex_unlock(&sdma_instance->engine_reset_mutex);
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 2bf365609775..4f4e56022c97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -85,34 +85,6 @@ struct amdgpu_sdma_instance {
const struct amdgpu_sdma_funcs *funcs;
};
-enum amdgpu_sdma_ras_memory_id {
- AMDGPU_SDMA_MBANK_DATA_BUF0 = 1,
- AMDGPU_SDMA_MBANK_DATA_BUF1 = 2,
- AMDGPU_SDMA_MBANK_DATA_BUF2 = 3,
- AMDGPU_SDMA_MBANK_DATA_BUF3 = 4,
- AMDGPU_SDMA_MBANK_DATA_BUF4 = 5,
- AMDGPU_SDMA_MBANK_DATA_BUF5 = 6,
- AMDGPU_SDMA_MBANK_DATA_BUF6 = 7,
- AMDGPU_SDMA_MBANK_DATA_BUF7 = 8,
- AMDGPU_SDMA_MBANK_DATA_BUF8 = 9,
- AMDGPU_SDMA_MBANK_DATA_BUF9 = 10,
- AMDGPU_SDMA_MBANK_DATA_BUF10 = 11,
- AMDGPU_SDMA_MBANK_DATA_BUF11 = 12,
- AMDGPU_SDMA_MBANK_DATA_BUF12 = 13,
- AMDGPU_SDMA_MBANK_DATA_BUF13 = 14,
- AMDGPU_SDMA_MBANK_DATA_BUF14 = 15,
- AMDGPU_SDMA_MBANK_DATA_BUF15 = 16,
- AMDGPU_SDMA_UCODE_BUF = 17,
- AMDGPU_SDMA_RB_CMD_BUF = 18,
- AMDGPU_SDMA_IB_CMD_BUF = 19,
- AMDGPU_SDMA_UTCL1_RD_FIFO = 20,
- AMDGPU_SDMA_UTCL1_RDBST_FIFO = 21,
- AMDGPU_SDMA_UTCL1_WR_FIFO = 22,
- AMDGPU_SDMA_DATA_LUT_FIFO = 23,
- AMDGPU_SDMA_SPLIT_DAT_BUF = 24,
- AMDGPU_SDMA_MEMORY_BLOCK_LAST,
-};
-
struct amdgpu_sdma_ras {
struct amdgpu_ras_block_object ras_block;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 85724ec6aaf8..5324030a13f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -28,6 +28,8 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
+#include "amdgpu_userq_fence.h"
+
#undef TRACE_SYSTEM
#define TRACE_SYSTEM amdgpu
#define TRACE_INCLUDE_FILE amdgpu_trace
@@ -582,6 +584,154 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
__entry->value)
);
+DECLARE_EVENT_CLASS(amdgpu_userq_queue,
+ TP_PROTO(struct amdgpu_usermode_queue *queue),
+ TP_ARGS(queue),
+ TP_STRUCT__entry(
+ __field(void *, queue)
+ __field(u64, doorbell_index)
+ __field(int, queue_type)
+ __field(int, state)
+ __field(u32, xcp_id)
+ ),
+ TP_fast_assign(
+ __entry->queue = queue;
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->queue_type = queue->queue_type;
+ __entry->state = queue->state;
+ __entry->xcp_id = queue->xcp_id;
+ ),
+ TP_printk("queue=%p, doorbell=%llu, type=%d, state=%d, xcp_id=%u",
+ __entry->queue, __entry->doorbell_index,
+ __entry->queue_type, __entry->state, __entry->xcp_id)
+);
+DEFINE_EVENT(amdgpu_userq_queue, amdgpu_userq_create_start,
+ TP_PROTO(struct amdgpu_usermode_queue *queue),
+ TP_ARGS(queue));
+DEFINE_EVENT(amdgpu_userq_queue, amdgpu_userq_destroy_start,
+ TP_PROTO(struct amdgpu_usermode_queue *queue),
+ TP_ARGS(queue));
+DECLARE_EVENT_CLASS(amdgpu_userq_queue_result,
+ TP_PROTO(struct amdgpu_usermode_queue *queue, int result),
+ TP_ARGS(queue, result),
+ TP_STRUCT__entry(
+ __field(void *, queue)
+ __field(u64, doorbell_index)
+ __field(int, queue_type)
+ __field(int, state)
+ __field(u32, xcp_id)
+ __field(int, result)
+ ),
+ TP_fast_assign(
+ __entry->queue = queue;
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->queue_type = queue->queue_type;
+ __entry->state = queue->state;
+ __entry->xcp_id = queue->xcp_id;
+ __entry->result = result;
+ ),
+ TP_printk("queue=%p, doorbell=%llu, type=%d, state=%d, xcp_id=%u, result=%d",
+ __entry->queue, __entry->doorbell_index,
+ __entry->queue_type, __entry->state,
+ __entry->xcp_id, __entry->result)
+);
+DEFINE_EVENT(amdgpu_userq_queue_result, amdgpu_userq_create_end,
+ TP_PROTO(struct amdgpu_usermode_queue *queue, int result),
+ TP_ARGS(queue, result));
+DEFINE_EVENT(amdgpu_userq_queue_result, amdgpu_userq_destroy_end,
+ TP_PROTO(struct amdgpu_usermode_queue *queue, int result),
+ TP_ARGS(queue, result));
+
+TRACE_EVENT(amdgpu_userq_emit_fence,
+ TP_PROTO(struct device *device, struct amdgpu_usermode_queue *queue, struct amdgpu_userq_fence *fence),
+ TP_ARGS(device, queue, fence),
+ TP_STRUCT__entry(
+ __field(u64, fence_context)
+ __field(u64, fence_seqno)
+ __string(dev, dev_name(device))
+ __field(u64, doorbell_index)
+ __field(u64, client_id)
+ __field(u32, queue_type)
+ ),
+ TP_fast_assign(
+ __entry->fence_context = fence->base.context;
+ __entry->fence_seqno = fence->base.seqno;
+ __assign_str(dev);
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->client_id = queue->userq_mgr->file->client_id;
+ __entry->queue_type = queue->queue_type;
+ ),
+ TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, fence=%llu:%llu",
+ __get_str(dev), __entry->client_id, __entry->queue_type, __entry->doorbell_index,
+ __entry->fence_context,
+ __entry->fence_seqno)
+);
+
+TRACE_EVENT(amdgpu_userq_wait_deps,
+ TP_PROTO(struct device *device, struct amdgpu_usermode_queue *queue, struct amdgpu_userq_fence *dep),
+ TP_ARGS(device, queue, dep),
+ TP_STRUCT__entry(
+ __field(u64, context)
+ __field(u64, dep_context)
+ __field(u64, dep_seqno)
+ __string(dev, dev_name(device))
+ __field(u64, doorbell_index)
+ __field(u64, client_id)
+ __field(u32, queue_type)
+ ),
+ TP_fast_assign(
+ __assign_str(dev);
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->queue_type = queue->queue_type;
+ __entry->client_id = queue->userq_mgr->file->client_id;
+ __entry->context = queue->fence_drv->context;
+ __entry->dep_context = dep->base.context;
+ __entry->dep_seqno = dep->base.seqno;
+ ),
+ TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, context=%llu depends on fence=%llu:%llu",
+ __get_str(dev), __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->context,
+ __entry->dep_context,
+ __entry->dep_seqno)
+);
+
+TRACE_EVENT(amdgpu_userq_state_start,
+ TP_PROTO(struct amdgpu_usermode_queue *queue),
+ TP_ARGS(queue),
+ TP_STRUCT__entry(
+ __field(u64, doorbell_index)
+ __field(u64, client_id)
+ __field(u32, queue_type)
+ __field(u32, from)
+ ),
+ TP_fast_assign(
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->queue_type = queue->queue_type;
+ __entry->client_id = queue->userq_mgr->file->client_id;
+ __entry->from = queue->state;
+ ),
+ TP_printk("client_id=%llu, type=%u, doorbell=%llu, from=%d",
+ __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->from)
+);
+
+TRACE_EVENT(amdgpu_userq_state_changed,
+ TP_PROTO(struct amdgpu_usermode_queue *queue, enum amdgpu_userq_state new_state),
+ TP_ARGS(queue, new_state),
+ TP_STRUCT__entry(
+ __field(u64, doorbell_index)
+ __field(u64, client_id)
+ __field(u32, queue_type)
+ __field(u32, to)
+ ),
+ TP_fast_assign(
+ __entry->doorbell_index = queue->doorbell_index;
+ __entry->queue_type = queue->queue_type;
+ __entry->client_id = queue->userq_mgr->file->client_id;
+ __entry->to = new_state;
+ ),
+ TP_printk("client_id=%llu, type=%u, doorbell=%llu, to=%d",
+ __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->to)
+);
+
#undef AMDGPU_JOB_GET_TIMELINE_NAME
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 025625e7e800..b10b0878df37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2194,7 +2194,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
return r;
}
- /* Create a boorbell page for kernel usages */
+ /* Create a doorbell page for kernel usages */
r = amdgpu_doorbell_create_kernel_doorbells(adev);
if (r) {
dev_err(adev->dev, "Failed to initialize kernel doorbells.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index b5d938b31383..ff9e2e346609 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -140,6 +140,7 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size);
void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev);
int amdgpu_preempt_mgr_init(struct amdgpu_device *adev);
void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev);
+void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev);
int amdgpu_vram_mgr_init(struct amdgpu_device *adev);
void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index b8ed931f8a40..2a5f5e6188bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -97,7 +97,6 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct amdgpu_ras_eeprom_control *control = &con->eeprom_control;
unsigned int error_query_mode;
int ret = 0;
unsigned long err_count;
@@ -118,77 +117,66 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
mutex_lock(&con->page_retirement_lock);
- if (!amdgpu_ras_smu_eeprom_supported(adev)) {
- ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
- if (ret == -EOPNOTSUPP &&
- error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
- if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
- adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
- adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
- ras_error_status);
-
- if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
- adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
- adev->umc.max_ras_err_cnt_per_query) {
- kfree(err_data->err_addr);
- err_data->err_addr =
- kzalloc_objs(struct eeprom_table_record,
- adev->umc.max_ras_err_cnt_per_query);
-
- /* still call query_ras_error_address to clear error status
- * even NOMEM error is encountered
- */
- if (!err_data->err_addr)
- dev_warn(adev->dev,
- "Failed to alloc memory for umc error address record!\n");
- else
- err_data->err_addr_len =
- adev->umc.max_ras_err_cnt_per_query;
-
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev,
- ras_error_status);
- }
- } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
- (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
- if (adev->umc.ras &&
- adev->umc.ras->ecc_info_query_ras_error_count)
- adev->umc.ras->ecc_info_query_ras_error_count(adev,
- ras_error_status);
-
- if (adev->umc.ras &&
- adev->umc.ras->ecc_info_query_ras_error_address &&
- adev->umc.max_ras_err_cnt_per_query) {
- kfree(err_data->err_addr);
- err_data->err_addr =
- kzalloc_objs(struct eeprom_table_record,
- adev->umc.max_ras_err_cnt_per_query);
-
- /* still call query_ras_error_address to clear error status
- * even NOMEM error is encountered
- */
- if (!err_data->err_addr)
- dev_warn(adev->dev,
- "Failed to alloc memory for umc error address record!\n");
- else
- err_data->err_addr_len =
- adev->umc.max_ras_err_cnt_per_query;
-
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- adev->umc.ras->ecc_info_query_ras_error_address(adev,
- ras_error_status);
- }
+ ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
+ if (ret == -EOPNOTSUPP &&
+ error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+ if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
+ ras_error_status);
+
+ if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kzalloc_objs(struct eeprom_table_record,
+ adev->umc.max_ras_err_cnt_per_query);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if (!err_data->err_addr)
+ dev_warn(adev->dev,
+ "Failed to alloc memory for umc error address record!\n");
+ else
+ err_data->err_addr_len =
+ adev->umc.max_ras_err_cnt_per_query;
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev,
+ ras_error_status);
}
- } else {
- if (!amdgpu_ras_eeprom_update_record_num(control)) {
- err_data->err_addr_cnt = err_data->de_count =
- control->ras_num_recs - control->ras_num_recs_old;
- amdgpu_ras_eeprom_read_idx(control, err_data->err_addr,
- control->ras_num_recs_old, err_data->de_count);
+ } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
+ (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
+ if (adev->umc.ras &&
+ adev->umc.ras->ecc_info_query_ras_error_count)
+ adev->umc.ras->ecc_info_query_ras_error_count(adev,
+ ras_error_status);
+
+ if (adev->umc.ras &&
+ adev->umc.ras->ecc_info_query_ras_error_address &&
+ adev->umc.max_ras_err_cnt_per_query) {
+ err_data->err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+ /* still call query_ras_error_address to clear error status
+ * even NOMEM error is encountered
+ */
+ if (!err_data->err_addr)
+ dev_warn(adev->dev,
+ "Failed to alloc memory for umc error address record!\n");
+ else
+ err_data->err_addr_len =
+ adev->umc.max_ras_err_cnt_per_query;
+
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ adev->umc.ras->ecc_info_query_ras_error_address(adev,
+ ras_error_status);
}
}
@@ -276,7 +264,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
}
amdgpu_ras_error_data_fini(&err_data);
- } else if (amdgpu_uniras_enabled(adev)) {
+ } else {
struct ras_ih_info ih_info = {0};
ih_info.block = block;
@@ -285,17 +273,6 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
ih_info.pasid_fn = pasid_fn;
ih_info.data = data;
amdgpu_ras_mgr_handle_consumer_interrupt(adev, &ih_info);
- } else {
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int ret;
-
- ret = amdgpu_ras_put_poison_req(adev,
- block, pasid, pasid_fn, data, reset);
- if (!ret) {
- atomic_inc(&con->page_retirement_req_cnt);
- atomic_inc(&con->poison_consumption_count);
- wake_up(&con->page_retirement_wq);
- }
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -512,129 +489,3 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
return 0;
}
-
-int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
- uint64_t status, uint64_t ipid, uint64_t addr)
-{
- if (adev->umc.ras->update_ecc_status)
- return adev->umc.ras->update_ecc_status(adev,
- status, ipid, addr);
- return 0;
-}
-
-int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
- struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_ecc_log_info *ecc_log;
- int ret;
-
- ecc_log = &con->umc_ecc_log;
-
- mutex_lock(&ecc_log->lock);
- ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
- if (!ret)
- radix_tree_tag_set(ecc_tree,
- ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
- mutex_unlock(&ecc_log->lock);
-
- return ret;
-}
-
-int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev,
- struct ras_err_data *err_data, uint64_t pa_addr)
-{
- struct ta_ras_query_address_output addr_out;
-
- /* reinit err_data */
- err_data->err_addr_cnt = 0;
- err_data->err_addr_len = adev->umc.retire_unit;
-
- addr_out.pa.pa = pa_addr;
- if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL,
- &addr_out, false);
- else
- return -EINVAL;
-}
-
-int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
- uint64_t pa_addr, uint64_t *pfns, int len)
-{
- int i, ret;
- struct ras_err_data err_data;
-
- err_data.err_addr = kzalloc_objs(struct eeprom_table_record,
- adev->umc.retire_unit);
- if (!err_data.err_addr) {
- dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n");
- return 0;
- }
-
- ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr);
- if (ret)
- goto out;
-
- for (i = 0; i < adev->umc.retire_unit; i++) {
- if (i >= len)
- goto out;
-
- pfns[i] = err_data.err_addr[i].retired_page;
- }
- ret = i;
- adev->umc.err_addr_cnt = err_data.err_addr_cnt;
-
-out:
- kfree(err_data.err_addr);
- return ret;
-}
-
-int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
- uint64_t err_addr, uint32_t ch, uint32_t umc,
- uint32_t node, uint32_t socket,
- struct ta_ras_query_address_output *addr_out, bool dump_addr)
-{
- struct ta_ras_query_address_input addr_in;
- int ret;
-
- memset(&addr_in, 0, sizeof(addr_in));
- addr_in.ma.err_addr = err_addr;
- addr_in.ma.ch_inst = ch;
- addr_in.ma.umc_inst = umc;
- addr_in.ma.node_inst = node;
- addr_in.ma.socket_id = socket;
-
- if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
- addr_out, dump_addr);
- if (ret)
- return ret;
- } else {
- return 0;
- }
-
- return 0;
-}
-
-int amdgpu_umc_pa2mca(struct amdgpu_device *adev,
- uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps)
-{
- struct ta_ras_query_address_input addr_in;
- struct ta_ras_query_address_output addr_out;
- int ret;
-
- /* nps: the pa belongs to */
- addr_in.pa.pa = pa | ((uint64_t)nps << 58);
- addr_in.addr_type = TA_RAS_PA_TO_MCA;
- ret = psp_ras_query_address(&adev->psp, &addr_in, &addr_out);
- if (ret) {
- dev_warn(adev->dev, "Failed to query RAS MCA address for 0x%llx",
- pa);
-
- return ret;
- }
-
- *mca = addr_out.ma.err_addr;
-
- return 0;
-}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 8494a55ebf76..cf06d5f856f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -103,18 +103,7 @@ struct amdgpu_umc_ras {
void *ras_error_status);
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status);
- int (*update_ecc_status)(struct amdgpu_device *adev,
- uint64_t status, uint64_t ipid, uint64_t addr);
- int (*convert_ras_err_addr)(struct amdgpu_device *adev,
- struct ras_err_data *err_data,
- struct ta_ras_query_address_input *addr_in,
- struct ta_ras_query_address_output *addr_out,
- bool dump_addr);
- uint32_t (*get_die_id_from_pa)(struct amdgpu_device *adev,
- uint64_t mca_addr, uint64_t retired_page);
void (*get_retire_flip_bits)(struct amdgpu_device *adev);
- void (*mca_ipid_parse)(struct amdgpu_device *adev, uint64_t ipid,
- uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid);
};
struct amdgpu_umc_funcs {
@@ -179,21 +168,6 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
-int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
- uint64_t status, uint64_t ipid, uint64_t addr);
-int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
- struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
-
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status);
-int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev,
- struct ras_err_data *err_data, uint64_t pa_addr);
-int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
- uint64_t pa_addr, uint64_t *pfns, int len);
-int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
- uint64_t err_addr, uint32_t ch, uint32_t umc,
- uint32_t node, uint32_t socket,
- struct ta_ras_query_address_output *addr_out, bool dump_addr);
-int amdgpu_umc_pa2mca(struct amdgpu_device *adev,
- uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index ef3f0213cc46..82c8809d1d9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -33,6 +33,7 @@
#include "amdgpu_userq.h"
#include "amdgpu_hmm.h"
#include "amdgpu_userq_fence.h"
+#include "amdgpu_trace.h"
u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
{
@@ -88,14 +89,7 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
container_of(work, struct amdgpu_userq_mgr,
reset_work);
struct amdgpu_device *adev = uq_mgr->adev;
- const int queue_types[] = {
- AMDGPU_RING_TYPE_COMPUTE,
- AMDGPU_RING_TYPE_GFX,
- AMDGPU_RING_TYPE_SDMA
- };
- const int num_queue_types = ARRAY_SIZE(queue_types);
- bool gpu_reset = false;
- int i, r;
+ struct amdgpu_reset_context reset_context;
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "userq reset disabled by debug mask\n");
@@ -109,42 +103,15 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
if (!amdgpu_gpu_recovery)
return;
- /*
- * Iterate through all queue types to detect and reset problematic queues
- * Process each queue type in the defined order
- */
- for (i = 0; i < num_queue_types; i++) {
- int ring_type = queue_types[i];
- const struct amdgpu_userq_funcs *funcs =
- adev->userq_funcs[ring_type];
-
- if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
- AMDGPU_RESET_TYPE_PER_QUEUE))
- continue;
+ memset(&reset_context, 0, sizeof(reset_context));
- if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
- funcs && funcs->detect_and_reset) {
- r = funcs->detect_and_reset(adev, ring_type);
- if (r) {
- gpu_reset = true;
- break;
- }
- }
- }
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
- if (gpu_reset) {
- struct amdgpu_reset_context reset_context;
-
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- reset_context.src = AMDGPU_RESET_SRC_USERQ;
- set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
- }
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@@ -152,12 +119,45 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
struct amdgpu_usermode_queue *queue =
container_of(work, struct amdgpu_usermode_queue,
hang_detect_work.work);
+ struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ return;
+ }
+
+ /*
+ * If GPU recovery feature is disabled system-wide,
+ * skip all reset detection logic
+ */
+ if (!amdgpu_gpu_recovery)
+ return;
+
+ if (amdgpu_userq_is_reset_type_supported(adev, queue->queue_type,
+ AMDGPU_RESET_TYPE_PER_QUEUE)) {
+ int r;
+
+ if (queue->queue_type == AMDGPU_HW_IP_COMPUTE)
+ r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL,
+ queue, NULL, NULL);
+ else
+ r = userq_funcs->reset(queue);
+ if (r)
+ gpu_reset = true;
+ } else {
+ gpu_reset = true;
+ }
/*
* Don't schedule the work here! Scheduling or queue work from one reset
* handler to another is illegal if you don't take extra precautions!
*/
- amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
+ if (gpu_reset)
+ amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
}
/*
@@ -293,11 +293,15 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
int r;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ trace_amdgpu_userq_state_start(queue);
+
r = userq_funcs->preempt(queue);
if (r) {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG);
queue->state = AMDGPU_USERQ_STATE_HUNG;
return r;
} else {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_PREEMPTED);
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
}
}
@@ -313,10 +317,14 @@ static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
+ trace_amdgpu_userq_state_start(queue);
+
r = userq_funcs->restore(queue);
if (r) {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG);
queue->state = AMDGPU_USERQ_STATE_HUNG;
} else {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED);
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
@@ -334,12 +342,15 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
+ trace_amdgpu_userq_state_start(queue);
r = userq_funcs->unmap(queue);
if (r) {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG);
queue->state = AMDGPU_USERQ_STATE_HUNG;
return r;
} else {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_UNMAPPED);
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
}
}
@@ -356,11 +367,15 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
int r;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
+ trace_amdgpu_userq_state_start(queue);
+
r = userq_funcs->map(queue);
if (r) {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG);
queue->state = AMDGPU_USERQ_STATE_HUNG;
return r;
} else {
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED);
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
@@ -507,6 +522,8 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
int r = 0;
+ trace_amdgpu_userq_destroy_start(queue);
+
cancel_delayed_work_sync(&uq_mgr->resume_work);
/* Cancel any pending hang detection work and cleanup */
@@ -532,6 +549,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
amdgpu_bo_unreserve(queue->db_obj.obj);
amdgpu_bo_unref(&queue->db_obj.obj);
+ trace_amdgpu_userq_destroy_end(queue, r);
kfree(queue);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
@@ -629,6 +647,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
queue->queue_type = args->in.ip_type;
queue->vm = &fpriv->vm;
queue->priority = priority;
+ queue->xcp_id = (fpriv->xcp_id != AMDGPU_XCP_NO_PARTITION) ?
+ fpriv->xcp_id : 0;
queue->userq_mgr = uq_mgr;
INIT_DELAYED_WORK(&queue->hang_detect_work,
amdgpu_userq_hang_detect_work);
@@ -671,6 +691,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
}
queue->doorbell_index = index;
+ queue->doorbell_offset = (u32)args->in.doorbell_offset;
+ trace_amdgpu_userq_create_start(queue);
r = uq_funcs->mqd_create(queue, &args->in);
if (r) {
drm_file_err(uq_mgr->file, "Failed to create Queue\n");
@@ -694,6 +716,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
r = amdgpu_userq_map_helper(queue);
if (r) {
drm_file_err(uq_mgr->file, "Failed to map Queue\n");
+ trace_amdgpu_userq_create_end(queue, r);
mutex_unlock(&uq_mgr->userq_mutex);
goto erase_doorbell;
}
@@ -710,11 +733,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
* This drops the last reference which should take care of
* all cleanup.
*/
+ trace_amdgpu_userq_create_end(queue, r);
amdgpu_userq_put(queue);
return r;
}
amdgpu_debugfs_userq_init(filp, queue, qid);
+ trace_amdgpu_userq_create_end(queue, 0);
args->out.queue_id = qid;
return 0;
@@ -730,6 +755,7 @@ clean_doorbell_bo:
free_fence_drv:
amdgpu_userq_fence_driver_free(queue);
free_queue:
+ trace_amdgpu_userq_create_end(queue, r);
kfree(queue);
err_pm_runtime:
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
@@ -862,16 +888,10 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
static int
amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
{
- struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
- struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_usermode_queue *queue;
unsigned long queue_id;
int ret = 0, r;
-
- if (amdgpu_bo_reserve(vm->root.bo, false))
- return false;
-
mutex_lock(&uq_mgr->userq_mutex);
/* Resume all the queues for this process */
xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
@@ -879,6 +899,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
if (!amdgpu_userq_buffer_vas_mapped(queue)) {
drm_file_err(uq_mgr->file,
"trying restore queue without va mapping\n");
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_INVALID_VA);
queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
continue;
}
@@ -886,10 +907,8 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
r = amdgpu_userq_map_helper(queue);
if (r)
ret = r;
-
}
mutex_unlock(&uq_mgr->userq_mutex);
- amdgpu_bo_unreserve(vm->root.bo);
if (ret)
drm_file_err(uq_mgr->file,
@@ -923,7 +942,8 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
spin_unlock(&vm->individual_lock);
bo = bo_va->base.bo;
- ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
+ ret = drm_exec_prepare_obj(exec, &bo->tbo.base,
+ TTM_NUM_MOVE_FENCES + 1);
if (unlikely(ret))
return ret;
@@ -946,7 +966,7 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
/* Make sure the whole VM is ready to be used */
static int
-amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
+amdgpu_userq_vm_validate_and_restore_queue(struct amdgpu_userq_mgr *uq_mgr)
{
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
bool invalidated = false, new_addition = false;
@@ -1072,8 +1092,12 @@ retry_lock:
dma_fence_wait(vm->last_update, false);
ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec);
- if (ret)
+ if (ret) {
drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n");
+ goto unlock_all;
+ }
+
+ ret = amdgpu_userq_restore_all(uq_mgr);
unlock_all:
drm_exec_fini(&exec);
@@ -1099,18 +1123,34 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
if (!dma_fence_is_signaled(ev_fence))
goto put_fence;
- ret = amdgpu_userq_vm_validate(uq_mgr);
+ ret = amdgpu_userq_vm_validate_and_restore_queue(uq_mgr);
if (ret) {
drm_file_err(uq_mgr->file, "Failed to validate BOs to restore ret=%d\n", ret);
goto put_fence;
}
- amdgpu_userq_restore_all(uq_mgr);
-
put_fence:
dma_fence_put(ev_fence);
}
+void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
+ u32 pasid, u32 doorbell_offset)
+{
+ struct xarray *xa = &adev->userq_doorbell_xa;
+ struct amdgpu_usermode_queue *queue;
+ unsigned long flags, idx;
+
+ xa_lock_irqsave(xa, flags);
+ xa_for_each(xa, idx, queue) {
+ if (queue->vm && queue->vm->pasid == pasid &&
+ queue->doorbell_offset == doorbell_offset) {
+ amdgpu_userq_start_hang_detect_work(queue);
+ break;
+ }
+ }
+ xa_unlock_irqrestore(xa, flags);
+}
+
static int
amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -1166,6 +1206,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
xa_init_flags(&userq_mgr->userq_xa, XA_FLAGS_ALLOC);
userq_mgr->adev = adev;
userq_mgr->file = file_priv;
+ mutex_init(&userq_mgr->proc_ctx_lock);
INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
@@ -1219,6 +1260,11 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
*/
cancel_work_sync(&userq_mgr->reset_work);
+ amdgpu_bo_free_kernel(&userq_mgr->proc_ctx_obj.obj,
+ &userq_mgr->proc_ctx_obj.gpu_addr,
+ &userq_mgr->proc_ctx_obj.cpu_ptr);
+
+ mutex_destroy(&userq_mgr->proc_ctx_lock);
mutex_destroy(&userq_mgr->userq_mutex);
}
@@ -1370,12 +1416,14 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
if (queue->state != AMDGPU_USERQ_STATE_MAPPED)
continue;
+ trace_amdgpu_userq_state_start(queue);
userq_funcs = adev->userq_funcs[queue->queue_type];
userq_funcs->unmap(queue);
/* just mark all queues as hung at this point.
* if unmap succeeds, we could map again
* in amdgpu_userq_post_reset() if vram is not lost
*/
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG);
queue->state = AMDGPU_USERQ_STATE_HUNG;
amdgpu_userq_fence_driver_force_completion(queue);
}
@@ -1394,6 +1442,8 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
+ trace_amdgpu_userq_state_start(queue);
+
userq_funcs = adev->userq_funcs[queue->queue_type];
/* Re-map queue */
r = userq_funcs->map(queue);
@@ -1401,6 +1451,7 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
continue;
}
+ trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED);
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index d1751febaefe..61e5f8a06eb2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -53,6 +53,7 @@ struct amdgpu_usermode_queue {
enum amdgpu_userq_state state;
uint64_t doorbell_handle;
uint64_t doorbell_index;
+ u32 doorbell_offset;
uint64_t flags;
struct amdgpu_mqd_prop *userq_prop;
struct amdgpu_userq_mgr *userq_mgr;
@@ -111,8 +112,7 @@ struct amdgpu_userq_funcs {
int (*map)(struct amdgpu_usermode_queue *queue);
int (*preempt)(struct amdgpu_usermode_queue *queue);
int (*restore)(struct amdgpu_usermode_queue *queue);
- int (*detect_and_reset)(struct amdgpu_device *adev,
- int queue_type);
+ int (*reset)(struct amdgpu_usermode_queue *queue);
};
/* Usermode queues for gfx */
@@ -127,6 +127,8 @@ struct amdgpu_userq_mgr {
struct amdgpu_device *adev;
struct delayed_work resume_work;
struct drm_file *file;
+ struct mutex proc_ctx_lock;
+ struct amdgpu_userq_obj proc_ctx_obj;
/**
* @reset_work:
@@ -177,6 +179,16 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue);
void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell);
+/*
+ * CP packs the per-process doorbell_id of the queue in
+ * CTXID0[9:0] on priv-fault (same encoding KFD uses via
+ * KFD_CTXID0_DOORBELL_ID_MASK)
+ */
+#define AMDGPU_CTXID0_DOORBELL_ID_MASK 0x3ff
+
+void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
+ u32 pasid, u32 doorbell_offset);
+
int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
struct amdgpu_usermode_queue *queue,
u64 addr, u64 expected_size, u64 *va_out);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index f74ad378e407..7e80442ec3e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -30,7 +30,7 @@
#include <drm/drm_syncobj.h>
#include "amdgpu.h"
-#include "amdgpu_userq_fence.h"
+#include "amdgpu_trace.h"
#define AMDGPU_USERQ_MAX_HANDLES (1U << 16)
@@ -528,6 +528,8 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
/* Create the new fence */
amdgpu_userq_fence_init(queue, fence, wptr);
+ trace_amdgpu_userq_emit_fence(dev->dev, queue, fence);
+
mutex_unlock(&userq_mgr->userq_mutex);
/*
@@ -701,7 +703,7 @@ amdgpu_userq_wait_add_fence(struct drm_amdgpu_userq_wait *wait_info,
}
static int
-amdgpu_userq_wait_return_fence_info(struct drm_file *filp,
+amdgpu_userq_wait_return_fence_info(struct drm_device *dev, struct drm_file *filp,
struct drm_amdgpu_userq_wait *wait_info,
u32 *syncobj_handles, u64 *timeline_points,
u32 *timeline_handles,
@@ -869,6 +871,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp,
amdgpu_userq_fence_driver_get(fence_drv);
+ trace_amdgpu_userq_wait_deps(dev->dev, waitq, userq_fence);
+
/* Store drm syncobj's gpu va address and value */
fence_info[cnt].va = fence_drv->va;
fence_info[cnt].value = fences[i]->seqno;
@@ -969,7 +973,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
gobj_write,
gobj_read);
} else {
- r = amdgpu_userq_wait_return_fence_info(filp, wait_info,
+ r = amdgpu_userq_wait_return_fence_info(dev, filp, wait_info,
syncobj_handles,
timeline_points,
timeline_handles,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 616967519869..fe504f1a3fc8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -506,9 +506,8 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
struct amdgpu_device *adev = ring->adev;
struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me];
- atomic_inc(&vcn_inst->total_submission_cnt);
-
- cancel_delayed_work_sync(&vcn_inst->idle_work);
+ if (!atomic_fetch_inc(&vcn_inst->total_submission_cnt))
+ cancel_delayed_work_sync(&vcn_inst->idle_work);
mutex_lock(&vcn_inst->vcn_pg_lock);
vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
@@ -550,10 +549,9 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
!adev->vcn.inst[ring->me].using_unified_queue)
atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
- atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
-
- schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
- VCN_IDLE_TIMEOUT);
+ if (atomic_dec_and_test(&ring->adev->vcn.inst[ring->me].total_submission_cnt))
+ schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
+ VCN_IDLE_TIMEOUT);
}
int amdgpu_vcn_dec_ring_test_ring(struct amdgpu_ring *ring)
@@ -1485,6 +1483,37 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
return ret;
}
+static struct amdgpu_fence *
+amdgpu_vcn_ring_reset_begin_helper(struct amdgpu_ring *ring,
+ struct amdgpu_ring *guilty_ring,
+ struct amdgpu_fence *timedout_fence)
+{
+ struct amdgpu_fence *fence;
+
+ drm_sched_wqueue_stop(&ring->sched);
+ if (ring == guilty_ring)
+ fence = timedout_fence;
+ else
+ fence = amdgpu_ring_find_guilty_fence(ring);
+ amdgpu_ring_reset_helper_begin(ring, fence);
+
+ return fence;
+}
+
+static int
+amdgpu_vcn_ring_reset_end_helper(struct amdgpu_ring *ring,
+ struct amdgpu_fence *fence)
+{
+ int r;
+
+ r = amdgpu_ring_reset_helper_end(ring, fence);
+ if (r)
+ return r;
+
+ drm_sched_wqueue_start(&ring->sched);
+ return 0;
+}
+
/**
* amdgpu_vcn_ring_reset - Reset a VCN ring
* @ring: ring to reset
@@ -1502,48 +1531,33 @@ int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
{
struct amdgpu_device *adev = ring->adev;
struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
+ struct amdgpu_fence *dec_fence;
+ struct amdgpu_fence *enc_fence[AMDGPU_VCN_MAX_ENC_RINGS];
int r, i;
if (adev->vcn.inst[ring->me].using_unified_queue)
return -EINVAL;
mutex_lock(&vinst->engine_reset_mutex);
- /* Stop the scheduler's work queue for the dec and enc rings if they are running.
- * This ensures that no new tasks are submitted to the queues while
- * the reset is in progress.
- */
- drm_sched_wqueue_stop(&vinst->ring_dec.sched);
+ dec_fence = amdgpu_vcn_ring_reset_begin_helper(&vinst->ring_dec, ring,
+ timedout_fence);
for (i = 0; i < vinst->num_enc_rings; i++)
- drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
+ enc_fence[i] = amdgpu_vcn_ring_reset_begin_helper(&vinst->ring_enc[i], ring,
+ timedout_fence);
/* Perform the VCN reset for the specified instance */
r = vinst->reset(vinst);
if (r)
goto unlock;
- r = amdgpu_ring_test_ring(&vinst->ring_dec);
+
+ r = amdgpu_vcn_ring_reset_end_helper(&vinst->ring_dec, dec_fence);
if (r)
goto unlock;
for (i = 0; i < vinst->num_enc_rings; i++) {
- r = amdgpu_ring_test_ring(&vinst->ring_enc[i]);
+ r = amdgpu_vcn_ring_reset_end_helper(&vinst->ring_enc[i], enc_fence[i]);
if (r)
goto unlock;
}
- amdgpu_fence_driver_force_completion(&vinst->ring_dec,
- (&vinst->ring_dec == ring) ?
- &timedout_fence->base : NULL);
- for (i = 0; i < vinst->num_enc_rings; i++)
- amdgpu_fence_driver_force_completion(&vinst->ring_enc[i],
- (&vinst->ring_enc[i] == ring) ?
- &timedout_fence->base : NULL);
-
- /* Restart the scheduler's work queue for the dec and enc rings
- * if they were stopped by this function. This allows new tasks
- * to be submitted to the queues after the reset is complete.
- */
- drm_sched_wqueue_start(&vinst->ring_dec.sched);
- for (i = 0; i < vinst->num_enc_rings; i++)
- drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
-
unlock:
mutex_unlock(&vinst->engine_reset_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 82624b44e661..bea95307fd42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -368,6 +368,9 @@ struct amdgpu_vcn {
struct mutex workload_profile_mutex;
u32 reg_count;
const struct amdgpu_hwip_reg_entry *reg_list;
+
+ bool disable_uq;
+ bool disable_kq;
};
struct amdgpu_fw_shared_rb_ptrs_struct {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index 409e103ffe8c..35faea0ff17f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -381,7 +381,8 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
enum AMDGPU_XCP_IP_BLOCK ip,
uint32_t *inst_mask)
{
- if (!xcp->valid || !inst_mask || !(xcp->ip[ip].valid))
+ if (!xcp->valid || !inst_mask || ip >= AMDGPU_XCP_MAX_BLOCKS ||
+ !(xcp->ip[ip].valid))
return -EINVAL;
*inst_mask = xcp->ip[ip].inst_mask;
@@ -468,14 +469,18 @@ void amdgpu_xcp_release_sched(struct amdgpu_device *adev,
{
struct drm_gpu_scheduler *sched =
container_of(entity->entity.rq, typeof(*sched), rq);
+ struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
- if (!adev->xcp_mgr)
+ if (!xcp_mgr)
return;
if (drm_sched_wqueue_ready(sched)) {
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
- atomic_dec(&adev->xcp_mgr->xcp[ring->xcp_id].ref_cnt);
+ mutex_lock(&xcp_mgr->xcp_lock);
+ if (ring->xcp_id < xcp_mgr->num_xcps && xcp_mgr->xcp[ring->xcp_id].valid)
+ atomic_dec(&xcp_mgr->xcp[ring->xcp_id].ref_cnt);
+ mutex_unlock(&xcp_mgr->xcp_lock);
}
}
@@ -488,7 +493,9 @@ int amdgpu_xcp_select_scheds(struct amdgpu_device *adev,
u32 sel_xcp_id;
int i;
struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
+ int r = 0;
+ mutex_lock(&xcp_mgr->xcp_lock);
if (fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION) {
u32 least_ref_cnt = ~0;
@@ -505,19 +512,27 @@ int amdgpu_xcp_select_scheds(struct amdgpu_device *adev,
}
sel_xcp_id = fpriv->xcp_id;
+ if (sel_xcp_id >= xcp_mgr->num_xcps || !xcp_mgr->xcp[sel_xcp_id].valid) {
+ dev_err(adev->dev, "Selected partition #%d is not valid.", sel_xcp_id);
+ r = -ENODEV;
+ goto out;
+ }
+
if (xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds) {
*num_scheds =
- xcp_mgr->xcp[fpriv->xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds;
+ xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds;
*scheds =
- xcp_mgr->xcp[fpriv->xcp_id].gpu_sched[hw_ip][hw_prio].sched;
- atomic_inc(&adev->xcp_mgr->xcp[sel_xcp_id].ref_cnt);
+ xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].sched;
+ atomic_inc(&xcp_mgr->xcp[sel_xcp_id].ref_cnt);
dev_dbg(adev->dev, "Selected partition #%d", sel_xcp_id);
} else {
dev_err(adev->dev, "Failed to schedule partition #%d.", sel_xcp_id);
- return -ENOENT;
+ r = -ENOENT;
}
- return 0;
+out:
+ mutex_unlock(&xcp_mgr->xcp_lock);
+ return r;
}
static void amdgpu_set_xcp_id(struct amdgpu_device *adev,
@@ -574,6 +589,9 @@ static void amdgpu_xcp_gpu_sched_update(struct amdgpu_device *adev,
{
unsigned int *num_gpu_sched;
+ if (sel_xcp_id >= MAX_XCP || sel_xcp_id == AMDGPU_XCP_NO_PARTITION)
+ return;
+
num_gpu_sched = &adev->xcp_mgr->xcp[sel_xcp_id]
.gpu_sched[ring->funcs->type][ring->hw_prio].num_scheds;
adev->xcp_mgr->xcp[sel_xcp_id].gpu_sched[ring->funcs->type][ring->hw_prio]
@@ -903,7 +921,7 @@ static void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev)
{
struct amdgpu_xcp_res_details *xcp_res;
struct amdgpu_xcp_cfg *xcp_cfg;
- int i, r, j, rid, mode;
+ int i, r, rid, mode;
if (!adev->xcp_mgr)
return;
@@ -949,14 +967,16 @@ static void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev)
&xcp_cfg_res_sysfs_ktype,
&xcp_cfg->kobj, "%s",
xcp_res_names[rid]);
- if (r)
+ if (r) {
+ kobject_put(&xcp_res->kobj);
goto err;
+ }
}
adev->xcp_mgr->xcp_cfg = xcp_cfg;
return;
err:
- for (j = 0; j < i; j++) {
+ while (i--) {
xcp_res = &xcp_cfg->xcp_res[i];
kobject_put(&xcp_res->kobj);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e63d05c477a0..d2c5bb50d94a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -106,53 +106,6 @@ static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
};
-static const int xgmi3x16_pcs_err_status_reg_v6_4[] = {
- smnPCS_XGMI3X16_PCS_ERROR_STATUS,
- smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000
-};
-
-static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
- smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
- smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
-};
-
-static const u64 xgmi_v6_4_0_mca_base_array[] = {
- 0x11a09200,
- 0x11b09200,
-};
-
-static const char *xgmi_v6_4_0_ras_error_code_ext[32] = {
- [0x00] = "XGMI PCS DataLossErr",
- [0x01] = "XGMI PCS TrainingErr",
- [0x02] = "XGMI PCS FlowCtrlAckErr",
- [0x03] = "XGMI PCS RxFifoUnderflowErr",
- [0x04] = "XGMI PCS RxFifoOverflowErr",
- [0x05] = "XGMI PCS CRCErr",
- [0x06] = "XGMI PCS BERExceededErr",
- [0x07] = "XGMI PCS TxMetaDataErr",
- [0x08] = "XGMI PCS ReplayBufParityErr",
- [0x09] = "XGMI PCS DataParityErr",
- [0x0a] = "XGMI PCS ReplayFifoOverflowErr",
- [0x0b] = "XGMI PCS ReplayFifoUnderflowErr",
- [0x0c] = "XGMI PCS ElasticFifoOverflowErr",
- [0x0d] = "XGMI PCS DeskewErr",
- [0x0e] = "XGMI PCS FlowCtrlCRCErr",
- [0x0f] = "XGMI PCS DataStartupLimitErr",
- [0x10] = "XGMI PCS FCInitTimeoutErr",
- [0x11] = "XGMI PCS RecoveryTimeoutErr",
- [0x12] = "XGMI PCS ReadySerialTimeoutErr",
- [0x13] = "XGMI PCS ReadySerialAttemptErr",
- [0x14] = "XGMI PCS RecoveryAttemptErr",
- [0x15] = "XGMI PCS RecoveryRelockAttemptErr",
- [0x16] = "XGMI PCS ReplayAttemptErr",
- [0x17] = "XGMI PCS SyncHdrErr",
- [0x18] = "XGMI PCS TxReplayTimeoutErr",
- [0x19] = "XGMI PCS RxReplayTimeoutErr",
- [0x1a] = "XGMI PCS LinkSubTxTimeoutErr",
- [0x1b] = "XGMI PCS LinkSubRxTimeoutErr",
- [0x1c] = "XGMI PCS RxCMDPktErr",
-};
-
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
{"XGMI PCS DataLossErr",
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -1152,91 +1105,15 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
return 0;
}
-static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct amdgpu_device *adev = handle->adev;
- struct aca_bank_info info;
- const char *error_str;
- u64 status, count;
- int ret, ext_error_code;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- status = bank->regs[ACA_REG_IDX_STATUS];
- ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
-
- error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
- xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
- if (error_str)
- dev_info(adev->dev, "%s detected\n", error_str);
-
- count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
-
- switch (type) {
- case ACA_SMU_TYPE_UE:
- if (ext_error_code != 0 && ext_error_code != 1 && ext_error_code != 9)
- count = 0ULL;
-
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
- break;
- case ACA_SMU_TYPE_CE:
- count = ext_error_code == 6 ? count : 0ULL;
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count);
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
- .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
-};
-
-static const struct aca_info xgmi_v6_4_0_aca_info = {
- .hwip = ACA_HWIP_TYPE_PCS_XGMI,
- .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
- .bank_ops = &xgmi_v6_4_0_aca_bank_ops,
-};
-
static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
- int r;
-
if (!adev->gmc.xgmi.supported ||
adev->gmc.xgmi.num_physical_nodes == 0)
return 0;
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
- r = amdgpu_ras_block_late_init(adev, ras_block);
- if (r)
- return r;
-
- switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
- case IP_VERSION(6, 4, 0):
- case IP_VERSION(6, 4, 1):
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
- &xgmi_v6_4_0_aca_info, NULL);
- if (r)
- goto late_fini;
- break;
- default:
- break;
- }
-
- return 0;
-
-late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
-
- return r;
+ return amdgpu_ras_block_late_init(adev, ras_block);
}
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
@@ -1252,7 +1129,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg
WREG32_PCIE(pcs_status_reg, 0);
}
-static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
+static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
{
uint32_t i;
@@ -1278,54 +1155,6 @@ static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
default:
break;
}
-
- switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
- case IP_VERSION(6, 4, 0):
- case IP_VERSION(6, 4, 1):
- for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
- pcs_clear_status(adev,
- xgmi3x16_pcs_err_status_reg_v6_4[i]);
- break;
- default:
- break;
- }
-}
-
-static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base)
-{
- uint64_t smn_base =
- amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst);
-
- WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL);
-}
-
-static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
- __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]);
-}
-
-static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev)
-{
- int i;
-
- for_each_inst(i, adev->aid_mask)
- xgmi_v6_4_0_reset_error_count(adev, i);
-}
-
-static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
-{
- switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
- case IP_VERSION(6, 4, 0):
- case IP_VERSION(6, 4, 1):
- xgmi_v6_4_0_reset_ras_error_count(adev);
- break;
- default:
- amdgpu_xgmi_legacy_reset_ras_error_count(adev);
- break;
- }
}
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
@@ -1343,11 +1172,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
if (is_xgmi_pcs) {
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
- IP_VERSION(6, 1, 0) ||
- amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
- IP_VERSION(6, 4, 0) ||
- amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
- IP_VERSION(6, 4, 1)) {
+ IP_VERSION(6, 1, 0)) {
pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
} else {
@@ -1381,11 +1206,11 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
return 0;
}
-static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
+static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- int i, supported = 1;
+ int i;
uint32_t data, mask_data = 0;
uint32_t ue_cnt = 0, ce_cnt = 0;
@@ -1449,26 +1274,6 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
}
break;
default:
- supported = 0;
- break;
- }
-
- switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
- case IP_VERSION(6, 4, 0):
- case IP_VERSION(6, 4, 1):
- /* check xgmi3x16 pcs error */
- for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) {
- data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
- mask_data =
- RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]);
- if (data)
- amdgpu_xgmi_query_pcs_error_status(adev, data,
- mask_data, &ue_cnt, &ce_cnt, true, true);
- }
- break;
- default:
- if (!supported)
- dev_warn(adev->dev, "XGMI RAS error query not supported");
break;
}
@@ -1478,90 +1283,6 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
err_data->ce_count += ce_cnt;
}
-static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status)
-{
- const char *error_str;
- int ext_error_code;
-
- ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
-
- error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
- xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
- if (error_str)
- dev_info(adev->dev, "%s detected\n", error_str);
-
- switch (ext_error_code) {
- case 0:
- return ACA_ERROR_TYPE_UE;
- case 6:
- return ACA_ERROR_TYPE_CE;
- default:
- return -EINVAL;
- }
-
- return -EINVAL;
-}
-
-static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info,
- u64 mca_base, struct ras_err_data *err_data)
-{
- int xgmi_inst = mcm_info->die_id;
- uint64_t smn_base;
- u64 status = 0;
-
- status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS);
- if (!ACA_REG__STATUS__VAL(status))
- return;
-
- switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
- case ACA_ERROR_TYPE_UE:
- amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
- break;
- case ACA_ERROR_TYPE_CE:
- amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
- break;
- default:
- break;
- }
- smn_base = amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst);
- WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL);
-}
-
-static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data)
-{
- struct amdgpu_smuio_mcm_config_info mcm_info = {
- .socket_id = adev->smuio.funcs->get_socket_id(adev),
- .die_id = xgmi_inst,
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
- __xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data);
-}
-
-static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- int i;
-
- for_each_inst(i, adev->aid_mask)
- xgmi_v6_4_0_query_error_count(adev, i, err_data);
-}
-
-static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
- case IP_VERSION(6, 4, 0):
- case IP_VERSION(6, 4, 1):
- xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status);
- break;
- default:
- amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status);
- break;
- }
-}
-
/* Trigger XGMI/WAFL error */
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
void *inject_if, uint32_t instance_mask)
@@ -1663,6 +1384,16 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
if (r && r != -EHWPOISON)
dev_err(tmp_adev->dev,
"error during bad page data initialization");
+
+ /*
+ * For the reset-on-init path (e.g. an NPS memory partition
+ * switch) the RAS IP block hw_init was skipped under the
+ * minimal init level, so uniras was never enabled. Bring it
+ * up now that the reset domain has been unlocked. This is a
+ * no-op for any other reset path where RAS is already
+ * initialized, and for non-uniras devices.
+ */
+ amdgpu_ras_resume_after_reset(tmp_adev);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 72ea37dbfea8..cddfe4015f53 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -273,8 +273,10 @@ static int aqua_vanjaram_get_xcp_res_info(struct amdgpu_xcp_mgr *xcp_mgr,
xcp_cfg->num_res = ARRAY_SIZE(max_res);
for (i = 0; i < xcp_cfg->num_res; i++) {
- res_lt_xcp = max_res[i] < num_xcp;
xcp_cfg->xcp_res[i].id = i;
+ if (!max_res[i])
+ continue;
+ res_lt_xcp = max_res[i] < num_xcp;
xcp_cfg->xcp_res[i].num_inst =
res_lt_xcp ? 1 : max_res[i] / num_xcp;
xcp_cfg->xcp_res[i].num_inst =
@@ -589,6 +591,29 @@ static struct aqua_reg_list pcie_reg_addrs[] = {
{ smreg_0x1A380088, 6, DW_ADDR_INCR },
};
+/*
+ * Return the GPU's internal US switch port, or NULL if it is not visible
+ * (e.g. passthrough) or the EP is parented under an unrelated bridge.
+ */
+static struct pci_dev *aqua_vanjaram_get_us_pdev(struct amdgpu_device *adev)
+{
+ struct pci_dev *ds_pdev, *us_pdev;
+
+ ds_pdev = pci_upstream_bridge(adev->pdev);
+ if (!ds_pdev || ds_pdev->vendor != PCI_VENDOR_ID_ATI ||
+ pci_pcie_type(ds_pdev) != PCI_EXP_TYPE_DOWNSTREAM)
+ return NULL;
+
+ us_pdev = pci_upstream_bridge(ds_pdev);
+ if (!us_pdev ||
+ (us_pdev->vendor != PCI_VENDOR_ID_ATI &&
+ us_pdev->vendor != PCI_VENDOR_ID_AMD) ||
+ pci_pcie_type(us_pdev) != PCI_EXP_TYPE_UPSTREAM)
+ return NULL;
+
+ return us_pdev;
+}
+
static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev,
void *buf, size_t max_size)
{
@@ -596,7 +621,7 @@ static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev,
uint32_t start_addr, incrx, num_regs, szbuf;
struct amdgpu_regs_pcie_v1_0 *pcie_regs;
struct amdgpu_smn_reg_data *reg_data;
- struct pci_dev *us_pdev, *ds_pdev;
+ struct pci_dev *us_pdev;
int aer_cap, r, n;
if (!buf || !max_size)
@@ -628,25 +653,27 @@ static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev,
}
}
- ds_pdev = pci_upstream_bridge(adev->pdev);
- us_pdev = pci_upstream_bridge(ds_pdev);
+ us_pdev = aqua_vanjaram_get_us_pdev(adev);
+ if (us_pdev) {
+ pcie_capability_read_word(us_pdev, PCI_EXP_DEVSTA,
+ &pcie_regs->device_status);
+ pcie_capability_read_word(us_pdev, PCI_EXP_LNKSTA,
+ &pcie_regs->link_status);
+
+ aer_cap = pci_find_ext_capability(us_pdev, PCI_EXT_CAP_ID_ERR);
+ if (aer_cap) {
+ pci_read_config_dword(us_pdev,
+ aer_cap + PCI_ERR_COR_STATUS,
+ &pcie_regs->pcie_corr_err_status);
+ pci_read_config_dword(us_pdev,
+ aer_cap + PCI_ERR_UNCOR_STATUS,
+ &pcie_regs->pcie_uncorr_err_status);
+ }
- pcie_capability_read_word(us_pdev, PCI_EXP_DEVSTA,
- &pcie_regs->device_status);
- pcie_capability_read_word(us_pdev, PCI_EXP_LNKSTA,
- &pcie_regs->link_status);
-
- aer_cap = pci_find_ext_capability(us_pdev, PCI_EXT_CAP_ID_ERR);
- if (aer_cap) {
- pci_read_config_dword(us_pdev, aer_cap + PCI_ERR_COR_STATUS,
- &pcie_regs->pcie_corr_err_status);
- pci_read_config_dword(us_pdev, aer_cap + PCI_ERR_UNCOR_STATUS,
- &pcie_regs->pcie_uncorr_err_status);
+ pci_read_config_dword(us_pdev, PCI_PRIMARY_BUS,
+ &pcie_regs->sub_bus_number_latency);
}
- pci_read_config_dword(us_pdev, PCI_PRIMARY_BUS,
- &pcie_regs->sub_bus_number_latency);
-
pcie_reg_state->common_header.structure_size = szbuf;
pcie_reg_state->common_header.format_revision = 1;
pcie_reg_state->common_header.content_revision = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
index ca5d091549e1..e0e585f280e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -114,8 +114,10 @@ static uint32_t atom_iio_execute(struct atom_context *ctx, int base,
uint32_t index, uint32_t data)
{
uint32_t temp = 0xCDCDCDCD;
+ int start = base;
- while (1)
+ /* IIO opcodes read up to base+3; keep within the BIOS image */
+ while (base + 3 < ctx->bios_size)
switch (CU8(base)) {
case ATOM_IIO_NOP:
base++;
@@ -180,6 +182,9 @@ static uint32_t atom_iio_execute(struct atom_context *ctx, int base,
pr_info("Unknown IIO opcode\n");
return 0;
}
+
+ pr_info("IIO method starting at offset %d runs past BIOS image\n", start);
+ return 0;
}
static uint32_t atom_get_src_int(atom_exec_context *ctx, uint8_t attr,
@@ -1327,11 +1332,25 @@ static void atom_index_iio(struct atom_context *ctx, int base)
ctx->iio = kzalloc(2 * 256, GFP_KERNEL);
if (!ctx->iio)
return;
- while (CU8(base) == ATOM_IIO_START) {
- ctx->iio[CU8(base + 1)] = base + 2;
+ while (base + 1 < ctx->bios_size && CU8(base) == ATOM_IIO_START) {
+ uint8_t index = CU8(base + 1);
+ int start = base + 2;
base += 2;
- while (CU8(base) != ATOM_IIO_END)
- base += atom_iio_len[CU8(base)];
+ while (base < ctx->bios_size && CU8(base) != ATOM_IIO_END) {
+ uint8_t op = CU8(base);
+
+ /*
+ * Unknown opcode: its length is unknown so the byte
+ * stream cannot be resynced reliably.
+ */
+ if (op >= ARRAY_SIZE(atom_iio_len))
+ return;
+ base += atom_iio_len[op];
+ }
+ if (base >= ctx->bios_size)
+ return;
+ /* Only index well-formed methods, others stay 0 */
+ ctx->iio[index] = start;
base += 3;
}
}
@@ -1339,6 +1358,7 @@ static void atom_index_iio(struct atom_context *ctx, int base)
static void atom_get_vbios_name(struct atom_context *ctx)
{
unsigned char *p_rom;
+ unsigned char *p_end;
unsigned char str_num;
unsigned short off_to_vbios_str;
unsigned char *c_ptr;
@@ -1349,39 +1369,48 @@ static void atom_get_vbios_name(struct atom_context *ctx)
char *back;
p_rom = ctx->bios;
+ p_end = p_rom + ctx->bios_size;
+
+ if (p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START + 1 >= p_end)
+ goto no_name;
str_num = *(p_rom + OFFSET_TO_GET_ATOMBIOS_NUMBER_OF_STRINGS);
- if (str_num != 0) {
- off_to_vbios_str =
- *(unsigned short *)(p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START);
+ if (!str_num)
+ goto no_name;
- c_ptr = (unsigned char *)(p_rom + off_to_vbios_str);
- } else {
- /* do not know where to find name */
- memcpy(ctx->name, na, 7);
- ctx->name[7] = 0;
- return;
- }
+ off_to_vbios_str =
+ *(unsigned short *)(p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START);
+
+ c_ptr = (unsigned char *)(p_rom + off_to_vbios_str);
+ if (c_ptr >= p_end)
+ goto no_name;
/*
* skip the atombios strings, usually 4
* 1st is P/N, 2nd is ASIC, 3rd is PCI type, 4th is Memory type
*/
for (i = 0; i < str_num; i++) {
- while (*c_ptr != 0)
+ while (c_ptr < p_end && *c_ptr != 0)
c_ptr++;
c_ptr++;
}
/* skip the following 2 chars: 0x0D 0x0A */
c_ptr += 2;
+ if (c_ptr >= p_end)
+ goto no_name;
- name_size = strnlen(c_ptr, STRLEN_LONG - 1);
+ name_size = strnlen(c_ptr, min(STRLEN_LONG - 1, (int)(p_end - c_ptr)));
memcpy(ctx->name, c_ptr, name_size);
back = ctx->name + name_size;
while ((*--back) == ' ')
;
*(back + 1) = '\0';
+ return;
+
+no_name:
+ /* do not know where to find name */
+ strscpy(ctx->name, na, sizeof(ctx->name));
}
static void atom_get_vbios_date(struct atom_context *ctx)
@@ -1553,7 +1582,7 @@ static inline void atom_print_vbios_info(struct atom_context *ctx)
drm_info(ctx->card->dev, "ATOM BIOS: %s\n", vbios_info);
}
-struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios)
+struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios, uint32_t bios_size)
{
int base;
struct atom_context *ctx =
@@ -1567,6 +1596,7 @@ struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios)
ctx->card = card;
ctx->bios = bios;
+ ctx->bios_size = bios_size;
if (CU16(0) != ATOM_BIOS_MAGIC) {
pr_info("Invalid BIOS magic\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/atom.h b/drivers/gpu/drm/amd/amdgpu/atom.h
index bb3d9eb7eb6b..4687c019cbe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.h
+++ b/drivers/gpu/drm/amd/amdgpu/atom.h
@@ -133,6 +133,7 @@ struct atom_context {
struct card_info *card;
struct mutex mutex;
void *bios;
+ uint32_t bios_size;
uint32_t cmd_table, data_table;
uint16_t *iio;
@@ -160,7 +161,7 @@ struct atom_context {
extern int amdgpu_atom_debug;
-struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios);
+struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios, uint32_t bios_size);
int amdgpu_atom_execute_table(struct atom_context *ctx, int index, uint32_t *params, int params_size);
int amdgpu_atom_asic_init(struct atom_context *ctx);
void amdgpu_atom_destroy(struct atom_context *ctx);
diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c
index 29954c7d61b0..77e120a72815 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik.c
@@ -1876,12 +1876,6 @@ static void cik_invalidate_hdp(struct amdgpu_device *adev,
}
}
-static bool cik_need_full_reset(struct amdgpu_device *adev)
-{
- /* change this when we support soft reset */
- return true;
-}
-
static void cik_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
uint64_t *count1)
{
@@ -1971,7 +1965,6 @@ static const struct amdgpu_asic_funcs cik_asic_funcs =
.get_config_memsize = &cik_get_config_memsize,
.flush_hdp = &cik_flush_hdp,
.invalidate_hdp = &cik_invalidate_hdp,
- .need_full_reset = &cik_need_full_reset,
.init_doorbell_index = &legacy_doorbell_index_init,
.get_pcie_usage = &cik_get_pcie_usage,
.need_reset_on_init = &cik_need_reset_on_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index c8f465158e71..f2977fe6d824 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -410,36 +410,6 @@ static u32 dce_v10_0_hpd_get_gpio_reg(struct amdgpu_device *adev)
return mmDC_GPIO_HPD_A;
}
-static bool dce_v10_0_is_display_hung(struct amdgpu_device *adev)
-{
- u32 crtc_hung = 0;
- u32 crtc_status[6];
- u32 i, j, tmp;
-
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- tmp = RREG32(mmCRTC_CONTROL + crtc_offsets[i]);
- if (REG_GET_FIELD(tmp, CRTC_CONTROL, CRTC_MASTER_EN)) {
- crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- crtc_hung |= (1 << i);
- }
- }
-
- for (j = 0; j < 10; j++) {
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- if (crtc_hung & (1 << i)) {
- tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- if (tmp != crtc_status[i])
- crtc_hung &= ~(1 << i);
- }
- }
- if (crtc_hung == 0)
- return false;
- udelay(100);
- }
-
- return true;
-}
-
static void dce_v10_0_set_vga_render_state(struct amdgpu_device *adev,
bool render)
{
@@ -2956,40 +2926,6 @@ static bool dce_v10_0_is_idle(struct amdgpu_ip_block *ip_block)
return true;
}
-static bool dce_v10_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- return dce_v10_0_is_display_hung(adev);
-}
-
-static int dce_v10_0_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- u32 srbm_soft_reset = 0, tmp;
- struct amdgpu_device *adev = ip_block->adev;
-
- if (dce_v10_0_is_display_hung(adev))
- srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK;
-
- if (srbm_soft_reset) {
- tmp = RREG32(mmSRBM_SOFT_RESET);
- tmp |= srbm_soft_reset;
- dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp);
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- udelay(50);
-
- tmp &= ~srbm_soft_reset;
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- /* Wait a little for things to settle down */
- udelay(50);
- }
- return 0;
-}
-
static void dce_v10_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev,
int crtc,
enum amdgpu_interrupt_state state)
@@ -3332,8 +3268,6 @@ static const struct amd_ip_funcs dce_v10_0_ip_funcs = {
.suspend = dce_v10_0_suspend,
.resume = dce_v10_0_resume,
.is_idle = dce_v10_0_is_idle,
- .check_soft_reset = dce_v10_0_check_soft_reset,
- .soft_reset = dce_v10_0_soft_reset,
.set_clockgating_state = dce_v10_0_set_clockgating_state,
.set_powergating_state = dce_v10_0_set_powergating_state,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index 58d0da5c2a74..c68de0fe1d7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -378,35 +378,6 @@ static u32 dce_v6_0_hpd_get_gpio_reg(struct amdgpu_device *adev)
return mmDC_GPIO_HPD_A;
}
-static bool dce_v6_0_is_display_hung(struct amdgpu_device *adev)
-{
- u32 crtc_hung = 0;
- u32 crtc_status[6];
- u32 i, j, tmp;
-
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- if (RREG32(mmCRTC_CONTROL + crtc_offsets[i]) & CRTC_CONTROL__CRTC_MASTER_EN_MASK) {
- crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- crtc_hung |= (1 << i);
- }
- }
-
- for (j = 0; j < 10; j++) {
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- if (crtc_hung & (1 << i)) {
- tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- if (tmp != crtc_status[i])
- crtc_hung &= ~(1 << i);
- }
- }
- if (crtc_hung == 0)
- return false;
- udelay(100);
- }
-
- return true;
-}
-
static void dce_v6_0_set_vga_render_state(struct amdgpu_device *adev,
bool render)
{
@@ -2901,33 +2872,6 @@ static bool dce_v6_0_is_idle(struct amdgpu_ip_block *ip_block)
return true;
}
-static int dce_v6_0_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- u32 srbm_soft_reset = 0, tmp;
- struct amdgpu_device *adev = ip_block->adev;
-
- if (dce_v6_0_is_display_hung(adev))
- srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK;
-
- if (srbm_soft_reset) {
- tmp = RREG32(mmSRBM_SOFT_RESET);
- tmp |= srbm_soft_reset;
- dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp);
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- udelay(50);
-
- tmp &= ~srbm_soft_reset;
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- /* Wait a little for things to settle down */
- udelay(50);
- }
- return 0;
-}
-
static void dce_v6_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev,
int crtc,
enum amdgpu_interrupt_state state)
@@ -3224,7 +3168,6 @@ static const struct amd_ip_funcs dce_v6_0_ip_funcs = {
.suspend = dce_v6_0_suspend,
.resume = dce_v6_0_resume,
.is_idle = dce_v6_0_is_idle,
- .soft_reset = dce_v6_0_soft_reset,
.set_clockgating_state = dce_v6_0_set_clockgating_state,
.set_powergating_state = dce_v6_0_set_powergating_state,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index 6d19f6d94d25..c3906270f25e 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -362,35 +362,6 @@ static u32 dce_v8_0_hpd_get_gpio_reg(struct amdgpu_device *adev)
return mmDC_GPIO_HPD_A;
}
-static bool dce_v8_0_is_display_hung(struct amdgpu_device *adev)
-{
- u32 crtc_hung = 0;
- u32 crtc_status[6];
- u32 i, j, tmp;
-
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- if (RREG32(mmCRTC_CONTROL + crtc_offsets[i]) & CRTC_CONTROL__CRTC_MASTER_EN_MASK) {
- crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- crtc_hung |= (1 << i);
- }
- }
-
- for (j = 0; j < 10; j++) {
- for (i = 0; i < adev->mode_info.num_crtc; i++) {
- if (crtc_hung & (1 << i)) {
- tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]);
- if (tmp != crtc_status[i])
- crtc_hung &= ~(1 << i);
- }
- }
- if (crtc_hung == 0)
- return false;
- udelay(100);
- }
-
- return true;
-}
-
static void dce_v8_0_set_vga_render_state(struct amdgpu_device *adev,
bool render)
{
@@ -2873,33 +2844,6 @@ static bool dce_v8_0_is_idle(struct amdgpu_ip_block *ip_block)
return true;
}
-static int dce_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- u32 srbm_soft_reset = 0, tmp;
- struct amdgpu_device *adev = ip_block->adev;
-
- if (dce_v8_0_is_display_hung(adev))
- srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK;
-
- if (srbm_soft_reset) {
- tmp = RREG32(mmSRBM_SOFT_RESET);
- tmp |= srbm_soft_reset;
- dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp);
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- udelay(50);
-
- tmp &= ~srbm_soft_reset;
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- /* Wait a little for things to settle down */
- udelay(50);
- }
- return 0;
-}
-
static void dce_v8_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev,
int crtc,
enum amdgpu_interrupt_state state)
@@ -3241,7 +3185,6 @@ static const struct amd_ip_funcs dce_v8_0_ip_funcs = {
.suspend = dce_v8_0_suspend,
.resume = dce_v8_0_resume,
.is_idle = dce_v8_0_is_idle,
- .soft_reset = dce_v8_0_soft_reset,
.set_clockgating_state = dce_v8_0_set_clockgating_state,
.set_powergating_state = dce_v8_0_set_powergating_state,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index b4b27e4c495d..ddf190672530 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7852,6 +7852,8 @@ static int gfx_v10_0_early_init(struct amdgpu_ip_block *ip_block)
/* init rlcg reg access ctrl */
gfx_v10_0_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v10_0_init_microcode(adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 3b12eb27a253..2a121df90574 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1923,6 +1923,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
+ adev->gfx.me.use_mmio_for_reset = false;
+ adev->gfx.mec.use_mmio_for_reset = true;
+
+ mutex_init(&adev->gfx.mec.reset_mutex);
+
return 0;
}
@@ -4233,13 +4238,13 @@ static int gfx_v11_0_gfx_mqd_init(struct amdgpu_device *adev, void *m,
return 0;
}
-static int gfx_v11_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset)
+static int gfx_v11_0_kgq_init_queue(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
struct v11_gfx_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.gfx_ring[0];
- if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
+ if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -4266,7 +4271,7 @@ static int gfx_v11_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev)
int r, i;
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
- r = gfx_v11_0_kgq_init_queue(&adev->gfx.gfx_ring[i], false);
+ r = gfx_v11_0_kgq_init_queue(&adev->gfx.gfx_ring[i]);
if (r)
return r;
}
@@ -4603,13 +4608,13 @@ static int gfx_v11_0_kiq_init_queue(struct amdgpu_ring *ring)
return 0;
}
-static int gfx_v11_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset)
+static int gfx_v11_0_kcq_init_queue(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
struct v11_compute_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];
- if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
+ if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -4646,7 +4651,7 @@ static int gfx_v11_0_kcq_resume(struct amdgpu_device *adev)
gfx_v11_0_cp_compute_enable(adev, true);
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
- r = gfx_v11_0_kcq_init_queue(&adev->gfx.compute_ring[i], false);
+ r = gfx_v11_0_kcq_init_queue(&adev->gfx.compute_ring[i]);
if (r)
return r;
}
@@ -5265,38 +5270,12 @@ static int gfx_v11_0_soft_reset(struct amdgpu_ip_block *ip_block)
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
- return gfx_v11_0_cp_resume(adev);
-}
-
-static bool gfx_v11_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- int i, r;
- struct amdgpu_device *adev = ip_block->adev;
- struct amdgpu_ring *ring;
- long tmo = msecs_to_jiffies(1000);
-
- for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
- ring = &adev->gfx.gfx_ring[i];
- r = amdgpu_ring_test_ib(ring, tmo);
- if (r)
- return true;
- }
-
- for (i = 0; i < adev->gfx.num_compute_rings; i++) {
- ring = &adev->gfx.compute_ring[i];
- r = amdgpu_ring_test_ib(ring, tmo);
- if (r)
- return true;
- }
-
- return false;
-}
+ r = gfx_v11_0_cp_resume(adev);
+ if (r)
+ return r;
-static int gfx_v11_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
/**
- * GFX soft reset will impact MES, need resume MES when do GFX soft reset
+ * GFX soft reset impacts MES, resume MES after GFX soft reset is finished
*/
return amdgpu_mes_resume(adev, 0);
}
@@ -5420,6 +5399,8 @@ static int gfx_v11_0_early_init(struct amdgpu_ip_block *ip_block)
gfx_v11_0_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v11_0_init_microcode(adev);
}
@@ -6708,22 +6689,29 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
- u8 me_id, pipe_id, queue_id;
- struct amdgpu_ring *ring;
- int i;
-
- me_id = (entry->ring_id & 0x0c) >> 2;
- pipe_id = (entry->ring_id & 0x03) >> 0;
- queue_id = (entry->ring_id & 0x70) >> 4;
+ u32 doorbell_offset = entry->src_data[0] & AMDGPU_CTXID0_DOORBELL_ID_MASK;
+ /*
+ * Try KQ first by ring_id (HW slot is authoritative). The
+ * KMD compute_hqd_mask contract guarantees KCQ and user queues
+ * never share a HW slot.
+ */
if (!adev->gfx.disable_kq) {
+ u8 me_id = (entry->ring_id & 0x0c) >> 2;
+ u8 pipe_id = (entry->ring_id & 0x03) >> 0;
+ u8 queue_id = (entry->ring_id & 0x70) >> 4;
+ struct amdgpu_ring *ring;
+ int i;
+
switch (me_id) {
case 0:
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
ring = &adev->gfx.gfx_ring[i];
if (ring->me == me_id && ring->pipe == pipe_id &&
- ring->queue == queue_id)
+ ring->queue == queue_id) {
drm_sched_fault(&ring->sched);
+ return;
+ }
}
break;
case 1:
@@ -6731,8 +6719,10 @@ static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = &adev->gfx.compute_ring[i];
if (ring->me == me_id && ring->pipe == pipe_id &&
- ring->queue == queue_id)
+ ring->queue == queue_id) {
drm_sched_fault(&ring->sched);
+ return;
+ }
}
break;
default:
@@ -6740,6 +6730,11 @@ static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
break;
}
}
+
+ /* No KQ matched: HW slot is a MES-scheduled user queue. */
+ if (adev->enable_mes && doorbell_offset)
+ amdgpu_userq_process_reset_irq(adev, entry->pasid,
+ doorbell_offset);
}
static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev,
@@ -6846,233 +6841,14 @@ static void gfx_v11_0_emit_mem_sync(struct amdgpu_ring *ring)
amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
}
-static bool gfx_v11_pipe_reset_support(struct amdgpu_device *adev)
-{
- /* Disable the pipe reset until the CPFW fully support it.*/
- dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");
- return false;
-}
-
-
-static int gfx_v11_reset_gfx_pipe(struct amdgpu_ring *ring)
-{
- struct amdgpu_device *adev = ring->adev;
- uint32_t reset_pipe = 0, clean_pipe = 0;
- int r;
-
- if (!gfx_v11_pipe_reset_support(adev))
- return -EOPNOTSUPP;
-
- gfx_v11_0_set_safe_mode(adev, 0);
- mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- PFP_PIPE0_RESET, 1);
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- ME_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- PFP_PIPE0_RESET, 0);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- ME_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- PFP_PIPE1_RESET, 1);
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- ME_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- PFP_PIPE1_RESET, 0);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- ME_PIPE1_RESET, 0);
- break;
- default:
- break;
- }
-
- WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
-
- r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) -
- RS64_FW_UC_START_ADDR_LO;
- soc21_grbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- gfx_v11_0_unset_safe_mode(adev, 0);
-
- dev_info(adev->dev, "The ring %s pipe reset to the ME firmware start PC: %s\n", ring->name,
- r == 0 ? "successfully" : "failed");
- /* FIXME: Sometimes driver can't cache the ME firmware start PC correctly,
- * so the pipe reset status relies on the later gfx ring test result.
- */
- return 0;
-}
-
static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
unsigned int vmid,
struct amdgpu_fence *timedout_fence)
{
struct amdgpu_device *adev = ring->adev;
- bool use_mmio = false;
- int r;
+ bool use_mmio = adev->gfx.me.use_mmio_for_reset;
- amdgpu_ring_reset_helper_begin(ring, timedout_fence);
-
- r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
- if (r) {
-
- dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r);
- r = gfx_v11_reset_gfx_pipe(ring);
- if (r)
- return r;
- }
-
- if (use_mmio) {
- r = gfx_v11_0_kgq_init_queue(ring, true);
- if (r) {
- dev_err(adev->dev, "failed to init kgq\n");
- return r;
- }
-
- r = amdgpu_mes_map_legacy_queue(adev, ring, 0);
- if (r) {
- dev_err(adev->dev, "failed to remap kgq\n");
- return r;
- }
- }
-
- return amdgpu_ring_reset_helper_end(ring, timedout_fence);
-}
-
-static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
-{
-
- struct amdgpu_device *adev = ring->adev;
- uint32_t reset_pipe = 0, clean_pipe = 0;
- int r;
-
- if (!gfx_v11_pipe_reset_support(adev))
- return -EOPNOTSUPP;
-
- gfx_v11_0_set_safe_mode(adev, 0);
- mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-
- reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
- clean_pipe = reset_pipe;
-
- if (adev->gfx.rs64_enable) {
-
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 0);
- break;
- case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 0);
- break;
- case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 0);
- break;
- default:
- break;
- }
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
- r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
- RS64_FW_UC_START_ADDR_LO;
- } else {
- if (ring->me == 1) {
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 0);
- break;
- case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE2_RESET, 0);
- break;
- case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE3_RESET, 0);
- break;
- default:
- break;
- }
- /* mec1 fw pc: CP_MEC1_INSTR_PNTR */
- } else {
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE1_RESET, 0);
- break;
- case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE2_RESET, 0);
- break;
- case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME2_PIPE3_RESET, 0);
- break;
- default:
- break;
- }
- /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */
- }
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe);
- r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR));
- }
-
- soc21_grbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- gfx_v11_0_unset_safe_mode(adev, 0);
-
- dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", ring->name,
- r == 0 ? "successfully" : "failed");
- /*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe
- * reset status relies on the compute ring test result.
- */
- return 0;
+ return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio);
}
static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
@@ -7080,30 +6856,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
struct amdgpu_fence *timedout_fence)
{
struct amdgpu_device *adev = ring->adev;
- int r = 0;
-
- amdgpu_ring_reset_helper_begin(ring, timedout_fence);
-
- r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0);
- if (r) {
- dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
- r = gfx_v11_0_reset_compute_pipe(ring);
- if (r)
- return r;
- }
-
- r = gfx_v11_0_kcq_init_queue(ring, true);
- if (r) {
- dev_err(adev->dev, "fail to init kcq\n");
- return r;
- }
- r = amdgpu_mes_map_legacy_queue(adev, ring, 0);
- if (r) {
- dev_err(adev->dev, "failed to remap kcq\n");
- return r;
- }
- return amdgpu_ring_reset_helper_end(ring, timedout_fence);
+ return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL, NULL);
}
static void gfx_v11_ip_print(struct amdgpu_ip_block *ip_block, struct drm_printer *p)
@@ -7281,8 +7035,6 @@ static const struct amd_ip_funcs gfx_v11_0_ip_funcs = {
.is_idle = gfx_v11_0_is_idle,
.wait_for_idle = gfx_v11_0_wait_for_idle,
.soft_reset = gfx_v11_0_soft_reset,
- .check_soft_reset = gfx_v11_0_check_soft_reset,
- .post_soft_reset = gfx_v11_0_post_soft_reset,
.set_clockgating_state = gfx_v11_0_set_clockgating_state,
.set_powergating_state = gfx_v11_0_set_powergating_state,
.get_clockgating_state = gfx_v11_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index da668a8d6abd..c765af54669c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1603,6 +1603,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
+ adev->gfx.me.use_mmio_for_reset = false;
+ adev->gfx.mec.use_mmio_for_reset = true;
+
+ mutex_init(&adev->gfx.mec.reset_mutex);
+
return 0;
}
@@ -3071,13 +3076,13 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device *adev, void *m,
return 0;
}
-static int gfx_v12_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset)
+static int gfx_v12_0_kgq_init_queue(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
struct v12_gfx_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.gfx_ring[0];
- if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
+ if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -3104,7 +3109,7 @@ static int gfx_v12_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
- r = gfx_v12_0_kgq_init_queue(&adev->gfx.gfx_ring[i], false);
+ r = gfx_v12_0_kgq_init_queue(&adev->gfx.gfx_ring[i]);
if (r)
return r;
}
@@ -3441,13 +3446,13 @@ static int gfx_v12_0_kiq_init_queue(struct amdgpu_ring *ring)
return 0;
}
-static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset)
+static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
struct v12_compute_mqd *mqd = ring->mqd_ptr;
int mqd_idx = ring - &adev->gfx.compute_ring[0];
- if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) {
+ if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
memset((void *)mqd, 0, sizeof(*mqd));
mutex_lock(&adev->srbm_mutex);
soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -3485,7 +3490,7 @@ static int gfx_v12_0_kcq_resume(struct amdgpu_device *adev)
gfx_v12_0_cp_compute_enable(adev, true);
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
- r = gfx_v12_0_kcq_init_queue(&adev->gfx.compute_ring[i], false);
+ r = gfx_v12_0_kcq_init_queue(&adev->gfx.compute_ring[i]);
if (r)
return r;
}
@@ -3986,6 +3991,8 @@ static int gfx_v12_0_early_init(struct amdgpu_ip_block *ip_block)
gfx_v12_0_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v12_0_init_microcode(adev);
}
@@ -5025,22 +5032,30 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
- u8 me_id, pipe_id, queue_id;
- struct amdgpu_ring *ring;
- int i;
-
- me_id = (entry->ring_id & 0x0c) >> 2;
- pipe_id = (entry->ring_id & 0x03) >> 0;
- queue_id = (entry->ring_id & 0x70) >> 4;
+ u32 doorbell_offset = entry->src_data[0] & AMDGPU_CTXID0_DOORBELL_ID_MASK;
+ /*
+ * Try KQ first by ring_id; UQ as fallback. KCQ and UQ never share
+ * a HW slot (compute_hqd_mask contract).
+ */
if (!adev->gfx.disable_kq) {
+ u8 me_id, pipe_id, queue_id;
+ struct amdgpu_ring *ring;
+ int i;
+
+ me_id = (entry->ring_id & 0x0c) >> 2;
+ pipe_id = (entry->ring_id & 0x03) >> 0;
+ queue_id = (entry->ring_id & 0x70) >> 4;
+
switch (me_id) {
case 0:
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
ring = &adev->gfx.gfx_ring[i];
if (ring->me == me_id && ring->pipe == pipe_id &&
- ring->queue == queue_id)
+ ring->queue == queue_id) {
drm_sched_fault(&ring->sched);
+ return;
+ }
}
break;
case 1:
@@ -5048,8 +5063,10 @@ static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = &adev->gfx.compute_ring[i];
if (ring->me == me_id && ring->pipe == pipe_id &&
- ring->queue == queue_id)
+ ring->queue == queue_id) {
drm_sched_fault(&ring->sched);
+ return;
+ }
}
break;
default:
@@ -5057,6 +5074,11 @@ static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
break;
}
}
+
+ /* No KQ matched: HW slot is a MES-scheduled user queue. */
+ if (adev->enable_mes && doorbell_offset)
+ amdgpu_userq_process_reset_irq(adev, entry->pasid,
+ doorbell_offset);
}
static int gfx_v12_0_priv_reg_irq(struct amdgpu_device *adev,
@@ -5261,185 +5283,14 @@ static void gfx_v12_ip_dump(struct amdgpu_ip_block *ip_block)
amdgpu_gfx_off_ctrl(adev, true);
}
-static bool gfx_v12_pipe_reset_support(struct amdgpu_device *adev)
-{
- /* Disable the pipe reset until the CPFW fully support it.*/
- dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");
- return false;
-}
-
-static int gfx_v12_reset_gfx_pipe(struct amdgpu_ring *ring)
-{
- struct amdgpu_device *adev = ring->adev;
- uint32_t reset_pipe = 0, clean_pipe = 0;
- int r;
-
- if (!gfx_v12_pipe_reset_support(adev))
- return -EOPNOTSUPP;
-
- gfx_v12_0_set_safe_mode(adev, 0);
- mutex_lock(&adev->srbm_mutex);
- soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- PFP_PIPE0_RESET, 1);
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- ME_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- PFP_PIPE0_RESET, 0);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- ME_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- PFP_PIPE1_RESET, 1);
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
- ME_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- PFP_PIPE1_RESET, 0);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
- ME_PIPE1_RESET, 0);
- break;
- default:
- break;
- }
-
- WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
-
- r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) -
- RS64_FW_UC_START_ADDR_LO;
- soc24_grbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- gfx_v12_0_unset_safe_mode(adev, 0);
-
- dev_info(adev->dev, "The ring %s pipe reset: %s\n", ring->name,
- r == 0 ? "successfully" : "failed");
- /* Sometimes the ME start pc counter can't cache correctly, so the
- * PC check only as a reference and pipe reset result rely on the
- * later ring test.
- */
- return 0;
-}
-
static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring,
unsigned int vmid,
struct amdgpu_fence *timedout_fence)
{
struct amdgpu_device *adev = ring->adev;
- bool use_mmio = false;
- int r;
-
- amdgpu_ring_reset_helper_begin(ring, timedout_fence);
-
- r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0);
- if (r) {
- dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r);
- r = gfx_v12_reset_gfx_pipe(ring);
- if (r)
- return r;
- }
-
- if (use_mmio) {
- r = gfx_v12_0_kgq_init_queue(ring, true);
- if (r) {
- dev_err(adev->dev, "failed to init kgq\n");
- return r;
- }
-
- r = amdgpu_mes_map_legacy_queue(adev, ring, 0);
- if (r) {
- dev_err(adev->dev, "failed to remap kgq\n");
- return r;
- }
- }
-
- return amdgpu_ring_reset_helper_end(ring, timedout_fence);
-}
-
-static int gfx_v12_0_reset_compute_pipe(struct amdgpu_ring *ring)
-{
- struct amdgpu_device *adev = ring->adev;
- uint32_t reset_pipe = 0, clean_pipe = 0;
- int r = 0;
-
- if (!gfx_v12_pipe_reset_support(adev))
- return -EOPNOTSUPP;
-
- gfx_v12_0_set_safe_mode(adev, 0);
- mutex_lock(&adev->srbm_mutex);
- soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-
- reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
- clean_pipe = reset_pipe;
+ bool use_mmio = adev->gfx.me.use_mmio_for_reset;
- if (adev->gfx.rs64_enable) {
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE1_RESET, 0);
- break;
- case 2:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE2_RESET, 0);
- break;
- case 3:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
- MEC_PIPE3_RESET, 0);
- break;
- default:
- break;
- }
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
- r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
- RS64_FW_UC_START_ADDR_LO;
- } else {
- switch (ring->pipe) {
- case 0:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE0_RESET, 0);
- break;
- case 1:
- reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 1);
- clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
- MEC_ME1_PIPE1_RESET, 0);
- break;
- default:
- break;
- }
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe);
- WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe);
- /* Doesn't find the F32 MEC instruction pointer register, and suppose
- * the driver won't run into the F32 mode.
- */
- }
-
- soc24_grbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- gfx_v12_0_unset_safe_mode(adev, 0);
-
- dev_info(adev->dev, "The ring %s pipe resets: %s\n", ring->name,
- r == 0 ? "successfully" : "failed");
- /* Need the ring test to verify the pipe reset result.*/
- return 0;
+ return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio);
}
static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
@@ -5447,30 +5298,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
struct amdgpu_fence *timedout_fence)
{
struct amdgpu_device *adev = ring->adev;
- int r;
-
- amdgpu_ring_reset_helper_begin(ring, timedout_fence);
-
- r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0);
- if (r) {
- dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
- r = gfx_v12_0_reset_compute_pipe(ring);
- if (r)
- return r;
- }
-
- r = gfx_v12_0_kcq_init_queue(ring, true);
- if (r) {
- dev_err(adev->dev, "failed to init kcq\n");
- return r;
- }
- r = amdgpu_mes_map_legacy_queue(adev, ring, 0);
- if (r) {
- dev_err(adev->dev, "failed to remap kcq\n");
- return r;
- }
- return amdgpu_ring_reset_helper_end(ring, timedout_fence);
+ return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL, NULL);
}
static void gfx_v12_0_ring_begin_use(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
index e7e9f11b9754..e87f1baf5cb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
@@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
+ mutex_init(&adev->gfx.mec.reset_mutex);
+
return 0;
}
@@ -3004,6 +3006,8 @@ static int gfx_v12_1_early_init(struct amdgpu_ip_block *ip_block)
gfx_v12_1_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v12_1_init_microcode(adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 70ba81e6b4d4..bee2ff6865f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1487,7 +1487,14 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
/* bail if the compute ring is not ready */
if (!ring->sched.ready)
- return 0;
+ return -EBUSY;
+
+ if (amdgpu_in_reset(adev)) {
+ /* Set preempt condition to execute IB */
+ amdgpu_ring_set_preempt_cond_exec(ring, true);
+ /* Flush HDP cache so the GPU can see the updated COND_EXEC value */
+ amdgpu_device_flush_hdp(adev, NULL);
+ }
tmp = RREG32(mmGB_EDC_MODE);
WREG32(mmGB_EDC_MODE, 0);
@@ -2028,6 +2035,11 @@ static int gfx_v8_0_sw_init(struct amdgpu_ip_block *ip_block)
adev->gfx.compute_supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+ if (!amdgpu_sriov_vf(adev) && !adev->debug_disable_ip_block_soft_reset) {
+ adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET;
+ adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET;
+ }
+
return 0;
}
@@ -4703,12 +4715,14 @@ static int gfx_v8_0_cp_test_all_rings(struct amdgpu_device *adev)
if (r)
return r;
+ r = 0;
+
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = &adev->gfx.compute_ring[i];
- amdgpu_ring_test_helper(ring);
+ r |= amdgpu_ring_test_helper(ring);
}
- return 0;
+ return r;
}
static int gfx_v8_0_cp_resume(struct amdgpu_device *adev)
@@ -4868,14 +4882,12 @@ static int gfx_v8_0_hw_fini(struct amdgpu_ip_block *ip_block)
}
amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
- if (!gfx_v8_0_wait_for_idle(ip_block))
- gfx_v8_0_cp_enable(adev, false);
- else
+ if (!amdgpu_in_reset(adev) && gfx_v8_0_wait_for_idle(ip_block))
pr_err("cp is busy, skip halt cp\n");
- if (!gfx_v8_0_wait_for_rlc_idle(adev))
- adev->gfx.rlc.funcs->stop(adev);
- else
- pr_err("rlc is busy, skip halt rlc\n");
+ if (!amdgpu_in_reset(adev) && gfx_v8_0_wait_for_rlc_idle(adev))
+ pr_err("rlc is busy\n");
+ gfx_v8_0_cp_enable(adev, false);
+ adev->gfx.rlc.funcs->stop(adev);
amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
return 0;
@@ -4891,128 +4903,49 @@ static int gfx_v8_0_resume(struct amdgpu_ip_block *ip_block)
return gfx_v8_0_hw_init(ip_block);
}
-static bool gfx_v8_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
+static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
u32 grbm_soft_reset = 0, srbm_soft_reset = 0;
u32 tmp;
+ int i;
+ int r;
- /* GRBM_STATUS */
- tmp = RREG32(mmGRBM_STATUS);
- if (tmp & (GRBM_STATUS__PA_BUSY_MASK | GRBM_STATUS__SC_BUSY_MASK |
- GRBM_STATUS__BCI_BUSY_MASK | GRBM_STATUS__SX_BUSY_MASK |
- GRBM_STATUS__TA_BUSY_MASK | GRBM_STATUS__VGT_BUSY_MASK |
- GRBM_STATUS__DB_BUSY_MASK | GRBM_STATUS__CB_BUSY_MASK |
- GRBM_STATUS__GDS_BUSY_MASK | GRBM_STATUS__SPI_BUSY_MASK |
- GRBM_STATUS__IA_BUSY_MASK | GRBM_STATUS__IA_BUSY_NO_DMA_MASK |
- GRBM_STATUS__CP_BUSY_MASK | GRBM_STATUS__CP_COHERENCY_BUSY_MASK)) {
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset,
- GRBM_SOFT_RESET, SOFT_RESET_CP, 1);
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset,
- GRBM_SOFT_RESET, SOFT_RESET_GFX, 1);
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset,
- SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1);
- }
-
- /* GRBM_STATUS2 */
- tmp = RREG32(mmGRBM_STATUS2);
- if (REG_GET_FIELD(tmp, GRBM_STATUS2, RLC_BUSY))
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset,
- GRBM_SOFT_RESET, SOFT_RESET_RLC, 1);
-
- if (REG_GET_FIELD(tmp, GRBM_STATUS2, CPF_BUSY) ||
- REG_GET_FIELD(tmp, GRBM_STATUS2, CPC_BUSY) ||
- REG_GET_FIELD(tmp, GRBM_STATUS2, CPG_BUSY)) {
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET,
- SOFT_RESET_CPF, 1);
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET,
- SOFT_RESET_CPC, 1);
- grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET,
- SOFT_RESET_CPG, 1);
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET,
- SOFT_RESET_GRBM, 1);
- }
-
- /* SRBM_STATUS */
- tmp = RREG32(mmSRBM_STATUS);
- if (REG_GET_FIELD(tmp, SRBM_STATUS, GRBM_RQ_PENDING))
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset,
- SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1);
- if (REG_GET_FIELD(tmp, SRBM_STATUS, SEM_BUSY))
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset,
- SRBM_SOFT_RESET, SOFT_RESET_SEM, 1);
-
- if (grbm_soft_reset || srbm_soft_reset) {
- adev->gfx.grbm_soft_reset = grbm_soft_reset;
- adev->gfx.srbm_soft_reset = srbm_soft_reset;
- return true;
- } else {
- adev->gfx.grbm_soft_reset = 0;
- adev->gfx.srbm_soft_reset = 0;
- return false;
- }
-}
-
-static int gfx_v8_0_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 grbm_soft_reset = 0;
-
- if ((!adev->gfx.grbm_soft_reset) &&
- (!adev->gfx.srbm_soft_reset))
- return 0;
-
- grbm_soft_reset = adev->gfx.grbm_soft_reset;
-
- /* stop the rlc */
- adev->gfx.rlc.funcs->stop(adev);
+ grbm_soft_reset =
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_RLC, 1) |
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_GFX, 1) |
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CP, 1) |
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPF, 1) |
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPC, 1) |
+ REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPG, 1);
- if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_GFX))
- /* Disable GFX parsing/prefetching */
- gfx_v8_0_cp_gfx_enable(adev, false);
+ srbm_soft_reset =
+ REG_SET_FIELD(0, SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1) |
+ REG_SET_FIELD(0, SRBM_SOFT_RESET, SOFT_RESET_SEM, 1);
- if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPF) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPC) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPG)) {
- int i;
+ for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+ struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
- for (i = 0; i < adev->gfx.num_compute_rings; i++) {
- struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
+ mutex_lock(&adev->srbm_mutex);
+ vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+ gfx_v8_0_deactivate_hqd(adev, 2);
+ vi_srbm_select(adev, 0, 0, 0, 0);
+ mutex_unlock(&adev->srbm_mutex);
- mutex_lock(&adev->srbm_mutex);
- vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
- gfx_v8_0_deactivate_hqd(adev, 2);
- vi_srbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- }
- /* Disable MEC parsing/prefetching */
- gfx_v8_0_cp_compute_enable(adev, false);
+ udelay(50);
}
- return 0;
-}
-
-static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 grbm_soft_reset = 0, srbm_soft_reset = 0;
- u32 tmp;
-
- if ((!adev->gfx.grbm_soft_reset) &&
- (!adev->gfx.srbm_soft_reset))
- return 0;
-
- grbm_soft_reset = adev->gfx.grbm_soft_reset;
- srbm_soft_reset = adev->gfx.srbm_soft_reset;
+ ip_block->version->funcs->set_clockgating_state(ip_block, AMD_CG_STATE_UNGATE);
+ ip_block->version->funcs->set_powergating_state(ip_block, AMD_PG_STATE_UNGATE);
+ ip_block->version->funcs->suspend(ip_block);
if (grbm_soft_reset || srbm_soft_reset) {
tmp = RREG32(mmGMCON_DEBUG);
tmp = REG_SET_FIELD(tmp, GMCON_DEBUG, GFX_STALL, 1);
tmp = REG_SET_FIELD(tmp, GMCON_DEBUG, GFX_CLEAR, 1);
WREG32(mmGMCON_DEBUG, tmp);
- udelay(50);
+
+ udelay(100);
}
if (grbm_soft_reset) {
@@ -5022,11 +4955,13 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
WREG32(mmGRBM_SOFT_RESET, tmp);
tmp = RREG32(mmGRBM_SOFT_RESET);
- udelay(50);
+ udelay(100);
tmp &= ~grbm_soft_reset;
WREG32(mmGRBM_SOFT_RESET, tmp);
tmp = RREG32(mmGRBM_SOFT_RESET);
+
+ udelay(100);
}
if (srbm_soft_reset) {
@@ -5036,11 +4971,13 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
WREG32(mmSRBM_SOFT_RESET, tmp);
tmp = RREG32(mmSRBM_SOFT_RESET);
- udelay(50);
+ udelay(100);
tmp &= ~srbm_soft_reset;
WREG32(mmSRBM_SOFT_RESET, tmp);
tmp = RREG32(mmSRBM_SOFT_RESET);
+
+ udelay(100);
}
if (grbm_soft_reset || srbm_soft_reset) {
@@ -5051,48 +4988,15 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
}
/* Wait a little for things to settle down */
- udelay(50);
+ udelay(100);
- return 0;
-}
-
-static int gfx_v8_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 grbm_soft_reset = 0;
-
- if ((!adev->gfx.grbm_soft_reset) &&
- (!adev->gfx.srbm_soft_reset))
- return 0;
-
- grbm_soft_reset = adev->gfx.grbm_soft_reset;
-
- if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPF) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPC) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPG)) {
- int i;
-
- for (i = 0; i < adev->gfx.num_compute_rings; i++) {
- struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
-
- mutex_lock(&adev->srbm_mutex);
- vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
- gfx_v8_0_deactivate_hqd(adev, 2);
- vi_srbm_select(adev, 0, 0, 0, 0);
- mutex_unlock(&adev->srbm_mutex);
- }
- gfx_v8_0_kiq_resume(adev);
- gfx_v8_0_kcq_resume(adev);
- }
-
- if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) ||
- REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_GFX))
- gfx_v8_0_cp_gfx_resume(adev);
+ r = ip_block->version->funcs->resume(ip_block);
+ r |= ip_block->version->funcs->late_init(ip_block);
+ if (r)
+ return r;
- gfx_v8_0_cp_test_all_rings(adev);
-
- adev->gfx.rlc.funcs->start(adev);
+ ip_block->version->funcs->set_clockgating_state(ip_block, AMD_CG_STATE_GATE);
+ ip_block->version->funcs->set_powergating_state(ip_block, AMD_PG_STATE_GATE);
return 0;
}
@@ -6859,10 +6763,7 @@ static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {
.resume = gfx_v8_0_resume,
.is_idle = gfx_v8_0_is_idle,
.wait_for_idle = gfx_v8_0_wait_for_idle,
- .check_soft_reset = gfx_v8_0_check_soft_reset,
- .pre_soft_reset = gfx_v8_0_pre_soft_reset,
.soft_reset = gfx_v8_0_soft_reset,
- .post_soft_reset = gfx_v8_0_post_soft_reset,
.set_clockgating_state = gfx_v8_0_set_clockgating_state,
.set_powergating_state = gfx_v8_0_set_powergating_state,
.get_clockgating_state = gfx_v8_0_get_clockgating_state,
@@ -6923,10 +6824,12 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
.get_wptr = gfx_v8_0_ring_get_wptr_compute,
.set_wptr = gfx_v8_0_ring_set_wptr_compute,
.emit_frame_size =
+ 5 + /* gfx_v8_0_ring_emit_init_cond_exec (from amdgpu_ib_schedule) */
20 + /* gfx_v8_0_ring_emit_gds_switch */
7 + /* gfx_v8_0_ring_emit_hdp_flush */
5 + /* hdp_invalidate */
7 + /* gfx_v8_0_ring_emit_pipeline_sync */
+ 5 + /* gfx_v8_0_ring_emit_init_cond_exec (from amdgpu_vm_flush) */
VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
7 + /* gfx_v8_0_emit_mem_sync_compute */
@@ -6947,6 +6850,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
.soft_recovery = gfx_v8_0_ring_soft_recovery,
.emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
.emit_wave_limit = gfx_v8_0_emit_wave_limit,
+ .init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
};
static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 3370f542e990..9f81fd715418 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4875,6 +4875,8 @@ static int gfx_v9_0_early_init(struct amdgpu_ip_block *ip_block)
/* init rlcg reg access ctrl */
gfx_v9_0_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v9_0_init_microcode(adev);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 2a36647b975a..b89cbc2df951 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -39,7 +39,6 @@
#include "gfx_v9_4_3.h"
#include "gfx_v9_4_3_cleaner_shader.h"
#include "amdgpu_xcp.h"
-#include "amdgpu_aca.h"
MODULE_FIRMWARE("amdgpu/gc_9_4_3_mec.bin");
MODULE_FIRMWARE("amdgpu/gc_9_4_4_mec.bin");
@@ -851,73 +850,6 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {
.get_hdp_flush_mask = &amdgpu_gfx_get_hdp_flush_mask,
};
-static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
- struct aca_bank *bank, enum aca_smu_type type,
- void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- u32 instlo;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- /* NOTE: overwrite info.die_id with xcd id for gfx */
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
- info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
-
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
- switch (instlo) {
- case mmSMNAID_XCD0_MCA_SMU:
- case mmSMNAID_XCD1_MCA_SMU:
- case mmSMNXCD_XCD0_MCA_SMU:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
-static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = {
- .aca_bank_parser = gfx_v9_4_3_aca_bank_parser,
- .aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid,
-};
-
-static const struct aca_info gfx_v9_4_3_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
- .bank_ops = &gfx_v9_4_3_aca_bank_ops,
-};
-
static int gfx_v9_4_3_gpu_early_init(struct amdgpu_device *adev)
{
adev->gfx.funcs = &gfx_v9_4_3_gfx_funcs;
@@ -1107,22 +1039,24 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
/* set up the compute queues - allocate horizontally across pipes */
for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
ring_id = 0;
- for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
- for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
- for (k = 0; k < adev->gfx.mec.num_pipe_per_mec;
- k++) {
- if (!amdgpu_gfx_is_mec_queue_enabled(
- adev, xcc_id, i, k, j))
- continue;
-
- r = gfx_v9_4_3_compute_ring_init(adev,
- ring_id,
- xcc_id,
- i, k, j);
- if (r)
- return r;
-
- ring_id++;
+ if (!adev->gfx.disable_kq) {
+ for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
+ for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
+ for (k = 0; k < adev->gfx.mec.num_pipe_per_mec;
+ k++) {
+ if (!amdgpu_gfx_is_mec_queue_enabled(
+ adev, xcc_id, i, k, j))
+ continue;
+
+ r = gfx_v9_4_3_compute_ring_init(adev,
+ ring_id,
+ xcc_id,
+ i, k, j);
+ if (r)
+ return r;
+
+ ring_id++;
+ }
}
}
}
@@ -2350,6 +2284,65 @@ static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id)
gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id);
}
+static int gfx_v9_4_3_set_userq_eop_interrupts(struct amdgpu_device *adev,
+ bool enable)
+{
+ int num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+ unsigned int irq_type;
+ int m, p, xcc_id, r;
+
+ if (adev->gfx.disable_kq) {
+ for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) {
+ for (m = 0; m < adev->gfx.mec.num_mec; ++m) {
+ for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) {
+ irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+ + (m * adev->gfx.mec.num_pipe_per_mec)
+ + p;
+
+ if (enable)
+ r = amdgpu_irq_get(adev, &adev->gfx.eop_irq,
+ irq_type);
+ else
+ r = amdgpu_irq_put(adev, &adev->gfx.eop_irq,
+ irq_type);
+ if (r) {
+ if (!enable)
+ return r;
+ goto err_compute;
+ }
+ }
+ }
+ }
+ }
+
+ return 0;
+
+err_compute:
+ for (p--; p >= 0; p--) {
+ irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+ + (m * adev->gfx.mec.num_pipe_per_mec) + p;
+ amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type);
+ }
+ for (m--; m >= 0; m--) {
+ for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) {
+ irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+ + (m * adev->gfx.mec.num_pipe_per_mec) + p;
+ amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type);
+ }
+ }
+ for (xcc_id--; xcc_id >= 0; xcc_id--) {
+ for (m = adev->gfx.mec.num_mec - 1; m <= 0; m--) {
+ for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) {
+ irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+ + (m * adev->gfx.mec.num_pipe_per_mec) + p;
+ amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type);
+ }
+ }
+ }
+
+ return r;
+}
+
static int gfx_v9_4_3_hw_init(struct amdgpu_ip_block *ip_block)
{
int r;
@@ -2382,9 +2375,14 @@ static int gfx_v9_4_3_hw_init(struct amdgpu_ip_block *ip_block)
r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
if (r)
goto err_bad_op;
+ r = gfx_v9_4_3_set_userq_eop_interrupts(adev, true);
+ if (r)
+ goto err_bad_eop;
return 0;
+err_bad_eop:
+ amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
err_bad_op:
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
err_priv_inst:
@@ -2467,6 +2465,7 @@ static int gfx_v9_4_3_hw_fini(struct amdgpu_ip_block *ip_block)
amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
+ gfx_v9_4_3_set_userq_eop_interrupts(adev, false);
num_xcc = NUM_XCC(adev->gfx.xcc_mask);
for (i = 0; i < num_xcc; i++) {
@@ -2612,8 +2611,24 @@ static int gfx_v9_4_3_early_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
- adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev),
- AMDGPU_MAX_COMPUTE_RINGS);
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->gfx.disable_kq = false;
+ adev->gfx.disable_uq = true;
+ break;
+ case 2:
+ adev->gfx.disable_kq = true;
+ adev->gfx.disable_uq = true;
+ break;
+ }
+
+ if (adev->gfx.disable_kq)
+ adev->gfx.num_compute_rings = 0;
+ else
+ adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev),
+ AMDGPU_MAX_COMPUTE_RINGS);
gfx_v9_4_3_set_kiq_pm4_funcs(adev);
gfx_v9_4_3_set_ring_funcs(adev);
gfx_v9_4_3_set_irq_funcs(adev);
@@ -2623,6 +2638,8 @@ static int gfx_v9_4_3_early_init(struct amdgpu_ip_block *ip_block)
/* init rlcg reg access ctrl */
gfx_v9_4_3_init_rlcg_reg_access_ctrl(adev);
+ amdgpu_init_rlc_reg_funcs(adev);
+
return gfx_v9_4_3_init_microcode(adev);
}
@@ -3709,872 +3726,6 @@ pipe_reset:
return amdgpu_ring_reset_helper_end(ring, timedout_fence);
}
-enum amdgpu_gfx_cp_ras_mem_id {
- AMDGPU_GFX_CP_MEM1 = 1,
- AMDGPU_GFX_CP_MEM2,
- AMDGPU_GFX_CP_MEM3,
- AMDGPU_GFX_CP_MEM4,
- AMDGPU_GFX_CP_MEM5,
-};
-
-enum amdgpu_gfx_gcea_ras_mem_id {
- AMDGPU_GFX_GCEA_IOWR_CMDMEM = 4,
- AMDGPU_GFX_GCEA_IORD_CMDMEM,
- AMDGPU_GFX_GCEA_GMIWR_CMDMEM,
- AMDGPU_GFX_GCEA_GMIRD_CMDMEM,
- AMDGPU_GFX_GCEA_DRAMWR_CMDMEM,
- AMDGPU_GFX_GCEA_DRAMRD_CMDMEM,
- AMDGPU_GFX_GCEA_MAM_DMEM0,
- AMDGPU_GFX_GCEA_MAM_DMEM1,
- AMDGPU_GFX_GCEA_MAM_DMEM2,
- AMDGPU_GFX_GCEA_MAM_DMEM3,
- AMDGPU_GFX_GCEA_MAM_AMEM0,
- AMDGPU_GFX_GCEA_MAM_AMEM1,
- AMDGPU_GFX_GCEA_MAM_AMEM2,
- AMDGPU_GFX_GCEA_MAM_AMEM3,
- AMDGPU_GFX_GCEA_MAM_AFLUSH_BUFFER,
- AMDGPU_GFX_GCEA_WRET_TAGMEM,
- AMDGPU_GFX_GCEA_RRET_TAGMEM,
- AMDGPU_GFX_GCEA_IOWR_DATAMEM,
- AMDGPU_GFX_GCEA_GMIWR_DATAMEM,
- AMDGPU_GFX_GCEA_DRAM_DATAMEM,
-};
-
-enum amdgpu_gfx_gc_cane_ras_mem_id {
- AMDGPU_GFX_GC_CANE_MEM0 = 0,
-};
-
-enum amdgpu_gfx_gcutcl2_ras_mem_id {
- AMDGPU_GFX_GCUTCL2_MEM2P512X95 = 160,
-};
-
-enum amdgpu_gfx_gds_ras_mem_id {
- AMDGPU_GFX_GDS_MEM0 = 0,
-};
-
-enum amdgpu_gfx_lds_ras_mem_id {
- AMDGPU_GFX_LDS_BANK0 = 0,
- AMDGPU_GFX_LDS_BANK1,
- AMDGPU_GFX_LDS_BANK2,
- AMDGPU_GFX_LDS_BANK3,
- AMDGPU_GFX_LDS_BANK4,
- AMDGPU_GFX_LDS_BANK5,
- AMDGPU_GFX_LDS_BANK6,
- AMDGPU_GFX_LDS_BANK7,
- AMDGPU_GFX_LDS_BANK8,
- AMDGPU_GFX_LDS_BANK9,
- AMDGPU_GFX_LDS_BANK10,
- AMDGPU_GFX_LDS_BANK11,
- AMDGPU_GFX_LDS_BANK12,
- AMDGPU_GFX_LDS_BANK13,
- AMDGPU_GFX_LDS_BANK14,
- AMDGPU_GFX_LDS_BANK15,
- AMDGPU_GFX_LDS_BANK16,
- AMDGPU_GFX_LDS_BANK17,
- AMDGPU_GFX_LDS_BANK18,
- AMDGPU_GFX_LDS_BANK19,
- AMDGPU_GFX_LDS_BANK20,
- AMDGPU_GFX_LDS_BANK21,
- AMDGPU_GFX_LDS_BANK22,
- AMDGPU_GFX_LDS_BANK23,
- AMDGPU_GFX_LDS_BANK24,
- AMDGPU_GFX_LDS_BANK25,
- AMDGPU_GFX_LDS_BANK26,
- AMDGPU_GFX_LDS_BANK27,
- AMDGPU_GFX_LDS_BANK28,
- AMDGPU_GFX_LDS_BANK29,
- AMDGPU_GFX_LDS_BANK30,
- AMDGPU_GFX_LDS_BANK31,
- AMDGPU_GFX_LDS_SP_BUFFER_A,
- AMDGPU_GFX_LDS_SP_BUFFER_B,
-};
-
-enum amdgpu_gfx_rlc_ras_mem_id {
- AMDGPU_GFX_RLC_GPMF32 = 1,
- AMDGPU_GFX_RLC_RLCVF32,
- AMDGPU_GFX_RLC_SCRATCH,
- AMDGPU_GFX_RLC_SRM_ARAM,
- AMDGPU_GFX_RLC_SRM_DRAM,
- AMDGPU_GFX_RLC_TCTAG,
- AMDGPU_GFX_RLC_SPM_SE,
- AMDGPU_GFX_RLC_SPM_GRBMT,
-};
-
-enum amdgpu_gfx_sp_ras_mem_id {
- AMDGPU_GFX_SP_SIMDID0 = 0,
-};
-
-enum amdgpu_gfx_spi_ras_mem_id {
- AMDGPU_GFX_SPI_MEM0 = 0,
- AMDGPU_GFX_SPI_MEM1,
- AMDGPU_GFX_SPI_MEM2,
- AMDGPU_GFX_SPI_MEM3,
-};
-
-enum amdgpu_gfx_sqc_ras_mem_id {
- AMDGPU_GFX_SQC_INST_CACHE_A = 100,
- AMDGPU_GFX_SQC_INST_CACHE_B = 101,
- AMDGPU_GFX_SQC_INST_CACHE_TAG_A = 102,
- AMDGPU_GFX_SQC_INST_CACHE_TAG_B = 103,
- AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_A = 104,
- AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_B = 105,
- AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_A = 106,
- AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_B = 107,
- AMDGPU_GFX_SQC_DATA_CACHE_A = 200,
- AMDGPU_GFX_SQC_DATA_CACHE_B = 201,
- AMDGPU_GFX_SQC_DATA_CACHE_TAG_A = 202,
- AMDGPU_GFX_SQC_DATA_CACHE_TAG_B = 203,
- AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_A = 204,
- AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_B = 205,
- AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_A = 206,
- AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_B = 207,
- AMDGPU_GFX_SQC_DIRTY_BIT_A = 208,
- AMDGPU_GFX_SQC_DIRTY_BIT_B = 209,
- AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU0 = 210,
- AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU1 = 211,
- AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A = 212,
- AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B = 213,
- AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_INST_CACHE = 108,
-};
-
-enum amdgpu_gfx_sq_ras_mem_id {
- AMDGPU_GFX_SQ_SGPR_MEM0 = 0,
- AMDGPU_GFX_SQ_SGPR_MEM1,
- AMDGPU_GFX_SQ_SGPR_MEM2,
- AMDGPU_GFX_SQ_SGPR_MEM3,
-};
-
-enum amdgpu_gfx_ta_ras_mem_id {
- AMDGPU_GFX_TA_FS_AFIFO_RAM_LO = 1,
- AMDGPU_GFX_TA_FS_AFIFO_RAM_HI,
- AMDGPU_GFX_TA_FS_CFIFO_RAM,
- AMDGPU_GFX_TA_FSX_LFIFO,
- AMDGPU_GFX_TA_FS_DFIFO_RAM,
-};
-
-enum amdgpu_gfx_tcc_ras_mem_id {
- AMDGPU_GFX_TCC_MEM1 = 1,
-};
-
-enum amdgpu_gfx_tca_ras_mem_id {
- AMDGPU_GFX_TCA_MEM1 = 1,
-};
-
-enum amdgpu_gfx_tci_ras_mem_id {
- AMDGPU_GFX_TCIW_MEM = 1,
-};
-
-enum amdgpu_gfx_tcp_ras_mem_id {
- AMDGPU_GFX_TCP_LFIFO0 = 1,
- AMDGPU_GFX_TCP_SET0BANK0_RAM,
- AMDGPU_GFX_TCP_SET0BANK1_RAM,
- AMDGPU_GFX_TCP_SET0BANK2_RAM,
- AMDGPU_GFX_TCP_SET0BANK3_RAM,
- AMDGPU_GFX_TCP_SET1BANK0_RAM,
- AMDGPU_GFX_TCP_SET1BANK1_RAM,
- AMDGPU_GFX_TCP_SET1BANK2_RAM,
- AMDGPU_GFX_TCP_SET1BANK3_RAM,
- AMDGPU_GFX_TCP_SET2BANK0_RAM,
- AMDGPU_GFX_TCP_SET2BANK1_RAM,
- AMDGPU_GFX_TCP_SET2BANK2_RAM,
- AMDGPU_GFX_TCP_SET2BANK3_RAM,
- AMDGPU_GFX_TCP_SET3BANK0_RAM,
- AMDGPU_GFX_TCP_SET3BANK1_RAM,
- AMDGPU_GFX_TCP_SET3BANK2_RAM,
- AMDGPU_GFX_TCP_SET3BANK3_RAM,
- AMDGPU_GFX_TCP_VM_FIFO,
- AMDGPU_GFX_TCP_DB_TAGRAM0,
- AMDGPU_GFX_TCP_DB_TAGRAM1,
- AMDGPU_GFX_TCP_DB_TAGRAM2,
- AMDGPU_GFX_TCP_DB_TAGRAM3,
- AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE0,
- AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE1,
- AMDGPU_GFX_TCP_CMD_FIFO,
-};
-
-enum amdgpu_gfx_td_ras_mem_id {
- AMDGPU_GFX_TD_UTD_CS_FIFO_MEM = 1,
- AMDGPU_GFX_TD_UTD_SS_FIFO_LO_MEM,
- AMDGPU_GFX_TD_UTD_SS_FIFO_HI_MEM,
-};
-
-enum amdgpu_gfx_tcx_ras_mem_id {
- AMDGPU_GFX_TCX_FIFOD0 = 0,
- AMDGPU_GFX_TCX_FIFOD1,
- AMDGPU_GFX_TCX_FIFOD2,
- AMDGPU_GFX_TCX_FIFOD3,
- AMDGPU_GFX_TCX_FIFOD4,
- AMDGPU_GFX_TCX_FIFOD5,
- AMDGPU_GFX_TCX_FIFOD6,
- AMDGPU_GFX_TCX_FIFOD7,
- AMDGPU_GFX_TCX_FIFOB0,
- AMDGPU_GFX_TCX_FIFOB1,
- AMDGPU_GFX_TCX_FIFOB2,
- AMDGPU_GFX_TCX_FIFOB3,
- AMDGPU_GFX_TCX_FIFOB4,
- AMDGPU_GFX_TCX_FIFOB5,
- AMDGPU_GFX_TCX_FIFOB6,
- AMDGPU_GFX_TCX_FIFOB7,
- AMDGPU_GFX_TCX_FIFOA0,
- AMDGPU_GFX_TCX_FIFOA1,
- AMDGPU_GFX_TCX_FIFOA2,
- AMDGPU_GFX_TCX_FIFOA3,
- AMDGPU_GFX_TCX_FIFOA4,
- AMDGPU_GFX_TCX_FIFOA5,
- AMDGPU_GFX_TCX_FIFOA6,
- AMDGPU_GFX_TCX_FIFOA7,
- AMDGPU_GFX_TCX_CFIFO0,
- AMDGPU_GFX_TCX_CFIFO1,
- AMDGPU_GFX_TCX_CFIFO2,
- AMDGPU_GFX_TCX_CFIFO3,
- AMDGPU_GFX_TCX_CFIFO4,
- AMDGPU_GFX_TCX_CFIFO5,
- AMDGPU_GFX_TCX_CFIFO6,
- AMDGPU_GFX_TCX_CFIFO7,
- AMDGPU_GFX_TCX_FIFO_ACKB0,
- AMDGPU_GFX_TCX_FIFO_ACKB1,
- AMDGPU_GFX_TCX_FIFO_ACKB2,
- AMDGPU_GFX_TCX_FIFO_ACKB3,
- AMDGPU_GFX_TCX_FIFO_ACKB4,
- AMDGPU_GFX_TCX_FIFO_ACKB5,
- AMDGPU_GFX_TCX_FIFO_ACKB6,
- AMDGPU_GFX_TCX_FIFO_ACKB7,
- AMDGPU_GFX_TCX_FIFO_ACKD0,
- AMDGPU_GFX_TCX_FIFO_ACKD1,
- AMDGPU_GFX_TCX_FIFO_ACKD2,
- AMDGPU_GFX_TCX_FIFO_ACKD3,
- AMDGPU_GFX_TCX_FIFO_ACKD4,
- AMDGPU_GFX_TCX_FIFO_ACKD5,
- AMDGPU_GFX_TCX_FIFO_ACKD6,
- AMDGPU_GFX_TCX_FIFO_ACKD7,
- AMDGPU_GFX_TCX_DST_FIFOA0,
- AMDGPU_GFX_TCX_DST_FIFOA1,
- AMDGPU_GFX_TCX_DST_FIFOA2,
- AMDGPU_GFX_TCX_DST_FIFOA3,
- AMDGPU_GFX_TCX_DST_FIFOA4,
- AMDGPU_GFX_TCX_DST_FIFOA5,
- AMDGPU_GFX_TCX_DST_FIFOA6,
- AMDGPU_GFX_TCX_DST_FIFOA7,
- AMDGPU_GFX_TCX_DST_FIFOB0,
- AMDGPU_GFX_TCX_DST_FIFOB1,
- AMDGPU_GFX_TCX_DST_FIFOB2,
- AMDGPU_GFX_TCX_DST_FIFOB3,
- AMDGPU_GFX_TCX_DST_FIFOB4,
- AMDGPU_GFX_TCX_DST_FIFOB5,
- AMDGPU_GFX_TCX_DST_FIFOB6,
- AMDGPU_GFX_TCX_DST_FIFOB7,
- AMDGPU_GFX_TCX_DST_FIFOD0,
- AMDGPU_GFX_TCX_DST_FIFOD1,
- AMDGPU_GFX_TCX_DST_FIFOD2,
- AMDGPU_GFX_TCX_DST_FIFOD3,
- AMDGPU_GFX_TCX_DST_FIFOD4,
- AMDGPU_GFX_TCX_DST_FIFOD5,
- AMDGPU_GFX_TCX_DST_FIFOD6,
- AMDGPU_GFX_TCX_DST_FIFOD7,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB0,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB1,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB2,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB3,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB4,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB5,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB6,
- AMDGPU_GFX_TCX_DST_FIFO_ACKB7,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD0,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD1,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD2,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD3,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD4,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD5,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD6,
- AMDGPU_GFX_TCX_DST_FIFO_ACKD7,
-};
-
-enum amdgpu_gfx_atc_l2_ras_mem_id {
- AMDGPU_GFX_ATC_L2_MEM0 = 0,
-};
-
-enum amdgpu_gfx_utcl2_ras_mem_id {
- AMDGPU_GFX_UTCL2_MEM0 = 0,
-};
-
-enum amdgpu_gfx_vml2_ras_mem_id {
- AMDGPU_GFX_VML2_MEM0 = 0,
-};
-
-enum amdgpu_gfx_vml2_walker_ras_mem_id {
- AMDGPU_GFX_VML2_WALKER_MEM0 = 0,
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_cp_mem_list[] = {
- {AMDGPU_GFX_CP_MEM1, "CP_MEM1"},
- {AMDGPU_GFX_CP_MEM2, "CP_MEM2"},
- {AMDGPU_GFX_CP_MEM3, "CP_MEM3"},
- {AMDGPU_GFX_CP_MEM4, "CP_MEM4"},
- {AMDGPU_GFX_CP_MEM5, "CP_MEM5"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gcea_mem_list[] = {
- {AMDGPU_GFX_GCEA_IOWR_CMDMEM, "GCEA_IOWR_CMDMEM"},
- {AMDGPU_GFX_GCEA_IORD_CMDMEM, "GCEA_IORD_CMDMEM"},
- {AMDGPU_GFX_GCEA_GMIWR_CMDMEM, "GCEA_GMIWR_CMDMEM"},
- {AMDGPU_GFX_GCEA_GMIRD_CMDMEM, "GCEA_GMIRD_CMDMEM"},
- {AMDGPU_GFX_GCEA_DRAMWR_CMDMEM, "GCEA_DRAMWR_CMDMEM"},
- {AMDGPU_GFX_GCEA_DRAMRD_CMDMEM, "GCEA_DRAMRD_CMDMEM"},
- {AMDGPU_GFX_GCEA_MAM_DMEM0, "GCEA_MAM_DMEM0"},
- {AMDGPU_GFX_GCEA_MAM_DMEM1, "GCEA_MAM_DMEM1"},
- {AMDGPU_GFX_GCEA_MAM_DMEM2, "GCEA_MAM_DMEM2"},
- {AMDGPU_GFX_GCEA_MAM_DMEM3, "GCEA_MAM_DMEM3"},
- {AMDGPU_GFX_GCEA_MAM_AMEM0, "GCEA_MAM_AMEM0"},
- {AMDGPU_GFX_GCEA_MAM_AMEM1, "GCEA_MAM_AMEM1"},
- {AMDGPU_GFX_GCEA_MAM_AMEM2, "GCEA_MAM_AMEM2"},
- {AMDGPU_GFX_GCEA_MAM_AMEM3, "GCEA_MAM_AMEM3"},
- {AMDGPU_GFX_GCEA_MAM_AFLUSH_BUFFER, "GCEA_MAM_AFLUSH_BUFFER"},
- {AMDGPU_GFX_GCEA_WRET_TAGMEM, "GCEA_WRET_TAGMEM"},
- {AMDGPU_GFX_GCEA_RRET_TAGMEM, "GCEA_RRET_TAGMEM"},
- {AMDGPU_GFX_GCEA_IOWR_DATAMEM, "GCEA_IOWR_DATAMEM"},
- {AMDGPU_GFX_GCEA_GMIWR_DATAMEM, "GCEA_GMIWR_DATAMEM"},
- {AMDGPU_GFX_GCEA_DRAM_DATAMEM, "GCEA_DRAM_DATAMEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gc_cane_mem_list[] = {
- {AMDGPU_GFX_GC_CANE_MEM0, "GC_CANE_MEM0"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gcutcl2_mem_list[] = {
- {AMDGPU_GFX_GCUTCL2_MEM2P512X95, "GCUTCL2_MEM2P512X95"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gds_mem_list[] = {
- {AMDGPU_GFX_GDS_MEM0, "GDS_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_lds_mem_list[] = {
- {AMDGPU_GFX_LDS_BANK0, "LDS_BANK0"},
- {AMDGPU_GFX_LDS_BANK1, "LDS_BANK1"},
- {AMDGPU_GFX_LDS_BANK2, "LDS_BANK2"},
- {AMDGPU_GFX_LDS_BANK3, "LDS_BANK3"},
- {AMDGPU_GFX_LDS_BANK4, "LDS_BANK4"},
- {AMDGPU_GFX_LDS_BANK5, "LDS_BANK5"},
- {AMDGPU_GFX_LDS_BANK6, "LDS_BANK6"},
- {AMDGPU_GFX_LDS_BANK7, "LDS_BANK7"},
- {AMDGPU_GFX_LDS_BANK8, "LDS_BANK8"},
- {AMDGPU_GFX_LDS_BANK9, "LDS_BANK9"},
- {AMDGPU_GFX_LDS_BANK10, "LDS_BANK10"},
- {AMDGPU_GFX_LDS_BANK11, "LDS_BANK11"},
- {AMDGPU_GFX_LDS_BANK12, "LDS_BANK12"},
- {AMDGPU_GFX_LDS_BANK13, "LDS_BANK13"},
- {AMDGPU_GFX_LDS_BANK14, "LDS_BANK14"},
- {AMDGPU_GFX_LDS_BANK15, "LDS_BANK15"},
- {AMDGPU_GFX_LDS_BANK16, "LDS_BANK16"},
- {AMDGPU_GFX_LDS_BANK17, "LDS_BANK17"},
- {AMDGPU_GFX_LDS_BANK18, "LDS_BANK18"},
- {AMDGPU_GFX_LDS_BANK19, "LDS_BANK19"},
- {AMDGPU_GFX_LDS_BANK20, "LDS_BANK20"},
- {AMDGPU_GFX_LDS_BANK21, "LDS_BANK21"},
- {AMDGPU_GFX_LDS_BANK22, "LDS_BANK22"},
- {AMDGPU_GFX_LDS_BANK23, "LDS_BANK23"},
- {AMDGPU_GFX_LDS_BANK24, "LDS_BANK24"},
- {AMDGPU_GFX_LDS_BANK25, "LDS_BANK25"},
- {AMDGPU_GFX_LDS_BANK26, "LDS_BANK26"},
- {AMDGPU_GFX_LDS_BANK27, "LDS_BANK27"},
- {AMDGPU_GFX_LDS_BANK28, "LDS_BANK28"},
- {AMDGPU_GFX_LDS_BANK29, "LDS_BANK29"},
- {AMDGPU_GFX_LDS_BANK30, "LDS_BANK30"},
- {AMDGPU_GFX_LDS_BANK31, "LDS_BANK31"},
- {AMDGPU_GFX_LDS_SP_BUFFER_A, "LDS_SP_BUFFER_A"},
- {AMDGPU_GFX_LDS_SP_BUFFER_B, "LDS_SP_BUFFER_B"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_rlc_mem_list[] = {
- {AMDGPU_GFX_RLC_GPMF32, "RLC_GPMF32"},
- {AMDGPU_GFX_RLC_RLCVF32, "RLC_RLCVF32"},
- {AMDGPU_GFX_RLC_SCRATCH, "RLC_SCRATCH"},
- {AMDGPU_GFX_RLC_SRM_ARAM, "RLC_SRM_ARAM"},
- {AMDGPU_GFX_RLC_SRM_DRAM, "RLC_SRM_DRAM"},
- {AMDGPU_GFX_RLC_TCTAG, "RLC_TCTAG"},
- {AMDGPU_GFX_RLC_SPM_SE, "RLC_SPM_SE"},
- {AMDGPU_GFX_RLC_SPM_GRBMT, "RLC_SPM_GRBMT"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sp_mem_list[] = {
- {AMDGPU_GFX_SP_SIMDID0, "SP_SIMDID0"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_spi_mem_list[] = {
- {AMDGPU_GFX_SPI_MEM0, "SPI_MEM0"},
- {AMDGPU_GFX_SPI_MEM1, "SPI_MEM1"},
- {AMDGPU_GFX_SPI_MEM2, "SPI_MEM2"},
- {AMDGPU_GFX_SPI_MEM3, "SPI_MEM3"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sqc_mem_list[] = {
- {AMDGPU_GFX_SQC_INST_CACHE_A, "SQC_INST_CACHE_A"},
- {AMDGPU_GFX_SQC_INST_CACHE_B, "SQC_INST_CACHE_B"},
- {AMDGPU_GFX_SQC_INST_CACHE_TAG_A, "SQC_INST_CACHE_TAG_A"},
- {AMDGPU_GFX_SQC_INST_CACHE_TAG_B, "SQC_INST_CACHE_TAG_B"},
- {AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_A, "SQC_INST_CACHE_MISS_FIFO_A"},
- {AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_B, "SQC_INST_CACHE_MISS_FIFO_B"},
- {AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_A, "SQC_INST_CACHE_GATCL1_MISS_FIFO_A"},
- {AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_B, "SQC_INST_CACHE_GATCL1_MISS_FIFO_B"},
- {AMDGPU_GFX_SQC_DATA_CACHE_A, "SQC_DATA_CACHE_A"},
- {AMDGPU_GFX_SQC_DATA_CACHE_B, "SQC_DATA_CACHE_B"},
- {AMDGPU_GFX_SQC_DATA_CACHE_TAG_A, "SQC_DATA_CACHE_TAG_A"},
- {AMDGPU_GFX_SQC_DATA_CACHE_TAG_B, "SQC_DATA_CACHE_TAG_B"},
- {AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_A, "SQC_DATA_CACHE_MISS_FIFO_A"},
- {AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_B, "SQC_DATA_CACHE_MISS_FIFO_B"},
- {AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_A, "SQC_DATA_CACHE_HIT_FIFO_A"},
- {AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_B, "SQC_DATA_CACHE_HIT_FIFO_B"},
- {AMDGPU_GFX_SQC_DIRTY_BIT_A, "SQC_DIRTY_BIT_A"},
- {AMDGPU_GFX_SQC_DIRTY_BIT_B, "SQC_DIRTY_BIT_B"},
- {AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU0, "SQC_WRITE_DATA_BUFFER_CU0"},
- {AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU1, "SQC_WRITE_DATA_BUFFER_CU1"},
- {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A, "SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A"},
- {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B, "SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B"},
- {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_INST_CACHE, "SQC_UTCL1_MISS_LFIFO_INST_CACHE"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sq_mem_list[] = {
- {AMDGPU_GFX_SQ_SGPR_MEM0, "SQ_SGPR_MEM0"},
- {AMDGPU_GFX_SQ_SGPR_MEM1, "SQ_SGPR_MEM1"},
- {AMDGPU_GFX_SQ_SGPR_MEM2, "SQ_SGPR_MEM2"},
- {AMDGPU_GFX_SQ_SGPR_MEM3, "SQ_SGPR_MEM3"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_ta_mem_list[] = {
- {AMDGPU_GFX_TA_FS_AFIFO_RAM_LO, "TA_FS_AFIFO_RAM_LO"},
- {AMDGPU_GFX_TA_FS_AFIFO_RAM_HI, "TA_FS_AFIFO_RAM_HI"},
- {AMDGPU_GFX_TA_FS_CFIFO_RAM, "TA_FS_CFIFO_RAM"},
- {AMDGPU_GFX_TA_FSX_LFIFO, "TA_FSX_LFIFO"},
- {AMDGPU_GFX_TA_FS_DFIFO_RAM, "TA_FS_DFIFO_RAM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcc_mem_list[] = {
- {AMDGPU_GFX_TCC_MEM1, "TCC_MEM1"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tca_mem_list[] = {
- {AMDGPU_GFX_TCA_MEM1, "TCA_MEM1"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tci_mem_list[] = {
- {AMDGPU_GFX_TCIW_MEM, "TCIW_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcp_mem_list[] = {
- {AMDGPU_GFX_TCP_LFIFO0, "TCP_LFIFO0"},
- {AMDGPU_GFX_TCP_SET0BANK0_RAM, "TCP_SET0BANK0_RAM"},
- {AMDGPU_GFX_TCP_SET0BANK1_RAM, "TCP_SET0BANK1_RAM"},
- {AMDGPU_GFX_TCP_SET0BANK2_RAM, "TCP_SET0BANK2_RAM"},
- {AMDGPU_GFX_TCP_SET0BANK3_RAM, "TCP_SET0BANK3_RAM"},
- {AMDGPU_GFX_TCP_SET1BANK0_RAM, "TCP_SET1BANK0_RAM"},
- {AMDGPU_GFX_TCP_SET1BANK1_RAM, "TCP_SET1BANK1_RAM"},
- {AMDGPU_GFX_TCP_SET1BANK2_RAM, "TCP_SET1BANK2_RAM"},
- {AMDGPU_GFX_TCP_SET1BANK3_RAM, "TCP_SET1BANK3_RAM"},
- {AMDGPU_GFX_TCP_SET2BANK0_RAM, "TCP_SET2BANK0_RAM"},
- {AMDGPU_GFX_TCP_SET2BANK1_RAM, "TCP_SET2BANK1_RAM"},
- {AMDGPU_GFX_TCP_SET2BANK2_RAM, "TCP_SET2BANK2_RAM"},
- {AMDGPU_GFX_TCP_SET2BANK3_RAM, "TCP_SET2BANK3_RAM"},
- {AMDGPU_GFX_TCP_SET3BANK0_RAM, "TCP_SET3BANK0_RAM"},
- {AMDGPU_GFX_TCP_SET3BANK1_RAM, "TCP_SET3BANK1_RAM"},
- {AMDGPU_GFX_TCP_SET3BANK2_RAM, "TCP_SET3BANK2_RAM"},
- {AMDGPU_GFX_TCP_SET3BANK3_RAM, "TCP_SET3BANK3_RAM"},
- {AMDGPU_GFX_TCP_VM_FIFO, "TCP_VM_FIFO"},
- {AMDGPU_GFX_TCP_DB_TAGRAM0, "TCP_DB_TAGRAM0"},
- {AMDGPU_GFX_TCP_DB_TAGRAM1, "TCP_DB_TAGRAM1"},
- {AMDGPU_GFX_TCP_DB_TAGRAM2, "TCP_DB_TAGRAM2"},
- {AMDGPU_GFX_TCP_DB_TAGRAM3, "TCP_DB_TAGRAM3"},
- {AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE0, "TCP_UTCL1_LFIFO_PROBE0"},
- {AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE1, "TCP_UTCL1_LFIFO_PROBE1"},
- {AMDGPU_GFX_TCP_CMD_FIFO, "TCP_CMD_FIFO"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_td_mem_list[] = {
- {AMDGPU_GFX_TD_UTD_CS_FIFO_MEM, "TD_UTD_CS_FIFO_MEM"},
- {AMDGPU_GFX_TD_UTD_SS_FIFO_LO_MEM, "TD_UTD_SS_FIFO_LO_MEM"},
- {AMDGPU_GFX_TD_UTD_SS_FIFO_HI_MEM, "TD_UTD_SS_FIFO_HI_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcx_mem_list[] = {
- {AMDGPU_GFX_TCX_FIFOD0, "TCX_FIFOD0"},
- {AMDGPU_GFX_TCX_FIFOD1, "TCX_FIFOD1"},
- {AMDGPU_GFX_TCX_FIFOD2, "TCX_FIFOD2"},
- {AMDGPU_GFX_TCX_FIFOD3, "TCX_FIFOD3"},
- {AMDGPU_GFX_TCX_FIFOD4, "TCX_FIFOD4"},
- {AMDGPU_GFX_TCX_FIFOD5, "TCX_FIFOD5"},
- {AMDGPU_GFX_TCX_FIFOD6, "TCX_FIFOD6"},
- {AMDGPU_GFX_TCX_FIFOD7, "TCX_FIFOD7"},
- {AMDGPU_GFX_TCX_FIFOB0, "TCX_FIFOB0"},
- {AMDGPU_GFX_TCX_FIFOB1, "TCX_FIFOB1"},
- {AMDGPU_GFX_TCX_FIFOB2, "TCX_FIFOB2"},
- {AMDGPU_GFX_TCX_FIFOB3, "TCX_FIFOB3"},
- {AMDGPU_GFX_TCX_FIFOB4, "TCX_FIFOB4"},
- {AMDGPU_GFX_TCX_FIFOB5, "TCX_FIFOB5"},
- {AMDGPU_GFX_TCX_FIFOB6, "TCX_FIFOB6"},
- {AMDGPU_GFX_TCX_FIFOB7, "TCX_FIFOB7"},
- {AMDGPU_GFX_TCX_FIFOA0, "TCX_FIFOA0"},
- {AMDGPU_GFX_TCX_FIFOA1, "TCX_FIFOA1"},
- {AMDGPU_GFX_TCX_FIFOA2, "TCX_FIFOA2"},
- {AMDGPU_GFX_TCX_FIFOA3, "TCX_FIFOA3"},
- {AMDGPU_GFX_TCX_FIFOA4, "TCX_FIFOA4"},
- {AMDGPU_GFX_TCX_FIFOA5, "TCX_FIFOA5"},
- {AMDGPU_GFX_TCX_FIFOA6, "TCX_FIFOA6"},
- {AMDGPU_GFX_TCX_FIFOA7, "TCX_FIFOA7"},
- {AMDGPU_GFX_TCX_CFIFO0, "TCX_CFIFO0"},
- {AMDGPU_GFX_TCX_CFIFO1, "TCX_CFIFO1"},
- {AMDGPU_GFX_TCX_CFIFO2, "TCX_CFIFO2"},
- {AMDGPU_GFX_TCX_CFIFO3, "TCX_CFIFO3"},
- {AMDGPU_GFX_TCX_CFIFO4, "TCX_CFIFO4"},
- {AMDGPU_GFX_TCX_CFIFO5, "TCX_CFIFO5"},
- {AMDGPU_GFX_TCX_CFIFO6, "TCX_CFIFO6"},
- {AMDGPU_GFX_TCX_CFIFO7, "TCX_CFIFO7"},
- {AMDGPU_GFX_TCX_FIFO_ACKB0, "TCX_FIFO_ACKB0"},
- {AMDGPU_GFX_TCX_FIFO_ACKB1, "TCX_FIFO_ACKB1"},
- {AMDGPU_GFX_TCX_FIFO_ACKB2, "TCX_FIFO_ACKB2"},
- {AMDGPU_GFX_TCX_FIFO_ACKB3, "TCX_FIFO_ACKB3"},
- {AMDGPU_GFX_TCX_FIFO_ACKB4, "TCX_FIFO_ACKB4"},
- {AMDGPU_GFX_TCX_FIFO_ACKB5, "TCX_FIFO_ACKB5"},
- {AMDGPU_GFX_TCX_FIFO_ACKB6, "TCX_FIFO_ACKB6"},
- {AMDGPU_GFX_TCX_FIFO_ACKB7, "TCX_FIFO_ACKB7"},
- {AMDGPU_GFX_TCX_FIFO_ACKD0, "TCX_FIFO_ACKD0"},
- {AMDGPU_GFX_TCX_FIFO_ACKD1, "TCX_FIFO_ACKD1"},
- {AMDGPU_GFX_TCX_FIFO_ACKD2, "TCX_FIFO_ACKD2"},
- {AMDGPU_GFX_TCX_FIFO_ACKD3, "TCX_FIFO_ACKD3"},
- {AMDGPU_GFX_TCX_FIFO_ACKD4, "TCX_FIFO_ACKD4"},
- {AMDGPU_GFX_TCX_FIFO_ACKD5, "TCX_FIFO_ACKD5"},
- {AMDGPU_GFX_TCX_FIFO_ACKD6, "TCX_FIFO_ACKD6"},
- {AMDGPU_GFX_TCX_FIFO_ACKD7, "TCX_FIFO_ACKD7"},
- {AMDGPU_GFX_TCX_DST_FIFOA0, "TCX_DST_FIFOA0"},
- {AMDGPU_GFX_TCX_DST_FIFOA1, "TCX_DST_FIFOA1"},
- {AMDGPU_GFX_TCX_DST_FIFOA2, "TCX_DST_FIFOA2"},
- {AMDGPU_GFX_TCX_DST_FIFOA3, "TCX_DST_FIFOA3"},
- {AMDGPU_GFX_TCX_DST_FIFOA4, "TCX_DST_FIFOA4"},
- {AMDGPU_GFX_TCX_DST_FIFOA5, "TCX_DST_FIFOA5"},
- {AMDGPU_GFX_TCX_DST_FIFOA6, "TCX_DST_FIFOA6"},
- {AMDGPU_GFX_TCX_DST_FIFOA7, "TCX_DST_FIFOA7"},
- {AMDGPU_GFX_TCX_DST_FIFOB0, "TCX_DST_FIFOB0"},
- {AMDGPU_GFX_TCX_DST_FIFOB1, "TCX_DST_FIFOB1"},
- {AMDGPU_GFX_TCX_DST_FIFOB2, "TCX_DST_FIFOB2"},
- {AMDGPU_GFX_TCX_DST_FIFOB3, "TCX_DST_FIFOB3"},
- {AMDGPU_GFX_TCX_DST_FIFOB4, "TCX_DST_FIFOB4"},
- {AMDGPU_GFX_TCX_DST_FIFOB5, "TCX_DST_FIFOB5"},
- {AMDGPU_GFX_TCX_DST_FIFOB6, "TCX_DST_FIFOB6"},
- {AMDGPU_GFX_TCX_DST_FIFOB7, "TCX_DST_FIFOB7"},
- {AMDGPU_GFX_TCX_DST_FIFOD0, "TCX_DST_FIFOD0"},
- {AMDGPU_GFX_TCX_DST_FIFOD1, "TCX_DST_FIFOD1"},
- {AMDGPU_GFX_TCX_DST_FIFOD2, "TCX_DST_FIFOD2"},
- {AMDGPU_GFX_TCX_DST_FIFOD3, "TCX_DST_FIFOD3"},
- {AMDGPU_GFX_TCX_DST_FIFOD4, "TCX_DST_FIFOD4"},
- {AMDGPU_GFX_TCX_DST_FIFOD5, "TCX_DST_FIFOD5"},
- {AMDGPU_GFX_TCX_DST_FIFOD6, "TCX_DST_FIFOD6"},
- {AMDGPU_GFX_TCX_DST_FIFOD7, "TCX_DST_FIFOD7"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB0, "TCX_DST_FIFO_ACKB0"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB1, "TCX_DST_FIFO_ACKB1"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB2, "TCX_DST_FIFO_ACKB2"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB3, "TCX_DST_FIFO_ACKB3"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB4, "TCX_DST_FIFO_ACKB4"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB5, "TCX_DST_FIFO_ACKB5"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB6, "TCX_DST_FIFO_ACKB6"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKB7, "TCX_DST_FIFO_ACKB7"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD0, "TCX_DST_FIFO_ACKD0"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD1, "TCX_DST_FIFO_ACKD1"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD2, "TCX_DST_FIFO_ACKD2"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD3, "TCX_DST_FIFO_ACKD3"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD4, "TCX_DST_FIFO_ACKD4"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD5, "TCX_DST_FIFO_ACKD5"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD6, "TCX_DST_FIFO_ACKD6"},
- {AMDGPU_GFX_TCX_DST_FIFO_ACKD7, "TCX_DST_FIFO_ACKD7"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_atc_l2_mem_list[] = {
- {AMDGPU_GFX_ATC_L2_MEM, "ATC_L2_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_utcl2_mem_list[] = {
- {AMDGPU_GFX_UTCL2_MEM, "UTCL2_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_vml2_mem_list[] = {
- {AMDGPU_GFX_VML2_MEM, "VML2_MEM"},
-};
-
-static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_vml2_walker_mem_list[] = {
- {AMDGPU_GFX_VML2_WALKER_MEM, "VML2_WALKER_MEM"},
-};
-
-static const struct amdgpu_gfx_ras_mem_id_entry gfx_v9_4_3_ras_mem_list_array[AMDGPU_GFX_MEM_TYPE_NUM] = {
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_cp_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gcea_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gc_cane_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gcutcl2_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gds_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_lds_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_rlc_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sp_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_spi_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sqc_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sq_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_ta_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcc_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tca_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tci_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcp_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_td_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcx_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_atc_l2_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_utcl2_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_vml2_mem_list)
- AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_vml2_walker_mem_list)
-};
-
-static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ce_reg_list[] = {
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regRLC_CE_ERR_STATUS_LOW, regRLC_CE_ERR_STATUS_HIGH),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "RLC"},
- AMDGPU_GFX_RLC_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPC_CE_ERR_STATUS_LO, regCPC_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPC"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPF_CE_ERR_STATUS_LO, regCPF_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPF"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPG_CE_ERR_STATUS_LO, regCPG_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPG"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGDS_CE_ERR_STATUS_LO, regGDS_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GDS"},
- AMDGPU_GFX_GDS_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGC_CANE_CE_ERR_STATUS_LO, regGC_CANE_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CANE"},
- AMDGPU_GFX_GC_CANE_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_CE_ERR_STATUS_LO, regSPI_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SPI"},
- AMDGPU_GFX_SPI_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_CE_ERR_STATUS_LO, regSP0_CE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP0"},
- AMDGPU_GFX_SP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_CE_ERR_STATUS_LO, regSP1_CE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP1"},
- AMDGPU_GFX_SP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_CE_ERR_STATUS_LO, regSQ_CE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQ"},
- AMDGPU_GFX_SQ_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_CE_EDC_LO, regSQC_CE_EDC_HI),
- 5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQC"},
- AMDGPU_GFX_SQC_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_CE_ERR_STATUS_LO, regTCX_CE_ERR_STATUS_HI),
- 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCX"},
- AMDGPU_GFX_TCX_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCC_CE_ERR_STATUS_LO, regTCC_CE_ERR_STATUS_HI),
- 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCC"},
- AMDGPU_GFX_TCC_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_CE_EDC_LO, regTA_CE_EDC_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TA"},
- AMDGPU_GFX_TA_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_CE_EDC_LO_REG, regTCI_CE_EDC_HI_REG),
- 27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCI"},
- AMDGPU_GFX_TCI_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_CE_EDC_LO_REG, regTCP_CE_EDC_HI_REG),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCP"},
- AMDGPU_GFX_TCP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_CE_EDC_LO, regTD_CE_EDC_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TD"},
- AMDGPU_GFX_TD_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_CE_ERR_STATUS_LO, regGCEA_CE_ERR_STATUS_HI),
- 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GCEA"},
- AMDGPU_GFX_GCEA_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_CE_ERR_STATUS_LO, regLDS_CE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "LDS"},
- AMDGPU_GFX_LDS_MEM, 4},
-};
-
-static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = {
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regRLC_UE_ERR_STATUS_LOW, regRLC_UE_ERR_STATUS_HIGH),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "RLC"},
- AMDGPU_GFX_RLC_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPC_UE_ERR_STATUS_LO, regCPC_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPC"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPF_UE_ERR_STATUS_LO, regCPF_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPF"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPG_UE_ERR_STATUS_LO, regCPG_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPG"},
- AMDGPU_GFX_CP_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGDS_UE_ERR_STATUS_LO, regGDS_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GDS"},
- AMDGPU_GFX_GDS_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGC_CANE_UE_ERR_STATUS_LO, regGC_CANE_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CANE"},
- AMDGPU_GFX_GC_CANE_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_UE_ERR_STATUS_LO, regSPI_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SPI"},
- AMDGPU_GFX_SPI_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_UE_ERR_STATUS_LO, regSP0_UE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP0"},
- AMDGPU_GFX_SP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_UE_ERR_STATUS_LO, regSP1_UE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP1"},
- AMDGPU_GFX_SP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_UE_ERR_STATUS_LO, regSQ_UE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQ"},
- AMDGPU_GFX_SQ_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_UE_EDC_LO, regSQC_UE_EDC_HI),
- 5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQC"},
- AMDGPU_GFX_SQC_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_UE_ERR_STATUS_LO, regTCX_UE_ERR_STATUS_HI),
- 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCX"},
- AMDGPU_GFX_TCX_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCC_UE_ERR_STATUS_LO, regTCC_UE_ERR_STATUS_HI),
- 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCC"},
- AMDGPU_GFX_TCC_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_UE_EDC_LO, regTA_UE_EDC_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TA"},
- AMDGPU_GFX_TA_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_UE_EDC_LO_REG, regTCI_UE_EDC_HI_REG),
- 27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCI"},
- AMDGPU_GFX_TCI_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_UE_EDC_LO_REG, regTCP_UE_EDC_HI_REG),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCP"},
- AMDGPU_GFX_TCP_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_UE_EDC_LO, regTD_UE_EDC_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TD"},
- AMDGPU_GFX_TD_MEM, 4},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCA_UE_ERR_STATUS_LO, regTCA_UE_ERR_STATUS_HI),
- 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCA"},
- AMDGPU_GFX_TCA_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_UE_ERR_STATUS_LO, regGCEA_UE_ERR_STATUS_HI),
- 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GCEA"},
- AMDGPU_GFX_GCEA_MEM, 1},
- {{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_UE_ERR_STATUS_LO, regLDS_UE_ERR_STATUS_HI),
- 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "LDS"},
- AMDGPU_GFX_LDS_MEM, 4},
-};
-
-static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
- void *ras_error_status, int xcc_id)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- unsigned long ce_count = 0, ue_count = 0;
- uint32_t i, j, k;
-
- /* NOTE: convert xcc_id to physical XCD ID (XCD0 or XCD1) */
- struct amdgpu_smuio_mcm_config_info mcm_info = {
- .socket_id = adev->smuio.funcs->get_socket_id(adev),
- .die_id = xcc_id & 0x01 ? 1 : 0,
- };
-
- mutex_lock(&adev->grbm_idx_mutex);
-
- for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) {
- for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) {
- for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) {
- /* no need to select if instance number is 1 */
- if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 ||
- gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1)
- gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id);
-
- amdgpu_ras_inst_query_ras_error_count(adev,
- &(gfx_v9_4_3_ce_reg_list[i].reg_entry),
- 1,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].mem_id_ent,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].size,
- GET_INST(GC, xcc_id),
- AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE,
- &ce_count);
-
- amdgpu_ras_inst_query_ras_error_count(adev,
- &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
- 1,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size,
- GET_INST(GC, xcc_id),
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
- &ue_count);
- }
- }
- }
-
- /* handle extra register entries of UE */
- for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
- for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
- for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
- /* no need to select if instance number is 1 */
- if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
- gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
- gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id);
-
- amdgpu_ras_inst_query_ras_error_count(adev,
- &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
- 1,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent,
- gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size,
- GET_INST(GC, xcc_id),
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
- &ue_count);
- }
- }
- }
-
- gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
- xcc_id);
- mutex_unlock(&adev->grbm_idx_mutex);
-
- /* the caller should make sure initialize value of
- * err_data->ue_count and err_data->ce_count
- */
- amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
- amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
-}
-
-static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
- void *ras_error_status, int xcc_id)
-{
- uint32_t i, j, k;
-
- mutex_lock(&adev->grbm_idx_mutex);
-
- for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) {
- for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) {
- for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) {
- /* no need to select if instance number is 1 */
- if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 ||
- gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1)
- gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id);
-
- amdgpu_ras_inst_reset_ras_error_count(adev,
- &(gfx_v9_4_3_ce_reg_list[i].reg_entry),
- 1,
- GET_INST(GC, xcc_id));
-
- amdgpu_ras_inst_reset_ras_error_count(adev,
- &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
- 1,
- GET_INST(GC, xcc_id));
- }
- }
- }
-
- /* handle extra register entries of UE */
- for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
- for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
- for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
- /* no need to select if instance number is 1 */
- if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
- gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
- gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id);
-
- amdgpu_ras_inst_reset_ras_error_count(adev,
- &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
- 1,
- GET_INST(GC, xcc_id));
- }
- }
- }
-
- gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
- xcc_id);
- mutex_unlock(&adev->grbm_idx_mutex);
-}
-
static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev,
void *ras_error_status, int xcc_id)
{
@@ -4607,18 +3758,6 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev,
mutex_unlock(&adev->grbm_idx_mutex);
}
-static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- amdgpu_gfx_ras_error_func(adev, ras_error_status,
- gfx_v9_4_3_inst_query_ras_err_count);
-}
-
-static void gfx_v9_4_3_reset_ras_error_count(struct amdgpu_device *adev)
-{
- amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_reset_ras_err_count);
-}
-
static void gfx_v9_4_3_enable_watchdog_timer(struct amdgpu_device *adev)
{
amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_enable_watchdog_timer);
@@ -5099,37 +4238,9 @@ struct amdgpu_xcp_ip_funcs gfx_v9_4_3_xcp_funcs = {
.resume = &gfx_v9_4_3_xcp_resume
};
-struct amdgpu_ras_block_hw_ops gfx_v9_4_3_ras_ops = {
- .query_ras_error_count = &gfx_v9_4_3_query_ras_error_count,
- .reset_ras_error_count = &gfx_v9_4_3_reset_ras_error_count,
-};
-
-static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
-{
- int r;
-
- r = amdgpu_ras_block_late_init(adev, ras_block);
- if (r)
- return r;
-
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__GFX,
- &gfx_v9_4_3_aca_info,
- NULL);
- if (r)
- goto late_fini;
-
- return 0;
-
-late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
-
- return r;
-}
-
struct amdgpu_gfx_ras gfx_v9_4_3_ras = {
.ras_block = {
- .hw_ops = &gfx_v9_4_3_ras_ops,
- .ras_late_init = &gfx_v9_4_3_ras_late_init,
+ .hw_ops = NULL,
},
.enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index c2a41fa3a396..64ebedc595b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -167,44 +167,6 @@ static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev)
}
}
-static void gmc_v8_0_mc_stop(struct amdgpu_device *adev)
-{
- u32 blackout;
- struct amdgpu_ip_block *ip_block;
-
- ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GMC);
- if (!ip_block)
- return;
-
- gmc_v8_0_wait_for_idle(ip_block);
-
- blackout = RREG32(mmMC_SHARED_BLACKOUT_CNTL);
- if (REG_GET_FIELD(blackout, MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE) != 1) {
- /* Block CPU access */
- WREG32(mmBIF_FB_EN, 0);
- /* blackout the MC */
- blackout = REG_SET_FIELD(blackout,
- MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE, 1);
- WREG32(mmMC_SHARED_BLACKOUT_CNTL, blackout);
- }
- /* wait for the MC to settle */
- udelay(100);
-}
-
-static void gmc_v8_0_mc_resume(struct amdgpu_device *adev)
-{
- u32 tmp;
-
- /* unblackout the MC */
- tmp = RREG32(mmMC_SHARED_BLACKOUT_CNTL);
- tmp = REG_SET_FIELD(tmp, MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE, 0);
- WREG32(mmMC_SHARED_BLACKOUT_CNTL, tmp);
- /* allow CPU access */
- tmp = REG_SET_FIELD(0, BIF_FB_EN, FB_READ_EN, 1);
- tmp = REG_SET_FIELD(tmp, BIF_FB_EN, FB_WRITE_EN, 1);
- WREG32(mmBIF_FB_EN, tmp);
-}
-
/**
* gmc_v8_0_init_microcode - load ucode images from disk
*
@@ -1293,89 +1255,6 @@ static int gmc_v8_0_wait_for_idle(struct amdgpu_ip_block *ip_block)
}
-static bool gmc_v8_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- u32 srbm_soft_reset = 0;
- struct amdgpu_device *adev = ip_block->adev;
- u32 tmp = RREG32(mmSRBM_STATUS);
-
- if (tmp & SRBM_STATUS__VMC_BUSY_MASK)
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset,
- SRBM_SOFT_RESET, SOFT_RESET_VMC, 1);
-
- if (tmp & (SRBM_STATUS__MCB_BUSY_MASK | SRBM_STATUS__MCB_NON_DISPLAY_BUSY_MASK |
- SRBM_STATUS__MCC_BUSY_MASK | SRBM_STATUS__MCD_BUSY_MASK)) {
- if (!(adev->flags & AMD_IS_APU))
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset,
- SRBM_SOFT_RESET, SOFT_RESET_MC, 1);
- }
-
- if (srbm_soft_reset) {
- adev->gmc.srbm_soft_reset = srbm_soft_reset;
- return true;
- }
-
- adev->gmc.srbm_soft_reset = 0;
-
- return false;
-}
-
-static int gmc_v8_0_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->gmc.srbm_soft_reset)
- return 0;
-
- gmc_v8_0_mc_stop(adev);
- if (gmc_v8_0_wait_for_idle(ip_block))
- dev_warn(adev->dev, "Wait for GMC idle timed out !\n");
-
- return 0;
-}
-
-static int gmc_v8_0_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset;
-
- if (!adev->gmc.srbm_soft_reset)
- return 0;
- srbm_soft_reset = adev->gmc.srbm_soft_reset;
-
- if (srbm_soft_reset) {
- u32 tmp;
-
- tmp = RREG32(mmSRBM_SOFT_RESET);
- tmp |= srbm_soft_reset;
- dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp);
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- udelay(50);
-
- tmp &= ~srbm_soft_reset;
- WREG32(mmSRBM_SOFT_RESET, tmp);
- tmp = RREG32(mmSRBM_SOFT_RESET);
-
- /* Wait a little for things to settle down */
- udelay(50);
- }
-
- return 0;
-}
-
-static int gmc_v8_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->gmc.srbm_soft_reset)
- return 0;
-
- gmc_v8_0_mc_resume(adev);
- return 0;
-}
-
static int gmc_v8_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned int type,
@@ -1715,10 +1594,6 @@ static const struct amd_ip_funcs gmc_v8_0_ip_funcs = {
.resume = gmc_v8_0_resume,
.is_idle = gmc_v8_0_is_idle,
.wait_for_idle = gmc_v8_0_wait_for_idle,
- .check_soft_reset = gmc_v8_0_check_soft_reset,
- .pre_soft_reset = gmc_v8_0_pre_soft_reset,
- .soft_reset = gmc_v8_0_soft_reset,
- .post_soft_reset = gmc_v8_0_post_soft_reset,
.set_clockgating_state = gmc_v8_0_set_clockgating_state,
.set_powergating_state = gmc_v8_0_set_powergating_state,
.get_clockgating_state = gmc_v8_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8a5c44810ba1..1fcc0594fd0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -57,6 +57,7 @@
#include "umc_v6_0.h"
#include "umc_v6_7.h"
#include "umc_v12_0.h"
+#include "ras_umc_v12_0.h"
#include "hdp_v4_0.h"
#include "mca_v3_0.h"
@@ -1382,7 +1383,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 5, 0):
adev->umc.max_ras_err_cnt_per_query =
- UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
+ UMC_V12_0_TOTAL_CHANNEL_NUM * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM;
adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM;
@@ -2025,11 +2026,19 @@ static int gmc_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
* The first KFD VMID is 8 for GPUs with graphics, 3 for
* compute-only GPUs. On compute-only GPUs that leaves 2 VMIDs
* for video processing.
+ *
+ * If kernel queues are disabled, allow KFD to use all vmids.
*/
- adev->vm_manager.first_kfd_vmid =
- (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
- amdgpu_is_multi_aid(adev)) ?
+ if (adev->gfx.disable_kq &&
+ adev->jpeg.disable_kq &&
+ adev->vcn.disable_kq &&
+ adev->sdma.no_user_submission)
+ adev->vm_manager.first_kfd_vmid = 1;
+ else
+ adev->vm_manager.first_kfd_vmid =
+ (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
+ amdgpu_is_multi_aid(adev)) ?
3 :
8;
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index d8204fbc198d..0fdc32b3ae91 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -119,6 +119,19 @@ static int jpeg_v4_0_3_early_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->jpeg.disable_kq = false;
+ adev->jpeg.disable_uq = true;
+ break;
+ case 2:
+ adev->jpeg.disable_kq = true;
+ adev->jpeg.disable_uq = true;
+ break;
+ }
+
adev->jpeg.num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS_4_0_3;
jpeg_v4_0_3_set_dec_ring_funcs(adev);
@@ -175,6 +188,10 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
ring = &adev->jpeg.inst[i].ring_dec[j];
ring->use_doorbell = true;
+ if (adev->jpeg.disable_kq) {
+ ring->no_scheduler = true;
+ ring->no_user_submission = true;
+ }
ring->vm_hub = AMDGPU_MMHUB0(adev->jpeg.inst[i].aid_id);
if (!amdgpu_sriov_vf(adev)) {
ring->doorbell_index =
@@ -1425,72 +1442,6 @@ static const struct amdgpu_ras_block_hw_ops jpeg_v4_0_3_ras_hw_ops = {
.query_poison_status = jpeg_v4_0_3_query_ras_poison_status,
};
-static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* reference to smu driver if header file */
-static int jpeg_v4_0_3_err_codes[] = {
- 16, 17, 18, 19, 20, 21, 22, 23, /* JPEG[0-7][S|D] */
- 24, 25, 26, 27, 28, 29, 30, 31
-};
-
-static bool jpeg_v4_0_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- jpeg_v4_0_3_err_codes,
- ARRAY_SIZE(jpeg_v4_0_3_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops jpeg_v4_0_3_aca_bank_ops = {
- .aca_bank_parser = jpeg_v4_0_3_aca_bank_parser,
- .aca_bank_is_valid = jpeg_v4_0_3_aca_bank_is_valid,
-};
-
-static const struct aca_info jpeg_v4_0_3_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &jpeg_v4_0_3_aca_bank_ops,
-};
-
static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
int r;
@@ -1506,11 +1457,6 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm
goto late_fini;
}
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG,
- &jpeg_v4_0_3_aca_info, NULL);
- if (r)
- goto late_fini;
-
return 0;
late_fini:
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
index ae3afc7ab326..8846cb3ed12b 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
@@ -118,6 +118,19 @@ static int jpeg_v5_0_1_early_init(struct amdgpu_ip_block *ip_block)
if (!adev->jpeg.num_jpeg_inst || adev->jpeg.num_jpeg_inst > AMDGPU_MAX_JPEG_INSTANCES)
return -ENOENT;
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->jpeg.disable_kq = false;
+ adev->jpeg.disable_uq = true;
+ break;
+ case 2:
+ adev->jpeg.disable_kq = true;
+ adev->jpeg.disable_uq = true;
+ break;
+ }
+
adev->jpeg.num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS;
jpeg_v5_0_1_set_dec_ring_funcs(adev);
jpeg_v5_0_1_set_irq_funcs(adev);
@@ -172,6 +185,10 @@ static int jpeg_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block)
for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
ring = &adev->jpeg.inst[i].ring_dec[j];
ring->use_doorbell = true;
+ if (adev->jpeg.disable_kq) {
+ ring->no_scheduler = true;
+ ring->no_user_submission = true;
+ }
ring->vm_hub = AMDGPU_MMHUB0(adev->jpeg.inst[i].aid_id);
if (!amdgpu_sriov_vf(adev)) {
ring->doorbell_index =
@@ -871,10 +888,7 @@ static const struct amd_ip_funcs jpeg_v5_0_1_ip_funcs = {
.resume = jpeg_v5_0_1_resume,
.is_idle = jpeg_v5_0_1_is_idle,
.wait_for_idle = jpeg_v5_0_1_wait_for_idle,
- .check_soft_reset = NULL,
- .pre_soft_reset = NULL,
.soft_reset = NULL,
- .post_soft_reset = NULL,
.set_clockgating_state = jpeg_v5_0_1_set_clockgating_state,
.set_powergating_state = jpeg_v5_0_1_set_powergating_state,
.dump_ip_state = amdgpu_jpeg_dump_ip_state,
@@ -1003,73 +1017,6 @@ static const struct amdgpu_ras_block_hw_ops jpeg_v5_0_1_ras_hw_ops = {
.query_poison_status = jpeg_v5_0_1_query_ras_poison_status,
};
-static int jpeg_v5_0_1_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* reference to smu driver if header file */
-static int jpeg_v5_0_1_err_codes[] = {
- 16, 17, 18, 19, 20, 21, 22, 23, /* JPEG[0-9][S|D] */
- 24, 25, 26, 27, 28, 29, 30, 31,
- 48, 49, 50, 51,
-};
-
-static bool jpeg_v5_0_1_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- jpeg_v5_0_1_err_codes,
- ARRAY_SIZE(jpeg_v5_0_1_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops jpeg_v5_0_1_aca_bank_ops = {
- .aca_bank_parser = jpeg_v5_0_1_aca_bank_parser,
- .aca_bank_is_valid = jpeg_v5_0_1_aca_bank_is_valid,
-};
-
-static const struct aca_info jpeg_v5_0_1_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &jpeg_v5_0_1_aca_bank_ops,
-};
-
static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
int r;
@@ -1078,11 +1025,6 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm
if (r)
return r;
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG,
- &jpeg_v5_0_1_aca_info, NULL);
- if (r)
- goto late_fini;
-
if (amdgpu_ras_is_supported(adev, ras_block->block) &&
adev->jpeg.inst->ras_poison_irq.funcs) {
r = amdgpu_irq_get(adev, &adev->jpeg.inst->ras_poison_irq, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c
index 7a4ecea6b39a..ff02f72352a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c
@@ -690,10 +690,7 @@ static const struct amd_ip_funcs jpeg_v5_0_2_ip_funcs = {
.resume = jpeg_v5_0_2_resume,
.is_idle = jpeg_v5_0_2_is_idle,
.wait_for_idle = jpeg_v5_0_2_wait_for_idle,
- .check_soft_reset = NULL,
- .pre_soft_reset = NULL,
.soft_reset = NULL,
- .post_soft_reset = NULL,
.set_clockgating_state = jpeg_v5_0_2_set_clockgating_state,
.set_powergating_state = jpeg_v5_0_2_set_powergating_state,
.dump_ip_state = amdgpu_jpeg_dump_ip_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 16625c31bfd3..e947c16e694d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -133,8 +133,8 @@ static int mes_userq_map(struct amdgpu_usermode_queue *queue)
queue_input.gang_quantum = 10000;
queue_input.paging = false;
- queue_input.process_context_addr = ctx->gpu_addr;
- queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+ queue_input.process_context_addr = uq_mgr->proc_ctx_obj.gpu_addr;
+ queue_input.gang_context_addr = ctx->gpu_addr;
queue_input.inprocess_gang_priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
queue_input.gang_global_priority_level = convert_to_mes_priority(queue->priority);
@@ -169,7 +169,8 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue)
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = queue->doorbell_index;
- queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+ queue_input.gang_context_addr = ctx->gpu_addr;
+ queue_input.queue_type = queue->queue_type;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
@@ -179,6 +180,63 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue)
return r;
}
+int mes_userq_reset(struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ struct mes_reset_queue_input queue_input;
+ int r;
+
+ /* XXX: add a FW version check for SDMA per queue reset */
+ memset(&queue_input, 0x0, sizeof(struct mes_reset_queue_input));
+ queue_input.doorbell_offset = queue->doorbell_index;
+ queue_input.queue_type = queue->queue_type;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r)
+ return r;
+ return mes_userq_unmap(queue);
+}
+
+int mes_userq_reset_queue(struct amdgpu_device *adev,
+ struct amdgpu_usermode_queue *guilty_uq,
+ int queue_type,
+ unsigned int pipe,
+ unsigned int queue,
+ unsigned int db)
+{
+ struct amdgpu_usermode_queue *uq;
+ bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
+ unsigned long uq_id;
+ int r;
+
+ xa_for_each(&adev->userq_doorbell_xa, uq_id, uq) {
+ if (uq->queue_type == queue_type) {
+ if (uq == guilty_uq)
+ continue;
+ if (uq->doorbell_index == db) {
+ uq->state = AMDGPU_USERQ_STATE_HUNG;
+ if (use_mmio)
+ r = amdgpu_mes_reset_queue_mmio(adev, queue_type, 0, 1, pipe, queue, 0);
+ else
+ r = amdgpu_mes_reset_user_queue(adev, queue_type, db, 0);
+ if (r)
+ return r;
+ r = mes_userq_unmap(uq);
+ if (r)
+ return r;
+ atomic_inc(&adev->gpu_reset_counter);
+ amdgpu_userq_fence_driver_force_completion(uq);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue,
struct drm_amdgpu_userq_in *mqd_user)
@@ -186,12 +244,8 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_userq_obj *ctx = &queue->fw_obj;
int r, size;
- /*
- * The FW expects at least one page space allocated for
- * process ctx and gang ctx each. Create an object
- * for the same.
- */
- size = AMDGPU_USERQ_PROC_CTX_SZ + AMDGPU_USERQ_GANG_CTX_SZ;
+ /* The FW expects at least one page space allocated for gang ctx. */
+ size = AMDGPU_USERQ_GANG_CTX_SZ;
r = amdgpu_bo_create_kernel(uq_mgr->adev, size, 0,
AMDGPU_GEM_DOMAIN_GTT,
&ctx->obj, &ctx->gpu_addr,
@@ -205,54 +259,26 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
return 0;
}
-static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
- int queue_type)
+static int mes_userq_create_proc_ctx_space(struct amdgpu_userq_mgr *uq_mgr)
{
- int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
- struct mes_detect_and_reset_queue_input input;
- struct amdgpu_usermode_queue *queue;
- unsigned int hung_db_num = 0;
- unsigned long queue_id;
- u32 db_array[8];
- bool found_hung_queue = false;
- int r, i;
-
- if (db_array_size > 8) {
- dev_err(adev->dev, "DB array size (%d vs 8) too small\n",
- db_array_size);
- return -EINVAL;
- }
-
- memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
+ int r = 0;
- input.queue_type = queue_type;
-
- amdgpu_mes_lock(&adev->mes);
- r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
- &hung_db_num, db_array, 0);
- amdgpu_mes_unlock(&adev->mes);
- if (r) {
- dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
- } else if (hung_db_num) {
- xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
- if (queue->queue_type == queue_type) {
- for (i = 0; i < hung_db_num; i++) {
- if (queue->doorbell_index == db_array[i]) {
- queue->state = AMDGPU_USERQ_STATE_HUNG;
- found_hung_queue = true;
- atomic_inc(&adev->gpu_reset_counter);
- amdgpu_userq_fence_driver_force_completion(queue);
- drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
- }
- }
- }
- }
+ mutex_lock(&uq_mgr->proc_ctx_lock);
+ /* This check is a necessary because amdgpu_bo_create_kernel()
+ * calls helpers like amdgpu_bo_pin() and memset() unconditionally
+ */
+ if (!uq_mgr->proc_ctx_obj.obj) {
+ r = amdgpu_bo_create_kernel(uq_mgr->adev, AMDGPU_USERQ_PROC_CTX_SZ,
+ 0, AMDGPU_GEM_DOMAIN_GTT,
+ &uq_mgr->proc_ctx_obj.obj,
+ &uq_mgr->proc_ctx_obj.gpu_addr,
+ &uq_mgr->proc_ctx_obj.cpu_ptr);
+
+ if (!r)
+ memset(uq_mgr->proc_ctx_obj.cpu_ptr, 0, AMDGPU_USERQ_PROC_CTX_SZ);
}
- if (found_hung_queue) {
- /* Resume scheduling after hang recovery */
- r = amdgpu_mes_resume(adev, input.xcc_id);
- }
+ mutex_unlock(&uq_mgr->proc_ctx_lock);
return r;
}
@@ -429,7 +455,14 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue,
goto free_mqd;
}
- /* Create BO for FW operations */
+ /* Create per-process MES process context BO */
+ r = mes_userq_create_proc_ctx_space(uq_mgr);
+ if (r) {
+ DRM_ERROR("Failed to allocate MES process context space bo, error: %d\n", r);
+ goto free_mqd;
+ }
+
+ /* Create BO of a gang for FW operations */
r = mes_userq_create_ctx_space(uq_mgr, queue, mqd_user);
if (r) {
DRM_ERROR("Failed to allocate BO for userqueue (%d)", r);
@@ -497,7 +530,7 @@ static int mes_userq_preempt(struct amdgpu_usermode_queue *queue)
*fence_ptr = 0;
memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input));
- queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+ queue_input.gang_context_addr = ctx->gpu_addr;
queue_input.suspend_fence_addr = fence_gpu_addr;
queue_input.suspend_fence_value = 1;
amdgpu_mes_lock(&adev->mes);
@@ -534,7 +567,7 @@ static int mes_userq_restore(struct amdgpu_usermode_queue *queue)
return 0;
memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input));
- queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ;
+ queue_input.gang_context_addr = ctx->gpu_addr;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input);
@@ -549,7 +582,7 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
.mqd_destroy = mes_userq_mqd_destroy,
.unmap = mes_userq_unmap,
.map = mes_userq_map,
- .detect_and_reset = mes_userq_detect_and_reset,
.preempt = mes_userq_preempt,
.restore = mes_userq_restore,
+ .reset = mes_userq_reset,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h
index 090ae8897770..a473360d6a8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h
@@ -27,4 +27,13 @@
#include "amdgpu_userq.h"
extern const struct amdgpu_userq_funcs userq_mes_funcs;
+
+int mes_userq_reset(struct amdgpu_usermode_queue *queue);
+int mes_userq_reset_queue(struct amdgpu_device *adev,
+ struct amdgpu_usermode_queue *guilty_uq,
+ int queue_type,
+ unsigned int pipe,
+ unsigned int queue,
+ unsigned int db);
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 1b071a3de173..8f136ff7d96f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -387,6 +387,8 @@ static int mes_v11_0_remove_hw_queue(struct amdgpu_mes *mes,
mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset;
mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr;
+ mes_remove_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
if (mes_rev >= 0x60)
mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset;
@@ -396,6 +398,230 @@ static int mes_v11_0_remove_hw_queue(struct amdgpu_mes *mes,
offsetof(union MESAPI__REMOVE_QUEUE, api_status));
}
+static bool mes_v11_0_pipe_reset_support(struct amdgpu_device *adev)
+{
+ /* Disable the pipe reset until the CPFW fully support it.*/
+ dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");
+ return false;
+}
+static int mes_v11_0_reset_gfx_pipe_mmio(struct amdgpu_device *adev,
+ u32 me, u32 pipe, u32 queue)
+{
+ uint32_t reset_pipe = 0, clean_pipe = 0;
+ int r;
+
+ if (!mes_v11_0_pipe_reset_support(adev))
+ return -EOPNOTSUPP;
+
+ amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+ mutex_lock(&adev->srbm_mutex);
+ soc21_grbm_select(adev, me, pipe, queue, 0);
+
+ switch (pipe) {
+ case 0:
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ PFP_PIPE0_RESET, 1);
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ ME_PIPE0_RESET, 1);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ PFP_PIPE0_RESET, 0);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ ME_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ PFP_PIPE1_RESET, 1);
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ ME_PIPE1_RESET, 1);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ PFP_PIPE1_RESET, 0);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ ME_PIPE1_RESET, 0);
+ break;
+ default:
+ break;
+ }
+
+ WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
+ WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
+
+ r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) -
+ RS64_FW_UC_START_ADDR_LO;
+ soc21_grbm_select(adev, 0, 0, 0, 0);
+ mutex_unlock(&adev->srbm_mutex);
+ amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+ dev_info(adev->dev, "The gfx pipe reset to the ME firmware start PC: %s\n",
+ r == 0 ? "successfully" : "failed");
+ /* FIXME: Sometimes driver can't cache the ME firmware start PC correctly,
+ * so the pipe reset status relies on the later gfx ring test result.
+ */
+ return 0;
+}
+
+/*
+ * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for
+ * every queue on (me, pipe). HQDs must be torn down while pipe reset stays
+ * asserted; only then clear the pipe reset bit.
+ * Caller must hold adev->srbm_mutex.
+ */
+static void mes_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me,
+ u32 pipe)
+{
+ unsigned int q;
+
+ for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+ soc21_grbm_select(adev, me, pipe, q, 0);
+ /* Start from a clean HQD dequeue state before forcing HQD inactive. */
+ WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0);
+ WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0);
+ }
+}
+
+static int mes_v11_0_reset_compute_pipe_mmio(struct amdgpu_device *adev,
+ u32 me, u32 pipe, u32 queue)
+{
+ uint32_t reset_val, clean_val;
+ int r;
+
+ amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+ mutex_lock(&adev->srbm_mutex);
+ soc21_grbm_select(adev, me, pipe, queue, 0);
+
+ if (adev->gfx.rs64_enable) {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
+ clean_val = reset_val;
+
+ switch (pipe) {
+ case 0:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 0);
+ break;
+ case 2:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 0);
+ break;
+ case 3:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 0);
+ break;
+ default:
+ break;
+ }
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val);
+ mes_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val);
+ r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
+ RS64_FW_UC_START_ADDR_LO;
+ } else {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
+ clean_val = reset_val;
+
+ if (me == 1) {
+ switch (pipe) {
+ case 0:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 0);
+ break;
+ case 2:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE2_RESET, 0);
+ break;
+ case 3:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE3_RESET, 0);
+ break;
+ default:
+ break;
+ }
+ /* mec1 fw pc: CP_MEC1_INSTR_PNTR */
+ } else {
+ switch (pipe) {
+ case 0:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE1_RESET, 0);
+ break;
+ case 2:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE2_RESET, 0);
+ break;
+ case 3:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME2_PIPE3_RESET, 0);
+ break;
+ default:
+ break;
+ }
+ /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */
+ }
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val);
+ mes_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val);
+ r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR));
+ }
+
+ soc21_grbm_select(adev, 0, 0, 0, 0);
+ mutex_unlock(&adev->srbm_mutex);
+ amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+ dev_dbg(adev->dev, "MEC pipe me%u pipe%u queue%u resets to MEC FW start PC: %s\n",
+ me, pipe, queue, r == 0 ? "successfully" : "failed");
+ /*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe
+ * reset status relies on the compute ring test result.
+ */
+ return 0;
+}
+
+static int mes_v11_0_reset_pipe_mmio(struct amdgpu_mes *mes, uint32_t queue_type,
+ uint32_t me_id, uint32_t pipe_id,
+ uint32_t queue_id, uint32_t vmid)
+{
+ struct amdgpu_device *adev = mes->adev;
+
+ if (queue_type == AMDGPU_RING_TYPE_GFX)
+ return mes_v11_0_reset_gfx_pipe_mmio(adev, me_id, pipe_id, queue_id);
+ else if (queue_type == AMDGPU_RING_TYPE_COMPUTE)
+ return mes_v11_0_reset_compute_pipe_mmio(adev, me_id, pipe_id, queue_id);
+ else
+ return -EOPNOTSUPP;
+}
+
static int mes_v11_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_type,
uint32_t me_id, uint32_t pipe_id,
uint32_t queue_id, uint32_t vmid)
@@ -770,10 +996,16 @@ static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes,
{
union MESAPI__RESET mes_reset_queue_pkt;
- if (input->use_mmio)
- return mes_v11_0_reset_queue_mmio(mes, input->queue_type,
- input->me_id, input->pipe_id,
- input->queue_id, input->vmid);
+ if (input->use_mmio) {
+ int r = mes_v11_0_reset_queue_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+ if (r)
+ return mes_v11_0_reset_pipe_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+ return 0;
+ }
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index b6cbc25e1ab4..ce5064200743 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -26,7 +26,7 @@
#include "amdgpu.h"
#include "gfx_v12_0.h"
#include "soc15_common.h"
-#include "soc21.h"
+#include "soc24.h"
#include "gc/gc_12_0_0_offset.h"
#include "gc/gc_12_0_0_sh_mask.h"
#include "gc/gc_11_0_0_default.h"
@@ -371,6 +371,8 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes *mes,
mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset;
mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr;
+ mes_remove_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
if (mes_rev >= 0x5a)
mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset;
@@ -413,6 +415,171 @@ int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev,
return 0;
}
+static bool mes_v12_0_pipe_reset_support(struct amdgpu_device *adev)
+{
+ /* Disable the pipe reset until the CPFW fully support it.*/
+ dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");
+ return false;
+}
+
+static int mes_v12_0_reset_gfx_pipe_mmio(struct amdgpu_device *adev,
+ u32 me, u32 pipe, u32 queue)
+{
+ uint32_t reset_pipe = 0, clean_pipe = 0;
+ int r;
+
+ if (!mes_v12_0_pipe_reset_support(adev))
+ return -EOPNOTSUPP;
+
+ amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+ mutex_lock(&adev->srbm_mutex);
+ soc24_grbm_select(adev, me, pipe, queue, 0);
+
+ switch (pipe) {
+ case 0:
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ PFP_PIPE0_RESET, 1);
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ ME_PIPE0_RESET, 1);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ PFP_PIPE0_RESET, 0);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ ME_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ PFP_PIPE1_RESET, 1);
+ reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
+ ME_PIPE1_RESET, 1);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ PFP_PIPE1_RESET, 0);
+ clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
+ ME_PIPE1_RESET, 0);
+ break;
+ default:
+ break;
+ }
+
+ WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
+ WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
+
+ r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) -
+ RS64_FW_UC_START_ADDR_LO;
+ soc24_grbm_select(adev, 0, 0, 0, 0);
+ mutex_unlock(&adev->srbm_mutex);
+ amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+ dev_info(adev->dev, "The gfx pipe reset: %s\n",
+ r == 0 ? "successfully" : "failed");
+ /* Sometimes the ME start pc counter can't cache correctly, so the
+ * PC check only as a reference and pipe reset result rely on the
+ * later ring test.
+ */
+ return 0;
+}
+
+/*
+ * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for
+ * every queue on (me, pipe). HQDs must be torn down while pipe reset stays
+ * asserted; only then clear the pipe reset bit.
+ * Caller must hold adev->srbm_mutex.
+ */
+static void mes_v12_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me,
+ u32 pipe)
+{
+ unsigned int q;
+
+ for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+ soc24_grbm_select(adev, me, pipe, q, 0);
+ /* Start from a clean HQD dequeue state before forcing HQD inactive. */
+ WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0);
+ WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0);
+ }
+}
+
+static int mes_v12_0_reset_compute_pipe_mmio(struct amdgpu_device *adev,
+ u32 me, u32 pipe, u32 queue)
+{
+ uint32_t reset_val, clean_val;
+ int r = 0;
+
+ amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
+ mutex_lock(&adev->srbm_mutex);
+ soc24_grbm_select(adev, me, pipe, queue, 0);
+ if (adev->gfx.rs64_enable) {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
+ clean_val = reset_val;
+
+ switch (pipe) {
+ case 0:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE1_RESET, 0);
+ break;
+ case 2:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE2_RESET, 0);
+ break;
+ case 3:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+ MEC_PIPE3_RESET, 0);
+ break;
+ default:
+ break;
+ }
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val);
+ mes_v12_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ soc24_grbm_select(adev, me, pipe, queue, 0);
+ WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val);
+ r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
+ RS64_FW_UC_START_ADDR_LO;
+ } else {
+ reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
+ clean_val = reset_val;
+
+ switch (pipe) {
+ case 0:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE0_RESET, 0);
+ break;
+ case 1:
+ reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 1);
+ clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL,
+ MEC_ME1_PIPE1_RESET, 0);
+ break;
+ default:
+ break;
+ }
+
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val);
+ mes_v12_0_clear_hqds_on_mec_pipe(adev, me, pipe);
+ soc24_grbm_select(adev, me, pipe, queue, 0);
+ WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val);
+ }
+
+ soc24_grbm_select(adev, 0, 0, 0, 0);
+ mutex_unlock(&adev->srbm_mutex);
+ amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
+
+ dev_dbg(adev->dev, "MEC pipe me%u pipe%u queue%u resets to MEC FW start PC: %s\n",
+ me, pipe, queue, r == 0 ? "successfully" : "failed");
+ return 0;
+}
+
static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_type,
uint32_t me_id, uint32_t pipe_id,
uint32_t queue_id, uint32_t vmid)
@@ -442,7 +609,7 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ
mutex_unlock(&adev->gfx.reset_sem_mutex);
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+ soc24_grbm_select(adev, me_id, pipe_id, queue_id, 0);
/* wait till dequeue take effects */
for (i = 0; i < adev->usec_timeout; i++) {
if (!(RREG32_SOC15(GC, 0, regCP_GFX_HQD_ACTIVE) & 1))
@@ -454,13 +621,13 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ
r = -ETIMEDOUT;
}
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
} else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) {
dev_info(adev->dev, "reset compute queue (%d:%d:%d)\n",
me_id, pipe_id, queue_id);
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0);
+ soc24_grbm_select(adev, me_id, pipe_id, queue_id, 0);
WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
@@ -474,7 +641,7 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ
dev_err(adev->dev, "failed to wait on hqd deactivate\n");
r = -ETIMEDOUT;
}
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
} else if (queue_type == AMDGPU_RING_TYPE_SDMA) {
dev_info(adev->dev, "reset sdma queue (%d:%d:%d)\n",
@@ -507,6 +674,20 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ
return r;
}
+static int mes_v12_0_reset_pipe_mmio(struct amdgpu_mes *mes, uint32_t queue_type,
+ uint32_t me_id, uint32_t pipe_id,
+ uint32_t queue_id, uint32_t vmid)
+{
+ struct amdgpu_device *adev = mes->adev;
+
+ if (queue_type == AMDGPU_RING_TYPE_GFX)
+ return mes_v12_0_reset_gfx_pipe_mmio(adev, me_id, pipe_id, queue_id);
+ else if (queue_type == AMDGPU_RING_TYPE_COMPUTE)
+ return mes_v12_0_reset_compute_pipe_mmio(adev, me_id, pipe_id, queue_id);
+ else
+ return -EOPNOTSUPP;
+}
+
static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes,
struct mes_map_legacy_queue_input *input)
{
@@ -528,10 +709,15 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes,
convert_to_mes_queue_type(input->queue_type);
mes_add_queue_pkt.map_legacy_kq = 1;
- if (mes->adev->enable_uni_mes)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
+ if (mes->adev->enable_uni_mes) {
+ /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */
+ if (input->queue_type == AMDGPU_RING_TYPE_MES)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+ } else {
pipe = AMDGPU_MES_SCHED_PIPE;
+ }
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
&mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
@@ -565,12 +751,28 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes,
mes_remove_queue_pkt.unmap_legacy_queue = 1;
mes_remove_queue_pkt.queue_type =
convert_to_mes_queue_type(input->queue_type);
+ /*
+ * A reset-time unmap: the queue was already reset via MMIO while
+ * gangs are suspended and it is on the MES hung/fail list. Tell
+ * MES to just drop its internal state for it. Without this flag
+ * MES asks CP to unmap the already-reset (still wedged) queue
+ * again, which times out and forces a GPU reset.
+ */
+ if (input->action == RESET_QUEUES &&
+ (mes->sched_version & AMDGPU_MES_VERSION_MASK) >= 0x5a)
+ mes_remove_queue_pkt.remove_queue_after_reset = 1;
+
}
- if (mes->adev->enable_uni_mes)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
+ if (mes->adev->enable_uni_mes) {
+ /* Keep scheduler queue on KIQ pipe; unmap all other kernel queues on sched pipe. */
+ if (input->queue_type == AMDGPU_RING_TYPE_MES)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+ } else {
pipe = AMDGPU_MES_SCHED_PIPE;
+ }
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
&mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt),
@@ -888,10 +1090,16 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
union MESAPI__RESET mes_reset_queue_pkt;
int pipe;
- if (input->use_mmio)
- return mes_v12_0_reset_queue_mmio(mes, input->queue_type,
- input->me_id, input->pipe_id,
- input->queue_id, input->vmid);
+ if (input->use_mmio) {
+ int r = mes_v12_0_reset_queue_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+ if (r)
+ return mes_v12_0_reset_pipe_mmio(mes, input->queue_type,
+ input->me_id, input->pipe_id,
+ input->queue_id, input->vmid);
+ return 0;
+ }
memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt));
@@ -915,10 +1123,7 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes,
mes_reset_queue_pkt.doorbell_offset = input->doorbell_offset;
}
- if (input->is_kq)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
- pipe = AMDGPU_MES_SCHED_PIPE;
+ pipe = AMDGPU_MES_SCHED_PIPE;
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
&mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt),
@@ -1094,7 +1299,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable)
if (enable) {
mutex_lock(&adev->srbm_mutex);
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
- soc21_grbm_select(adev, 3, pipe, 0, 0);
+ soc24_grbm_select(adev, 3, pipe, 0, 0);
if (amdgpu_mes_log_enable) {
u32 log_size = AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE;
/* In case uni mes is not enabled, only program for pipe 0 */
@@ -1133,7 +1338,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable)
WREG32_SOC15(GC, 0, regCP_MES_CNTL, data);
}
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
if (amdgpu_emu_mode)
@@ -1165,7 +1370,7 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev)
mutex_lock(&adev->srbm_mutex);
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
/* me=3, queue=0 */
- soc21_grbm_select(adev, 3, pipe, 0, 0);
+ soc24_grbm_select(adev, 3, pipe, 0, 0);
/* set ucode start address */
ucode_addr = adev->mes.uc_start_addr[pipe] >> 2;
@@ -1174,7 +1379,7 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev)
WREG32_SOC15(GC, 0, regCP_MES_PRGRM_CNTR_START_HI,
upper_32_bits(ucode_addr));
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
}
mutex_unlock(&adev->srbm_mutex);
}
@@ -1203,7 +1408,7 @@ static int mes_v12_0_load_microcode(struct amdgpu_device *adev,
mutex_lock(&adev->srbm_mutex);
/* me=3, pipe=0, queue=0 */
- soc21_grbm_select(adev, 3, pipe, 0, 0);
+ soc24_grbm_select(adev, 3, pipe, 0, 0);
WREG32_SOC15(GC, 0, regCP_MES_IC_BASE_CNTL, 0);
@@ -1238,7 +1443,7 @@ static int mes_v12_0_load_microcode(struct amdgpu_device *adev,
WREG32_SOC15(GC, 0, regCP_MES_IC_OP_CNTL, data);
}
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
return 0;
@@ -1385,7 +1590,7 @@ static void mes_v12_0_queue_init_register(struct amdgpu_ring *ring)
uint32_t data = 0;
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, 3, ring->pipe, 0, 0);
+ soc24_grbm_select(adev, 3, ring->pipe, 0, 0);
/* set CP_HQD_VMID.VMID = 0. */
data = RREG32_SOC15(GC, 0, regCP_HQD_VMID);
@@ -1436,7 +1641,7 @@ static void mes_v12_0_queue_init_register(struct amdgpu_ring *ring)
/* set CP_HQD_ACTIVE.ACTIVE=1 */
WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, mqd->cp_hqd_active);
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
}
@@ -1502,14 +1707,14 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev,
((pipe == AMDGPU_MES_KIQ_PIPE) && !adev->mes.kiq_version)) {
/* get MES scheduler/KIQ versions */
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, 3, pipe, 0, 0);
+ soc24_grbm_select(adev, 3, pipe, 0, 0);
if (pipe == AMDGPU_MES_SCHED_PIPE)
adev->mes.sched_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
else if (pipe == AMDGPU_MES_KIQ_PIPE && adev->enable_mes_kiq)
adev->mes.kiq_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO);
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
}
@@ -1697,7 +1902,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev)
int i;
mutex_lock(&adev->srbm_mutex);
- soc21_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0);
+ soc24_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0);
/* disable the queue if it's active */
if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) {
@@ -1721,7 +1926,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev)
WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_HI, 0);
WREG32_SOC15(GC, 0, regCP_HQD_PQ_RPTR, 0);
- soc21_grbm_select(adev, 0, 0, 0, 0);
+ soc24_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
adev->mes.ring[0].sched.ready = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
index e13535d94c51..f7d5879c6e44 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c
@@ -362,6 +362,8 @@ static int mes_v12_1_remove_hw_queue(struct amdgpu_mes *mes,
mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset;
mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr;
+ mes_remove_queue_pkt.queue_type =
+ convert_to_mes_queue_type(input->queue_type);
return mes_v12_1_submit_pkt_and_poll_completion(mes,
xcc_id, AMDGPU_MES_SCHED_PIPE,
@@ -417,10 +419,15 @@ static int mes_v12_1_map_legacy_queue(struct amdgpu_mes *mes,
convert_to_mes_queue_type(input->queue_type);
mes_add_queue_pkt.map_legacy_kq = 1;
- if (mes->adev->enable_uni_mes)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
+ if (mes->adev->enable_uni_mes) {
+ /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */
+ if (input->queue_type == AMDGPU_RING_TYPE_MES)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+ } else {
pipe = AMDGPU_MES_SCHED_PIPE;
+ }
return mes_v12_1_submit_pkt_and_poll_completion(mes,
input->xcc_id, pipe,
@@ -457,10 +464,15 @@ static int mes_v12_1_unmap_legacy_queue(struct amdgpu_mes *mes,
convert_to_mes_queue_type(input->queue_type);
}
- if (mes->adev->enable_uni_mes)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
+ if (mes->adev->enable_uni_mes) {
+ /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */
+ if (input->queue_type == AMDGPU_RING_TYPE_MES)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+ } else {
pipe = AMDGPU_MES_SCHED_PIPE;
+ }
return mes_v12_1_submit_pkt_and_poll_completion(mes,
input->xcc_id, pipe,
@@ -2262,6 +2274,7 @@ static int mes_v12_1_test_queue(struct amdgpu_device *adev, int xcc_id,
remove_queue.xcc_id = xcc_id;
remove_queue.doorbell_offset = doorbell_idx;
remove_queue.gang_context_addr = add_queue.gang_context_addr;
+ remove_queue.queue_type = queue_type;
r = mes_v12_1_remove_hw_queue(&adev->mes, &remove_queue);
error:
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index cc688ae79e84..47d07cd25fc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -29,7 +29,6 @@
#include "soc15_common.h"
#include "soc15.h"
-#include "amdgpu_ras.h"
#include "amdgpu_psp.h"
#define regVM_L2_CNTL3_DEFAULT 0x80100007
@@ -636,236 +635,8 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.get_clockgating = mmhub_v1_8_get_clockgating,
};
-static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = {
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_CE_ERR_STATUS_LO, regMMEA0_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_CE_ERR_STATUS_LO, regMMEA1_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_CE_ERR_STATUS_LO, regMMEA2_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_CE_ERR_STATUS_LO, regMMEA3_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_CE_ERR_STATUS_LO, regMMEA4_CE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_CE_ERR_STATUS_LO, regMM_CANE_CE_ERR_STATUS_HI),
- 1, 0, "MM_CANE"},
-};
-
-static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ue_reg_list[] = {
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_UE_ERR_STATUS_LO, regMMEA0_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_UE_ERR_STATUS_LO, regMMEA1_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_UE_ERR_STATUS_LO, regMMEA2_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_UE_ERR_STATUS_LO, regMMEA3_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_UE_ERR_STATUS_LO, regMMEA4_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"},
- {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_UE_ERR_STATUS_LO, regMM_CANE_UE_ERR_STATUS_HI),
- 1, 0, "MM_CANE"},
-};
-
-static const struct amdgpu_ras_memory_id_entry mmhub_v1_8_ras_memory_list[] = {
- {AMDGPU_MMHUB_WGMI_PAGEMEM, "MMEA_WGMI_PAGEMEM"},
- {AMDGPU_MMHUB_RGMI_PAGEMEM, "MMEA_RGMI_PAGEMEM"},
- {AMDGPU_MMHUB_WDRAM_PAGEMEM, "MMEA_WDRAM_PAGEMEM"},
- {AMDGPU_MMHUB_RDRAM_PAGEMEM, "MMEA_RDRAM_PAGEMEM"},
- {AMDGPU_MMHUB_WIO_CMDMEM, "MMEA_WIO_CMDMEM"},
- {AMDGPU_MMHUB_RIO_CMDMEM, "MMEA_RIO_CMDMEM"},
- {AMDGPU_MMHUB_WGMI_CMDMEM, "MMEA_WGMI_CMDMEM"},
- {AMDGPU_MMHUB_RGMI_CMDMEM, "MMEA_RGMI_CMDMEM"},
- {AMDGPU_MMHUB_WDRAM_CMDMEM, "MMEA_WDRAM_CMDMEM"},
- {AMDGPU_MMHUB_RDRAM_CMDMEM, "MMEA_RDRAM_CMDMEM"},
- {AMDGPU_MMHUB_MAM_DMEM0, "MMEA_MAM_DMEM0"},
- {AMDGPU_MMHUB_MAM_DMEM1, "MMEA_MAM_DMEM1"},
- {AMDGPU_MMHUB_MAM_DMEM2, "MMEA_MAM_DMEM2"},
- {AMDGPU_MMHUB_MAM_DMEM3, "MMEA_MAM_DMEM3"},
- {AMDGPU_MMHUB_WRET_TAGMEM, "MMEA_WRET_TAGMEM"},
- {AMDGPU_MMHUB_RRET_TAGMEM, "MMEA_RRET_TAGMEM"},
- {AMDGPU_MMHUB_WIO_DATAMEM, "MMEA_WIO_DATAMEM"},
- {AMDGPU_MMHUB_WGMI_DATAMEM, "MMEA_WGMI_DATAMEM"},
- {AMDGPU_MMHUB_WDRAM_DATAMEM, "MMEA_WDRAM_DATAMEM"},
-};
-
-static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev,
- uint32_t mmhub_inst,
- void *ras_err_status)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status;
- unsigned long ue_count = 0, ce_count = 0;
-
- /* NOTE: mmhub is converted by aid_mask and the range is 0-3,
- * which can be used as die ID directly */
- struct amdgpu_smuio_mcm_config_info mcm_info = {
- .socket_id = adev->smuio.funcs->get_socket_id(adev),
- .die_id = mmhub_inst,
- };
-
- amdgpu_ras_inst_query_ras_error_count(adev,
- mmhub_v1_8_ce_reg_list,
- ARRAY_SIZE(mmhub_v1_8_ce_reg_list),
- mmhub_v1_8_ras_memory_list,
- ARRAY_SIZE(mmhub_v1_8_ras_memory_list),
- mmhub_inst,
- AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE,
- &ce_count);
- amdgpu_ras_inst_query_ras_error_count(adev,
- mmhub_v1_8_ue_reg_list,
- ARRAY_SIZE(mmhub_v1_8_ue_reg_list),
- mmhub_v1_8_ras_memory_list,
- ARRAY_SIZE(mmhub_v1_8_ras_memory_list),
- mmhub_inst,
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
- &ue_count);
-
- amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
- amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-}
-
-static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_err_status)
-{
- uint32_t inst_mask;
- uint32_t i;
-
- if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) {
- dev_warn(adev->dev, "MMHUB RAS is not supported\n");
- return;
- }
-
- inst_mask = adev->aid_mask;
- for_each_inst(i, inst_mask)
- mmhub_v1_8_inst_query_ras_error_count(adev, i, ras_err_status);
-}
-
-static void mmhub_v1_8_inst_reset_ras_error_count(struct amdgpu_device *adev,
- uint32_t mmhub_inst)
-{
- amdgpu_ras_inst_reset_ras_error_count(adev,
- mmhub_v1_8_ce_reg_list,
- ARRAY_SIZE(mmhub_v1_8_ce_reg_list),
- mmhub_inst);
- amdgpu_ras_inst_reset_ras_error_count(adev,
- mmhub_v1_8_ue_reg_list,
- ARRAY_SIZE(mmhub_v1_8_ue_reg_list),
- mmhub_inst);
-}
-
-static void mmhub_v1_8_reset_ras_error_count(struct amdgpu_device *adev)
-{
- uint32_t inst_mask;
- uint32_t i;
-
- if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) {
- dev_warn(adev->dev, "MMHUB RAS is not supported\n");
- return;
- }
-
- inst_mask = adev->aid_mask;
- for_each_inst(i, inst_mask)
- mmhub_v1_8_inst_reset_ras_error_count(adev, i);
-}
-
-static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
- .query_ras_error_count = mmhub_v1_8_query_ras_error_count,
- .reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,
-};
-
-static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* reference to smu driver if header file */
-static int mmhub_v1_8_err_codes[] = {
- 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */
- 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */
- 10, /* CODE_UTCL2_ROUTER */
- 11, /* CODE_VML2 */
- 12, /* CODE_VML2_WALKER */
- 13, /* CODE_MMCANE */
-};
-
-static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- mmhub_v1_8_err_codes,
- ARRAY_SIZE(mmhub_v1_8_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = {
- .aca_bank_parser = mmhub_v1_8_aca_bank_parser,
- .aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid,
-};
-
-static const struct aca_info mmhub_v1_8_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &mmhub_v1_8_aca_bank_ops,
-};
-
-static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
-{
- int r;
-
- r = amdgpu_ras_block_late_init(adev, ras_block);
- if (r)
- return r;
-
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__MMHUB,
- &mmhub_v1_8_aca_info, NULL);
- if (r)
- goto late_fini;
-
- return 0;
-
-late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
-
- return r;
-}
-
struct amdgpu_mmhub_ras mmhub_v1_8_ras = {
.ras_block = {
- .hw_ops = &mmhub_v1_8_ras_hw_ops,
- .ras_late_init = mmhub_v1_8_ras_late_init,
+ .hw_ops = NULL,
},
};
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 72edf5326b05..77557ee3ca16 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -507,11 +507,6 @@ void nv_set_virt_ops(struct amdgpu_device *adev)
adev->virt.ops = &xgpu_nv_virt_ops;
}
-static bool nv_need_full_reset(struct amdgpu_device *adev)
-{
- return true;
-}
-
static bool nv_need_reset_on_init(struct amdgpu_device *adev)
{
u32 sol_reg;
@@ -595,7 +590,6 @@ static const struct amdgpu_asic_funcs nv_asic_funcs = {
.set_vce_clocks = &nv_set_vce_clocks,
.get_config_memsize = &nv_get_config_memsize,
.init_doorbell_index = &nv_init_doorbell_index,
- .need_full_reset = &nv_need_full_reset,
.need_reset_on_init = &nv_need_reset_on_init,
.get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count,
.supports_baco = &amdgpu_dpm_is_baco_supported,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 3fde9be74690..c2d098cd72ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -1237,65 +1237,6 @@ static int sdma_v3_0_wait_for_idle(struct amdgpu_ip_block *ip_block)
return -ETIMEDOUT;
}
-static bool sdma_v3_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
- u32 tmp = RREG32(mmSRBM_STATUS2);
-
- if ((tmp & SRBM_STATUS2__SDMA_BUSY_MASK) ||
- (tmp & SRBM_STATUS2__SDMA1_BUSY_MASK)) {
- srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_SDMA_MASK;
- srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_SDMA1_MASK;
- }
-
- if (srbm_soft_reset) {
- adev->sdma.srbm_soft_reset = srbm_soft_reset;
- return true;
- } else {
- adev->sdma.srbm_soft_reset = 0;
- return false;
- }
-}
-
-static int sdma_v3_0_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
-
- if (!adev->sdma.srbm_soft_reset)
- return 0;
-
- srbm_soft_reset = adev->sdma.srbm_soft_reset;
-
- if (REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA) ||
- REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA1)) {
- sdma_v3_0_ctx_switch_enable(adev, false);
- sdma_v3_0_enable(adev, false);
- }
-
- return 0;
-}
-
-static int sdma_v3_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
-
- if (!adev->sdma.srbm_soft_reset)
- return 0;
-
- srbm_soft_reset = adev->sdma.srbm_soft_reset;
-
- if (REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA) ||
- REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA1)) {
- sdma_v3_0_gfx_resume(adev);
- sdma_v3_0_rlc_resume(adev);
- }
-
- return 0;
-}
-
static int sdma_v3_0_soft_reset(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -1552,9 +1493,6 @@ static const struct amd_ip_funcs sdma_v3_0_ip_funcs = {
.resume = sdma_v3_0_resume,
.is_idle = sdma_v3_0_is_idle,
.wait_for_idle = sdma_v3_0_wait_for_idle,
- .check_soft_reset = sdma_v3_0_check_soft_reset,
- .pre_soft_reset = sdma_v3_0_pre_soft_reset,
- .post_soft_reset = sdma_v3_0_post_soft_reset,
.soft_reset = sdma_v3_0_soft_reset,
.set_clockgating_state = sdma_v3_0_set_clockgating_state,
.set_powergating_state = sdma_v3_0_set_powergating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 8652928861ad..484f1a6b5fbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -95,8 +95,6 @@ static const struct amdgpu_hwip_reg_entry sdma_reg_list_4_4_2[] = {
SOC15_REG_ENTRY_STR(GC, 0, regSDMA_VM_CNTL)
};
-#define mmSMNAID_AID0_MCA_SMU 0x03b30400
-
#define WREG32_SDMA(instance, offset, value) \
WREG32(sdma_v4_4_2_get_reg_offset(adev, (instance), (offset)), value)
#define RREG32_SDMA(instance, offset) \
@@ -1359,6 +1357,19 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int r;
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->sdma.no_user_submission = false;
+ adev->sdma.disable_uq = true;
+ break;
+ case 2:
+ adev->sdma.no_user_submission = true;
+ adev->sdma.disable_uq = true;
+ break;
+ }
+
r = sdma_v4_4_2_init_microcode(adev);
if (r)
return r;
@@ -1478,6 +1489,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
/* doorbell size is 2 dwords, get DWORD offset */
ring->doorbell_index = adev->doorbell_index.sdma_engine[i] << 1;
ring->vm_hub = AMDGPU_MMHUB0(aid_id);
+ ring->no_user_submission = adev->sdma.no_user_submission;
sprintf(ring->name, "sdma%d.%d", aid_id,
i % adev->sdma.num_inst_per_aid);
@@ -2404,187 +2416,9 @@ struct amdgpu_xcp_ip_funcs sdma_v4_4_2_xcp_funcs = {
.resume = &sdma_v4_4_2_xcp_resume
};
-static const struct amdgpu_ras_err_status_reg_entry sdma_v4_2_2_ue_reg_list[] = {
- {AMDGPU_RAS_REG_ENTRY(SDMA0, 0, regSDMA_UE_ERR_STATUS_LO, regSDMA_UE_ERR_STATUS_HI),
- 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SDMA"},
-};
-
-static const struct amdgpu_ras_memory_id_entry sdma_v4_4_2_ras_memory_list[] = {
- {AMDGPU_SDMA_MBANK_DATA_BUF0, "SDMA_MBANK_DATA_BUF0"},
- {AMDGPU_SDMA_MBANK_DATA_BUF1, "SDMA_MBANK_DATA_BUF1"},
- {AMDGPU_SDMA_MBANK_DATA_BUF2, "SDMA_MBANK_DATA_BUF2"},
- {AMDGPU_SDMA_MBANK_DATA_BUF3, "SDMA_MBANK_DATA_BUF3"},
- {AMDGPU_SDMA_MBANK_DATA_BUF4, "SDMA_MBANK_DATA_BUF4"},
- {AMDGPU_SDMA_MBANK_DATA_BUF5, "SDMA_MBANK_DATA_BUF5"},
- {AMDGPU_SDMA_MBANK_DATA_BUF6, "SDMA_MBANK_DATA_BUF6"},
- {AMDGPU_SDMA_MBANK_DATA_BUF7, "SDMA_MBANK_DATA_BUF7"},
- {AMDGPU_SDMA_MBANK_DATA_BUF8, "SDMA_MBANK_DATA_BUF8"},
- {AMDGPU_SDMA_MBANK_DATA_BUF9, "SDMA_MBANK_DATA_BUF9"},
- {AMDGPU_SDMA_MBANK_DATA_BUF10, "SDMA_MBANK_DATA_BUF10"},
- {AMDGPU_SDMA_MBANK_DATA_BUF11, "SDMA_MBANK_DATA_BUF11"},
- {AMDGPU_SDMA_MBANK_DATA_BUF12, "SDMA_MBANK_DATA_BUF12"},
- {AMDGPU_SDMA_MBANK_DATA_BUF13, "SDMA_MBANK_DATA_BUF13"},
- {AMDGPU_SDMA_MBANK_DATA_BUF14, "SDMA_MBANK_DATA_BUF14"},
- {AMDGPU_SDMA_MBANK_DATA_BUF15, "SDMA_MBANK_DATA_BUF15"},
- {AMDGPU_SDMA_UCODE_BUF, "SDMA_UCODE_BUF"},
- {AMDGPU_SDMA_RB_CMD_BUF, "SDMA_RB_CMD_BUF"},
- {AMDGPU_SDMA_IB_CMD_BUF, "SDMA_IB_CMD_BUF"},
- {AMDGPU_SDMA_UTCL1_RD_FIFO, "SDMA_UTCL1_RD_FIFO"},
- {AMDGPU_SDMA_UTCL1_RDBST_FIFO, "SDMA_UTCL1_RDBST_FIFO"},
- {AMDGPU_SDMA_UTCL1_WR_FIFO, "SDMA_UTCL1_WR_FIFO"},
- {AMDGPU_SDMA_DATA_LUT_FIFO, "SDMA_DATA_LUT_FIFO"},
- {AMDGPU_SDMA_SPLIT_DAT_BUF, "SDMA_SPLIT_DAT_BUF"},
-};
-
-static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev,
- uint32_t sdma_inst,
- void *ras_err_status)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status;
- uint32_t sdma_dev_inst = GET_INST(SDMA0, sdma_inst);
- unsigned long ue_count = 0;
- struct amdgpu_smuio_mcm_config_info mcm_info = {
- .socket_id = adev->smuio.funcs->get_socket_id(adev),
- .die_id = adev->sdma.instance[sdma_inst].aid_id,
- };
-
- /* sdma v4_4_2 doesn't support query ce counts */
- amdgpu_ras_inst_query_ras_error_count(adev,
- sdma_v4_2_2_ue_reg_list,
- ARRAY_SIZE(sdma_v4_2_2_ue_reg_list),
- sdma_v4_4_2_ras_memory_list,
- ARRAY_SIZE(sdma_v4_4_2_ras_memory_list),
- sdma_dev_inst,
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
- &ue_count);
-
- amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-}
-
-static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_err_status)
-{
- uint32_t inst_mask;
- int i = 0;
-
- inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
- for_each_inst(i, inst_mask)
- sdma_v4_4_2_inst_query_ras_error_count(adev, i, ras_err_status);
- } else {
- dev_warn(adev->dev, "SDMA RAS is not supported\n");
- }
-}
-
-static void sdma_v4_4_2_inst_reset_ras_error_count(struct amdgpu_device *adev,
- uint32_t sdma_inst)
-{
- uint32_t sdma_dev_inst = GET_INST(SDMA0, sdma_inst);
-
- amdgpu_ras_inst_reset_ras_error_count(adev,
- sdma_v4_2_2_ue_reg_list,
- ARRAY_SIZE(sdma_v4_2_2_ue_reg_list),
- sdma_dev_inst);
-}
-
-static void sdma_v4_4_2_reset_ras_error_count(struct amdgpu_device *adev)
-{
- uint32_t inst_mask;
- int i = 0;
-
- inst_mask = GENMASK(adev->sdma.num_instances - 1, 0);
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
- for_each_inst(i, inst_mask)
- sdma_v4_4_2_inst_reset_ras_error_count(adev, i);
- } else {
- dev_warn(adev->dev, "SDMA RAS is not supported\n");
- }
-}
-
-static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = {
- .query_ras_error_count = sdma_v4_4_2_query_ras_error_count,
- .reset_ras_error_count = sdma_v4_4_2_reset_ras_error_count,
-};
-
-static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */
-static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 };
-
-static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- sdma_v4_4_2_err_codes,
- ARRAY_SIZE(sdma_v4_4_2_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops sdma_v4_4_2_aca_bank_ops = {
- .aca_bank_parser = sdma_v4_4_2_aca_bank_parser,
- .aca_bank_is_valid = sdma_v4_4_2_aca_bank_is_valid,
-};
-
-static const struct aca_info sdma_v4_4_2_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &sdma_v4_4_2_aca_bank_ops,
-};
-
-static int sdma_v4_4_2_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
-{
- int r;
-
- r = amdgpu_sdma_ras_late_init(adev, ras_block);
- if (r)
- return r;
-
- return amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__SDMA,
- &sdma_v4_4_2_aca_info, NULL);
-}
-
static struct amdgpu_sdma_ras sdma_v4_4_2_ras = {
.ras_block = {
- .hw_ops = &sdma_v4_4_2_ras_hw_ops,
- .ras_late_init = sdma_v4_4_2_ras_late_init,
+ .hw_ops = NULL,
},
};
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index d7537888e60c..7a3f1a60b014 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -793,23 +793,6 @@ static int sdma_v6_0_soft_reset(struct amdgpu_ip_block *ip_block)
return sdma_v6_0_start(adev);
}
-static bool sdma_v6_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- struct amdgpu_ring *ring;
- int i, r;
- long tmo = msecs_to_jiffies(1000);
-
- for (i = 0; i < adev->sdma.num_instances; i++) {
- ring = &adev->sdma.instance[i].ring;
- r = amdgpu_ring_test_ib(ring, tmo);
- if (r)
- return true;
- }
-
- return false;
-}
-
/**
* sdma_v6_0_start - setup and start the async dma engines
*
@@ -1747,7 +1730,6 @@ const struct amd_ip_funcs sdma_v6_0_ip_funcs = {
.is_idle = sdma_v6_0_is_idle,
.wait_for_idle = sdma_v6_0_wait_for_idle,
.soft_reset = sdma_v6_0_soft_reset,
- .check_soft_reset = sdma_v6_0_check_soft_reset,
.set_clockgating_state = sdma_v6_0_set_clockgating_state,
.set_powergating_state = sdma_v6_0_set_powergating_state,
.get_clockgating_state = sdma_v6_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 49c57a38151b..84305b6800fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -784,23 +784,6 @@ static int sdma_v7_0_soft_reset(struct amdgpu_ip_block *ip_block)
return sdma_v7_0_start(adev);
}
-static bool sdma_v7_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- struct amdgpu_ring *ring;
- int i, r;
- long tmo = msecs_to_jiffies(1000);
-
- for (i = 0; i < adev->sdma.num_instances; i++) {
- ring = &adev->sdma.instance[i].ring;
- r = amdgpu_ring_test_ib(ring, tmo);
- if (r)
- return true;
- }
-
- return false;
-}
-
static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring,
unsigned int vmid,
struct amdgpu_fence *timedout_fence)
@@ -1679,7 +1662,6 @@ const struct amd_ip_funcs sdma_v7_0_ip_funcs = {
.is_idle = sdma_v7_0_is_idle,
.wait_for_idle = sdma_v7_0_wait_for_idle,
.soft_reset = sdma_v7_0_soft_reset,
- .check_soft_reset = sdma_v7_0_check_soft_reset,
.set_clockgating_state = sdma_v7_0_set_clockgating_state,
.set_powergating_state = sdma_v7_0_set_powergating_state,
.get_clockgating_state = sdma_v7_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c
index b06001f6b536..322e6f4dd121 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c
@@ -775,23 +775,6 @@ static int sdma_v7_1_soft_reset(struct amdgpu_ip_block *ip_block)
return sdma_v7_1_inst_start(adev, inst_mask);
}
-static bool sdma_v7_1_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- struct amdgpu_ring *ring;
- int i, r;
- long tmo = msecs_to_jiffies(1000);
-
- for (i = 0; i < adev->sdma.num_instances; i++) {
- ring = &adev->sdma.instance[i].ring;
- r = amdgpu_ring_test_ib(ring, tmo);
- if (r)
- return true;
- }
-
- return false;
-}
-
static int sdma_v7_1_reset_queue(struct amdgpu_ring *ring,
unsigned int vmid,
struct amdgpu_fence *timedout_fence)
@@ -1644,7 +1627,6 @@ const struct amd_ip_funcs sdma_v7_1_ip_funcs = {
.is_idle = sdma_v7_1_is_idle,
.wait_for_idle = sdma_v7_1_wait_for_idle,
.soft_reset = sdma_v7_1_soft_reset,
- .check_soft_reset = sdma_v7_1_check_soft_reset,
.set_clockgating_state = sdma_v7_1_set_clockgating_state,
.set_powergating_state = sdma_v7_1_set_powergating_state,
.get_clockgating_state = sdma_v7_1_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
index c26cb3e8bff6..b104469c38ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/si.c
+++ b/drivers/gpu/drm/amd/amdgpu/si.c
@@ -1509,12 +1509,6 @@ static void si_invalidate_hdp(struct amdgpu_device *adev,
}
}
-static bool si_need_full_reset(struct amdgpu_device *adev)
-{
- /* change this when we support soft reset */
- return true;
-}
-
static bool si_need_reset_on_init(struct amdgpu_device *adev)
{
return false;
@@ -2019,7 +2013,6 @@ static const struct amdgpu_asic_funcs si_asic_funcs =
.get_config_memsize = &si_get_config_memsize,
.flush_hdp = &si_flush_hdp,
.invalidate_hdp = &si_invalidate_hdp,
- .need_full_reset = &si_need_full_reset,
.get_pcie_usage = &si_get_pcie_usage,
.need_reset_on_init = &si_need_reset_on_init,
.get_pcie_replay_count = &si_get_pcie_replay_count,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 87b398dd0769..ed3fd58b78d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -721,12 +721,6 @@ void soc15_set_virt_ops(struct amdgpu_device *adev)
soc15_reg_base_init(adev);
}
-static bool soc15_need_full_reset(struct amdgpu_device *adev)
-{
- /* change this when we implement soft reset */
- return true;
-}
-
static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
uint64_t *count1)
{
@@ -878,7 +872,6 @@ static const struct amdgpu_asic_funcs soc15_asic_funcs =
.set_uvd_clocks = &soc15_set_uvd_clocks,
.set_vce_clocks = &soc15_set_vce_clocks,
.get_config_memsize = &soc15_get_config_memsize,
- .need_full_reset = &soc15_need_full_reset,
.init_doorbell_index = &vega10_doorbell_index_init,
.get_pcie_usage = &soc15_get_pcie_usage,
.need_reset_on_init = &soc15_need_reset_on_init,
@@ -899,7 +892,6 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
.set_uvd_clocks = &soc15_set_uvd_clocks,
.set_vce_clocks = &soc15_set_vce_clocks,
.get_config_memsize = &soc15_get_config_memsize,
- .need_full_reset = &soc15_need_full_reset,
.init_doorbell_index = &vega20_doorbell_index_init,
.get_pcie_usage = &vega20_get_pcie_usage,
.need_reset_on_init = &soc15_need_reset_on_init,
@@ -920,7 +912,6 @@ static const struct amdgpu_asic_funcs aqua_vanjaram_asic_funcs =
.set_uvd_clocks = &soc15_set_uvd_clocks,
.set_vce_clocks = &soc15_set_vce_clocks,
.get_config_memsize = &soc15_get_config_memsize,
- .need_full_reset = &soc15_need_full_reset,
.init_doorbell_index = &aqua_vanjaram_doorbell_index_init,
.need_reset_on_init = &soc15_need_reset_on_init,
.get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index a7b5a95ebebb..47e0329b6f3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -38,30 +38,30 @@
(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + (reg)+(offset))
#define __WREG32_SOC15_RLC__(reg, value, flag, hwip, inst) \
- ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.rlcg_reg_access_supported) ? \
- amdgpu_sriov_wreg(adev, reg, value, flag, hwip, inst) : \
- WREG32(reg, value))
+ adev->gfx.rlc.reg_funcs->wreg32(adev, reg, value, flag, hwip, inst)
#define __RREG32_SOC15_RLC__(reg, flag, hwip, inst) \
- ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.rlcg_reg_access_supported) ? \
- amdgpu_sriov_rreg(adev, reg, flag, hwip, inst) : \
- RREG32(reg))
-
-#define WREG32_FIELD15(ip, idx, reg, field, val) \
- __WREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \
- (__RREG32_SOC15_RLC__( \
- adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \
- 0, ip##_HWIP, idx) & \
- ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field), \
- 0, ip##_HWIP, idx)
-
-#define WREG32_FIELD15_PREREG(ip, idx, reg_name, field, val) \
- __WREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name, \
- (__RREG32_SOC15_RLC__( \
- adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name, \
- 0, ip##_HWIP, idx) & \
- ~REG_FIELD_MASK(reg_name, field)) | (val) << REG_FIELD_SHIFT(reg_name, field), \
- 0, ip##_HWIP, idx)
+ adev->gfx.rlc.reg_funcs->rreg32(adev, reg, flag, hwip, inst)
+
+#define WREG32_FIELD15(ip, idx, reg_name, field, val) \
+do { \
+ u32 reg__ = adev->reg_offset[ip##_HWIP][idx][mm##reg_name##_BASE_IDX] + mm##reg_name; \
+ u32 val__ = __RREG32_SOC15_RLC__(reg__, 0, ip##_HWIP, idx); \
+\
+ val__ &= ~REG_FIELD_MASK(reg_name, field); \
+ val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \
+ __WREG32_SOC15_RLC__(reg__, val__, 0, ip##_HWIP, idx); \
+} while (0)
+
+#define WREG32_FIELD15_PREREG(ip, idx, reg_name, field, val) \
+do { \
+ u32 reg__ = adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name; \
+ u32 val__ = __RREG32_SOC15_RLC__(reg__, 0, ip##_HWIP, idx); \
+\
+ val__ &= ~REG_FIELD_MASK(reg_name, field); \
+ val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \
+ __WREG32_SOC15_RLC__(reg__, val__, 0, ip##_HWIP, idx); \
+} while (0)
#define RREG32_SOC15(ip, inst, reg) \
__RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, \
@@ -181,12 +181,15 @@
WREG32_RLC_EX(prefix, target_reg, value, inst); \
} while (0)
-#define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \
- __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
- (__RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \
- AMDGPU_REGS_RLC, ip##_HWIP, idx) & \
- ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field), \
- AMDGPU_REGS_RLC, ip##_HWIP, idx)
+#define WREG32_FIELD15_RLC(ip, idx, reg_name, field, val) \
+do { \
+ u32 reg__ = adev->reg_offset[ip##_HWIP][idx][mm##reg_name##_BASE_IDX] + mm##reg_name; \
+ u32 val__ = __RREG32_SOC15_RLC__(reg__, AMDGPU_REGS_RLC, ip##_HWIP, idx); \
+\
+ val__ &= ~REG_FIELD_MASK(reg_name, field); \
+ val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \
+ __WREG32_SOC15_RLC__(reg__, val__, AMDGPU_REGS_RLC, ip##_HWIP, idx); \
+} while (0)
#define WREG32_SOC15_OFFSET_RLC(ip, inst, reg, offset, value) \
__WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + offset, value, AMDGPU_REGS_RLC, ip##_HWIP, inst)
@@ -207,10 +210,4 @@
amdgpu_reg_get_smn_base64(adev, ip##_HWIP, inst), \
value)
-#define RREG64_MCA(smn_base, mca_base, idx) \
- RREG64_PCIE_EXT(smn_base + mca_base + (idx * 8))
-
-#define WREG64_MCA(smn_base, mca_base, idx, val) \
- WREG64_PCIE_EXT(smn_base + mca_base + (idx * 8), val)
-
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 1677e88a4e36..09f28dbd60ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -461,17 +461,6 @@ const struct amdgpu_ip_block_version soc21_common_ip_block = {
.funcs = &soc21_common_ip_funcs,
};
-static bool soc21_need_full_reset(struct amdgpu_device *adev)
-{
- switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
- case IP_VERSION(11, 0, 0):
- case IP_VERSION(11, 0, 2):
- case IP_VERSION(11, 0, 3):
- default:
- return true;
- }
-}
-
static bool soc21_need_reset_on_init(struct amdgpu_device *adev)
{
u32 sol_reg;
@@ -550,7 +539,6 @@ static const struct amdgpu_asic_funcs soc21_asic_funcs = {
.set_vce_clocks = &soc21_set_vce_clocks,
.get_config_memsize = &soc21_get_config_memsize,
.init_doorbell_index = &soc21_init_doorbell_index,
- .need_full_reset = &soc21_need_full_reset,
.need_reset_on_init = &soc21_need_reset_on_init,
.get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count,
.supports_baco = &amdgpu_dpm_is_baco_supported,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c
index 9dce30d2bb8d..e5e3a460e486 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -238,16 +238,6 @@ const struct amdgpu_ip_block_version soc24_common_ip_block = {
.funcs = &soc24_common_ip_funcs,
};
-static bool soc24_need_full_reset(struct amdgpu_device *adev)
-{
- switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
- case IP_VERSION(12, 0, 0):
- case IP_VERSION(12, 0, 1):
- default:
- return true;
- }
-}
-
static bool soc24_need_reset_on_init(struct amdgpu_device *adev)
{
u32 sol_reg;
@@ -330,7 +320,6 @@ static const struct amdgpu_asic_funcs soc24_asic_funcs = {
.get_xclk = &soc24_get_xclk,
.get_config_memsize = &soc24_get_config_memsize,
.init_doorbell_index = &soc24_init_doorbell_index,
- .need_full_reset = &soc24_need_full_reset,
.need_reset_on_init = &soc24_need_reset_on_init,
.get_pcie_replay_count = &soc24_get_pcie_replay_count,
.supports_baco = &amdgpu_dpm_is_baco_supported,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c b/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c
index 5f05c8e68297..a9039fb1a77b 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c
@@ -223,15 +223,6 @@ static int soc_v1_0_read_register(struct amdgpu_device *adev,
return -EINVAL;
}
-static bool soc_v1_0_need_full_reset(struct amdgpu_device *adev)
-{
- switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
- case IP_VERSION(12, 1, 0):
- default:
- return true;
- }
-}
-
static bool soc_v1_0_need_reset_on_init(struct amdgpu_device *adev)
{
@@ -271,7 +262,6 @@ static const struct amdgpu_asic_funcs soc_v1_0_asic_funcs = {
.read_register = &soc_v1_0_read_register,
.get_config_memsize = &soc_v1_0_get_config_memsize,
.get_xclk = &soc_v1_0_get_xclk,
- .need_full_reset = &soc_v1_0_need_full_reset,
.init_doorbell_index = &soc_v1_0_doorbell_index_init,
.need_reset_on_init = &soc_v1_0_need_reset_on_init,
.encode_ext_smn_addressing = &soc_v1_0_encode_ext_smn_addressing,
@@ -600,8 +590,10 @@ static int soc_v1_0_get_xcp_res_info(struct amdgpu_xcp_mgr *xcp_mgr,
xcp_cfg->num_res = ARRAY_SIZE(max_res);
for (i = 0; i < xcp_cfg->num_res; i++) {
- res_lt_xcp = max_res[i] < num_xcp;
xcp_cfg->xcp_res[i].id = i;
+ if (!max_res[i])
+ continue;
+ res_lt_xcp = max_res[i] < num_xcp;
xcp_cfg->xcp_res[i].num_inst =
res_lt_xcp ? 1 : max_res[i] / num_xcp;
xcp_cfg->xcp_res[i].num_inst =
diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
index ee8038df17e3..a3e883f6f099 100644
--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
@@ -390,43 +390,6 @@ static int tonga_ih_wait_for_idle(struct amdgpu_ip_block *ip_block)
return -ETIMEDOUT;
}
-static bool tonga_ih_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
- u32 tmp = RREG32(mmSRBM_STATUS);
-
- if (tmp & SRBM_STATUS__IH_BUSY_MASK)
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET,
- SOFT_RESET_IH, 1);
-
- if (srbm_soft_reset) {
- adev->irq.srbm_soft_reset = srbm_soft_reset;
- return true;
- } else {
- adev->irq.srbm_soft_reset = 0;
- return false;
- }
-}
-
-static int tonga_ih_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- if (!ip_block->adev->irq.srbm_soft_reset)
- return 0;
-
- return tonga_ih_hw_fini(ip_block);
-}
-
-static int tonga_ih_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->irq.srbm_soft_reset)
- return 0;
-
- return tonga_ih_hw_init(ip_block);
-}
-
static int tonga_ih_soft_reset(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -481,10 +444,7 @@ static const struct amd_ip_funcs tonga_ih_ip_funcs = {
.resume = tonga_ih_resume,
.is_idle = tonga_ih_is_idle,
.wait_for_idle = tonga_ih_wait_for_idle,
- .check_soft_reset = tonga_ih_check_soft_reset,
- .pre_soft_reset = tonga_ih_pre_soft_reset,
.soft_reset = tonga_ih_soft_reset,
- .post_soft_reset = tonga_ih_post_soft_reset,
.set_clockgating_state = tonga_ih_set_clockgating_state,
.set_powergating_state = tonga_ih_set_powergating_state,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 14092150336a..67bdf7303e6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -28,48 +28,6 @@
#include "umc/umc_12_0_0_sh_mask.h"
#include "mp/mp_13_0_6_sh_mask.h"
-#define MAX_ECC_NUM_PER_RETIREMENT 32
-#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms
-
-static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
- uint32_t node_inst,
- uint32_t umc_inst,
- uint32_t ch_inst)
-{
- uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
- uint64_t cross_node_offset = (node_inst == 0) ? 0 : UMC_V12_0_CROSS_NODE_OFFSET;
-
- umc_inst = index / 4;
- ch_inst = index % 4;
-
- return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst +
- UMC_V12_0_NODE_DIST * node_inst + cross_node_offset;
-}
-
-static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev,
- uint32_t node_inst, uint32_t umc_inst,
- uint32_t ch_inst, void *data)
-{
- uint64_t odecc_err_cnt_addr;
- uint64_t umc_reg_offset =
- get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
-
- odecc_err_cnt_addr =
- SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
-
- /* clear error count */
- WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
- UMC_V12_0_CE_CNT_INIT);
-
- return 0;
-}
-
-static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
-{
- amdgpu_umc_loop_channels(adev,
- umc_v12_0_reset_error_count_per_channel, NULL);
-}
-
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{
dev_dbg(adev->dev,
@@ -115,65 +73,6 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_
!(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)))));
}
-static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev,
- uint64_t umc_reg_offset,
- unsigned long *error_count,
- check_error_type_func error_type_func)
-{
- uint64_t mc_umc_status;
- uint64_t mc_umc_status_addr;
-
- mc_umc_status_addr =
- SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
-
- /* Check MCUMC_STATUS */
- mc_umc_status =
- RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
-
- if (error_type_func(adev, mc_umc_status))
- *error_count += 1;
-}
-
-static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
- uint32_t node_inst, uint32_t umc_inst,
- uint32_t ch_inst, void *data)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)data;
- unsigned long ue_count = 0, ce_count = 0, de_count = 0;
-
- /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3],
- * which can be used as die ID directly */
- struct amdgpu_smuio_mcm_config_info mcm_info = {
- .socket_id = adev->smuio.funcs->get_socket_id(adev),
- .die_id = node_inst,
- };
-
- uint64_t umc_reg_offset =
- get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
-
- umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
- &ce_count, umc_v12_0_is_correctable_error);
- umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
- &ue_count, umc_v12_0_is_uncorrectable_error);
- umc_v12_0_query_error_count_per_type(adev, umc_reg_offset,
- &de_count, umc_v12_0_is_deferred_error);
-
- amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
- amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
- amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, de_count);
-
- return 0;
-}
-
-static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- amdgpu_umc_loop_channels(adev,
- umc_v12_0_query_error_count, ras_error_status);
-
- umc_v12_0_reset_error_count(adev);
-}
-
static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev)
{
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
@@ -279,190 +178,6 @@ static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev)
adev->umc.retire_unit = 0x1 << flip_bits->bit_num;
}
-static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
- struct ras_err_data *err_data,
- struct ta_ras_query_address_input *addr_in,
- struct ta_ras_query_address_output *addr_out,
- bool dump_addr)
-{
- uint32_t row = 0, row_lower = 0, row_high = 0;
- uint32_t col = 0, col_lower = 0, bank = 0;
- uint32_t channel_index = 0, umc_inst = 0;
- uint32_t i, bit_num, retire_unit, *flip_bits;
- uint64_t soc_pa, column, err_addr;
- struct ta_ras_query_address_output addr_out_tmp;
- struct ta_ras_query_address_output *paddr_out;
- int ret = 0;
-
- if (!addr_out)
- paddr_out = &addr_out_tmp;
- else
- paddr_out = addr_out;
-
- err_addr = bank = 0;
- if (addr_in) {
- err_addr = addr_in->ma.err_addr;
- addr_in->addr_type = TA_RAS_MCA_TO_PA;
- ret = psp_ras_query_address(&adev->psp, addr_in, paddr_out);
- if (ret) {
- dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
- err_addr);
-
- goto out;
- }
-
- bank = paddr_out->pa.bank;
- /* no need to care about umc inst if addr_in is NULL */
- umc_inst = addr_in->ma.umc_inst;
- }
-
- flip_bits = adev->umc.flip_bits.flip_bits_in_pa;
- bit_num = adev->umc.flip_bits.bit_num;
- retire_unit = adev->umc.retire_unit;
-
- soc_pa = paddr_out->pa.pa;
- channel_index = paddr_out->pa.channel_idx;
- /* clear loop bits in soc physical address */
- for (i = 0; i < bit_num; i++)
- soc_pa &= ~BIT_ULL(flip_bits[i]);
-
- paddr_out->pa.pa = soc_pa;
- /* get column bit 0 and 1 in mca address */
- col_lower = (err_addr >> 1) & 0x3ULL;
- /* extra row bit will be handled later */
- row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & 0x1fffULL;
- row_lower &= ~BIT_ULL(adev->umc.flip_bits.flip_row_bit);
-
- if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 5, 0)) {
- row_high = (soc_pa >> adev->umc.flip_bits.r13_in_pa) & 0x3ULL;
- /* it's 2.25GB in each channel, from MCA address to PA
- * [R14 R13] is converted if the two bits value are 0x3,
- * get them from PA instead of MCA address.
- */
- row_lower |= (row_high << 13);
- }
-
- if (!err_data && !dump_addr)
- goto out;
-
- /* loop for all possibilities of retired bits */
- for (column = 0; column < retire_unit; column++) {
- soc_pa = paddr_out->pa.pa;
- for (i = 0; i < bit_num; i++)
- soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]);
-
- col = ((column & 0x7) << 2) | col_lower;
- /* handle extra row bit */
- if (bit_num == RETIRE_FLIP_BITS_NUM)
- row = ((column >> 3) << adev->umc.flip_bits.flip_row_bit) |
- row_lower;
-
- if (dump_addr)
- dev_info(adev->dev,
- "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
- soc_pa, row, col, bank, channel_index);
-
- if (err_data)
- amdgpu_umc_fill_error_record(err_data, err_addr,
- soc_pa, channel_index, umc_inst);
- }
-
-out:
- return ret;
-}
-
-static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
- uint32_t node_inst, uint32_t umc_inst,
- uint32_t ch_inst, void *data)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)data;
- struct ta_ras_query_address_input addr_in;
- uint64_t mc_umc_status_addr;
- uint64_t mc_umc_status, err_addr;
- uint64_t mc_umc_addrt0;
- uint64_t umc_reg_offset =
- get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
-
- mc_umc_status_addr =
- SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
-
- mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
-
- if (mc_umc_status == 0)
- return 0;
-
- if (!err_data->err_addr) {
- /* clear umc status */
- WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
-
- return 0;
- }
-
- /* calculate error address if ue error is detected */
- if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
- umc_v12_0_is_deferred_error(adev, mc_umc_status)) {
- mc_umc_addrt0 =
- SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
-
- err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * 4);
-
- err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
-
- if (!adev->aid_mask &&
- adev->smuio.funcs &&
- adev->smuio.funcs->get_socket_id)
- addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev);
- else
- addr_in.ma.socket_id = 0;
-
- addr_in.ma.err_addr = err_addr;
- addr_in.ma.ch_inst = ch_inst;
- addr_in.ma.umc_inst = umc_inst;
- addr_in.ma.node_inst = node_inst;
-
- umc_v12_0_convert_error_address(adev, err_data, &addr_in, NULL, true);
- }
-
- /* clear umc status */
- WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
-
- return 0;
-}
-
-static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- amdgpu_umc_loop_channels(adev,
- umc_v12_0_query_error_address, ras_error_status);
-}
-
-static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
- uint32_t node_inst, uint32_t umc_inst,
- uint32_t ch_inst, void *data)
-{
- uint32_t odecc_cnt_sel;
- uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr;
- uint64_t umc_reg_offset =
- get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
-
- odecc_cnt_sel_addr =
- SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel);
- odecc_err_cnt_addr =
- SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
-
- odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4);
-
- /* set ce error interrupt type to APIC based interrupt */
- odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
- OdEccErrInt, 0x1);
- WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel);
-
- /* set error count to initial value */
- WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT);
-
- return 0;
-}
-
static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status)
{
@@ -482,309 +197,11 @@ static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
return false;
}
-static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
-{
- amdgpu_umc_loop_channels(adev,
- umc_v12_0_err_cnt_init_per_channel, NULL);
-}
-
-static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
-{
- /*
- * Force return true, because regUMCCH0_EccCtrl
- * is not accessible from host side
- */
- return true;
-}
-
-const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
- .query_ras_error_count = umc_v12_0_query_ras_error_count,
- .query_ras_error_address = umc_v12_0_query_ras_error_address,
-};
-
-static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct amdgpu_device *adev = handle->adev;
- struct aca_bank_info info;
- enum aca_error_type err_type;
- u64 status, count;
- u32 ext_error_code;
- int ret;
-
- status = bank->regs[ACA_REG_IDX_STATUS];
- if (umc_v12_0_is_deferred_error(adev, status))
- err_type = ACA_ERROR_TYPE_DEFERRED;
- else if (umc_v12_0_is_uncorrectable_error(adev, status))
- err_type = ACA_ERROR_TYPE_UE;
- else if (umc_v12_0_is_correctable_error(adev, status))
- err_type = ACA_ERROR_TYPE_CE;
- else
- return 0;
- bank->aca_err_type = err_type;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- amdgpu_umc_update_ecc_status(adev,
- bank->regs[ACA_REG_IDX_STATUS],
- bank->regs[ACA_REG_IDX_IPID],
- bank->regs[ACA_REG_IDX_ADDR]);
-
- ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
- if (umc_v12_0_is_deferred_error(adev, status))
- count = ext_error_code == 0 ?
- adev->umc.err_addr_cnt / adev->umc.retire_unit : 1ULL;
- else
- count = ext_error_code == 0 ?
- ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL;
-
- return aca_error_cache_log_bank_error(handle, &info, err_type, count);
-}
-
-static const struct aca_bank_ops umc_v12_0_aca_bank_ops = {
- .aca_bank_parser = umc_v12_0_aca_bank_parser,
-};
-
-const struct aca_info umc_v12_0_aca_info = {
- .hwip = ACA_HWIP_TYPE_UMC,
- .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK | ACA_ERROR_DEFERRED_MASK,
- .bank_ops = &umc_v12_0_aca_bank_ops,
-};
-
-static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
-{
- int ret;
-
- ret = amdgpu_umc_ras_late_init(adev, ras_block);
- if (ret)
- return ret;
-
- ret = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__UMC,
- &umc_v12_0_aca_info, NULL);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
- uint64_t status, uint64_t ipid, uint64_t addr)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- uint16_t hwid, mcatype;
- uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
- uint64_t err_addr, pa_addr = 0;
- struct ras_ecc_err *ecc_err;
- struct ta_ras_query_address_output addr_out;
- uint32_t shift_bit = adev->umc.flip_bits.flip_bits_in_pa[2];
- int count, ret, i;
-
- hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
- mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
-
- /* The IP block decode of consumption is SMU */
- if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) {
- con->umc_ecc_log.consumption_q_count++;
- return 0;
- }
-
- if (!status)
- return 0;
-
- if (!umc_v12_0_is_deferred_error(adev, status))
- return 0;
-
- err_addr = REG_GET_FIELD(addr,
- MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
-
- dev_dbg(adev->dev,
- "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
- ipid,
- MCA_IPID_2_SOCKET_ID(ipid),
- MCA_IPID_2_DIE_ID(ipid),
- MCA_IPID_2_UMC_INST(ipid),
- MCA_IPID_2_UMC_CH(ipid),
- err_addr);
-
- ret = amdgpu_umc_mca_to_addr(adev,
- err_addr, MCA_IPID_2_UMC_CH(ipid),
- MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
- MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true);
- if (ret)
- return ret;
-
- ecc_err = kzalloc_obj(*ecc_err);
- if (!ecc_err)
- return -ENOMEM;
-
- pa_addr = addr_out.pa.pa;
- ecc_err->status = status;
- ecc_err->ipid = ipid;
- ecc_err->addr = addr;
- ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
- ecc_err->channel_idx = addr_out.pa.channel_idx;
-
- /* If converted pa_pfn is 0, use pa C4 pfn. */
- if (!ecc_err->pa_pfn)
- ecc_err->pa_pfn = BIT_ULL(shift_bit) >> AMDGPU_GPU_PAGE_SHIFT;
-
- ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
- if (ret) {
- if (ret == -EEXIST)
- con->umc_ecc_log.de_queried_count++;
- else
- dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
-
- kfree(ecc_err);
- return ret;
- }
-
- con->umc_ecc_log.de_queried_count++;
-
- memset(page_pfn, 0, sizeof(page_pfn));
- count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
- pa_addr,
- page_pfn, ARRAY_SIZE(page_pfn));
- if (count <= 0) {
- dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
- return 0;
- }
-
- /* Reserve memory */
- for (i = 0; i < count; i++)
- amdgpu_ras_reserve_page(adev, page_pfn[i]);
-
- /* The problem case is as follows:
- * 1. GPU A triggers a gpu ras reset, and GPU A drives
- * GPU B to also perform a gpu ras reset.
- * 2. After gpu B ras reset started, gpu B queried a DE
- * data. Since the DE data was queried in the ras reset
- * thread instead of the page retirement thread, bad
- * page retirement work would not be triggered. Then
- * even if all gpu resets are completed, the bad pages
- * will be cached in RAM until GPU B's bad page retirement
- * work is triggered again and then saved to eeprom.
- * Trigger delayed work to save the bad pages to eeprom in time
- * after gpu ras reset is completed.
- */
- if (amdgpu_ras_in_recovery(adev))
- schedule_delayed_work(&con->page_retirement_dwork,
- msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
-
- return 0;
-}
-
-static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
- struct ras_ecc_err *ecc_err, void *ras_error_status)
-{
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
- int ret, i, count;
-
- if (!err_data || !ecc_err)
- return -EINVAL;
-
- memset(page_pfn, 0, sizeof(page_pfn));
- count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
- ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT,
- page_pfn, ARRAY_SIZE(page_pfn));
-
- for (i = 0; i < count; i++) {
- ret = amdgpu_umc_fill_error_record(err_data,
- ecc_err->addr,
- page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
- ecc_err->channel_idx,
- MCA_IPID_2_UMC_INST(ecc_err->ipid));
- if (ret)
- break;
- }
-
- err_data->de_count++;
-
- return ret;
-}
-
-static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
- struct radix_tree_root *ecc_tree;
- int new_detected, ret, i;
-
- ecc_tree = &con->umc_ecc_log.de_page_tree;
-
- mutex_lock(&con->umc_ecc_log.lock);
- new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
- 0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
- for (i = 0; i < new_detected; i++) {
- if (!entries[i])
- continue;
-
- ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
- if (ret) {
- dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
- break;
- }
- radix_tree_tag_clear(ecc_tree,
- entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
- }
- mutex_unlock(&con->umc_ecc_log.lock);
-}
-
-static uint32_t umc_v12_0_get_die_id(struct amdgpu_device *adev,
- uint64_t mca_addr, uint64_t retired_page)
-{
- uint32_t die = 0;
-
- /* we only calculate die id for nps1 mode right now */
- die += ((((retired_page >> 12) & 0x1ULL)^
- ((retired_page >> 20) & 0x1ULL) ^
- ((retired_page >> 27) & 0x1ULL) ^
- ((retired_page >> 34) & 0x1ULL) ^
- ((retired_page >> 41) & 0x1ULL)) << 0);
-
- /* the original PA_C4 and PA_R13 may be cleared in retired_page, so
- * get them from mca_addr.
- */
- die += ((((retired_page >> 13) & 0x1ULL) ^
- ((mca_addr >> 5) & 0x1ULL) ^
- ((retired_page >> 28) & 0x1ULL) ^
- ((mca_addr >> 23) & 0x1ULL) ^
- ((retired_page >> 42) & 0x1ULL)) << 1);
- die &= 3;
-
- return die;
-}
-
-static void umc_v12_0_mca_ipid_parse(struct amdgpu_device *adev, uint64_t ipid,
- uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid)
-{
- if (did)
- *did = MCA_IPID_2_DIE_ID(ipid);
- if (ch)
- *ch = MCA_IPID_2_UMC_CH(ipid);
- if (umc_inst)
- *umc_inst = MCA_IPID_2_UMC_INST(ipid);
- if (sid)
- *sid = MCA_IPID_2_SOCKET_ID(ipid);
-}
-
struct amdgpu_umc_ras umc_v12_0_ras = {
.ras_block = {
- .hw_ops = &umc_v12_0_ras_hw_ops,
- .ras_late_init = umc_v12_0_ras_late_init,
+ .hw_ops = NULL,
},
- .err_cnt_init = umc_v12_0_err_cnt_init,
- .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
- .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
- .update_ecc_status = umc_v12_0_update_ecc_status,
- .convert_ras_err_addr = umc_v12_0_convert_error_address,
- .get_die_id_from_pa = umc_v12_0_get_die_id,
.get_retire_flip_bits = umc_v12_0_get_retire_flip_bits,
- .mca_ipid_parse = umc_v12_0_mca_ipid_parse,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 63b7e7254526..9d9e84d8d3bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -26,31 +26,6 @@
#include "soc15_common.h"
#include "amdgpu.h"
-#define UMC_V12_0_NODE_DIST 0x40000000
-#define UMC_V12_0_INST_DIST 0x40000
-
-/* UMC register per channel offset */
-#define UMC_V12_0_PER_CHANNEL_OFFSET 0x400
-
-/* UMC cross node offset */
-#define UMC_V12_0_CROSS_NODE_OFFSET 0x100000000
-
-/* OdEccErrCnt max value */
-#define UMC_V12_0_CE_CNT_MAX 0xffff
-/* umc ce interrupt threshold */
-#define UMC_V12_0_CE_INT_THRESHOLD 0xffff
-/* umc ce count initial value */
-#define UMC_V12_0_CE_CNT_INIT (UMC_V12_0_CE_CNT_MAX - UMC_V12_0_CE_INT_THRESHOLD)
-
-/* number of umc channel instance with memory map register access */
-#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8
-/* number of umc instance with memory map register access */
-#define UMC_V12_0_UMC_INSTANCE_NUM 4
-
-/* Total channel instances for all available umc nodes */
-#define UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \
- (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc)
-
/* one piece of normalized address is mapped to 8 pieces of physical address */
#define UMC_V12_0_NA_MAP_PA_NUM 8
/* R13 bit shift should be considered, double the number */
@@ -75,9 +50,6 @@
/* row bits in MCA address */
#define UMC_V12_0_MA_R0_BIT 10
-#define MCA_UMC_HWID_V12_0 0x96
-#define MCA_UMC_MCATYPE_V12_0 0x0
-
#define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \
(((_ipid_lo) >> 12) & 0xF))
#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index ecd7ead7a60b..8bb9592b0981 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -1165,36 +1165,6 @@ static int uvd_v6_0_wait_for_idle(struct amdgpu_ip_block *ip_block)
}
#define AMDGPU_UVD_STATUS_BUSY_MASK 0xfd
-static bool uvd_v6_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
- u32 tmp = RREG32(mmSRBM_STATUS);
-
- if (REG_GET_FIELD(tmp, SRBM_STATUS, UVD_RQ_PENDING) ||
- REG_GET_FIELD(tmp, SRBM_STATUS, UVD_BUSY) ||
- (RREG32(mmUVD_STATUS) & AMDGPU_UVD_STATUS_BUSY_MASK))
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_UVD, 1);
-
- if (srbm_soft_reset) {
- adev->uvd.inst->srbm_soft_reset = srbm_soft_reset;
- return true;
- } else {
- adev->uvd.inst->srbm_soft_reset = 0;
- return false;
- }
-}
-
-static int uvd_v6_0_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->uvd.inst->srbm_soft_reset)
- return 0;
-
- uvd_v6_0_stop(adev);
- return 0;
-}
static int uvd_v6_0_soft_reset(struct amdgpu_ip_block *ip_block)
{
@@ -1227,18 +1197,6 @@ static int uvd_v6_0_soft_reset(struct amdgpu_ip_block *ip_block)
return 0;
}
-static int uvd_v6_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->uvd.inst->srbm_soft_reset)
- return 0;
-
- mdelay(5);
-
- return uvd_v6_0_start(adev);
-}
-
static int uvd_v6_0_set_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
@@ -1538,10 +1496,7 @@ static const struct amd_ip_funcs uvd_v6_0_ip_funcs = {
.resume = uvd_v6_0_resume,
.is_idle = uvd_v6_0_is_idle,
.wait_for_idle = uvd_v6_0_wait_for_idle,
- .check_soft_reset = uvd_v6_0_check_soft_reset,
- .pre_soft_reset = uvd_v6_0_pre_soft_reset,
.soft_reset = uvd_v6_0_soft_reset,
- .post_soft_reset = uvd_v6_0_post_soft_reset,
.set_clockgating_state = uvd_v6_0_set_clockgating_state,
.set_powergating_state = uvd_v6_0_set_powergating_state,
.get_clockgating_state = uvd_v6_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
index c69f7d82060f..9f4e88440c0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
@@ -631,47 +631,6 @@ static int vce_v3_0_wait_for_idle(struct amdgpu_ip_block *ip_block)
#define AMDGPU_VCE_STATUS_BUSY_MASK (VCE_STATUS_VCPU_REPORT_AUTO_BUSY_MASK | \
VCE_STATUS_VCPU_REPORT_RB0_BUSY_MASK)
-static bool vce_v3_0_check_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
- u32 srbm_soft_reset = 0;
-
- /* According to VCE team , we should use VCE_STATUS instead
- * SRBM_STATUS.VCE_BUSY bit for busy status checking.
- * GRBM_GFX_INDEX.INSTANCE_INDEX is used to specify which VCE
- * instance's registers are accessed
- * (0 for 1st instance, 10 for 2nd instance).
- *
- *VCE_STATUS
- *|UENC|ACPI|AUTO ACTIVE|RB1 |RB0 |RB2 | |FW_LOADED|JOB |
- *|----+----+-----------+----+----+----+----------+---------+----|
- *|bit8|bit7| bit6 |bit5|bit4|bit3| bit2 | bit1 |bit0|
- *
- * VCE team suggest use bit 3--bit 6 for busy status check
- */
- mutex_lock(&adev->grbm_idx_mutex);
- WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(0));
- if (RREG32(mmVCE_STATUS) & AMDGPU_VCE_STATUS_BUSY_MASK) {
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE0, 1);
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE1, 1);
- }
- WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(1));
- if (RREG32(mmVCE_STATUS) & AMDGPU_VCE_STATUS_BUSY_MASK) {
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE0, 1);
- srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE1, 1);
- }
- WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(0));
- mutex_unlock(&adev->grbm_idx_mutex);
-
- if (srbm_soft_reset) {
- adev->vce.srbm_soft_reset = srbm_soft_reset;
- return true;
- } else {
- adev->vce.srbm_soft_reset = 0;
- return false;
- }
-}
-
static int vce_v3_0_soft_reset(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -703,31 +662,6 @@ static int vce_v3_0_soft_reset(struct amdgpu_ip_block *ip_block)
return 0;
}
-static int vce_v3_0_pre_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->vce.srbm_soft_reset)
- return 0;
-
- mdelay(5);
-
- return vce_v3_0_suspend(ip_block);
-}
-
-
-static int vce_v3_0_post_soft_reset(struct amdgpu_ip_block *ip_block)
-{
- struct amdgpu_device *adev = ip_block->adev;
-
- if (!adev->vce.srbm_soft_reset)
- return 0;
-
- mdelay(5);
-
- return vce_v3_0_resume(ip_block);
-}
-
static int vce_v3_0_set_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
unsigned type,
@@ -909,10 +843,7 @@ static const struct amd_ip_funcs vce_v3_0_ip_funcs = {
.resume = vce_v3_0_resume,
.is_idle = vce_v3_0_is_idle,
.wait_for_idle = vce_v3_0_wait_for_idle,
- .check_soft_reset = vce_v3_0_check_soft_reset,
- .pre_soft_reset = vce_v3_0_pre_soft_reset,
.soft_reset = vce_v3_0_soft_reset,
- .post_soft_reset = vce_v3_0_post_soft_reset,
.set_clockgating_state = vce_v3_0_set_clockgating_state,
.set_powergating_state = vce_v3_0_set_powergating_state,
.get_clockgating_state = vce_v3_0_get_clockgating_state,
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 8b8184fe6764..0d8a3cea63ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -159,9 +159,8 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
struct amdgpu_device *adev = ring->adev;
struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me];
- atomic_inc(&adev->vcn.inst[0].total_submission_cnt);
-
- cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work);
+ if (!atomic_fetch_inc(&adev->vcn.inst[0].total_submission_cnt))
+ cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work);
/* We can safely return early here because we've cancelled the
* the delayed work so there is no one else to set it to false
@@ -207,10 +206,9 @@ static void vcn_v2_5_ring_end_use(struct amdgpu_ring *ring)
!adev->vcn.inst[ring->me].using_unified_queue)
atomic_dec(&adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
- atomic_dec(&adev->vcn.inst[0].total_submission_cnt);
-
- schedule_delayed_work(&adev->vcn.inst[0].idle_work,
- VCN_IDLE_TIMEOUT);
+ if (atomic_dec_and_test(&adev->vcn.inst[0].total_submission_cnt))
+ schedule_delayed_work(&adev->vcn.inst[0].idle_work,
+ VCN_IDLE_TIMEOUT);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 894780669f9c..0cce78b205a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1995,7 +1995,7 @@ static int vcn_v4_0_ring_reset(struct amdgpu_ring *ring,
return amdgpu_ring_reset_helper_end(ring, timedout_fence);
}
-static struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = {
+static const struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = {
.type = AMDGPU_RING_TYPE_VCN_ENC,
.align_mask = 0x3f,
.nop = VCN_ENC_CMD_NO_OP,
@@ -2028,6 +2028,40 @@ static struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = {
.reset = vcn_v4_0_ring_reset,
};
+static const struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs_secure = {
+ .type = AMDGPU_RING_TYPE_VCN_ENC,
+ .align_mask = 0x3f,
+ .nop = VCN_ENC_CMD_NO_OP,
+ .secure_submission_supported = true,
+ .no_user_fence = true,
+ .extra_bytes = sizeof(struct amdgpu_vcn_rb_metadata),
+ .get_rptr = vcn_v4_0_unified_ring_get_rptr,
+ .get_wptr = vcn_v4_0_unified_ring_get_wptr,
+ .set_wptr = vcn_v4_0_unified_ring_set_wptr,
+ .patch_cs_in_place = vcn_v4_0_ring_patch_cs_in_place,
+ .emit_frame_size =
+ SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
+ SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 4 +
+ 4 + /* vcn_v2_0_enc_ring_emit_vm_flush */
+ 5 + 5 + /* vcn_v2_0_enc_ring_emit_fence x2 vm fence */
+ 1, /* vcn_v2_0_enc_ring_insert_end */
+ .emit_ib_size = 5, /* vcn_v2_0_enc_ring_emit_ib */
+ .emit_ib = vcn_v2_0_enc_ring_emit_ib,
+ .emit_fence = vcn_v2_0_enc_ring_emit_fence,
+ .emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush,
+ .test_ring = amdgpu_vcn_enc_ring_test_ring,
+ .test_ib = amdgpu_vcn_unified_ring_test_ib,
+ .insert_nop = amdgpu_ring_insert_nop,
+ .insert_end = vcn_v2_0_enc_ring_insert_end,
+ .pad_ib = amdgpu_ring_generic_pad_ib,
+ .begin_use = amdgpu_vcn_ring_begin_use,
+ .end_use = amdgpu_vcn_ring_end_use,
+ .emit_wreg = vcn_v2_0_enc_ring_emit_wreg,
+ .emit_reg_wait = vcn_v2_0_enc_ring_emit_reg_wait,
+ .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
+ .reset = vcn_v4_0_ring_reset,
+};
+
/**
* vcn_v4_0_set_unified_ring_funcs - set unified ring functions
*
@@ -2044,10 +2078,11 @@ static void vcn_v4_0_set_unified_ring_funcs(struct amdgpu_device *adev)
continue;
if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 2))
- vcn_v4_0_unified_ring_vm_funcs.secure_submission_supported = true;
-
- adev->vcn.inst[i].ring_enc[0].funcs =
- (const struct amdgpu_ring_funcs *)&vcn_v4_0_unified_ring_vm_funcs;
+ adev->vcn.inst[i].ring_enc[0].funcs =
+ &vcn_v4_0_unified_ring_vm_funcs_secure;
+ else
+ adev->vcn.inst[i].ring_enc[0].funcs =
+ &vcn_v4_0_unified_ring_vm_funcs;
adev->vcn.inst[i].ring_enc[0].me = i;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 7f001c32e911..179b892fb410 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -115,6 +115,19 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int i, r;
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->vcn.disable_kq = false;
+ adev->vcn.disable_uq = true;
+ break;
+ case 2:
+ adev->vcn.disable_kq = true;
+ adev->vcn.disable_uq = true;
+ break;
+ }
+
for (i = 0; i < adev->vcn.num_vcn_inst; ++i)
/* re-use enc ring as unified ring */
adev->vcn.inst[i].num_enc_rings = 1;
@@ -217,6 +230,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
ring = &adev->vcn.inst[i].ring_enc[0];
ring->use_doorbell = true;
+ if (adev->vcn.disable_kq) {
+ ring->no_scheduler = true;
+ ring->no_user_submission = true;
+ }
if (!amdgpu_sriov_vf(adev))
ring->doorbell_index =
@@ -2146,71 +2163,6 @@ static const struct amdgpu_ras_block_hw_ops vcn_v4_0_3_ras_hw_ops = {
.query_poison_status = vcn_v4_0_3_query_poison_status,
};
-static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* reference to smu driver if header file */
-static int vcn_v4_0_3_err_codes[] = {
- 14, 15, /* VCN */
-};
-
-static bool vcn_v4_0_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- vcn_v4_0_3_err_codes,
- ARRAY_SIZE(vcn_v4_0_3_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops vcn_v4_0_3_aca_bank_ops = {
- .aca_bank_parser = vcn_v4_0_3_aca_bank_parser,
- .aca_bank_is_valid = vcn_v4_0_3_aca_bank_is_valid,
-};
-
-static const struct aca_info vcn_v4_0_3_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &vcn_v4_0_3_aca_bank_ops,
-};
-
static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
int r;
@@ -2226,11 +2178,6 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
goto late_fini;
}
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__VCN,
- &vcn_v4_0_3_aca_info, NULL);
- if (r)
- goto late_fini;
-
return 0;
late_fini:
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index 1571cc5a148c..c8879a6e5297 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -1479,10 +1479,11 @@ static int vcn_v4_0_5_ring_reset(struct amdgpu_ring *ring,
return amdgpu_ring_reset_helper_end(ring, timedout_fence);
}
-static struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = {
+static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = {
.type = AMDGPU_RING_TYPE_VCN_ENC,
.align_mask = 0x3f,
.nop = VCN_ENC_CMD_NO_OP,
+ .secure_submission_supported = true,
.no_user_fence = true,
.get_rptr = vcn_v4_0_5_unified_ring_get_rptr,
.get_wptr = vcn_v4_0_5_unified_ring_get_wptr,
@@ -1525,9 +1526,6 @@ static void vcn_v4_0_5_set_unified_ring_funcs(struct amdgpu_device *adev)
if (adev->vcn.harvest_config & (1 << i))
continue;
- if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 5))
- vcn_v4_0_5_unified_ring_vm_funcs.secure_submission_supported = true;
-
adev->vcn.inst[i].ring_enc[0].funcs = &vcn_v4_0_5_unified_ring_vm_funcs;
adev->vcn.inst[i].ring_enc[0].me = i;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
index d3db0494341e..1a07c3bf4425 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
@@ -94,6 +94,19 @@ static int vcn_v5_0_1_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int i, r;
+ switch (amdgpu_user_queue) {
+ case -1:
+ case 0:
+ default:
+ adev->vcn.disable_kq = false;
+ adev->vcn.disable_uq = true;
+ break;
+ case 2:
+ adev->vcn.disable_kq = true;
+ adev->vcn.disable_uq = true;
+ break;
+ }
+
for (i = 0; i < adev->vcn.num_vcn_inst; ++i)
/* re-use enc ring as unified ring */
adev->vcn.inst[i].num_enc_rings = 1;
@@ -188,6 +201,10 @@ static int vcn_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block)
ring = &adev->vcn.inst[i].ring_enc[0];
ring->use_doorbell = true;
+ if (adev->vcn.disable_kq) {
+ ring->no_scheduler = true;
+ ring->no_user_submission = true;
+ }
if (!amdgpu_sriov_vf(adev))
ring->doorbell_index =
(adev->doorbell_index.vcn.vcn_ring0_1 << 1) +
@@ -1657,10 +1674,7 @@ static const struct amd_ip_funcs vcn_v5_0_1_ip_funcs = {
.resume = vcn_v5_0_1_resume,
.is_idle = vcn_v5_0_1_is_idle,
.wait_for_idle = vcn_v5_0_1_wait_for_idle,
- .check_soft_reset = NULL,
- .pre_soft_reset = NULL,
.soft_reset = NULL,
- .post_soft_reset = NULL,
.set_clockgating_state = vcn_v5_0_1_set_clockgating_state,
.set_powergating_state = vcn_set_powergating_state,
.dump_ip_state = amdgpu_vcn_dump_ip_state,
@@ -1713,71 +1727,6 @@ static const struct amdgpu_ras_block_hw_ops vcn_v5_0_1_ras_hw_ops = {
.query_poison_status = vcn_v5_0_1_query_poison_status,
};
-static int vcn_v5_0_1_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- struct aca_bank_info info;
- u64 misc0;
- int ret;
-
- ret = aca_bank_info_decode(bank, &info);
- if (ret)
- return ret;
-
- misc0 = bank->regs[ACA_REG_IDX_MISC0];
- switch (type) {
- case ACA_SMU_TYPE_UE:
- bank->aca_err_type = ACA_ERROR_TYPE_UE;
- ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
- 1ULL);
- break;
- case ACA_SMU_TYPE_CE:
- bank->aca_err_type = ACA_ERROR_TYPE_CE;
- ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type,
- ACA_REG__MISC0__ERRCNT(misc0));
- break;
- default:
- return -EINVAL;
- }
-
- return ret;
-}
-
-/* reference to smu driver if header file */
-static int vcn_v5_0_1_err_codes[] = {
- 14, 15, 47, /* VCN [D|V|S] */
-};
-
-static bool vcn_v5_0_1_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
- enum aca_smu_type type, void *data)
-{
- u32 instlo;
-
- instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
- instlo &= GENMASK(31, 1);
-
- if (instlo != mmSMNAID_AID0_MCA_SMU)
- return false;
-
- if (aca_bank_check_error_codes(handle->adev, bank,
- vcn_v5_0_1_err_codes,
- ARRAY_SIZE(vcn_v5_0_1_err_codes)))
- return false;
-
- return true;
-}
-
-static const struct aca_bank_ops vcn_v5_0_1_aca_bank_ops = {
- .aca_bank_parser = vcn_v5_0_1_aca_bank_parser,
- .aca_bank_is_valid = vcn_v5_0_1_aca_bank_is_valid,
-};
-
-static const struct aca_info vcn_v5_0_1_aca_info = {
- .hwip = ACA_HWIP_TYPE_SMU,
- .mask = ACA_ERROR_UE_MASK,
- .bank_ops = &vcn_v5_0_1_aca_bank_ops,
-};
-
static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
{
int r;
@@ -1786,11 +1735,6 @@ static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_commo
if (r)
return r;
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__VCN,
- &vcn_v5_0_1_aca_info, NULL);
- if (r)
- goto late_fini;
-
if (amdgpu_ras_is_supported(adev, ras_block->block) &&
adev->vcn.inst->ras_poison_irq.funcs) {
r = amdgpu_irq_get(adev, &adev->vcn.inst->ras_poison_irq, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c
index bbc172db91a1..b9f6ae75ea72 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c
@@ -1203,10 +1203,7 @@ static const struct amd_ip_funcs vcn_v5_0_2_ip_funcs = {
.resume = vcn_v5_0_2_resume,
.is_idle = vcn_v5_0_2_is_idle,
.wait_for_idle = vcn_v5_0_2_wait_for_idle,
- .check_soft_reset = NULL,
- .pre_soft_reset = NULL,
.soft_reset = NULL,
- .post_soft_reset = NULL,
.set_clockgating_state = vcn_v5_0_2_set_clockgating_state,
.set_powergating_state = vcn_set_powergating_state,
};
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index a256320b92f3..5715b6b596af 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -1328,27 +1328,6 @@ static void vi_invalidate_hdp(struct amdgpu_device *adev,
}
}
-static bool vi_need_full_reset(struct amdgpu_device *adev)
-{
- switch (adev->asic_type) {
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- /* CZ has hang issues with full reset at the moment */
- return false;
- case CHIP_FIJI:
- case CHIP_TONGA:
- /* XXX: soft reset should work on fiji and tonga */
- return true;
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_TOPAZ:
- default:
- /* change this when we support soft reset */
- return true;
- }
-}
-
static void vi_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
uint64_t *count1)
{
@@ -1437,7 +1416,6 @@ static const struct amdgpu_asic_funcs vi_asic_funcs =
.get_config_memsize = &vi_get_config_memsize,
.flush_hdp = &vi_flush_hdp,
.invalidate_hdp = &vi_invalidate_hdp,
- .need_full_reset = &vi_need_full_reset,
.init_doorbell_index = &legacy_doorbell_index_init,
.get_pcie_usage = &vi_get_pcie_usage,
.need_reset_on_init = &vi_need_reset_on_init,