From 322a7e005db78b8a46ead91b7e3df3514cb658f0 Mon Sep 17 00:00:00 2001 From: Hawking Zhang <Hawking.Zhang@amd.com> Date: Thu, 2 Feb 2023 20:54:08 +0800 Subject: drm/amdgpu: Add common helper to query ras error (v2) Add common helper to query ras error status and log error information, including memory block id and erorr count. The helpers are applicable to IP blocks that follow the new ras error logging design. For IP blocks that don't support the new design, please still implement ip specific helper to query ras error. v2: optimize struct amdgpu_ras_err_status_reg_entry and the implementaion in helper (Lijo/Tao) Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 54 +++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 17b3d1992e80..c820af7f1a4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -314,6 +314,43 @@ enum amdgpu_ras_ret { AMDGPU_RAS_PT, }; +/* ras error status reisger fields */ +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L +#define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18 +#define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2 +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L +#define ERR_STATUS__ERR_CNT__SHIFT 0x17 +#define ERR_STATUS__ERR_CNT_MASK 0x03800000L + +#define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \ + ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi + +#define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \ + (adev->reg_offset[hwip][ip_inst][segment] + (reg)) + +#define AMDGPU_RAS_ERR_INFO_VALID (1 << 0) +#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) +#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) + +struct amdgpu_ras_err_status_reg_entry { + uint32_t hwip; + uint32_t ip_inst; + uint32_t seg_lo; + uint32_t reg_lo; + uint32_t seg_hi; + uint32_t reg_hi; + uint32_t reg_inst; + uint32_t flags; + const char *block_name; +}; + +struct amdgpu_ras_memory_id_entry { + uint32_t memory_id; + const char *name; +}; + struct ras_common_if { enum amdgpu_ras_block block; enum amdgpu_ras_error_type type; @@ -696,4 +733,21 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name); +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id); +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt); +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count); #endif -- cgit v1.2.3 From e53a3250f76b8a0dd5b533bd0ce0dc821055e77d Mon Sep 17 00:00:00 2001 From: Hawking Zhang <Hawking.Zhang@amd.com> Date: Fri, 3 Feb 2023 16:10:37 +0800 Subject: drm/amdgpu: Add common helper to reset ras error Add common helper to reset ras error status. It applies to IP blocks that follow the new ras error logging register design, and need to write 0 to reset the error status. For IP blocks that don't support the new design, please still implement ip specific helper. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 57e86af0c906..8a16a06cb78a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3222,3 +3222,23 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, } } } + +void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + uint32_t instance) +{ + uint32_t err_status_lo_offset, err_status_hi_offset; + uint32_t i; + + for (i = 0; i < reg_list_size; i++) { + err_status_lo_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, + reg_list[i].seg_lo, reg_list[i].reg_lo); + err_status_hi_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, + reg_list[i].seg_hi, reg_list[i].reg_hi); + WREG32(err_status_lo_offset, 0); + WREG32(err_status_hi_offset, 0); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c820af7f1a4b..e96333d0c269 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -750,4 +750,8 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, uint32_t instance, uint32_t err_type, unsigned long *err_count); +void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + uint32_t instance); #endif -- cgit v1.2.3 From 2c22ed0bdb0cb6da9408593eafa6137325576017 Mon Sep 17 00:00:00 2001 From: Tao Zhou <tao.zhou1@amd.com> Date: Mon, 27 Feb 2023 18:25:23 +0800 Subject: drm/amdgpu: add instance mask for RAS inject User can specify injected instances by the mask. For backward compatibility, the mask value is incorporated into sub block index without interface change of RAS TA. User uses logical mask and driver should convert it to physical value before sending it to RAS TA. v2: update parameter name. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 21 ++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 ++++++++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 5 +++-- 8 files changed, 56 insertions(+), 19 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index ec79a5c2f500..59b8b26e2caf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1672,14 +1672,33 @@ int psp_ras_initialize(struct psp_context *psp) } int psp_ras_trigger_error(struct psp_context *psp, - struct ta_ras_trigger_error_input *info) + struct ta_ras_trigger_error_input *info, uint32_t instance_mask) { struct ta_ras_shared_memory *ras_cmd; + struct amdgpu_device *adev = psp->adev; int ret; + uint32_t dev_mask; if (!psp->ras_context.context.initialized) return -EINVAL; + switch (info->block_id) { + case TA_RAS_BLOCK__GFX: + dev_mask = GET_MASK(GC, instance_mask); + break; + case TA_RAS_BLOCK__SDMA: + dev_mask = GET_MASK(SDMA0, instance_mask); + break; + default: + dev_mask = instance_mask; + break; + } + + /* reuse sub_block_index for backward compatibility */ + dev_mask <<= AMDGPU_RAS_INST_SHIFT; + dev_mask &= AMDGPU_RAS_INST_MASK; + info->sub_block_index |= dev_mask; + ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 0a409da749d1..d84323923a3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -486,7 +486,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id); int psp_ras_enable_features(struct psp_context *psp, union ta_ras_cmd_input *info, bool enable); int psp_ras_trigger_error(struct psp_context *psp, - struct ta_ras_trigger_error_input *info); + struct ta_ras_trigger_error_input *info, uint32_t instance_mask); int psp_ras_terminate(struct psp_context *psp); int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 64f80e8cbd63..7ae08f168f99 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, int block_id; uint32_t sub_block; u64 address, value; + /* default value is 0 if the mask is not set by user */ + u32 instance_mask = 0; if (*pos) return -EINVAL; @@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->op = op; if (op == 2) { - if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", + if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s %u %llu %llu %u", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", &sub_block, &address, &value) != 3 && sscanf(str, "%*s %*s %*s %u %llu %llu", &sub_block, &address, &value) != 3) @@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->head.sub_block_index = sub_block; data->inject.address = address; data->inject.value = value; + data->inject.instance_mask = instance_mask; } } else { if (size < sizeof(*data)) @@ -341,7 +348,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. * name: the name of IP. * - * inject has two more members than head, they are address, value. + * inject has three more members than head, they are address, value and mask. * As their names indicate, inject operation will write the * value to the address. * @@ -365,7 +372,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl - * echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * * Where N, is the card which you want to affect. * @@ -382,13 +389,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. + * The mask means instance mask, is optional, default value is 0x1. * * For instance, * * .. code-block:: bash * * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl - * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl + * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl * * How to check the result of the operation? @@ -1117,13 +1125,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, info); + ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); } else { /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); + ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, + info->instance_mask); else /*If not defined .ras_error_inject, use default ras_error_inject*/ - ret = psp_ras_trigger_error(&adev->psp, &block_info); + ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); } if (ret) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e96333d0c269..bc43f7db17cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -32,6 +32,11 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) +/* position of instance value in sub_block_index of + * ta_ras_trigger_error_input, the sub block uses lower 12 bits + */ +#define AMDGPU_RAS_INST_MASK 0xfffff000 +#define AMDGPU_RAS_INST_SHIFT 0xc enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -508,6 +513,7 @@ struct ras_inject_if { struct ras_common_if head; uint64_t address; uint64_t value; + uint32_t instance_mask; }; struct ras_cure_if { @@ -545,7 +551,8 @@ struct amdgpu_ras_block_object { }; struct amdgpu_ras_block_hw_ops { - int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); + int (*ras_error_inject)(struct amdgpu_device *adev, + void *inject_if, uint32_t instance_mask); void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 439925477fb8..85ee1af963dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1014,7 +1014,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, } /* Trigger XGMI/WAFL error */ -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, + void *inject_if, uint32_t instance_mask) { int ret = 0; struct ta_ras_trigger_error_input *block_info = @@ -1026,7 +1027,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *injec if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) dev_warn(adev->dev, "Failed to disallow XGMI power down"); - ret = psp_ras_trigger_error(&adev->psp, block_info); + ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); if (amdgpu_ras_intr_triggered()) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index cc005e3bcd40..de8e70b3db75 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -770,7 +770,7 @@ static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, - void *inject_if); + void *inject_if, uint32_t instance_mask); static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev); static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring, @@ -6335,7 +6335,7 @@ static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = { }; static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, - void *inject_if) + void *inject_if, uint32_t instance_mask) { struct ras_inject_if *info = (struct ras_inject_if *)inject_if; int ret; @@ -6374,7 +6374,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev, block_info.value = info->value; mutex_lock(&adev->grbm_idx_mutex); - ret = psp_ras_trigger_error(&adev->psp, &block_info); + ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask); mutex_unlock(&adev->grbm_idx_mutex); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c index c67e387a97f5..59abe162bbaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c @@ -971,7 +971,7 @@ static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev) } static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, - void *inject_if) + void *inject_if, uint32_t instance_mask) { struct ras_inject_if *info = (struct ras_inject_if *)inject_if; int ret; @@ -987,7 +987,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, block_info.value = info->value; mutex_lock(&adev->grbm_idx_mutex); - ret = psp_ras_trigger_error(&adev->psp, &block_info); + ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask); mutex_unlock(&adev->grbm_idx_mutex); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index ec7c049c5952..4906affa6f8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1699,7 +1699,8 @@ static void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev) gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL); } -static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if) +static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, + void *inject_if, uint32_t instance_mask) { struct ras_inject_if *info = (struct ras_inject_if *)inject_if; int ret; @@ -1715,7 +1716,7 @@ static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_ block_info.value = info->value; mutex_lock(&adev->grbm_idx_mutex); - ret = psp_ras_trigger_error(&adev->psp, &block_info); + ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask); mutex_unlock(&adev->grbm_idx_mutex); return ret; -- cgit v1.2.3 From 6c47a79b3b8ba91faf89f9866da2ec16aac979e7 Mon Sep 17 00:00:00 2001 From: YiPeng Chai <YiPeng.Chai@amd.com> Date: Tue, 16 May 2023 17:34:17 +0800 Subject: drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3 perform mode2 reset for sdma fed error on gfx v11_0_3. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 14 +++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6bb438642cc0..f2da69adcd9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) /* Perform full reset in fatal error mode */ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - else + else { clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { + ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset_context.method = AMD_RESET_METHOD_MODE2; + } + } + amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index bc43f7db17cc..46bf1889a9d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -339,6 +339,8 @@ enum amdgpu_ras_ret { #define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) #define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) +#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0) + struct amdgpu_ras_err_status_reg_entry { uint32_t hwip; uint32_t ip_inst; @@ -427,6 +429,9 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + + /* Record special requirements of gpu reset caller */ + uint32_t gpu_reset_flags; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c index 068b9586a223..26d6286d86c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c @@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, /* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */ if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) && (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) && - !entry->vmid && !entry->pasid) + !entry->vmid && !entry->pasid) { + uint32_t rlc_status0 = 0; + + rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0); + + if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) || + REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR)) { + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + } + amdgpu_ras_reset_gpu(adev); + } return 0; } -- cgit v1.2.3