diff options
author | Stanley.Yang <Stanley.Yang@amd.com> | 2020-09-15 16:15:05 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2020-09-22 17:37:38 -0400 |
commit | 3f975d0f71d384825f47c3598d1d5358e40a57f5 (patch) | |
tree | 1daf6289c4aa95790897b0cf2ba2b450708b06a2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | d117413f5e1be245b7b3e0cd6afda402609b2572 (diff) | |
download | lwn-3f975d0f71d384825f47c3598d1d5358e40a57f5.tar.gz lwn-3f975d0f71d384825f47c3598d1d5358e40a57f5.zip |
drm/amdgpu: update athub interrupt harvesting handle
GCEA/MMHUB EA error should not result to DF freeze, this is
fixed in next generation, but for some reasons the GCEA/MMHUB
EA error will result to DF freeze in previous generation,
diver should avoid to indicate GCEA/MMHUB EA error as hw fatal
error in kernel message by read GCEA/MMHUB err status registers.
Changed from V1:
make query_ras_error_status function more general
make read mmhub er status register more friendly
Changed from V2:
move ras error status query function into do_recovery workqueue
Changed from V3:
remove useless code from V2, print GCEA error status
instance number
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 |
1 files changed, 42 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e5ea14774c0c..40614ac9a111 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) } } +/* Parse RdRspStatus and WrRspStatus */ +void amdgpu_ras_error_status_query(struct amdgpu_device *adev, + struct ras_query_if *info) +{ + /* + * Only two block need to query read/write + * RspStatus at current state + */ + switch (info->head.block) { + case AMDGPU_RAS_BLOCK__GFX: + if (adev->gfx.funcs->query_ras_error_status) + adev->gfx.funcs->query_ras_error_status(adev); + break; + case AMDGPU_RAS_BLOCK__MMHUB: + if (adev->mmhub.funcs->query_ras_error_status) + adev->mmhub.funcs->query_ras_error_status(adev); + break; + default: + break; + } +} + +static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_manager *obj; + + if (!con) + return; + + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; + + amdgpu_ras_error_status_query(adev, &info); + } +} + /* recovery begin */ /* return 0 on success. @@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) } list_for_each_entry(remote_adev, - device_list_handle, gmc.xgmi.head) + device_list_handle, gmc.xgmi.head) { + amdgpu_ras_query_err_status(remote_adev); amdgpu_ras_log_on_err_counter(remote_adev); + } amdgpu_put_xgmi_hive(hive); } |