diff options
author | John Clements <john.clements@amd.com> | 2021-03-25 17:10:10 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2021-04-09 16:54:51 -0400 |
commit | 134d16d50f0948f00e7172b509e869b6eaecf437 (patch) | |
tree | 036563baedfcf183200a94b37eb2f3bbbfac4363 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 340c571bebbefe03da1c1139b62a55f4ec6fcdce (diff) | |
download | lwn-134d16d50f0948f00e7172b509e869b6eaecf437.tar.gz lwn-134d16d50f0948f00e7172b509e869b6eaecf437.zip |
drm/amdgpu: RAS harvest on driver load
In event of RAS UE + warm reset, error counters shall be harvested and cleared on driver load
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1d905bcbc1ac..b0fe5885e4c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2090,6 +2090,32 @@ release_con: return r; } +static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) +{ + if (adev->gmc.xgmi.connected_to_cpu) + return 1; + return 0; +} + +static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + struct ras_query_if info = { + .head = *ras_block, + }; + + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + return 0; + + if (amdgpu_ras_query_error_status(adev, &info) != 0) + DRM_WARN("RAS init harvest failure"); + + if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) + DRM_WARN("RAS init harvest reset failure"); + + return 0; +} + /* helper function to handle common stuff in ip late init phase */ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, @@ -2119,6 +2145,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, return r; } + /* check for errors on warm reset edc persisant supported ASIC */ + amdgpu_persistent_edc_harvesting(adev, ras_block); + /* in resume phase, no need to create ras fs node */ if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; |