summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c33
1 files changed, 21 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e8875351967e..08133de21fdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
#ifdef CONFIG_X86_MCE_AMD
-static void amdgpu_register_bad_pages_mca_notifier(void);
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+ struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+ int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
#endif
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
@@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
- amdgpu_register_bad_pages_mca_notifier();
+ amdgpu_register_bad_pages_mca_notifier(adev);
#endif
return 0;
@@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
#ifdef CONFIG_X86_MCE_AMD
static struct amdgpu_device *find_adev(uint32_t node_id)
{
- struct amdgpu_gpu_instance *gpu_instance;
int i;
struct amdgpu_device *adev = NULL;
- mutex_lock(&mgpu_info.mutex);
-
- for (i = 0; i < mgpu_info.num_gpu; i++) {
- gpu_instance = &(mgpu_info.gpu_ins[i]);
- adev = gpu_instance->adev;
+ for (i = 0; i < mce_adev_list.num_gpu; i++) {
+ adev = mce_adev_list.devs[i];
- if (adev->gmc.xgmi.connected_to_cpu &&
+ if (adev && adev->gmc.xgmi.connected_to_cpu &&
adev->gmc.xgmi.physical_node_id == node_id)
break;
adev = NULL;
}
- mutex_unlock(&mgpu_info.mutex);
-
return adev;
}
@@ -2718,9 +2717,19 @@ static struct notifier_block amdgpu_bad_page_nb = {
.priority = MCE_PRIO_UC,
};
-static void amdgpu_register_bad_pages_mca_notifier(void)
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
{
/*
+ * Add the adev to the mce_adev_list.
+ * During mode2 reset, amdgpu device is temporarily
+ * removed from the mgpu_info list which can cause
+ * page retirement to fail.
+ * Use this list instead of mgpu_info to find the amdgpu
+ * device on which the UMC error was reported.
+ */
+ mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
+
+ /*
* Register the x86 notifier only once
* with MCE subsystem.
*/