summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c321
1 files changed, 179 insertions, 142 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0924aa3f4e4..68685aca2835 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1733,7 +1733,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
*/
static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
- struct kobject *kobj, struct bin_attribute *attr,
+ struct kobject *kobj, const struct bin_attribute *attr,
char *buf, loff_t ppos, size_t count)
{
struct amdgpu_ras *con =
@@ -1864,6 +1864,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
if (!obj || obj->attr_inuse)
return -EINVAL;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block))
+ return 0;
+
get_obj(obj);
snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
@@ -2065,8 +2068,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
/* debugfs end */
/* ras fs */
-static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
- amdgpu_ras_sysfs_badpages_read, NULL, 0);
+static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
+ amdgpu_ras_sysfs_badpages_read, NULL, 0);
static DEVICE_ATTR(features, S_IRUGO,
amdgpu_ras_sysfs_features_read, NULL);
static DEVICE_ATTR(version, 0444,
@@ -2088,7 +2091,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
&con->event_state_attr.attr,
NULL
};
- struct bin_attribute *bin_attrs[] = {
+ const struct bin_attribute *bin_attrs[] = {
NULL,
NULL,
};
@@ -2114,11 +2117,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
if (amdgpu_bad_page_threshold != 0) {
/* add bad_page_features entry */
- bin_attr_gpu_vram_bad_pages.private = NULL;
con->badpages_attr = bin_attr_gpu_vram_bad_pages;
+ sysfs_bin_attr_init(&con->badpages_attr);
bin_attrs[0] = &con->badpages_attr;
- group.bin_attrs = bin_attrs;
- sysfs_bin_attr_init(bin_attrs[0]);
+ group.bin_attrs_new = bin_attrs;
}
r = sysfs_create_group(&adev->dev->kobj, &group);
@@ -2796,20 +2798,108 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int j;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
+ for (j = 0; j < count; j++) {
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ continue;
+
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ return -ENOMEM;
+ }
+
+ amdgpu_ras_reserve_page(adev, bps[j].retired_page);
+
+ memcpy(&data->bps[data->count], &(bps[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ }
+
+ return 0;
+}
+
+static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+
+ /*old asics just have pa in eeprom*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ goto out;
+ }
+
+ for (i = 0; i < adev->umc.retire_unit; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps) {
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+ return -EINVAL;
+ }
+ } else {
+ if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+ if (nps == AMDGPU_NPS1_PARTITION_MODE)
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ else
+ return -EOPNOTSUPP;
+ }
+ }
+
+out:
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ }
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+ adev->umc.retire_unit);
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
struct ras_err_data err_data;
- struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
- uint32_t i, j, loop_cnt = 1;
- bool find_pages_per_pa = false;
+ uint32_t i;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2820,114 +2910,46 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
sizeof(struct eeprom_table_record), GFP_KERNEL);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- err_rec = err_data.err_addr;
- loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data) {
- /* Returning 0 as the absence of eh_data is acceptable */
- goto free;
- }
-
- for (i = 0; i < pages; i++) {
- if (from_rom &&
- control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
- if (!find_pages_per_pa) {
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- /* may use old RAS TA, use PA to find pages in
- * one row
- */
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page <<
- AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- } else {
- find_pages_per_pa = true;
- }
- } else {
- /* unsupported cases */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
- } else {
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- }
- }
- } else {
- if (from_rom && !find_pages_per_pa) {
- if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
- /* bad page in any NPS mode in eeprom */
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- ret = -EINVAL;
+
+ if (from_rom) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ //deal with retire_unit records a time
+ ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
+ i += (adev->umc.retire_unit - 1);
} else {
- /* legacy bad page in eeprom, generated only in
- * NPS1 mode
- */
- if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
- /* old RAS TA or ASICs which don't support to
- * convert addrss via mca address
- */
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- find_pages_per_pa = true;
- err_rec = &bps[i];
- loop_cnt = 1;
- } else {
- /* non-nps1 mode, old RAS TA
- * can't support it
- */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
+ break;
}
-
- if (!find_pages_per_pa)
- i += (adev->umc.retire_unit - 1);
} else {
- err_rec = &bps[i];
+ break;
}
}
-
- for (j = 0; j < loop_cnt; j++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
- ret = -ENOMEM;
+ for (; i < pages; i++) {
+ ret = __amdgpu_ras_convert_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
goto free;
- }
-
- amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
-
- memcpy(&data->bps[data->count], &(err_rec[j]),
- sizeof(struct eeprom_table_record));
- data->count++;
- data->space_left--;
}
+ } else {
+ ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
free:
if (from_rom)
kfree(err_data.err_addr);
-out:
mutex_unlock(&con->recovery_lock);
return ret;
@@ -2966,18 +2988,18 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
/* only new entries are saved */
if (save_count > 0) {
- if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) {
+ /*old asics only save pa to eeprom like before*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[control->ras_num_recs],
- save_count)) {
+ &data->bps[bad_page_num], save_count)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
} else {
for (i = 0; i < unit_num; i++) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[bad_page_num + i * adev->umc.retire_unit],
- 1)) {
+ &data->bps[bad_page_num +
+ i * adev->umc.retire_unit], 1)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
@@ -2999,7 +3021,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
struct eeprom_table_record *bps;
- int ret;
+ int ret, i = 0;
/* no bad page record, skip eeprom access */
if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
@@ -3013,13 +3035,23 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
if (ret) {
dev_err(adev->dev, "Failed to load EEPROM table records!");
} else {
- if (control->ras_num_recs > 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- if ((bps[0].address == bps[1].address) &&
- (bps[0].mem_channel == bps[1].mem_channel))
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
- else
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+ for (i = 0; i < control->ras_num_recs; i++) {
+ if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ control->ras_num_pa_recs += adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ control->ras_num_mca_recs +=
+ (control->ras_num_recs - i);
+ break;
+ }
+ } else {
+ control->ras_num_mca_recs += (control->ras_num_recs - i);
+ break;
+ }
+ }
}
ret = amdgpu_ras_eeprom_check(control);
@@ -3080,31 +3112,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/*
- * Justification of value bad_page_cnt_threshold in ras structure
- *
- * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
- * scenarios accordingly.
- *
- * Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -2,
- * bad_page_cnt_threshold = typical value by formula.
- *
- * - When the value from user is 0 < amdgpu_bad_page_threshold <
- * max record length in eeprom, use it directly.
- *
- * Bad page retirement disablement:
- * - If amdgpu_bad_page_threshold = 0, bad page retirement
- * functionality is disabled, and bad_page_cnt_threshold will
- * take no effect.
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
*/
-
- if (amdgpu_bad_page_threshold < 0) {
+ if (amdgpu_bad_page_threshold == -2) {
u64 val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_COVER);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_count);
+ } else if (amdgpu_bad_page_threshold == -1) {
+ con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
} else {
con->bad_page_cnt_threshold = min_t(int, max_count,
amdgpu_bad_page_threshold);
@@ -3427,12 +3457,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
return ret;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
-
- /* default status is MCA storage */
- if (control->ras_num_recs <= 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ control->ras_num_pa_recs = control->ras_num_recs;
if (control->ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
@@ -3447,6 +3472,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
+
+ /* The format action is only applied to new ASICs */
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
+ control->tbl_hdr.version < RAS_TABLE_VER_V3)
+ if (!amdgpu_ras_eeprom_reset_table(control))
+ if (amdgpu_ras_save_bad_pages(adev, NULL))
+ dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
return ret;
@@ -3759,8 +3791,11 @@ init_ras_enabled_flag:
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
- /* aca is disabled by default */
- adev->aca.is_enabled = false;
+ /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
+ adev->aca.is_enabled =
+ (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14));
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
@@ -3848,8 +3883,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+ break;
case IP_VERSION(13, 0, 14):
- con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
break;
default:
break;
@@ -5127,9 +5164,9 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
"socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
socket_id, aid_id, fw_status);
- if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+ if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error))
dev_info(adev->dev,
- "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n",
+ "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n",
socket_id, aid_id, fw_status);
}