diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 148 |
1 files changed, 104 insertions, 44 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index f28f6b4ba765..0ea7cfaf3587 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -161,6 +161,7 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 10): return true; case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): return (adev->gmc.is_app_apu) ? false : true; default: @@ -177,7 +178,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, if (!control) return false; - if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { + if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { /* The address given by VBIOS is an 8-bit, wire-format * address, i.e. the most significant byte. * @@ -223,6 +224,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, return true; case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): control->i2c_address = EEPROM_I2C_MADDR_4; return true; @@ -413,9 +415,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { case IP_VERSION(8, 10, 0): - case IP_VERSION(12, 0, 0): hdr->version = RAS_TABLE_VER_V2_1; return; + case IP_VERSION(12, 0, 0): + hdr->version = RAS_TABLE_VER_V3; + return; default: hdr->version = RAS_TABLE_VER_V1; return; @@ -443,7 +447,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) hdr->header = RAS_TABLE_HDR_VAL; amdgpu_ras_set_eeprom_table_version(control); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { hdr->first_rec_offset = RAS_RECORD_START_V2_1; hdr->tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE; @@ -461,7 +465,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } csum = __calc_hdr_byte_sum(control); - if (hdr->version == RAS_TABLE_VER_V2_1) + if (hdr->version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); csum = -csum; hdr->checksum = csum; @@ -470,9 +474,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) res = __write_table_ras_info(control); control->ras_num_recs = 0; + control->ras_num_bad_pages = 0; control->ras_fri = 0; - amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); + amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages); control->bad_channel_bitmap = 0; amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); @@ -557,16 +562,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) return false; if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { - if (amdgpu_bad_page_threshold == -1) { + if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold) dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", - con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold); + con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { dev_warn(adev->dev, - "But GPU can be operated due to bad_page_threshold = -1.\n"); + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); return false; } else { - dev_warn(adev->dev, "This GPU is in BAD status."); - dev_warn(adev->dev, "Please retire it or set a larger " - "threshold value when reloading driver.\n"); + dev_warn(adev->dev, + "Please consider adjusting the customized threshold.\n"); return true; } } @@ -621,6 +627,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); + struct amdgpu_device *adev = to_amdgpu_device(control); u32 a, b, i; u8 *buf, *pp; int res; @@ -723,6 +730,15 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, control->ras_num_recs = 1 + (control->ras_max_record_count + b - control->ras_fri) % control->ras_max_record_count; + + /*old asics only save pa to eeprom like before*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) + control->ras_num_pa_recs += num; + else + control->ras_num_mca_recs += num; + + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; Out: kfree(buf); return res; @@ -740,24 +756,25 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) /* Modify the header if it exceeds. */ if (amdgpu_bad_page_threshold != 0 && - control->ras_num_recs >= ras->bad_page_cnt_threshold) { + control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { dev_warn(adev->dev, "Saved bad pages %d reaches threshold value %d\n", - control->ras_num_recs, ras->bad_page_cnt_threshold); + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; control->tbl_rai.health_percent = 0; } - if (amdgpu_bad_page_threshold != -1) + if ((amdgpu_bad_page_threshold != -1) && + (amdgpu_bad_page_threshold != -2)) ras->is_rma = true; /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -797,10 +814,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) * now calculate gpu health percent */ if (amdgpu_bad_page_threshold != 0 && - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && - control->ras_num_recs < ras->bad_page_cnt_threshold) + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && + control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - - control->ras_num_recs) * 100) / + control->ras_num_bad_pages) * 100) / ras->bad_page_cnt_threshold; /* Recalc the checksum. @@ -810,7 +827,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) csum += *pp; csum += __calc_hdr_byte_sum(control); - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); /* avoid sign extension when assigning to "checksum" */ csum = -csum; @@ -841,7 +858,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_device *adev = to_amdgpu_device(control); - int res; + int res, i; + uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -855,6 +873,13 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, return -EINVAL; } + if (adev->gmc.gmc_funcs->query_mem_partition_mode) + nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); + + /* set the new channel index flag */ + for (i = 0; i < num; i++) + record[i].retired_page |= (nps << UMC_NPS_SHIFT); + mutex_lock(&control->ras_tbl_mutex); res = amdgpu_ras_eeprom_append_table(control, record, num); @@ -864,6 +889,11 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, amdgpu_ras_debugfs_set_ret_size(control); mutex_unlock(&control->ras_tbl_mutex); + + /* clear channel index flag, the flag is only saved on eeprom */ + for (i = 0; i < num; i++) + record[i].retired_page &= ~(nps << UMC_NPS_SHIFT); + return res; } @@ -1014,7 +1044,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co /* get available eeprom table version first before eeprom table init */ amdgpu_ras_set_eeprom_table_version(control); - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) return RAS_MAX_RECORD_COUNT_V2_1; else return RAS_MAX_RECORD_COUNT; @@ -1259,7 +1289,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control int buf_size, res; u8 csum, *buf, *pp; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) buf_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -1362,7 +1392,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) __decode_table_header_from_buf(hdr, buf); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); control->ras_record_offset = RAS_RECORD_START_V2_1; control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; @@ -1373,11 +1403,36 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) } control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); + control->ras_num_mca_recs = 0; + control->ras_num_pa_recs = 0; + return 0; +} + +int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int res; + + if (!__is_ras_eeprom_supported(adev)) + return 0; + + /* Verify i2c adapter is initialized */ + if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) + return -ENOENT; + + if (!__get_eeprom_i2c_addr(adev, control)) + return -EINVAL; + + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; + if (hdr->header == RAS_TABLE_HDR_VAL) { DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", - control->ras_num_recs); + control->ras_num_bad_pages); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; @@ -1385,28 +1440,32 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) res = __verify_ras_table_checksum(control); if (res) - DRM_ERROR("RAS table incorrect checksum or error:%d\n", - res); + dev_err(adev->dev, + "RAS table incorrect checksum or error:%d\n", + res); /* Warn if we are at 90% of the threshold or above */ - if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold) + if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", - control->ras_num_recs, + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); } else if (hdr->header == RAS_TABLE_HDR_BAD && amdgpu_bad_page_threshold != 0) { - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; } res = __verify_ras_table_checksum(control); - if (res) - DRM_ERROR("RAS Table incorrect checksum or error:%d\n", - res); - if (ras->bad_page_cnt_threshold > control->ras_num_recs) { + if (res) { + dev_err(adev->dev, + "RAS Table incorrect checksum or error:%d\n", + res); + return -EINVAL; + } + if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) { /* This means that, the threshold was increased since * the last time the system was booted, and now, * ras->bad_page_cnt_threshold - control->num_recs > 0, @@ -1416,22 +1475,23 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) dev_info(adev->dev, "records:%d threshold:%d, resetting " "RAS table header signature", - control->ras_num_recs, + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); res = amdgpu_ras_eeprom_correct_header_tag(control, RAS_TABLE_HDR_VAL); } else { - dev_err(adev->dev, "RAS records:%d exceed threshold:%d", - control->ras_num_recs, ras->bad_page_cnt_threshold); - if (amdgpu_bad_page_threshold == -1) { - dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); + dev_warn(adev->dev, + "RAS records:%d exceed threshold:%d\n", + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { res = 0; + dev_warn(adev->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); } else { ras->is_rma = true; - dev_err(adev->dev, - "RAS records:%d exceed threshold:%d, " - "GPU will not be initialized. Replace this GPU or increase the threshold", - control->ras_num_recs, ras->bad_page_cnt_threshold); + dev_warn(adev->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); } } } else { |