summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c441
1 files changed, 375 insertions, 66 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4c9fa24dd972..bed2603ae4c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,6 +36,7 @@
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "nbio_v4_3.h"
+#include "nbif_v6_3_1.h"
#include "nbio_v7_9.h"
#include "atom.h"
#include "amdgpu_reset.h"
@@ -192,7 +193,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
- err_data.err_addr_cnt);
+ err_data.err_addr_cnt, false);
amdgpu_ras_save_bad_pages(adev, NULL);
}
@@ -1863,6 +1864,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
if (!obj || obj->attr_inuse)
return -EINVAL;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block))
+ return 0;
+
get_obj(obj);
snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
@@ -2015,6 +2019,7 @@ static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
ret = true;
break;
@@ -2156,6 +2161,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
/* Fatal error events are handled on host side */
if (amdgpu_sriov_vf(adev))
return;
+ /**
+ * If the current interrupt is caused by a non-fatal RAS error, skip
+ * check for fatal error. For fatal errors, FED status of all devices
+ * in XGMI hive gets set when the first device gets fatal error
+ * interrupt. The error gets propagated to other devices as well, so
+ * make sure to ack the interrupt regardless of FED status.
+ */
+ if (!amdgpu_ras_get_fed_status(adev) &&
+ amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY))
+ return;
if (adev->nbio.ras &&
adev->nbio.ras->handle_ras_controller_intr_no_bifring)
@@ -2185,6 +2200,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (ret)
return;
+ amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block);
/* both query_poison_status and handle_poison_consumption are optional,
* but at least one of them should be implemented if we need poison
* consumption handler
@@ -2717,41 +2733,224 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
return 0;
}
-/* it deal with vram only. */
-int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
- struct eeprom_table_record *bps, int pages)
+static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps,
+ struct ras_err_data *err_data)
{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
+ struct ta_ras_query_address_input addr_in;
+ uint32_t socket = 0;
int ret = 0;
- uint32_t i;
- if (!con || !con->eh_data || !bps || pages <= 0)
- return 0;
+ if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
+ socket = adev->smuio.funcs->get_socket_id(adev);
- mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data)
- goto out;
+ /* reinit err_data */
+ err_data->err_addr_cnt = 0;
+ err_data->err_addr_len = adev->umc.retire_unit;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ addr_in.ma.err_addr = bps->address;
+ addr_in.ma.socket_id = socket;
+ addr_in.ma.ch_inst = bps->mem_channel;
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+ ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
+ &addr_in, NULL, false);
+
+ return ret;
+}
- for (i = 0; i < pages; i++) {
+static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps,
+ struct ras_err_data *err_data)
+{
+ struct ta_ras_query_address_input addr_in;
+ uint32_t die_id, socket = 0;
+
+ if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
+ socket = adev->smuio.funcs->get_socket_id(adev);
+
+ /* although die id is gotten from PA in nps1 mode, the id is
+ * fitable for any nps mode
+ */
+ if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa)
+ die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT);
+ else
+ return -EINVAL;
+
+ /* reinit err_data */
+ err_data->err_addr_cnt = 0;
+ err_data->err_addr_len = adev->umc.retire_unit;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ addr_in.ma.err_addr = bps->address;
+ addr_in.ma.ch_inst = bps->mem_channel;
+ addr_in.ma.umc_inst = bps->mcumc_id;
+ addr_in.ma.node_inst = die_id;
+ addr_in.ma.socket_id = socket;
+
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+ return adev->umc.ras->convert_ras_err_addr(adev, err_data,
+ &addr_in, NULL, false);
+ else
+ return -EINVAL;
+}
+
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int j;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
+ for (j = 0; j < count; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
- ret = -ENOMEM;
- goto out;
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ return -ENOMEM;
}
- amdgpu_ras_reserve_page(adev, bps[i].retired_page);
+ amdgpu_ras_reserve_page(adev, bps[j].retired_page);
- memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
+ memcpy(&data->bps[data->count], &(bps[j]),
+ sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
}
+
+ return 0;
+}
+
+static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+
+ /*old asics just have pa in eeprom*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ goto out;
+ }
+
+ for (i = 0; i < adev->umc.retire_unit; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps) {
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+ return -EINVAL;
+ }
+ } else {
+ if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+ if (nps == AMDGPU_NPS1_PARTITION_MODE)
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ else
+ return -EOPNOTSUPP;
+ }
+ }
+
out:
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ }
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+ adev->umc.retire_unit);
+}
+
+/* it deal with vram only. */
+int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int pages, bool from_rom)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_data err_data;
+ struct amdgpu_ras_eeprom_control *control =
+ &adev->psp.ras_context.ras->eeprom_control;
+ enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
+ int ret = 0;
+ uint32_t i;
+
+ if (!con || !con->eh_data || !bps || pages <= 0)
+ return 0;
+
+ if (from_rom) {
+ err_data.err_addr =
+ kcalloc(adev->umc.retire_unit,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if (!err_data.err_addr) {
+ dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
+ return -ENOMEM;
+ }
+
+ if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+ nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+ }
+
+ mutex_lock(&con->recovery_lock);
+
+ if (from_rom) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ //deal with retire_unit records a time
+ ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
+ goto free;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+ for (; i < pages; i++) {
+ ret = __amdgpu_ras_convert_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
+ goto free;
+ }
+ } else {
+ ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
+ }
+
+free:
+ if (from_rom)
+ kfree(err_data.err_addr);
mutex_unlock(&con->recovery_lock);
return ret;
@@ -2768,7 +2967,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
- int save_count;
+ int save_count, unit_num, bad_page_num, i;
if (!con || !con->eh_data) {
if (new_cnt)
@@ -2780,19 +2979,32 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
- save_count = data->count - control->ras_num_recs;
+ bad_page_num = control->ras_num_bad_pages;
+ save_count = data->count - bad_page_num;
mutex_unlock(&con->recovery_lock);
+ unit_num = save_count / adev->umc.retire_unit;
if (new_cnt)
- *new_cnt = save_count / adev->umc.retire_unit;
+ *new_cnt = unit_num;
/* only new entries are saved */
if (save_count > 0) {
- if (amdgpu_ras_eeprom_append(control,
- &data->bps[control->ras_num_recs],
- save_count)) {
- dev_err(adev->dev, "Failed to save EEPROM table data!");
- return -EIO;
+ /*old asics only save pa to eeprom like before*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
+ if (amdgpu_ras_eeprom_append(control,
+ &data->bps[bad_page_num], save_count)) {
+ dev_err(adev->dev, "Failed to save EEPROM table data!");
+ return -EIO;
+ }
+ } else {
+ for (i = 0; i < unit_num; i++) {
+ if (amdgpu_ras_eeprom_append(control,
+ &data->bps[bad_page_num +
+ i * adev->umc.retire_unit], 1)) {
+ dev_err(adev->dev, "Failed to save EEPROM table data!");
+ return -EIO;
+ }
+ }
}
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
@@ -2810,7 +3022,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
struct eeprom_table_record *bps;
- int ret;
+ int ret, i = 0;
/* no bad page record, skip eeprom access */
if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
@@ -2821,11 +3033,42 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
return -ENOMEM;
ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
- if (ret)
+ if (ret) {
dev_err(adev->dev, "Failed to load EEPROM table records!");
- else
- ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
+ } else {
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+ for (i = 0; i < control->ras_num_recs; i++) {
+ if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ control->ras_num_pa_recs += adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ control->ras_num_mca_recs +=
+ (control->ras_num_recs - i);
+ break;
+ }
+ } else {
+ control->ras_num_mca_recs += (control->ras_num_recs - i);
+ break;
+ }
+ }
+ }
+
+ ret = amdgpu_ras_eeprom_check(control);
+ if (ret)
+ goto out;
+
+ /* HW not usable */
+ if (amdgpu_ras_is_rma(adev)) {
+ ret = -EHWPOISON;
+ goto out;
+ }
+
+ ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
+ }
+out:
kfree(bps);
return ret;
}
@@ -2870,31 +3113,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/*
- * Justification of value bad_page_cnt_threshold in ras structure
- *
- * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
- * scenarios accordingly.
- *
- * Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -2,
- * bad_page_cnt_threshold = typical value by formula.
- *
- * - When the value from user is 0 < amdgpu_bad_page_threshold <
- * max record length in eeprom, use it directly.
- *
- * Bad page retirement disablement:
- * - If amdgpu_bad_page_threshold = 0, bad page retirement
- * functionality is disabled, and bad_page_cnt_threshold will
- * take no effect.
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
*/
-
- if (amdgpu_bad_page_threshold < 0) {
+ if (amdgpu_bad_page_threshold == -2) {
u64 val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_COVER);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_count);
+ } else if (amdgpu_bad_page_threshold == -1) {
+ con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
} else {
con->bad_page_cnt_threshold = min_t(int, max_count,
amdgpu_bad_page_threshold);
@@ -3205,33 +3446,40 @@ static int amdgpu_ras_page_retirement_thread(void *param)
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_eeprom_control *control;
int ret;
if (!con || amdgpu_sriov_vf(adev))
return 0;
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-
+ control = &con->eeprom_control;
+ ret = amdgpu_ras_eeprom_init(control);
if (ret)
return ret;
- /* HW not usable */
- if (amdgpu_ras_is_rma(adev))
- return -EHWPOISON;
+ if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
+ control->ras_num_pa_recs = control->ras_num_recs;
- if (con->eeprom_control.ras_num_recs) {
+ if (control->ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
return ret;
amdgpu_dpm_send_hbm_bad_pages_num(
- adev, con->eeprom_control.ras_num_recs);
+ adev, control->ras_num_bad_pages);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(
- adev, con->eeprom_control.bad_channel_bitmap);
+ adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
+
+ /* The format action is only applied to new ASICs */
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
+ control->tbl_hdr.version < RAS_TABLE_VER_V3)
+ if (!amdgpu_ras_eeprom_reset_table(control))
+ if (amdgpu_ras_save_bad_pages(adev, NULL))
+ dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
return ret;
@@ -3366,6 +3614,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
return true;
default:
@@ -3378,7 +3627,9 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+ case IP_VERSION(14, 0, 3):
return true;
default:
return false;
@@ -3541,8 +3792,11 @@ init_ras_enabled_flag:
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
- /* aca is disabled by default */
- adev->aca.is_enabled = false;
+ /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
+ adev->aca.is_enabled =
+ (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14));
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
@@ -3629,8 +3883,11 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 12):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+ break;
case IP_VERSION(13, 0, 14):
- con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
break;
default:
break;
@@ -3704,7 +3961,19 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
* check DF RAS */
adev->nbio.ras = &nbio_v4_3_ras;
break;
+ case IP_VERSION(6, 3, 1):
+ if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
+ /* unlike other generation of nbio ras,
+ * nbif v6_3_1 only support fatal error interrupt
+ * to inform software that DF is freezed due to
+ * system fatal error event. driver should not
+ * enable nbio ras in such case. Instead,
+ * check DF RAS
+ */
+ adev->nbio.ras = &nbif_v6_3_1_ras;
+ break;
case IP_VERSION(7, 9, 0):
+ case IP_VERSION(7, 9, 1):
if (!adev->gmc.is_app_apu)
adev->nbio.ras = &nbio_v7_9_ras;
break;
@@ -4083,7 +4352,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
if (!ras)
return false;
- return atomic_read(&ras->fed);
+ return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
}
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
@@ -4091,8 +4360,48 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
struct amdgpu_ras *ras;
ras = amdgpu_ras_get_context(adev);
+ if (ras) {
+ if (status)
+ set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
+ else
+ clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
+ }
+}
+
+void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
+ if (ras)
+ ras->ras_err_state = 0;
+}
+
+void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
+{
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
if (ras)
- atomic_set(&ras->fed, !!status);
+ set_bit(block, &ras->ras_err_state);
+}
+
+bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block)
+{
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
+ if (ras) {
+ if (block == AMDGPU_RAS_BLOCK__ANY)
+ return (ras->ras_err_state != 0);
+ else
+ return test_bit(block, &ras->ras_err_state) ||
+ test_bit(AMDGPU_RAS_BLOCK__LAST,
+ &ras->ras_err_state);
+ }
+
+ return false;
}
static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
@@ -4856,9 +5165,9 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
"socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
socket_id, aid_id, fw_status);
- if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+ if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error))
dev_info(adev->dev,
- "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n",
+ "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n",
socket_id, aid_id, fw_status);
}