summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c1182
1 files changed, 879 insertions, 303 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0924aa3f4e4..fc9f3adf9912 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -21,6 +21,7 @@
*
*
*/
+#include "amdgpu_reg_access.h"
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/module.h>
@@ -41,6 +42,8 @@
#include "atom.h"
#include "amdgpu_reset.h"
#include "amdgpu_psp.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_virt_ras_cmd.h"
#ifdef CONFIG_X86_MCE_AMD
#include <asm/mce.h>
@@ -77,6 +80,7 @@ const char *ras_block_string[] = {
"jpeg",
"ih",
"mpio",
+ "mmsch",
};
const char *ras_mca_block_string[] = {
@@ -121,12 +125,15 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
+#define BYPASS_ALLOCATED_ADDRESS 0x0
+#define BYPASS_INITIALIZATION_ADDRESS 0x1
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -135,12 +142,18 @@ enum amdgpu_ras_retire_page_reservation {
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
+
+static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
+static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
+
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+static void
+amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
struct amdgpu_device *devs[MAX_GPU_INSTANCE];
int num_gpu;
@@ -168,18 +181,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
struct eeprom_table_record err_rec;
int ret;
- if ((address >= adev->gmc.mc_vram_size) ||
- (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ ret = amdgpu_ras_check_bad_page(adev, address);
+ if (ret == -EINVAL) {
dev_warn(adev->dev,
- "RAS WARN: input address 0x%llx is invalid.\n",
- address);
+ "RAS WARN: input address 0x%llx is invalid.\n",
+ address);
return -EINVAL;
- }
-
- if (amdgpu_ras_check_bad_page(adev, address)) {
+ } else if (ret == 1) {
dev_warn(adev->dev,
- "RAS WARN: 0x%llx has already been marked as bad page!\n",
- address);
+ "RAS WARN: 0x%llx has already been marked as bad page!\n",
+ address);
return 0;
}
@@ -206,6 +217,72 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
return 0;
}
+static int amdgpu_check_address_validity(struct amdgpu_device *adev,
+ uint64_t address, uint64_t flags)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_vram_block_info blk_info;
+ uint64_t page_pfns[32] = {0};
+ int i, ret, count;
+ bool hit = false;
+
+ if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
+ return 0;
+
+ if (amdgpu_sriov_vf(adev)) {
+ if (amdgpu_uniras_enabled(adev)) {
+ if (amdgpu_virt_ras_check_address_validity(adev, address, &hit))
+ return -EPERM;
+ if (hit)
+ return -EACCES;
+ } else {
+ if (amdgpu_virt_check_vf_critical_region(adev, address, &hit))
+ return -EPERM;
+ return hit ? -EACCES : 0;
+ }
+ }
+
+ if ((address >= adev->gmc.mc_vram_size) ||
+ (address >= RAS_UMC_INJECT_ADDR_LIMIT))
+ return -EFAULT;
+
+ if (amdgpu_uniras_enabled(adev)) {
+ if (amdgpu_sriov_vf(adev))
+ count = amdgpu_virt_ras_convert_retired_address(adev, address,
+ page_pfns, ARRAY_SIZE(page_pfns));
+ else
+ count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address,
+ page_pfns, ARRAY_SIZE(page_pfns));
+ } else
+ count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
+ address, page_pfns, ARRAY_SIZE(page_pfns));
+
+ if (count <= 0)
+ return -EPERM;
+
+ for (i = 0; i < count; i++) {
+ memset(&blk_info, 0, sizeof(blk_info));
+ ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr,
+ page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info);
+ if (!ret) {
+ /* The input address that needs to be checked is allocated by
+ * current calling process, so it is necessary to exclude
+ * the calling process.
+ */
+ if ((flags == BYPASS_ALLOCATED_ADDRESS) &&
+ ((blk_info.task.pid != task_pid_nr(current)) ||
+ strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN)))
+ return -EACCES;
+ else if ((flags == BYPASS_INITIALIZATION_ADDRESS) &&
+ (blk_info.task.pid == con->init_task_pid) &&
+ !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN))
+ return -EACCES;
+ }
+ }
+
+ return 0;
+}
+
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
@@ -296,6 +373,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
op = 2;
else if (strstr(str, "retire_page") != NULL)
op = 3;
+ else if (strstr(str, "check_address") != NULL)
+ op = 4;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
@@ -310,6 +389,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
data->inject.address = address;
return 0;
+ } else if (op == 4) {
+ if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 &&
+ sscanf(str, "%*s %llu %llu", &address, &value) != 2)
+ return -EINVAL;
+
+ data->op = op;
+ data->inject.address = address;
+ data->inject.value = value;
+ return 0;
}
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
@@ -499,6 +587,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
return size;
else
return ret;
+ } else if (data.op == 4) {
+ ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value);
+ return ret ? ret : size;
}
if (!amdgpu_ras_is_supported(adev, data.head.block))
@@ -512,22 +603,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
break;
case 2:
- if ((data.inject.address >= adev->gmc.mc_vram_size &&
- adev->gmc.mc_vram_size) ||
- (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
- dev_warn(adev->dev, "RAS WARN: input address "
- "0x%llx is invalid.",
+ /* umc ce/ue error injection for a bad page is not allowed */
+ if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
+ ret = amdgpu_ras_check_bad_page(adev, data.inject.address);
+ if (ret == -EINVAL) {
+ dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.",
data.inject.address);
- ret = -EINVAL;
break;
- }
-
- /* umc ce/ue error injection for a bad page is not allowed */
- if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
- amdgpu_ras_check_bad_page(adev, data.inject.address)) {
- dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
- "already been marked as bad!\n",
- data.inject.address);
+ } else if (ret == 1) {
+ dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
+ data.inject.address);
break;
}
@@ -547,6 +632,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
return size;
}
+static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev);
+
/**
* DOC: AMDGPU RAS debugfs EEPROM table reset interface
*
@@ -571,6 +658,11 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
(struct amdgpu_device *)file_inode(f)->i_private;
int ret;
+ if (amdgpu_uniras_enabled(adev)) {
+ ret = amdgpu_uniras_clear_badpages_info(adev);
+ return ret ? ret : size;
+ }
+
ret = amdgpu_ras_eeprom_reset_table(
&(amdgpu_ras_get_context(adev)->eeprom_control));
@@ -812,7 +904,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_sriov_vf(adev) &&
!amdgpu_ras_intr_triggered()) {
- info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
+ info = kzalloc_obj(union ta_ras_cmd_input);
if (!info)
return -ENOMEM;
@@ -1106,6 +1198,9 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
err_info->de_count, blk_name);
}
} else {
+ if (adev->debug_disable_ce_logs)
+ return;
+
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
@@ -1475,9 +1570,51 @@ out_fini_err_data:
return ret;
}
+static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev)
+{
+ struct ras_cmd_dev_handle req = {0};
+ int ret;
+
+ ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CLEAR_BAD_PAGE_INFO,
+ &req, sizeof(req), NULL, 0);
+ if (ret) {
+ dev_err(adev->dev, "Failed to clear bad pages info, ret: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev,
+ struct ras_query_if *info)
+{
+ struct ras_cmd_block_ecc_info_req req = {0};
+ struct ras_cmd_block_ecc_info_rsp rsp = {0};
+ int ret;
+
+ if (!info)
+ return -EINVAL;
+
+ req.block_id = info->head.block;
+ req.subblock_id = info->head.sub_block_index;
+
+ ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS,
+ &req, sizeof(req), &rsp, sizeof(rsp));
+ if (!ret) {
+ info->ce_count = rsp.ce_count;
+ info->ue_count = rsp.ue_count;
+ info->de_count = rsp.de_count;
+ }
+
+ return ret;
+}
+
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{
- return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
+ if (amdgpu_uniras_enabled(adev))
+ return amdgpu_uniras_query_block_ecc(adev, info);
+ else
+ return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
}
int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
@@ -1497,6 +1634,9 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
+ if (amdgpu_sriov_vf(adev))
+ return -EOPNOTSUPP;
+
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
((smu_funcs && smu_funcs->set_debug_mode) ||
@@ -1526,6 +1666,27 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
return 0;
}
+static int amdgpu_uniras_error_inject(struct amdgpu_device *adev,
+ struct ras_inject_if *info)
+{
+ struct ras_cmd_inject_error_req inject_req;
+ struct ras_cmd_inject_error_rsp rsp;
+
+ if (!info)
+ return -EINVAL;
+
+ memset(&inject_req, 0, sizeof(inject_req));
+ inject_req.block_id = info->head.block;
+ inject_req.subblock_id = info->head.sub_block_index;
+ inject_req.address = info->address;
+ inject_req.error_type = info->head.type;
+ inject_req.instance_mask = info->instance_mask;
+ inject_req.method = info->value;
+
+ return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR,
+ &inject_req, sizeof(inject_req), &rsp, sizeof(rsp));
+}
+
/* wrapper of psp_ras_trigger_error */
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info)
@@ -1543,6 +1704,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
info->head.block,
info->head.sub_block_index);
+ if (amdgpu_uniras_enabled(adev))
+ return amdgpu_uniras_error_inject(adev, info);
+
/* inject on guest isn't allowed, return success directly */
if (amdgpu_sriov_vf(adev))
return 0;
@@ -1687,7 +1851,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
/* sysfs begin */
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
- struct ras_badpage **bps, unsigned int *count);
+ struct ras_badpage *bps, uint32_t count, uint32_t start);
+static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
+ struct ras_badpage *bps, uint32_t count, uint32_t start);
static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
{
@@ -1733,7 +1899,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
*/
static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
- struct kobject *kobj, struct bin_attribute *attr,
+ struct kobject *kobj, const struct bin_attribute *attr,
char *buf, loff_t ppos, size_t count)
{
struct amdgpu_ras *con =
@@ -1745,19 +1911,52 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
unsigned int end = div64_ul(ppos + count - 1, element_size);
ssize_t s = 0;
struct ras_badpage *bps = NULL;
- unsigned int bps_count = 0;
+ int bps_count = 0, i, status;
+ uint64_t address;
memset(buf, 0, count);
- if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
+ bps_count = end - start;
+ bps = kmalloc_objs(*bps, bps_count);
+ if (!bps)
return 0;
- for (; start < end && start < bps_count; start++)
+ memset(bps, 0, sizeof(*bps) * bps_count);
+
+ if (amdgpu_uniras_enabled(adev))
+ bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start);
+ else
+ bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start);
+
+ if (bps_count <= 0) {
+ kfree(bps);
+ return 0;
+ }
+
+ for (i = 0; i < bps_count; i++) {
+ address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT;
+
+ bps[i].size = AMDGPU_GPU_PAGE_SIZE;
+
+ status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
+ address);
+ if (status == -EBUSY)
+ bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
+ else if (status == -ENOENT)
+ bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
+ else
+ bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED;
+
+ if ((bps[i].flags != AMDGPU_RAS_RETIRE_PAGE_RESERVED) &&
+ amdgpu_ras_check_critical_address(adev, address))
+ bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED;
+
s += scnprintf(&buf[s], element_size + 1,
"0x%08x : 0x%08x : %1s\n",
- bps[start].bp,
- bps[start].size,
- amdgpu_ras_badpage_flags_str(bps[start].flags));
+ bps[i].bp,
+ bps[i].size,
+ amdgpu_ras_badpage_flags_str(bps[i].flags));
+ }
kfree(bps);
@@ -1773,12 +1972,42 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);
}
+static bool amdgpu_ras_get_version_info(struct amdgpu_device *adev, u32 *major,
+ u32 *minor, u32 *rev)
+{
+ int i;
+
+ if (!adev || !major || !minor || !rev || !amdgpu_uniras_enabled(adev))
+ return false;
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_RAS) {
+ *major = adev->ip_blocks[i].version->major;
+ *minor = adev->ip_blocks[i].version->minor;
+ *rev = adev->ip_blocks[i].version->rev;
+ return true;
+ }
+ }
+
+ return false;
+}
+
static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct amdgpu_ras *con =
container_of(attr, struct amdgpu_ras, version_attr);
- return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version);
+ u32 major, minor, rev;
+ ssize_t size = 0;
+
+ size += sysfs_emit_at(buf, size, "table version: 0x%x\n",
+ con->eeprom_control.tbl_hdr.version);
+
+ if (amdgpu_ras_get_version_info(con->adev, &major, &minor, &rev))
+ size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n",
+ major, minor, rev);
+
+ return size;
}
static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev,
@@ -1864,6 +2093,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
if (!obj || obj->attr_inuse)
return -EINVAL;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block))
+ return 0;
+
get_obj(obj);
snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
@@ -2018,6 +2250,7 @@ static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 15):
ret = true;
break;
default:
@@ -2047,7 +2280,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
list_for_each_entry(obj, &con->head, node) {
if (amdgpu_ras_is_supported(adev, obj->head.block) &&
(obj->attr_inuse == 1)) {
- sprintf(fs_info.debugfs_name, "%s_err_inject",
+ snprintf(fs_info.debugfs_name, sizeof(fs_info.debugfs_name),
+ "%s_err_inject",
get_ras_block_str(&obj->head));
fs_info.head = obj->head;
amdgpu_ras_debugfs_create(adev, &fs_info, dir);
@@ -2065,8 +2299,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
/* debugfs end */
/* ras fs */
-static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
- amdgpu_ras_sysfs_badpages_read, NULL, 0);
+static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
+ amdgpu_ras_sysfs_badpages_read, NULL, 0);
static DEVICE_ATTR(features, S_IRUGO,
amdgpu_ras_sysfs_features_read, NULL);
static DEVICE_ATTR(version, 0444,
@@ -2088,7 +2322,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
&con->event_state_attr.attr,
NULL
};
- struct bin_attribute *bin_attrs[] = {
+ const struct bin_attribute *bin_attrs[] = {
NULL,
NULL,
};
@@ -2114,11 +2348,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
if (amdgpu_bad_page_threshold != 0) {
/* add bad_page_features entry */
- bin_attr_gpu_vram_bad_pages.private = NULL;
con->badpages_attr = bin_attr_gpu_vram_bad_pages;
+ sysfs_bin_attr_init(&con->badpages_attr);
bin_attrs[0] = &con->badpages_attr;
group.bin_attrs = bin_attrs;
- sysfs_bin_attr_init(bin_attrs[0]);
}
r = sysfs_create_group(&adev->dev->kobj, &group);
@@ -2158,7 +2391,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
/* Fatal error events are handled on host side */
if (amdgpu_sriov_vf(adev))
return;
- /**
+ /*
* If the current interrupt is caused by a non-fatal RAS error, skip
* check for fatal error. For fatal errors, FED status of all devices
* in XGMI hive gets set when the first device gets fatal error
@@ -2169,6 +2402,11 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY))
return;
+ if (amdgpu_uniras_enabled(adev)) {
+ amdgpu_ras_mgr_handle_fatal_interrupt(adev, NULL);
+ return;
+ }
+
if (adev->nbio.ras &&
adev->nbio.ras->handle_ras_controller_intr_no_bifring)
adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
@@ -2339,6 +2577,16 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
struct ras_manager *obj;
struct ras_ih_data *data;
+ if (amdgpu_uniras_enabled(adev)) {
+ struct ras_ih_info ih_info;
+
+ memset(&ih_info, 0, sizeof(ih_info));
+ ih_info.block = info->head.block;
+ memcpy(&ih_info.iv_entry, info->entry, sizeof(struct amdgpu_iv_entry));
+
+ return amdgpu_ras_mgr_handle_controller_interrupt(adev, &ih_info);
+ }
+
obj = amdgpu_ras_find_obj(adev, &info->head);
if (!obj)
return -EINVAL;
@@ -2533,54 +2781,87 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
}
}
-/* recovery begin */
-
-/* return 0 on success.
- * caller need free bps.
- */
static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
- struct ras_badpage **bps, unsigned int *count)
+ struct ras_badpage *bps, uint32_t count, uint32_t start)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
- int i = 0;
- int ret = 0, status;
+ int r = 0;
+ uint32_t i;
if (!con || !con->eh_data || !bps || !count)
return -EINVAL;
mutex_lock(&con->recovery_lock);
data = con->eh_data;
- if (!data || data->count == 0) {
- *bps = NULL;
- ret = -EINVAL;
- goto out;
- }
+ if (start < data->count) {
+ for (i = start; i < data->count; i++) {
+ if (!data->bps[i].ts)
+ continue;
- *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
- if (!*bps) {
- ret = -ENOMEM;
- goto out;
+ /* U64_MAX is used to mark the record as invalid */
+ if (data->bps[i].retired_page == U64_MAX)
+ continue;
+
+ bps[r].bp = data->bps[i].retired_page;
+ r++;
+ if (r >= count)
+ break;
+ }
}
+ mutex_unlock(&con->recovery_lock);
- for (; i < data->count; i++) {
- (*bps)[i] = (struct ras_badpage){
- .bp = data->bps[i].retired_page,
- .size = AMDGPU_GPU_PAGE_SIZE,
- .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
- };
- status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
- data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
- if (status == -EBUSY)
- (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
- else if (status == -ENOENT)
- (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
+ return r;
+}
+
+static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
+ struct ras_badpage *bps, uint32_t count, uint32_t start)
+{
+ struct ras_cmd_bad_pages_info_req cmd_input;
+ struct ras_cmd_bad_pages_info_rsp *output;
+ uint32_t group, start_group, end_group;
+ uint32_t pos, pos_in_group;
+ int r = 0, i;
+
+ if (!bps || !count)
+ return -EINVAL;
+
+ output = kmalloc_obj(*output);
+ if (!output)
+ return -ENOMEM;
+
+ memset(&cmd_input, 0, sizeof(cmd_input));
+
+ start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+ end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) /
+ RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+
+ pos = start;
+ for (group = start_group; group < end_group; group++) {
+ memset(output, 0, sizeof(*output));
+ cmd_input.group_index = group;
+ if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES,
+ &cmd_input, sizeof(cmd_input), output, sizeof(*output)))
+ goto out;
+
+ if (pos >= output->bp_total_cnt)
+ goto out;
+
+ pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+ for (i = pos_in_group; i < output->bp_in_group; i++, pos++) {
+ if (!output->records[i].ts)
+ continue;
+
+ bps[r].bp = output->records[i].retired_page;
+ r++;
+ if (r >= count)
+ goto out;
+ }
}
- *count = data->count;
out:
- mutex_unlock(&con->recovery_lock);
- return ret;
+ kfree(output);
+ return r;
}
static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
@@ -2629,6 +2910,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ unsigned int error_query_mode;
enum ras_event_type type;
if (hive) {
@@ -2657,11 +2939,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
device_list_handle = &device_list;
}
+ if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) {
+ if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) {
+ /* wait 500ms to ensure pmfw polling mca bank info done */
+ msleep(500);
+ }
+ }
+
type = amdgpu_ras_get_fatal_error_event(adev);
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) {
- amdgpu_ras_query_err_status(remote_adev);
- amdgpu_ras_log_on_err_counter(remote_adev, type);
+ if (amdgpu_uniras_enabled(remote_adev)) {
+ amdgpu_ras_mgr_update_ras_ecc(remote_adev);
+ } else {
+ amdgpu_ras_query_err_status(remote_adev);
+ amdgpu_ras_log_on_err_counter(remote_adev, type);
+ }
}
}
@@ -2713,7 +3006,7 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
unsigned int old_space = data->count + data->space_left;
unsigned int new_space = old_space + pages;
unsigned int align_space = ALIGN(new_space, 512);
- void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+ void *bps = kmalloc_objs(*data->bps, align_space);
if (!bps) {
return -ENOMEM;
@@ -2749,8 +3042,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev,
addr_in.ma.err_addr = bps->address;
addr_in.ma.socket_id = socket;
addr_in.ma.ch_inst = bps->mem_channel;
- /* tell RAS TA the node instance is not used */
- addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ } else {
+ addr_in.ma.umc_inst = bps->mcumc_id;
+ addr_in.ma.node_inst = bps->cu;
+ }
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
@@ -2796,138 +3094,217 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
return -EINVAL;
}
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, int count)
+{
+ int j;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_err_handler_data *data = con->eh_data;
+
+ for (j = 0; j < count; j++) {
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ return -ENOMEM;
+ }
+
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
+ /* set to U64_MAX to mark it as invalid */
+ data->bps[data->count].retired_page = U64_MAX;
+ data->count++;
+ data->space_left--;
+ continue;
+ }
+
+ amdgpu_ras_reserve_page(adev, bps[j].retired_page);
+
+ memcpy(&data->bps[data->count], &(bps[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ con->bad_page_num++;
+ }
+
+ return 0;
+}
+
+static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ uint64_t chan_idx_v2;
+ enum amdgpu_memory_partition save_nps;
+
+ save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2;
+
+ /*old asics just have pa in eeprom*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ goto out;
+ }
+
+ for (i = 0; i < adev->umc.retire_unit; i++)
+ bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+ if (save_nps || chan_idx_v2) {
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ for (i = 0; i < adev->umc.retire_unit; i++) {
+ err_data->err_addr[i].address = bps[0].address;
+ err_data->err_addr[i].mem_channel = bps[0].mem_channel;
+ err_data->err_addr[i].bank = bps[0].bank;
+ err_data->err_addr[i].err_type = bps[0].err_type;
+ err_data->err_addr[i].mcumc_id = bps[0].mcumc_id;
+ }
+ } else {
+ if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+ return -EINVAL;
+ }
+ } else {
+ if (bps[0].address == 0) {
+ /* for specific old eeprom data, mca address is not stored,
+ * calc it from pa
+ */
+ if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT,
+ &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE))
+ return -EINVAL;
+ }
+
+ if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+ if (nps == AMDGPU_NPS1_PARTITION_MODE)
+ memcpy(err_data->err_addr, bps,
+ sizeof(struct eeprom_table_record) * adev->umc.retire_unit);
+ else
+ return -EOPNOTSUPP;
+ }
+ }
+
+out:
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps, struct ras_err_data *err_data,
+ enum amdgpu_memory_partition nps)
+{
+ int i = 0;
+ uint64_t chan_idx_v2;
+ enum amdgpu_memory_partition save_nps;
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+ save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+ chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2;
+ bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+ } else {
+ /* if pmfw manages eeprom, save_nps is not stored on eeprom,
+ * we should always convert mca address into physical address,
+ * make save_nps different from nps
+ */
+ save_nps = nps + 1;
+ }
+
+ if (save_nps == nps) {
+ if (amdgpu_umc_pages_in_a_row(adev, err_data,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ return -EINVAL;
+ for (i = 0; i < adev->umc.retire_unit; i++) {
+ err_data->err_addr[i].address = bps->address;
+ err_data->err_addr[i].mem_channel = bps->mem_channel;
+ err_data->err_addr[i].bank = bps->bank;
+ err_data->err_addr[i].err_type = bps->err_type;
+ err_data->err_addr[i].mcumc_id = bps->mcumc_id;
+ }
+ } else {
+ if (save_nps || chan_idx_v2) {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ } else {
+ /* for specific old eeprom data, mca address is not stored,
+ * calc it from pa
+ */
+ if (bps->address == 0)
+ if (amdgpu_umc_pa2mca(adev,
+ bps->retired_page << AMDGPU_GPU_PAGE_SHIFT,
+ &(bps->address),
+ AMDGPU_NPS1_PARTITION_MODE))
+ return -EINVAL;
+
+ if (amdgpu_ras_mca2pa(adev, bps, err_data))
+ return -EOPNOTSUPP;
+ }
+ }
+
+ return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+ adev->umc.retire_unit);
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages, bool from_rom)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- struct ras_err_handler_data *data;
struct ras_err_data err_data;
- struct eeprom_table_record *err_rec;
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
- uint32_t i, j, loop_cnt = 1;
- bool find_pages_per_pa = false;
+ uint32_t i = 0;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
if (from_rom) {
err_data.err_addr =
- kcalloc(adev->umc.retire_unit,
- sizeof(struct eeprom_table_record), GFP_KERNEL);
+ kzalloc_objs(struct eeprom_table_record,
+ adev->umc.retire_unit);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- err_rec = err_data.err_addr;
- loop_cnt = adev->umc.retire_unit;
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
}
mutex_lock(&con->recovery_lock);
- data = con->eh_data;
- if (!data) {
- /* Returning 0 as the absence of eh_data is acceptable */
- goto free;
- }
-
- for (i = 0; i < pages; i++) {
- if (from_rom &&
- control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
- if (!find_pages_per_pa) {
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- /* may use old RAS TA, use PA to find pages in
- * one row
- */
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page <<
- AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- } else {
- find_pages_per_pa = true;
- }
+
+ if (from_rom) {
+ /* there is no pa recs in V3, so skip pa recs processing */
+ if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) &&
+ !amdgpu_ras_smu_eeprom_supported(adev)) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ /* deal with retire_unit records a time */
+ ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
+ &bps[i], &err_data, nps);
+ i += (adev->umc.retire_unit - 1);
} else {
- /* unsupported cases */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
- } else {
- if (amdgpu_umc_pages_in_a_row(adev, &err_data,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
- ret = -EINVAL;
- goto free;
- }
- }
- } else {
- if (from_rom && !find_pages_per_pa) {
- if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
- /* bad page in any NPS mode in eeprom */
- if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
- ret = -EINVAL;
- goto free;
+ break;
}
} else {
- /* legacy bad page in eeprom, generated only in
- * NPS1 mode
- */
- if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
- /* old RAS TA or ASICs which don't support to
- * convert addrss via mca address
- */
- if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
- find_pages_per_pa = true;
- err_rec = &bps[i];
- loop_cnt = 1;
- } else {
- /* non-nps1 mode, old RAS TA
- * can't support it
- */
- ret = -EOPNOTSUPP;
- goto free;
- }
- }
+ break;
}
-
- if (!find_pages_per_pa)
- i += (adev->umc.retire_unit - 1);
- } else {
- err_rec = &bps[i];
}
}
-
- for (j = 0; j < loop_cnt; j++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
- ret = -ENOMEM;
- goto free;
- }
-
- amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
-
- memcpy(&data->bps[data->count], &(err_rec[j]),
- sizeof(struct eeprom_table_record));
- data->count++;
- data->space_left--;
+ for (; i < pages; i++) {
+ ret = __amdgpu_ras_convert_rec_from_rom(adev,
+ &bps[i], &err_data, nps);
}
+
+ con->eh_data->count_saved = con->eh_data->count;
+ } else {
+ ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
-free:
if (from_rom)
kfree(err_data.err_addr);
-out:
mutex_unlock(&con->recovery_lock);
return ret;
@@ -2944,7 +3321,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
- int save_count, unit_num, bad_page_num, i;
+ int save_count, unit_num, i;
if (!con || !con->eh_data) {
if (new_cnt)
@@ -2953,31 +3330,45 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
return 0;
}
+ if (!con->eeprom_control.is_eeprom_valid) {
+ dev_warn(adev->dev,
+ "Failed to save EEPROM table data because of EEPROM data corruption!");
+ if (new_cnt)
+ *new_cnt = 0;
+
+ return 0;
+ }
+
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
- bad_page_num = control->ras_num_bad_pages;
- save_count = data->count - bad_page_num;
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ unit_num = control->ras_num_recs -
+ control->ras_num_recs_old;
+ else
+ unit_num = data->count / adev->umc.retire_unit -
+ control->ras_num_recs;
+
+ save_count = con->bad_page_num - control->ras_num_bad_pages;
mutex_unlock(&con->recovery_lock);
- unit_num = save_count / adev->umc.retire_unit;
if (new_cnt)
*new_cnt = unit_num;
/* only new entries are saved */
- if (save_count > 0) {
- if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) {
+ if (unit_num && save_count) {
+ /*old asics only save pa to eeprom like before*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[control->ras_num_recs],
- save_count)) {
+ &data->bps[data->count_saved], unit_num)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
} else {
for (i = 0; i < unit_num; i++) {
if (amdgpu_ras_eeprom_append(control,
- &data->bps[bad_page_num + i * adev->umc.retire_unit],
- 1)) {
+ &data->bps[data->count_saved +
+ i * adev->umc.retire_unit], 1)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
return -EIO;
}
@@ -2985,6 +3376,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
}
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
+ data->count_saved = data->count;
}
return 0;
@@ -2999,13 +3391,13 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
struct amdgpu_ras_eeprom_control *control =
&adev->psp.ras_context.ras->eeprom_control;
struct eeprom_table_record *bps;
- int ret;
+ int ret, i = 0;
/* no bad page record, skip eeprom access */
if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
return 0;
- bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
+ bps = kzalloc_objs(*bps, control->ras_num_recs);
if (!bps)
return -ENOMEM;
@@ -3013,26 +3405,44 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
if (ret) {
dev_err(adev->dev, "Failed to load EEPROM table records!");
} else {
- if (control->ras_num_recs > 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- if ((bps[0].address == bps[1].address) &&
- (bps[0].mem_channel == bps[1].mem_channel))
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
- else
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+ /*In V3, there is no pa recs, and some cases(when address==0) may be parsed
+ as pa recs, so add verion check to avoid it.
+ */
+ if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) &&
+ !amdgpu_ras_smu_eeprom_supported(adev)) {
+ for (i = 0; i < control->ras_num_recs; i++) {
+ if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ control->ras_num_pa_recs += adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ control->ras_num_mca_recs +=
+ (control->ras_num_recs - i);
+ break;
+ }
+ } else {
+ control->ras_num_mca_recs += (control->ras_num_recs - i);
+ break;
+ }
+ }
+ } else {
+ control->ras_num_mca_recs = control->ras_num_recs;
+ }
}
+ ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
+ if (ret)
+ goto out;
+
ret = amdgpu_ras_eeprom_check(control);
if (ret)
goto out;
/* HW not usable */
- if (amdgpu_ras_is_rma(adev)) {
+ if (amdgpu_ras_is_rma(adev))
ret = -EHWPOISON;
- goto out;
- }
-
- ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true);
}
out:
@@ -3040,18 +3450,24 @@ out:
return ret;
}
-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr)
{
struct ras_err_handler_data *data = con->eh_data;
+ struct amdgpu_device *adev = con->adev;
int i;
+ if ((addr >= adev->gmc.mc_vram_size &&
+ adev->gmc.mc_vram_size) ||
+ (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
+ return -EINVAL;
+
addr >>= AMDGPU_GPU_PAGE_SHIFT;
for (i = 0; i < data->count; i++)
if (addr == data->bps[i].retired_page)
- return true;
+ return 1;
- return false;
+ return 0;
}
/*
@@ -3059,11 +3475,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
*
* Note: this check is only for umc block
*/
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- bool ret = false;
+ int ret = 0;
if (!con || !con->eh_data)
return ret;
@@ -3080,31 +3496,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/*
- * Justification of value bad_page_cnt_threshold in ras structure
- *
- * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
- * scenarios accordingly.
- *
- * Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -2,
- * bad_page_cnt_threshold = typical value by formula.
- *
- * - When the value from user is 0 < amdgpu_bad_page_threshold <
- * max record length in eeprom, use it directly.
- *
- * Bad page retirement disablement:
- * - If amdgpu_bad_page_threshold = 0, bad page retirement
- * functionality is disabled, and bad_page_cnt_threshold will
- * take no effect.
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
*/
-
- if (amdgpu_bad_page_threshold < 0) {
+ if (amdgpu_bad_page_threshold == -2) {
u64 val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_COVER);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_count);
+ } else if (amdgpu_bad_page_threshold == -1) {
+ con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
} else {
con->bad_page_cnt_threshold = min_t(int, max_count,
amdgpu_bad_page_threshold);
@@ -3149,7 +3563,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -3169,7 +3583,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_destroy(&ecc_log->lock);
ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
}
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
@@ -3195,7 +3609,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
- unsigned long err_cnt;
/* If gpu reset is ongoing, delay retiring the bad pages */
if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
@@ -3207,13 +3620,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data);
- err_cnt = err_data.err_addr_cnt;
amdgpu_ras_error_data_fini(&err_data);
- if (err_cnt && amdgpu_ras_is_rma(adev))
- amdgpu_ras_reset_gpu(adev);
-
amdgpu_ras_schedule_retirement_dwork(con,
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
}
@@ -3224,58 +3633,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = 0;
+ u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint64_t de_queried_count;
- uint32_t new_detect_count, total_detect_count;
- uint32_t need_query_count = poison_creation_count;
- bool query_data_timeout = false;
+ u64 de_queried_count;
+ u64 consumption_q_count;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
- total_detect_count = 0;
+ ecc_log->de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
+
do {
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (ret)
return ret;
de_queried_count = ecc_log->de_queried_count;
- if (de_queried_count > ecc_log->prev_de_queried_count) {
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
- ecc_log->prev_de_queried_count = de_queried_count;
- timeout = 0;
- } else {
- new_detect_count = 0;
- }
-
- if (new_detect_count) {
- total_detect_count += new_detect_count;
- } else {
- if (!timeout && need_query_count)
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+ consumption_q_count = ecc_log->consumption_q_count;
- if (timeout) {
- if (!--timeout) {
- query_data_timeout = true;
- break;
- }
- msleep(1);
- }
- }
- } while (total_detect_count < need_query_count);
+ if (de_queried_count && consumption_q_count)
+ break;
- if (query_data_timeout) {
- dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
- (need_query_count - total_detect_count));
- return -ENOENT;
- }
+ msleep(100);
+ } while (--timeout);
- if (total_detect_count)
+ if (de_queried_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
+ if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
+ amdgpu_ras_reset_gpu(adev);
+
return 0;
}
@@ -3311,6 +3701,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
reset_flags |= msg.reset;
}
+ /*
+ * Try to ensure poison creation handler is completed first
+ * to set rma if bad page exceed threshold.
+ */
+ flush_delayed_work(&con->page_retirement_dwork);
+
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
@@ -3320,8 +3716,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
else
reset = reset_flags;
- flush_delayed_work(&con->page_retirement_dwork);
-
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
@@ -3351,6 +3745,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
+ mutex_lock(&con->poison_lock);
gpu_reset = 0;
do {
@@ -3363,7 +3758,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(poison_creation_count, &con->poison_creation_count);
atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
}
- } while (atomic_read(&con->poison_creation_count));
+ } while (atomic_read(&con->poison_creation_count) &&
+ !atomic_read(&con->poison_consumption_count));
if (ret != -EIO) {
msg_count = kfifo_len(&con->poison_fifo);
@@ -3380,6 +3776,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
/* Clear poison creation request */
atomic_set(&con->poison_creation_count, 0);
+ atomic_set(&con->poison_consumption_count, 0);
/* Clear poison fifo */
amdgpu_ras_clear_poison_fifo(adev);
@@ -3404,9 +3801,12 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(msg_count, &con->page_retirement_req_cnt);
}
+ atomic_set(&con->poison_consumption_count, 0);
+
/* Wake up work to save bad pages to eeprom */
schedule_delayed_work(&con->page_retirement_dwork, 0);
}
+ mutex_unlock(&con->poison_lock);
}
return 0;
@@ -3421,23 +3821,28 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
if (!con || amdgpu_sriov_vf(adev))
return 0;
+ if (amdgpu_uniras_enabled(adev))
+ return 0;
+
control = &con->eeprom_control;
+ con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev);
+
ret = amdgpu_ras_eeprom_init(control);
- if (ret)
- return ret;
+ control->is_eeprom_valid = !ret;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
+ control->ras_num_pa_recs = control->ras_num_recs;
- /* default status is MCA storage */
- if (control->ras_num_recs <= 1 &&
- adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
- control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+ if (adev->umc.ras &&
+ adev->umc.ras->get_retire_flip_bits)
+ adev->umc.ras->get_retire_flip_bits(adev);
- if (control->ras_num_recs) {
+ if (control->ras_num_recs && control->is_eeprom_valid) {
ret = amdgpu_ras_load_bad_pages(adev);
- if (ret)
- return ret;
+ if (ret) {
+ control->is_eeprom_valid = false;
+ return 0;
+ }
amdgpu_dpm_send_hbm_bad_pages_num(
adev, control->ras_num_bad_pages);
@@ -3447,9 +3852,16 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
+
+ /* The format action is only applied to new ASICs */
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
+ control->tbl_hdr.version < RAS_TABLE_VER_V3)
+ if (!amdgpu_ras_eeprom_reset_table(control))
+ if (amdgpu_ras_save_bad_pages(adev, NULL))
+ dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
- return ret;
+ return 0;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
@@ -3473,15 +3885,17 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
return 0;
data = &con->eh_data;
- *data = kzalloc(sizeof(**data), GFP_KERNEL);
+ *data = kzalloc_obj(**data);
if (!*data) {
ret = -ENOMEM;
goto out;
}
mutex_init(&con->recovery_lock);
+ mutex_init(&con->poison_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ atomic_set(&con->rma_in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -3499,6 +3913,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
atomic_set(&con->poison_creation_count, 0);
+ atomic_set(&con->poison_consumption_count, 0);
con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) {
@@ -3571,6 +3986,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kfree(data);
mutex_unlock(&con->recovery_lock);
+ amdgpu_ras_critical_region_init(adev);
+#ifdef CONFIG_X86_MCE_AMD
+ amdgpu_unregister_bad_pages_mca_notifier(adev);
+#endif
return 0;
}
/* recovery end */
@@ -3583,6 +4002,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 15):
return true;
default:
return false;
@@ -3596,6 +4016,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 10):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 15):
case IP_VERSION(14, 0, 3):
return true;
default:
@@ -3658,7 +4079,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev
*/
if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) ||
amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) ||
- amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+ amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) ||
+ amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1))
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
1 << AMDGPU_RAS_BLOCK__JPEG);
else
@@ -3759,8 +4181,14 @@ init_ras_enabled_flag:
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
- /* aca is disabled by default */
- adev->aca.is_enabled = false;
+ /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
+ if (!amdgpu_sriov_vf(adev)) {
+ adev->aca.is_enabled =
+ (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15));
+ }
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
@@ -3788,7 +4216,6 @@ static void amdgpu_ras_counte_dw(struct work_struct *work)
atomic_set(&con->ras_ue_count, ue_count);
}
- pm_runtime_mark_last_busy(dev->dev);
Out:
pm_runtime_put_autosuspend(dev->dev);
}
@@ -3848,8 +4275,11 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 15):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+ break;
case IP_VERSION(13, 0, 14):
- con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
break;
default:
break;
@@ -3948,7 +4378,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
* to handle fatal error */
r = amdgpu_nbio_ras_sw_init(adev);
if (r)
- return r;
+ goto release_con;
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
@@ -3989,6 +4419,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ con->init_task_pid = task_pid_nr(current);
+ get_task_comm(con->init_task_comm, current);
+
+ mutex_init(&con->critical_region_lock);
+ INIT_LIST_HEAD(&con->critical_region_head);
+
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
@@ -4020,10 +4456,10 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
return 0;
if (amdgpu_ras_query_error_status(adev, &info) != 0)
- DRM_WARN("RAS init harvest failure");
+ drm_warn(adev_to_drm(adev), "RAS init query failure");
if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
- DRM_WARN("RAS init harvest reset failure");
+ drm_warn(adev_to_drm(adev), "RAS init harvest reset failure");
return 0;
}
@@ -4089,7 +4525,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
/* Those are the cached values at init.
*/
- query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
+ query_info = kzalloc_obj(*query_info);
if (!query_info)
return -ENOMEM;
memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
@@ -4240,6 +4676,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
}
+ amdgpu_ras_check_bad_page_status(adev);
+
return 0;
}
@@ -4268,6 +4706,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (!adev->ras_enabled || !con)
return 0;
+ amdgpu_ras_critical_region_fini(adev);
+ mutex_destroy(&con->critical_region_lock);
+
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
if (ras_node->ras_obj) {
obj = ras_node->ras_obj;
@@ -4335,8 +4776,10 @@ void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
struct amdgpu_ras *ras;
ras = amdgpu_ras_get_context(adev);
- if (ras)
+ if (ras) {
ras->ras_err_state = 0;
+ ras->gpu_reset_flags = 0;
+ }
}
void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
@@ -4384,6 +4827,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_
struct ras_event_state *event_state;
int ret = 0;
+ if (amdgpu_uniras_enabled(adev))
+ return 0;
+
if (type >= RAS_EVENT_TYPE_COUNT) {
ret = -EINVAL;
goto out;
@@ -4434,17 +4880,18 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
return id;
}
-void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
- u64 event_id;
+ u64 event_id = RAS_EVENT_INVALID_ID;
- if (amdgpu_ras_mark_ras_event(adev, type))
- return;
+ if (amdgpu_uniras_enabled(adev))
+ return 0;
- event_id = amdgpu_ras_acquire_event_id(adev, type);
+ if (!amdgpu_ras_mark_ras_event(adev, type))
+ event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
@@ -4453,6 +4900,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
+
+ return -EBUSY;
}
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
@@ -4580,6 +5029,28 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
notifier_registered = true;
}
}
+static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev)
+{
+ int i, j;
+
+ if (!notifier_registered && !mce_adev_list.num_gpu)
+ return;
+ for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) {
+ if (mce_adev_list.devs[i] == adev)
+ mce_adev_list.devs[i] = NULL;
+ if (!mce_adev_list.devs[i])
+ ++j;
+ }
+
+ if (j == mce_adev_list.num_gpu) {
+ mce_adev_list.num_gpu = 0;
+ /* Unregister x86 notifier with MCE subsystem. */
+ if (notifier_registered) {
+ mce_unregister_decode_chain(&amdgpu_bad_page_nb);
+ notifier_registered = false;
+ }
+ }
+}
#endif
struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
@@ -4743,7 +5214,7 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
if (!adev || !ras_block_obj)
return -EINVAL;
- ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
+ ras_node = kzalloc_obj(*ras_node);
if (!ras_node)
return -ENOMEM;
@@ -4944,7 +5415,7 @@ static struct ras_err_node *amdgpu_ras_error_node_new(void)
{
struct ras_err_node *err_node;
- err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL);
+ err_node = kvzalloc_obj(*err_node);
if (!err_node)
return NULL;
@@ -5071,11 +5542,11 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
* is changed. In such case, replace the aqua_vanjaram implementation
* with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
- aqua_vanjaram_encode_ext_smn_addressing(instance);
+ amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance);
fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
- aqua_vanjaram_encode_ext_smn_addressing(instance);
+ amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance);
boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
@@ -5127,9 +5598,9 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
"socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
socket_id, aid_id, fw_status);
- if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+ if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error))
dev_info(adev->dev,
- "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n",
+ "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n",
socket_id, aid_id, fw_status);
}
@@ -5141,7 +5612,7 @@ static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
int retry_loop;
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
- aqua_vanjaram_encode_ext_smn_addressing(instance);
+ amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance);
for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
@@ -5171,6 +5642,9 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
int ret = 0;
+ if (amdgpu_ras_check_critical_address(adev, start))
+ return 0;
+
mutex_lock(&con->page_rsv_lock);
ret = amdgpu_vram_mgr_query_page_status(mgr, start);
if (ret == -ENOENT)
@@ -5202,8 +5676,110 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ if (amdgpu_uniras_enabled(adev))
+ return amdgpu_ras_mgr_is_rma(adev);
+
if (!con)
return false;
return con->is_rma;
}
+
+int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
+ struct amdgpu_bo *bo)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_vram_mgr_resource *vres;
+ struct ras_critical_region *region;
+ struct gpu_buddy_block *block;
+ int ret = 0;
+
+ if (!bo || !bo->tbo.resource)
+ return -EINVAL;
+
+ vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
+
+ mutex_lock(&con->critical_region_lock);
+
+ /* Check if the bo had been recorded */
+ list_for_each_entry(region, &con->critical_region_head, node)
+ if (region->bo == bo)
+ goto out;
+
+ /* Record new critical amdgpu bo */
+ list_for_each_entry(block, &vres->blocks, link) {
+ region = kzalloc_obj(*region);
+ if (!region) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ region->bo = bo;
+ region->start = amdgpu_vram_mgr_block_start(block);
+ region->size = amdgpu_vram_mgr_block_size(block);
+ list_add_tail(&region->node, &con->critical_region_head);
+ }
+
+out:
+ mutex_unlock(&con->critical_region_lock);
+
+ return ret;
+}
+
+static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
+{
+ amdgpu_ras_add_critical_region(adev, adev->mman.resv_region[AMDGPU_RESV_FW].bo);
+}
+
+static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_critical_region *region, *tmp;
+
+ mutex_lock(&con->critical_region_lock);
+ list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
+ list_del(&region->node);
+ kfree(region);
+ }
+ mutex_unlock(&con->critical_region_lock);
+}
+
+bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_critical_region *region;
+ bool ret = false;
+
+ mutex_lock(&con->critical_region_lock);
+ list_for_each_entry(region, &con->critical_region_head, node) {
+ if ((region->start <= addr) &&
+ (addr < (region->start + region->size))) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&con->critical_region_lock);
+
+ return ret;
+}
+
+void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ list_for_each_entry(tmp_adev, device_list, reset_list) {
+ if (amdgpu_uniras_enabled(tmp_adev))
+ amdgpu_ras_mgr_pre_reset(tmp_adev);
+ }
+}
+
+void amdgpu_ras_post_reset(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ list_for_each_entry(tmp_adev, device_list, reset_list) {
+ if (amdgpu_uniras_enabled(tmp_adev))
+ amdgpu_ras_mgr_post_reset(tmp_adev);
+ }
+}