diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1182 |
1 files changed, 879 insertions, 303 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f0924aa3f4e4..fc9f3adf9912 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -21,6 +21,7 @@ * * */ +#include "amdgpu_reg_access.h" #include <linux/debugfs.h> #include <linux/list.h> #include <linux/module.h> @@ -41,6 +42,8 @@ #include "atom.h" #include "amdgpu_reset.h" #include "amdgpu_psp.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_virt_ras_cmd.h" #ifdef CONFIG_X86_MCE_AMD #include <asm/mce.h> @@ -77,6 +80,7 @@ const char *ras_block_string[] = { "jpeg", "ih", "mpio", + "mmsch", }; const char *ras_mca_block_string[] = { @@ -121,12 +125,15 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms #define MAX_FLUSH_RETIRE_DWORK_TIMES 100 +#define BYPASS_ALLOCATED_ADDRESS 0x0 +#define BYPASS_INITIALIZATION_ADDRESS 0x1 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -135,12 +142,18 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); + #ifdef CONFIG_X86_MCE_AMD static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); +static void +amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); struct mce_notifier_adev_list { struct amdgpu_device *devs[MAX_GPU_INSTANCE]; int num_gpu; @@ -168,18 +181,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre struct eeprom_table_record err_rec; int ret; - if ((address >= adev->gmc.mc_vram_size) || - (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + ret = amdgpu_ras_check_bad_page(adev, address); + if (ret == -EINVAL) { dev_warn(adev->dev, - "RAS WARN: input address 0x%llx is invalid.\n", - address); + "RAS WARN: input address 0x%llx is invalid.\n", + address); return -EINVAL; - } - - if (amdgpu_ras_check_bad_page(adev, address)) { + } else if (ret == 1) { dev_warn(adev->dev, - "RAS WARN: 0x%llx has already been marked as bad page!\n", - address); + "RAS WARN: 0x%llx has already been marked as bad page!\n", + address); return 0; } @@ -206,6 +217,72 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre return 0; } +static int amdgpu_check_address_validity(struct amdgpu_device *adev, + uint64_t address, uint64_t flags) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_vram_block_info blk_info; + uint64_t page_pfns[32] = {0}; + int i, ret, count; + bool hit = false; + + if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) + return 0; + + if (amdgpu_sriov_vf(adev)) { + if (amdgpu_uniras_enabled(adev)) { + if (amdgpu_virt_ras_check_address_validity(adev, address, &hit)) + return -EPERM; + if (hit) + return -EACCES; + } else { + if (amdgpu_virt_check_vf_critical_region(adev, address, &hit)) + return -EPERM; + return hit ? -EACCES : 0; + } + } + + if ((address >= adev->gmc.mc_vram_size) || + (address >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EFAULT; + + if (amdgpu_uniras_enabled(adev)) { + if (amdgpu_sriov_vf(adev)) + count = amdgpu_virt_ras_convert_retired_address(adev, address, + page_pfns, ARRAY_SIZE(page_pfns)); + else + count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, + page_pfns, ARRAY_SIZE(page_pfns)); + } else + count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, + address, page_pfns, ARRAY_SIZE(page_pfns)); + + if (count <= 0) + return -EPERM; + + for (i = 0; i < count; i++) { + memset(&blk_info, 0, sizeof(blk_info)); + ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr, + page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info); + if (!ret) { + /* The input address that needs to be checked is allocated by + * current calling process, so it is necessary to exclude + * the calling process. + */ + if ((flags == BYPASS_ALLOCATED_ADDRESS) && + ((blk_info.task.pid != task_pid_nr(current)) || + strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN))) + return -EACCES; + else if ((flags == BYPASS_INITIALIZATION_ADDRESS) && + (blk_info.task.pid == con->init_task_pid) && + !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN)) + return -EACCES; + } + } + + return 0; +} + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -296,6 +373,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 2; else if (strstr(str, "retire_page") != NULL) op = 3; + else if (strstr(str, "check_address") != NULL) + op = 4; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -310,6 +389,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->inject.address = address; return 0; + } else if (op == 4) { + if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 && + sscanf(str, "%*s %llu %llu", &address, &value) != 2) + return -EINVAL; + + data->op = op; + data->inject.address = address; + data->inject.value = value; + return 0; } if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) @@ -499,6 +587,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, return size; else return ret; + } else if (data.op == 4) { + ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value); + return ret ? ret : size; } if (!amdgpu_ras_is_supported(adev, data.head.block)) @@ -512,22 +603,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, ret = amdgpu_ras_feature_enable(adev, &data.head, 1); break; case 2: - if ((data.inject.address >= adev->gmc.mc_vram_size && - adev->gmc.mc_vram_size) || - (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { - dev_warn(adev->dev, "RAS WARN: input address " - "0x%llx is invalid.", + /* umc ce/ue error injection for a bad page is not allowed */ + if (data.head.block == AMDGPU_RAS_BLOCK__UMC) + ret = amdgpu_ras_check_bad_page(adev, data.inject.address); + if (ret == -EINVAL) { + dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", data.inject.address); - ret = -EINVAL; break; - } - - /* umc ce/ue error injection for a bad page is not allowed */ - if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && - amdgpu_ras_check_bad_page(adev, data.inject.address)) { - dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " - "already been marked as bad!\n", - data.inject.address); + } else if (ret == 1) { + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", + data.inject.address); break; } @@ -547,6 +632,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, return size; } +static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev); + /** * DOC: AMDGPU RAS debugfs EEPROM table reset interface * @@ -571,6 +658,11 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, (struct amdgpu_device *)file_inode(f)->i_private; int ret; + if (amdgpu_uniras_enabled(adev)) { + ret = amdgpu_uniras_clear_badpages_info(adev); + return ret ? ret : size; + } + ret = amdgpu_ras_eeprom_reset_table( &(amdgpu_ras_get_context(adev)->eeprom_control)); @@ -812,7 +904,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (head->block == AMDGPU_RAS_BLOCK__GFX && !amdgpu_sriov_vf(adev) && !amdgpu_ras_intr_triggered()) { - info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); + info = kzalloc_obj(union ta_ras_cmd_input); if (!info) return -ENOMEM; @@ -1106,6 +1198,9 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, err_info->de_count, blk_name); } } else { + if (adev->debug_disable_ce_logs) + return; + for_each_ras_error(err_node, err_data) { err_info = &err_node->err_info; mcm_info = &err_info->mcm_info; @@ -1475,9 +1570,51 @@ out_fini_err_data: return ret; } +static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev) +{ + struct ras_cmd_dev_handle req = {0}; + int ret; + + ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CLEAR_BAD_PAGE_INFO, + &req, sizeof(req), NULL, 0); + if (ret) { + dev_err(adev->dev, "Failed to clear bad pages info, ret: %d\n", ret); + return ret; + } + + return 0; +} + +static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev, + struct ras_query_if *info) +{ + struct ras_cmd_block_ecc_info_req req = {0}; + struct ras_cmd_block_ecc_info_rsp rsp = {0}; + int ret; + + if (!info) + return -EINVAL; + + req.block_id = info->head.block; + req.subblock_id = info->head.sub_block_index; + + ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS, + &req, sizeof(req), &rsp, sizeof(rsp)); + if (!ret) { + info->ce_count = rsp.ce_count; + info->ue_count = rsp.ue_count; + info->de_count = rsp.de_count; + } + + return ret; +} + int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { - return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); + if (amdgpu_uniras_enabled(adev)) + return amdgpu_uniras_query_block_ecc(adev, info); + else + return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); } int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, @@ -1497,6 +1634,9 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; + if (amdgpu_sriov_vf(adev)) + return -EOPNOTSUPP; + /* skip ras error reset in gpu reset */ if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_funcs->set_debug_mode) || @@ -1526,6 +1666,27 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, return 0; } +static int amdgpu_uniras_error_inject(struct amdgpu_device *adev, + struct ras_inject_if *info) +{ + struct ras_cmd_inject_error_req inject_req; + struct ras_cmd_inject_error_rsp rsp; + + if (!info) + return -EINVAL; + + memset(&inject_req, 0, sizeof(inject_req)); + inject_req.block_id = info->head.block; + inject_req.subblock_id = info->head.sub_block_index; + inject_req.address = info->address; + inject_req.error_type = info->head.type; + inject_req.instance_mask = info->instance_mask; + inject_req.method = info->value; + + return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR, + &inject_req, sizeof(inject_req), &rsp, sizeof(rsp)); +} + /* wrapper of psp_ras_trigger_error */ int amdgpu_ras_error_inject(struct amdgpu_device *adev, struct ras_inject_if *info) @@ -1543,6 +1704,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, info->head.block, info->head.sub_block_index); + if (amdgpu_uniras_enabled(adev)) + return amdgpu_uniras_error_inject(adev, info); + /* inject on guest isn't allowed, return success directly */ if (amdgpu_sriov_vf(adev)) return 0; @@ -1687,7 +1851,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* sysfs begin */ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, - struct ras_badpage **bps, unsigned int *count); + struct ras_badpage *bps, uint32_t count, uint32_t start); +static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage *bps, uint32_t count, uint32_t start); static char *amdgpu_ras_badpage_flags_str(unsigned int flags) { @@ -1733,7 +1899,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) */ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, - struct kobject *kobj, struct bin_attribute *attr, + struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t ppos, size_t count) { struct amdgpu_ras *con = @@ -1745,19 +1911,52 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, unsigned int end = div64_ul(ppos + count - 1, element_size); ssize_t s = 0; struct ras_badpage *bps = NULL; - unsigned int bps_count = 0; + int bps_count = 0, i, status; + uint64_t address; memset(buf, 0, count); - if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) + bps_count = end - start; + bps = kmalloc_objs(*bps, bps_count); + if (!bps) return 0; - for (; start < end && start < bps_count; start++) + memset(bps, 0, sizeof(*bps) * bps_count); + + if (amdgpu_uniras_enabled(adev)) + bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start); + else + bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start); + + if (bps_count <= 0) { + kfree(bps); + return 0; + } + + for (i = 0; i < bps_count; i++) { + address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT; + + bps[i].size = AMDGPU_GPU_PAGE_SIZE; + + status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, + address); + if (status == -EBUSY) + bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; + else if (status == -ENOENT) + bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; + else + bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; + + if ((bps[i].flags != AMDGPU_RAS_RETIRE_PAGE_RESERVED) && + amdgpu_ras_check_critical_address(adev, address)) + bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; + s += scnprintf(&buf[s], element_size + 1, "0x%08x : 0x%08x : %1s\n", - bps[start].bp, - bps[start].size, - amdgpu_ras_badpage_flags_str(bps[start].flags)); + bps[i].bp, + bps[i].size, + amdgpu_ras_badpage_flags_str(bps[i].flags)); + } kfree(bps); @@ -1773,12 +1972,42 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); } +static bool amdgpu_ras_get_version_info(struct amdgpu_device *adev, u32 *major, + u32 *minor, u32 *rev) +{ + int i; + + if (!adev || !major || !minor || !rev || !amdgpu_uniras_enabled(adev)) + return false; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_RAS) { + *major = adev->ip_blocks[i].version->major; + *minor = adev->ip_blocks[i].version->minor; + *rev = adev->ip_blocks[i].version->rev; + return true; + } + } + + return false; +} + static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct amdgpu_ras *con = container_of(attr, struct amdgpu_ras, version_attr); - return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); + u32 major, minor, rev; + ssize_t size = 0; + + size += sysfs_emit_at(buf, size, "table version: 0x%x\n", + con->eeprom_control.tbl_hdr.version); + + if (amdgpu_ras_get_version_info(con->adev, &major, &minor, &rev)) + size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n", + major, minor, rev); + + return size; } static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, @@ -1864,6 +2093,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (!obj || obj->attr_inuse) return -EINVAL; + if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) + return 0; + get_obj(obj); snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), @@ -2018,6 +2250,7 @@ static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 15): ret = true; break; default: @@ -2047,7 +2280,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) list_for_each_entry(obj, &con->head, node) { if (amdgpu_ras_is_supported(adev, obj->head.block) && (obj->attr_inuse == 1)) { - sprintf(fs_info.debugfs_name, "%s_err_inject", + snprintf(fs_info.debugfs_name, sizeof(fs_info.debugfs_name), + "%s_err_inject", get_ras_block_str(&obj->head)); fs_info.head = obj->head; amdgpu_ras_debugfs_create(adev, &fs_info, dir); @@ -2065,8 +2299,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) /* debugfs end */ /* ras fs */ -static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, - amdgpu_ras_sysfs_badpages_read, NULL, 0); +static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, + amdgpu_ras_sysfs_badpages_read, NULL, 0); static DEVICE_ATTR(features, S_IRUGO, amdgpu_ras_sysfs_features_read, NULL); static DEVICE_ATTR(version, 0444, @@ -2088,7 +2322,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) &con->event_state_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { + const struct bin_attribute *bin_attrs[] = { NULL, NULL, }; @@ -2114,11 +2348,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) if (amdgpu_bad_page_threshold != 0) { /* add bad_page_features entry */ - bin_attr_gpu_vram_bad_pages.private = NULL; con->badpages_attr = bin_attr_gpu_vram_bad_pages; + sysfs_bin_attr_init(&con->badpages_attr); bin_attrs[0] = &con->badpages_attr; group.bin_attrs = bin_attrs; - sysfs_bin_attr_init(bin_attrs[0]); } r = sysfs_create_group(&adev->dev->kobj, &group); @@ -2158,7 +2391,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) /* Fatal error events are handled on host side */ if (amdgpu_sriov_vf(adev)) return; - /** + /* * If the current interrupt is caused by a non-fatal RAS error, skip * check for fatal error. For fatal errors, FED status of all devices * in XGMI hive gets set when the first device gets fatal error @@ -2169,6 +2402,11 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) return; + if (amdgpu_uniras_enabled(adev)) { + amdgpu_ras_mgr_handle_fatal_interrupt(adev, NULL); + return; + } + if (adev->nbio.ras && adev->nbio.ras->handle_ras_controller_intr_no_bifring) adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); @@ -2339,6 +2577,16 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_manager *obj; struct ras_ih_data *data; + if (amdgpu_uniras_enabled(adev)) { + struct ras_ih_info ih_info; + + memset(&ih_info, 0, sizeof(ih_info)); + ih_info.block = info->head.block; + memcpy(&ih_info.iv_entry, info->entry, sizeof(struct amdgpu_iv_entry)); + + return amdgpu_ras_mgr_handle_controller_interrupt(adev, &ih_info); + } + obj = amdgpu_ras_find_obj(adev, &info->head); if (!obj) return -EINVAL; @@ -2533,54 +2781,87 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) } } -/* recovery begin */ - -/* return 0 on success. - * caller need free bps. - */ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, - struct ras_badpage **bps, unsigned int *count) + struct ras_badpage *bps, uint32_t count, uint32_t start) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - int i = 0; - int ret = 0, status; + int r = 0; + uint32_t i; if (!con || !con->eh_data || !bps || !count) return -EINVAL; mutex_lock(&con->recovery_lock); data = con->eh_data; - if (!data || data->count == 0) { - *bps = NULL; - ret = -EINVAL; - goto out; - } + if (start < data->count) { + for (i = start; i < data->count; i++) { + if (!data->bps[i].ts) + continue; - *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); - if (!*bps) { - ret = -ENOMEM; - goto out; + /* U64_MAX is used to mark the record as invalid */ + if (data->bps[i].retired_page == U64_MAX) + continue; + + bps[r].bp = data->bps[i].retired_page; + r++; + if (r >= count) + break; + } } + mutex_unlock(&con->recovery_lock); - for (; i < data->count; i++) { - (*bps)[i] = (struct ras_badpage){ - .bp = data->bps[i].retired_page, - .size = AMDGPU_GPU_PAGE_SIZE, - .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, - }; - status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, - data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT); - if (status == -EBUSY) - (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (status == -ENOENT) - (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; + return r; +} + +static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage *bps, uint32_t count, uint32_t start) +{ + struct ras_cmd_bad_pages_info_req cmd_input; + struct ras_cmd_bad_pages_info_rsp *output; + uint32_t group, start_group, end_group; + uint32_t pos, pos_in_group; + int r = 0, i; + + if (!bps || !count) + return -EINVAL; + + output = kmalloc_obj(*output); + if (!output) + return -ENOMEM; + + memset(&cmd_input, 0, sizeof(cmd_input)); + + start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP; + end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) / + RAS_CMD_MAX_BAD_PAGES_PER_GROUP; + + pos = start; + for (group = start_group; group < end_group; group++) { + memset(output, 0, sizeof(*output)); + cmd_input.group_index = group; + if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES, + &cmd_input, sizeof(cmd_input), output, sizeof(*output))) + goto out; + + if (pos >= output->bp_total_cnt) + goto out; + + pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; + for (i = pos_in_group; i < output->bp_in_group; i++, pos++) { + if (!output->records[i].ts) + continue; + + bps[r].bp = output->records[i].retired_page; + r++; + if (r >= count) + goto out; + } } - *count = data->count; out: - mutex_unlock(&con->recovery_lock); - return ret; + kfree(output); + return r; } static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, @@ -2629,6 +2910,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + unsigned int error_query_mode; enum ras_event_type type; if (hive) { @@ -2657,11 +2939,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = &device_list; } + if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) { + if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) { + /* wait 500ms to ensure pmfw polling mca bank info done */ + msleep(500); + } + } + type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev, type); + if (amdgpu_uniras_enabled(remote_adev)) { + amdgpu_ras_mgr_update_ras_ecc(remote_adev); + } else { + amdgpu_ras_query_err_status(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); + } } } @@ -2713,7 +3006,7 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, unsigned int old_space = data->count + data->space_left; unsigned int new_space = old_space + pages; unsigned int align_space = ALIGN(new_space, 512); - void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + void *bps = kmalloc_objs(*data->bps, align_space); if (!bps) { return -ENOMEM; @@ -2749,8 +3042,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, addr_in.ma.err_addr = bps->address; addr_in.ma.socket_id = socket; addr_in.ma.ch_inst = bps->mem_channel; - /* tell RAS TA the node instance is not used */ - addr_in.ma.node_inst = TA_RAS_INV_NODE; + if (!amdgpu_ras_smu_eeprom_supported(adev)) { + /* tell RAS TA the node instance is not used */ + addr_in.ma.node_inst = TA_RAS_INV_NODE; + } else { + addr_in.ma.umc_inst = bps->mcumc_id; + addr_in.ma.node_inst = bps->cu; + } if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, @@ -2796,138 +3094,217 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, return -EINVAL; } +static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, + struct eeprom_table_record *bps, int count) +{ + int j; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data = con->eh_data; + + for (j = 0; j < count; j++) { + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { + return -ENOMEM; + } + + if (amdgpu_ras_check_bad_page_unlock(con, + bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { + /* set to U64_MAX to mark it as invalid */ + data->bps[data->count].retired_page = U64_MAX; + data->count++; + data->space_left--; + continue; + } + + amdgpu_ras_reserve_page(adev, bps[j].retired_page); + + memcpy(&data->bps[data->count], &(bps[j]), + sizeof(struct eeprom_table_record)); + data->count++; + data->space_left--; + con->bad_page_num++; + } + + return 0; +} + +static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, + struct eeprom_table_record *bps, struct ras_err_data *err_data, + enum amdgpu_memory_partition nps) +{ + int i = 0; + uint64_t chan_idx_v2; + enum amdgpu_memory_partition save_nps; + + save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2; + + /*old asics just have pa in eeprom*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { + memcpy(err_data->err_addr, bps, + sizeof(struct eeprom_table_record) * adev->umc.retire_unit); + goto out; + } + + for (i = 0; i < adev->umc.retire_unit; i++) + bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + + if (save_nps || chan_idx_v2) { + if (save_nps == nps) { + if (amdgpu_umc_pages_in_a_row(adev, err_data, + bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + return -EINVAL; + for (i = 0; i < adev->umc.retire_unit; i++) { + err_data->err_addr[i].address = bps[0].address; + err_data->err_addr[i].mem_channel = bps[0].mem_channel; + err_data->err_addr[i].bank = bps[0].bank; + err_data->err_addr[i].err_type = bps[0].err_type; + err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; + } + } else { + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) + return -EINVAL; + } + } else { + if (bps[0].address == 0) { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + } + + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { + if (nps == AMDGPU_NPS1_PARTITION_MODE) + memcpy(err_data->err_addr, bps, + sizeof(struct eeprom_table_record) * adev->umc.retire_unit); + else + return -EOPNOTSUPP; + } + } + +out: + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); +} + +static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, + struct eeprom_table_record *bps, struct ras_err_data *err_data, + enum amdgpu_memory_partition nps) +{ + int i = 0; + uint64_t chan_idx_v2; + enum amdgpu_memory_partition save_nps; + + if (!amdgpu_ras_smu_eeprom_supported(adev)) { + save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2; + bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + } else { + /* if pmfw manages eeprom, save_nps is not stored on eeprom, + * we should always convert mca address into physical address, + * make save_nps different from nps + */ + save_nps = nps + 1; + } + + if (save_nps == nps) { + if (amdgpu_umc_pages_in_a_row(adev, err_data, + bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) + return -EINVAL; + for (i = 0; i < adev->umc.retire_unit; i++) { + err_data->err_addr[i].address = bps->address; + err_data->err_addr[i].mem_channel = bps->mem_channel; + err_data->err_addr[i].bank = bps->bank; + err_data->err_addr[i].err_type = bps->err_type; + err_data->err_addr[i].mcumc_id = bps->mcumc_id; + } + } else { + if (save_nps || chan_idx_v2) { + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) + return -EINVAL; + } else { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (bps->address == 0) + if (amdgpu_umc_pa2mca(adev, + bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps->address), + AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + + if (amdgpu_ras_mca2pa(adev, bps, err_data)) + return -EOPNOTSUPP; + } + } + + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, + adev->umc.retire_unit); +} + /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages, bool from_rom) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; struct ras_err_data err_data; - struct eeprom_table_record *err_rec; struct amdgpu_ras_eeprom_control *control = &adev->psp.ras_context.ras->eeprom_control; enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; int ret = 0; - uint32_t i, j, loop_cnt = 1; - bool find_pages_per_pa = false; + uint32_t i = 0; if (!con || !con->eh_data || !bps || pages <= 0) return 0; if (from_rom) { err_data.err_addr = - kcalloc(adev->umc.retire_unit, - sizeof(struct eeprom_table_record), GFP_KERNEL); + kzalloc_objs(struct eeprom_table_record, + adev->umc.retire_unit); if (!err_data.err_addr) { dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); - ret = -ENOMEM; - goto out; + return -ENOMEM; } - err_rec = err_data.err_addr; - loop_cnt = adev->umc.retire_unit; if (adev->gmc.gmc_funcs->query_mem_partition_mode) nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); } mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) { - /* Returning 0 as the absence of eh_data is acceptable */ - goto free; - } - - for (i = 0; i < pages; i++) { - if (from_rom && - control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) { - if (!find_pages_per_pa) { - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { - if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { - /* may use old RAS TA, use PA to find pages in - * one row - */ - if (amdgpu_umc_pages_in_a_row(adev, &err_data, - bps[i].retired_page << - AMDGPU_GPU_PAGE_SHIFT)) { - ret = -EINVAL; - goto free; - } else { - find_pages_per_pa = true; - } + + if (from_rom) { + /* there is no pa recs in V3, so skip pa recs processing */ + if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && + !amdgpu_ras_smu_eeprom_supported(adev)) { + for (i = 0; i < pages; i++) { + if (control->ras_num_recs - i >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + /* deal with retire_unit records a time */ + ret = __amdgpu_ras_convert_rec_array_from_rom(adev, + &bps[i], &err_data, nps); + i += (adev->umc.retire_unit - 1); } else { - /* unsupported cases */ - ret = -EOPNOTSUPP; - goto free; - } - } - } else { - if (amdgpu_umc_pages_in_a_row(adev, &err_data, - bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { - ret = -EINVAL; - goto free; - } - } - } else { - if (from_rom && !find_pages_per_pa) { - if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) { - /* bad page in any NPS mode in eeprom */ - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { - ret = -EINVAL; - goto free; + break; } } else { - /* legacy bad page in eeprom, generated only in - * NPS1 mode - */ - if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) { - /* old RAS TA or ASICs which don't support to - * convert addrss via mca address - */ - if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { - find_pages_per_pa = true; - err_rec = &bps[i]; - loop_cnt = 1; - } else { - /* non-nps1 mode, old RAS TA - * can't support it - */ - ret = -EOPNOTSUPP; - goto free; - } - } + break; } - - if (!find_pages_per_pa) - i += (adev->umc.retire_unit - 1); - } else { - err_rec = &bps[i]; } } - - for (j = 0; j < loop_cnt; j++) { - if (amdgpu_ras_check_bad_page_unlock(con, - err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) - continue; - - if (!data->space_left && - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { - ret = -ENOMEM; - goto free; - } - - amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); - - memcpy(&data->bps[data->count], &(err_rec[j]), - sizeof(struct eeprom_table_record)); - data->count++; - data->space_left--; + for (; i < pages; i++) { + ret = __amdgpu_ras_convert_rec_from_rom(adev, + &bps[i], &err_data, nps); } + + con->eh_data->count_saved = con->eh_data->count; + } else { + ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); } -free: if (from_rom) kfree(err_data.err_addr); -out: mutex_unlock(&con->recovery_lock); return ret; @@ -2944,7 +3321,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; struct amdgpu_ras_eeprom_control *control; - int save_count, unit_num, bad_page_num, i; + int save_count, unit_num, i; if (!con || !con->eh_data) { if (new_cnt) @@ -2953,31 +3330,45 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, return 0; } + if (!con->eeprom_control.is_eeprom_valid) { + dev_warn(adev->dev, + "Failed to save EEPROM table data because of EEPROM data corruption!"); + if (new_cnt) + *new_cnt = 0; + + return 0; + } + mutex_lock(&con->recovery_lock); control = &con->eeprom_control; data = con->eh_data; - bad_page_num = control->ras_num_bad_pages; - save_count = data->count - bad_page_num; + if (amdgpu_ras_smu_eeprom_supported(adev)) + unit_num = control->ras_num_recs - + control->ras_num_recs_old; + else + unit_num = data->count / adev->umc.retire_unit - + control->ras_num_recs; + + save_count = con->bad_page_num - control->ras_num_bad_pages; mutex_unlock(&con->recovery_lock); - unit_num = save_count / adev->umc.retire_unit; if (new_cnt) *new_cnt = unit_num; /* only new entries are saved */ - if (save_count > 0) { - if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) { + if (unit_num && save_count) { + /*old asics only save pa to eeprom like before*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { if (amdgpu_ras_eeprom_append(control, - &data->bps[control->ras_num_recs], - save_count)) { + &data->bps[data->count_saved], unit_num)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } } else { for (i = 0; i < unit_num; i++) { if (amdgpu_ras_eeprom_append(control, - &data->bps[bad_page_num + i * adev->umc.retire_unit], - 1)) { + &data->bps[data->count_saved + + i * adev->umc.retire_unit], 1)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } @@ -2985,6 +3376,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, } dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); + data->count_saved = data->count; } return 0; @@ -2999,13 +3391,13 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) struct amdgpu_ras_eeprom_control *control = &adev->psp.ras_context.ras->eeprom_control; struct eeprom_table_record *bps; - int ret; + int ret, i = 0; /* no bad page record, skip eeprom access */ if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) return 0; - bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); + bps = kzalloc_objs(*bps, control->ras_num_recs); if (!bps) return -ENOMEM; @@ -3013,26 +3405,44 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (ret) { dev_err(adev->dev, "Failed to load EEPROM table records!"); } else { - if (control->ras_num_recs > 1 && - adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - if ((bps[0].address == bps[1].address) && - (bps[0].mem_channel == bps[1].mem_channel)) - control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; - else - control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { + /*In V3, there is no pa recs, and some cases(when address==0) may be parsed + as pa recs, so add verion check to avoid it. + */ + if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && + !amdgpu_ras_smu_eeprom_supported(adev)) { + for (i = 0; i < control->ras_num_recs; i++) { + if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + control->ras_num_pa_recs += adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + control->ras_num_mca_recs += + (control->ras_num_recs - i); + break; + } + } else { + control->ras_num_mca_recs += (control->ras_num_recs - i); + break; + } + } + } else { + control->ras_num_mca_recs = control->ras_num_recs; + } } + ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); + if (ret) + goto out; + ret = amdgpu_ras_eeprom_check(control); if (ret) goto out; /* HW not usable */ - if (amdgpu_ras_is_rma(adev)) { + if (amdgpu_ras_is_rma(adev)) ret = -EHWPOISON; - goto out; - } - - ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); } out: @@ -3040,18 +3450,24 @@ out: return ret; } -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr) { struct ras_err_handler_data *data = con->eh_data; + struct amdgpu_device *adev = con->adev; int i; + if ((addr >= adev->gmc.mc_vram_size && + adev->gmc.mc_vram_size) || + (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EINVAL; + addr >>= AMDGPU_GPU_PAGE_SHIFT; for (i = 0; i < data->count; i++) if (addr == data->bps[i].retired_page) - return true; + return 1; - return false; + return 0; } /* @@ -3059,11 +3475,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, * * Note: this check is only for umc block */ -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - bool ret = false; + int ret = 0; if (!con || !con->eh_data) return ret; @@ -3080,31 +3496,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); /* - * Justification of value bad_page_cnt_threshold in ras structure - * - * Generally, 0 <= amdgpu_bad_page_threshold <= max record length - * in eeprom or amdgpu_bad_page_threshold == -2, introduce two - * scenarios accordingly. - * - * Bad page retirement enablement: - * - If amdgpu_bad_page_threshold = -2, - * bad_page_cnt_threshold = typical value by formula. - * - * - When the value from user is 0 < amdgpu_bad_page_threshold < - * max record length in eeprom, use it directly. - * - * Bad page retirement disablement: - * - If amdgpu_bad_page_threshold = 0, bad page retirement - * functionality is disabled, and bad_page_cnt_threshold will - * take no effect. + * amdgpu_bad_page_threshold is used to config + * the threshold for the number of bad pages. + * -1: Threshold is set to default value + * Driver will issue a warning message when threshold is reached + * and continue runtime services. + * 0: Disable bad page retirement + * Driver will not retire bad pages + * which is intended for debugging purpose. + * -2: Threshold is determined by a formula + * that assumes 1 bad page per 100M of local memory. + * Driver will continue runtime services when threhold is reached. + * 0 < threshold < max number of bad page records in EEPROM, + * A user-defined threshold is set + * Driver will halt runtime services when this custom threshold is reached. */ - - if (amdgpu_bad_page_threshold < 0) { + if (amdgpu_bad_page_threshold == -2) { u64 val = adev->gmc.mc_vram_size; do_div(val, RAS_BAD_PAGE_COVER); con->bad_page_cnt_threshold = min(lower_32_bits(val), max_count); + } else if (amdgpu_bad_page_threshold == -1) { + con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; } else { con->bad_page_cnt_threshold = min_t(int, max_count, amdgpu_bad_page_threshold); @@ -3149,7 +3563,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3169,7 +3583,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_destroy(&ecc_log->lock); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3195,7 +3609,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) page_retirement_dwork.work); struct amdgpu_device *adev = con->adev; struct ras_err_data err_data; - unsigned long err_cnt; /* If gpu reset is ongoing, delay retiring the bad pages */ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { @@ -3207,13 +3620,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_init(&err_data); amdgpu_umc_handle_bad_pages(adev, &err_data); - err_cnt = err_data.err_addr_cnt; amdgpu_ras_error_data_fini(&err_data); - if (err_cnt && amdgpu_ras_is_rma(adev)) - amdgpu_ras_reset_gpu(adev); - amdgpu_ras_schedule_retirement_dwork(con, AMDGPU_RAS_RETIRE_PAGE_INTERVAL); } @@ -3224,58 +3633,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, int ret = 0; struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; - bool query_data_timeout = false; + u64 de_queried_count; + u64 consumption_q_count; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; + ecc_log->de_queried_count = 0; + ecc_log->consumption_q_count = 0; + do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; - } else { - new_detect_count = 0; - } - - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + consumption_q_count = ecc_log->consumption_q_count; - if (timeout) { - if (!--timeout) { - query_data_timeout = true; - break; - } - msleep(1); - } - } - } while (total_detect_count < need_query_count); + if (de_queried_count && consumption_q_count) + break; - if (query_data_timeout) { - dev_warn(adev->dev, "Can't find deferred error! count: %u\n", - (need_query_count - total_detect_count)); - return -ENOENT; - } + msleep(100); + } while (--timeout); - if (total_detect_count) + if (de_queried_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); + if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) + amdgpu_ras_reset_gpu(adev); + return 0; } @@ -3311,6 +3701,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, reset_flags |= msg.reset; } + /* + * Try to ensure poison creation handler is completed first + * to set rma if bad page exceed threshold. + */ + flush_delayed_work(&con->page_retirement_dwork); + /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ if (reset_flags && !amdgpu_ras_is_rma(adev)) { if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) @@ -3320,8 +3716,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, else reset = reset_flags; - flush_delayed_work(&con->page_retirement_dwork); - con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); @@ -3351,6 +3745,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; + mutex_lock(&con->poison_lock); gpu_reset = 0; do { @@ -3363,7 +3758,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(poison_creation_count, &con->poison_creation_count); atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); } - } while (atomic_read(&con->poison_creation_count)); + } while (atomic_read(&con->poison_creation_count) && + !atomic_read(&con->poison_consumption_count)); if (ret != -EIO) { msg_count = kfifo_len(&con->poison_fifo); @@ -3380,6 +3776,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ /* Clear poison creation request */ atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); /* Clear poison fifo */ amdgpu_ras_clear_poison_fifo(adev); @@ -3404,9 +3801,12 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(msg_count, &con->page_retirement_req_cnt); } + atomic_set(&con->poison_consumption_count, 0); + /* Wake up work to save bad pages to eeprom */ schedule_delayed_work(&con->page_retirement_dwork, 0); } + mutex_unlock(&con->poison_lock); } return 0; @@ -3421,23 +3821,28 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) if (!con || amdgpu_sriov_vf(adev)) return 0; + if (amdgpu_uniras_enabled(adev)) + return 0; + control = &con->eeprom_control; + con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev); + ret = amdgpu_ras_eeprom_init(control); - if (ret) - return ret; + control->is_eeprom_valid = !ret; if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) - control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; + control->ras_num_pa_recs = control->ras_num_recs; - /* default status is MCA storage */ - if (control->ras_num_recs <= 1 && - adev->umc.ras && adev->umc.ras->convert_ras_err_addr) - control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + if (adev->umc.ras && + adev->umc.ras->get_retire_flip_bits) + adev->umc.ras->get_retire_flip_bits(adev); - if (control->ras_num_recs) { + if (control->ras_num_recs && control->is_eeprom_valid) { ret = amdgpu_ras_load_bad_pages(adev); - if (ret) - return ret; + if (ret) { + control->is_eeprom_valid = false; + return 0; + } amdgpu_dpm_send_hbm_bad_pages_num( adev, control->ras_num_bad_pages); @@ -3447,9 +3852,16 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) adev, control->bad_channel_bitmap); con->update_channel_flag = false; } + + /* The format action is only applied to new ASICs */ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && + control->tbl_hdr.version < RAS_TABLE_VER_V3) + if (!amdgpu_ras_eeprom_reset_table(control)) + if (amdgpu_ras_save_bad_pages(adev, NULL)) + dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); } - return ret; + return 0; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) @@ -3473,15 +3885,17 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) return 0; data = &con->eh_data; - *data = kzalloc(sizeof(**data), GFP_KERNEL); + *data = kzalloc_obj(**data); if (!*data) { ret = -ENOMEM; goto out; } mutex_init(&con->recovery_lock); + mutex_init(&con->poison_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + atomic_set(&con->rma_in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -3499,6 +3913,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) init_waitqueue_head(&con->page_retirement_wq); atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); con->page_retirement_thread = kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); if (IS_ERR(con->page_retirement_thread)) { @@ -3571,6 +3986,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) kfree(data); mutex_unlock(&con->recovery_lock); + amdgpu_ras_critical_region_init(adev); +#ifdef CONFIG_X86_MCE_AMD + amdgpu_unregister_bad_pages_mca_notifier(adev); +#endif return 0; } /* recovery end */ @@ -3583,6 +4002,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 15): return true; default: return false; @@ -3596,6 +4016,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 10): case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 15): case IP_VERSION(14, 0, 3): return true; default: @@ -3658,7 +4079,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev */ if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || - amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); else @@ -3759,8 +4181,14 @@ init_ras_enabled_flag: adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : adev->ras_hw_enabled & amdgpu_ras_mask; - /* aca is disabled by default */ - adev->aca.is_enabled = false; + /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ + if (!amdgpu_sriov_vf(adev)) { + adev->aca.is_enabled = + (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15)); + } /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && @@ -3788,7 +4216,6 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) atomic_set(&con->ras_ue_count, ue_count); } - pm_runtime_mark_last_busy(dev->dev); Out: pm_runtime_put_autosuspend(dev->dev); } @@ -3848,8 +4275,11 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) case IP_VERSION(13, 0, 2): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 12): + case IP_VERSION(13, 0, 15): + con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; + break; case IP_VERSION(13, 0, 14): - con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; + con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); break; default: break; @@ -3948,7 +4378,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) * to handle fatal error */ r = amdgpu_nbio_ras_sw_init(adev); if (r) - return r; + goto release_con; if (adev->nbio.ras && adev->nbio.ras->init_ras_controller_interrupt) { @@ -3989,6 +4419,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->init_task_pid = task_pid_nr(current); + get_task_comm(con->init_task_comm, current); + + mutex_init(&con->critical_region_lock); + INIT_LIST_HEAD(&con->critical_region_head); + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", adev->ras_hw_enabled, adev->ras_enabled); @@ -4020,10 +4456,10 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, return 0; if (amdgpu_ras_query_error_status(adev, &info) != 0) - DRM_WARN("RAS init harvest failure"); + drm_warn(adev_to_drm(adev), "RAS init query failure"); if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) - DRM_WARN("RAS init harvest reset failure"); + drm_warn(adev_to_drm(adev), "RAS init harvest reset failure"); return 0; } @@ -4089,7 +4525,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, /* Those are the cached values at init. */ - query_info = kzalloc(sizeof(*query_info), GFP_KERNEL); + query_info = kzalloc_obj(*query_info); if (!query_info) return -ENOMEM; memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); @@ -4240,6 +4676,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); } + amdgpu_ras_check_bad_page_status(adev); + return 0; } @@ -4268,6 +4706,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (!adev->ras_enabled || !con) return 0; + amdgpu_ras_critical_region_fini(adev); + mutex_destroy(&con->critical_region_lock); + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { if (ras_node->ras_obj) { obj = ras_node->ras_obj; @@ -4335,8 +4776,10 @@ void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) struct amdgpu_ras *ras; ras = amdgpu_ras_get_context(adev); - if (ras) + if (ras) { ras->ras_err_state = 0; + ras->gpu_reset_flags = 0; + } } void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, @@ -4384,6 +4827,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_ struct ras_event_state *event_state; int ret = 0; + if (amdgpu_uniras_enabled(adev)) + return 0; + if (type >= RAS_EVENT_TYPE_COUNT) { ret = -EINVAL; goto out; @@ -4434,17 +4880,18 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type return id; } -void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); enum ras_event_type type = RAS_EVENT_TYPE_FATAL; - u64 event_id; + u64 event_id = RAS_EVENT_INVALID_ID; - if (amdgpu_ras_mark_ras_event(adev, type)) - return; + if (amdgpu_uniras_enabled(adev)) + return 0; - event_id = amdgpu_ras_acquire_event_id(adev, type); + if (!amdgpu_ras_mark_ras_event(adev, type)) + event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); @@ -4453,6 +4900,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; amdgpu_ras_reset_gpu(adev); } + + return -EBUSY; } bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) @@ -4580,6 +5029,28 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) notifier_registered = true; } } +static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev) +{ + int i, j; + + if (!notifier_registered && !mce_adev_list.num_gpu) + return; + for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { + if (mce_adev_list.devs[i] == adev) + mce_adev_list.devs[i] = NULL; + if (!mce_adev_list.devs[i]) + ++j; + } + + if (j == mce_adev_list.num_gpu) { + mce_adev_list.num_gpu = 0; + /* Unregister x86 notifier with MCE subsystem. */ + if (notifier_registered) { + mce_unregister_decode_chain(&amdgpu_bad_page_nb); + notifier_registered = false; + } + } +} #endif struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) @@ -4743,7 +5214,7 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, if (!adev || !ras_block_obj) return -EINVAL; - ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); + ras_node = kzalloc_obj(*ras_node); if (!ras_node) return -ENOMEM; @@ -4944,7 +5415,7 @@ static struct ras_err_node *amdgpu_ras_error_node_new(void) { struct ras_err_node *err_node; - err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL); + err_node = kvzalloc_obj(*err_node); if (!err_node) return NULL; @@ -5071,11 +5542,11 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, * is changed. In such case, replace the aqua_vanjaram implementation * with more common helper */ reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + - aqua_vanjaram_encode_ext_smn_addressing(instance); + amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + - aqua_vanjaram_encode_ext_smn_addressing(instance); + amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); @@ -5127,9 +5598,9 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", socket_id, aid_id, fw_status); - if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error)) + if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error)) dev_info(adev->dev, - "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n", + "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n", socket_id, aid_id, fw_status); } @@ -5141,7 +5612,7 @@ static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, int retry_loop; reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + - aqua_vanjaram_encode_ext_smn_addressing(instance); + amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); @@ -5171,6 +5642,9 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; int ret = 0; + if (amdgpu_ras_check_critical_address(adev, start)) + return 0; + mutex_lock(&con->page_rsv_lock); ret = amdgpu_vram_mgr_query_page_status(mgr, start); if (ret == -ENOENT) @@ -5202,8 +5676,110 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + if (amdgpu_uniras_enabled(adev)) + return amdgpu_ras_mgr_is_rma(adev); + if (!con) return false; return con->is_rma; } + +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, + struct amdgpu_bo *bo) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_vram_mgr_resource *vres; + struct ras_critical_region *region; + struct gpu_buddy_block *block; + int ret = 0; + + if (!bo || !bo->tbo.resource) + return -EINVAL; + + vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); + + mutex_lock(&con->critical_region_lock); + + /* Check if the bo had been recorded */ + list_for_each_entry(region, &con->critical_region_head, node) + if (region->bo == bo) + goto out; + + /* Record new critical amdgpu bo */ + list_for_each_entry(block, &vres->blocks, link) { + region = kzalloc_obj(*region); + if (!region) { + ret = -ENOMEM; + goto out; + } + region->bo = bo; + region->start = amdgpu_vram_mgr_block_start(block); + region->size = amdgpu_vram_mgr_block_size(block); + list_add_tail(®ion->node, &con->critical_region_head); + } + +out: + mutex_unlock(&con->critical_region_lock); + + return ret; +} + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) +{ + amdgpu_ras_add_critical_region(adev, adev->mman.resv_region[AMDGPU_RESV_FW].bo); +} + +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region, *tmp; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { + list_del(®ion->node); + kfree(region); + } + mutex_unlock(&con->critical_region_lock); +} + +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region; + bool ret = false; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry(region, &con->critical_region_head, node) { + if ((region->start <= addr) && + (addr < (region->start + region->size))) { + ret = true; + break; + } + } + mutex_unlock(&con->critical_region_lock); + + return ret; +} + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_pre_reset(tmp_adev); + } +} + +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_post_reset(tmp_adev); + } +} |
