diff options
author | Dave Airlie <airlied@redhat.com> | 2023-06-15 14:11:22 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2023-06-15 14:11:22 +1000 |
commit | 901bdf5ea1a836400ee69aa32b04e9c209271ec7 (patch) | |
tree | ccb1851c8a71e776dbccf1ccae132dc9b5f093c6 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | ba57b9b11f78530146f02b776854b2b6b6d344a4 (diff) | |
parent | 3b718dcaf163d17fe907ea098c8449e0cd6bc271 (diff) | |
download | lwn-901bdf5ea1a836400ee69aa32b04e9c209271ec7.tar.gz lwn-901bdf5ea1a836400ee69aa32b04e9c209271ec7.zip |
Merge tag 'amd-drm-next-6.5-2023-06-09' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.5-2023-06-02:
amdgpu:
- SR-IOV fixes
- Warning fixes
- Misc code cleanups and spelling fixes
- DCN 3.2 updates
- Improved DC FAMS support for better power management
- Improved DC SubVP support for better power management
- DCN 3.1.x fixes
- Max IB size query
- DC GPU reset fixes
- RAS updates
- DCN 3.0.x fixes
- S/G display fixes
- CP shadow buffer support
- Implement connector force callback
- Z8 power improvements
- PSP 13.0.10 vbflash support
- Mode2 reset fixes
- Store MQDs in VRAM to improve queue switch latency
- VCN 3.x fixes
- JPEG 3.x fixes
- Enable DC_FP on LoongArch
- GFXOFF fixes
- GC 9.4.3 partition support
- SDMA 4.4.2 partition support
- VCN/JPEG 4.0.3 partition support
- VCN 4.0.3 updates
- NBIO 7.9 updates
- GC 9.4.3 updates
- Take NUMA into account when allocating memory
- Handle NUMA for partitions
- SMU 13.0.6 updates
- GC 9.4.3 RAS updates
- Stop including unused swiotlb.h
- SMU 13.0.7 fixes
- Fix clock output ordering on some APUs
- Clean up DC FPGA code
- GFX9 preemption fixes
- Misc irq fixes
- S0ix fixes
- Add new DRM_AMDGPU_WERROR config parameter to help with CI
- PCIe fix for RDNA2
- kdoc fixes
- Documentation updates
amdkfd:
- Query TTM mem limit rather than hardcoding it
- GC 9.4.3 partition support
- Handle NUMA for partitions
radeon:
- Fix possible double free
- Stop including unused swiotlb.h
- Fix possible division by zero
ttm:
- Add query for TTM mem limit
- Add NUMA awareness to pools
- Export ttm_pool_fini()
UAPI:
- Add new ctx query flag to better handle GPU resets
Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22290
- Add new interface to query and set shadow buffer for RDNA3
Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21986
- Add new INFO query for max IB size
Proposed userspace: https://gitlab.freedesktop.org/bnieuwenhuizen/mesa/-/commits/ib-rejection-v3
amd-drm-next-6.5-2023-06-09:
amdgpu:
- S0ix fixes
- Initial SMU13 Overdrive support
- kdoc fixes
- Misc clode cleanups
- Flexible array fixes
- Display OTG fixes
- SMU 13.0.6 updates
- Revert some broken clock counter updates
- Misc display fixes
- GFX9 preemption fixes
- Add support for newer EEPROM bad page table format
- Add missing radeon secondary id
- Add support for new colorspace KMS API
- CSA fix
- Stable pstate fixes for APUs
- make vbl interface admin only
- Handle PCI accelerator class
amdkfd:
- Add debugger support for gdb
radeon:
- Fix possible UAF
drm:
- Add Colorspace functionality
UAPI:
- Add debugger interface for enabling gdb
Proposed userspace: https://github.com/ROCm-Developer-Tools/ROCdbgapi/tree/wip-dbgapi
- Add KMS colorspace API
Discussion: https://lists.freedesktop.org/archives/dri-devel/2023-June/408128.html
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230609174817.7764-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 230 |
1 files changed, 213 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3ab8a88789c8..a6c3265cdbc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, int block_id; uint32_t sub_block; u64 address, value; + /* default value is 0 if the mask is not set by user */ + u32 instance_mask = 0; if (*pos) return -EINVAL; @@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->op = op; if (op == 2) { - if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", + if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s %u %llu %llu %u", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", &sub_block, &address, &value) != 3 && sscanf(str, "%*s %*s %*s %u %llu %llu", &sub_block, &address, &value) != 3) @@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->head.sub_block_index = sub_block; data->inject.address = address; data->inject.value = value; + data->inject.instance_mask = instance_mask; } } else { if (size < sizeof(*data)) @@ -326,6 +333,46 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } +static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, + struct ras_debug_if *data) +{ + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + uint32_t mask, inst_mask = data->inject.instance_mask; + + /* no need to set instance mask if there is only one instance */ + if (num_xcc <= 1 && inst_mask) { + data->inject.instance_mask = 0; + dev_dbg(adev->dev, + "RAS inject mask(0x%x) isn't supported and force it to 0.\n", + inst_mask); + + return; + } + + switch (data->head.block) { + case AMDGPU_RAS_BLOCK__GFX: + mask = GENMASK(num_xcc - 1, 0); + break; + case AMDGPU_RAS_BLOCK__SDMA: + mask = GENMASK(adev->sdma.num_instances - 1, 0); + break; + case AMDGPU_RAS_BLOCK__VCN: + case AMDGPU_RAS_BLOCK__JPEG: + mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); + break; + default: + mask = inst_mask; + break; + } + + /* remove invalid bits in instance mask */ + data->inject.instance_mask &= mask; + if (inst_mask != data->inject.instance_mask) + dev_dbg(adev->dev, + "Adjust RAS inject mask 0x%x to 0x%x\n", + inst_mask, data->inject.instance_mask); +} + /** * DOC: AMDGPU RAS debugfs control interface * @@ -341,7 +388,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. * name: the name of IP. * - * inject has two more members than head, they are address, value. + * inject has three more members than head, they are address, value and mask. * As their names indicate, inject operation will write the * value to the address. * @@ -365,7 +412,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl - * echo "inject <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl * * Where N, is the card which you want to affect. * @@ -382,13 +429,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. + * The mask means instance mask, is optional, default value is 0x1. * * For instance, * * .. code-block:: bash * * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl - * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl + * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl * * How to check the result of the operation? @@ -460,6 +508,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, break; } + amdgpu_ras_instance_mask_check(adev, &data); + /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; @@ -1115,15 +1165,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, block_info.address); } - if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { - if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, info); + if (block_obj->hw_ops->ras_error_inject) { + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) + ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); + else /* Special ras_error_inject is defined (e.g: xgmi) */ + ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, + info->instance_mask); } else { - /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ - if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); - else /*If not defined .ras_error_inject, use default ras_error_inject*/ - ret = psp_ras_trigger_error(&adev->psp, &block_info); + /* default path */ + ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); } if (ret) @@ -1597,8 +1647,7 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) { /* Fatal error events are handled on host side */ - if (amdgpu_sriov_vf(adev) || - !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) + if (amdgpu_sriov_vf(adev)) return; if (adev->nbio.ras && @@ -2008,9 +2057,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) /* Perform full reset in fatal error mode */ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - else + else { clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { + ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset_context.method = AMD_RESET_METHOD_MODE2; + } + } + amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); @@ -2259,7 +2314,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; - max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); /* Todo: During test the SMU might fail to read the eeprom through I2C @@ -2625,7 +2680,8 @@ release_con: int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) { - if (adev->gmc.xgmi.connected_to_cpu) + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) return 1; return 0; } @@ -3104,3 +3160,143 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, return 0; } + +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) +{ + if (!err_type_name) + return; + + switch (err_type) { + case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: + sprintf(err_type_name, "correctable"); + break; + case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: + sprintf(err_type_name, "uncorrectable"); + break; + default: + sprintf(err_type_name, "unknown"); + break; + } +} + +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id) +{ + uint32_t err_status_lo_data, err_status_lo_offset; + + if (!reg_entry) + return false; + + err_status_lo_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_lo, reg_entry->reg_lo); + err_status_lo_data = RREG32(err_status_lo_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && + !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) + return false; + + *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); + + return true; +} + +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt) +{ + uint32_t err_status_hi_data, err_status_hi_offset; + + if (!reg_entry) + return false; + + err_status_hi_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_hi, reg_entry->reg_hi); + err_status_hi_data = RREG32(err_status_hi_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && + !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) + /* keep the check here in case we need to refer to the result later */ + dev_dbg(adev->dev, "Invalid err_info field\n"); + + /* read err count */ + *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); + + return true; +} + +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count) +{ + uint32_t memory_id; + unsigned long err_cnt; + char err_type_name[16]; + uint32_t i, j; + + for (i = 0; i < reg_list_size; i++) { + /* query memory_id from err_status_lo */ + if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], + instance, &memory_id)) + continue; + + /* query err_cnt from err_status_hi */ + if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], + instance, &err_cnt) || + !err_cnt) + continue; + + *err_count += err_cnt; + + /* log the errors */ + amdgpu_ras_get_error_type_name(err_type, err_type_name); + if (!mem_list) { + /* memory_list is not supported */ + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, memory_id); + } else { + for (j = 0; j < mem_list_size; j++) { + if (memory_id == mem_list[j].memory_id) { + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, mem_list[j].name); + break; + } + } + } + } +} + +void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + uint32_t instance) +{ + uint32_t err_status_lo_offset, err_status_hi_offset; + uint32_t i; + + for (i = 0; i < reg_list_size; i++) { + err_status_lo_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, + reg_list[i].seg_lo, reg_list[i].reg_lo); + err_status_hi_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, + reg_list[i].seg_hi, reg_list[i].reg_hi); + WREG32(err_status_lo_offset, 0); + WREG32(err_status_hi_offset, 0); + } +} |