diff options
author | Dave Airlie <airlied@redhat.com> | 2023-06-15 14:11:22 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2023-06-15 14:11:22 +1000 |
commit | 901bdf5ea1a836400ee69aa32b04e9c209271ec7 (patch) | |
tree | ccb1851c8a71e776dbccf1ccae132dc9b5f093c6 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | |
parent | ba57b9b11f78530146f02b776854b2b6b6d344a4 (diff) | |
parent | 3b718dcaf163d17fe907ea098c8449e0cd6bc271 (diff) | |
download | lwn-901bdf5ea1a836400ee69aa32b04e9c209271ec7.tar.gz lwn-901bdf5ea1a836400ee69aa32b04e9c209271ec7.zip |
Merge tag 'amd-drm-next-6.5-2023-06-09' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.5-2023-06-02:
amdgpu:
- SR-IOV fixes
- Warning fixes
- Misc code cleanups and spelling fixes
- DCN 3.2 updates
- Improved DC FAMS support for better power management
- Improved DC SubVP support for better power management
- DCN 3.1.x fixes
- Max IB size query
- DC GPU reset fixes
- RAS updates
- DCN 3.0.x fixes
- S/G display fixes
- CP shadow buffer support
- Implement connector force callback
- Z8 power improvements
- PSP 13.0.10 vbflash support
- Mode2 reset fixes
- Store MQDs in VRAM to improve queue switch latency
- VCN 3.x fixes
- JPEG 3.x fixes
- Enable DC_FP on LoongArch
- GFXOFF fixes
- GC 9.4.3 partition support
- SDMA 4.4.2 partition support
- VCN/JPEG 4.0.3 partition support
- VCN 4.0.3 updates
- NBIO 7.9 updates
- GC 9.4.3 updates
- Take NUMA into account when allocating memory
- Handle NUMA for partitions
- SMU 13.0.6 updates
- GC 9.4.3 RAS updates
- Stop including unused swiotlb.h
- SMU 13.0.7 fixes
- Fix clock output ordering on some APUs
- Clean up DC FPGA code
- GFX9 preemption fixes
- Misc irq fixes
- S0ix fixes
- Add new DRM_AMDGPU_WERROR config parameter to help with CI
- PCIe fix for RDNA2
- kdoc fixes
- Documentation updates
amdkfd:
- Query TTM mem limit rather than hardcoding it
- GC 9.4.3 partition support
- Handle NUMA for partitions
radeon:
- Fix possible double free
- Stop including unused swiotlb.h
- Fix possible division by zero
ttm:
- Add query for TTM mem limit
- Add NUMA awareness to pools
- Export ttm_pool_fini()
UAPI:
- Add new ctx query flag to better handle GPU resets
Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22290
- Add new interface to query and set shadow buffer for RDNA3
Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21986
- Add new INFO query for max IB size
Proposed userspace: https://gitlab.freedesktop.org/bnieuwenhuizen/mesa/-/commits/ib-rejection-v3
amd-drm-next-6.5-2023-06-09:
amdgpu:
- S0ix fixes
- Initial SMU13 Overdrive support
- kdoc fixes
- Misc clode cleanups
- Flexible array fixes
- Display OTG fixes
- SMU 13.0.6 updates
- Revert some broken clock counter updates
- Misc display fixes
- GFX9 preemption fixes
- Add support for newer EEPROM bad page table format
- Add missing radeon secondary id
- Add support for new colorspace KMS API
- CSA fix
- Stable pstate fixes for APUs
- make vbl interface admin only
- Handle PCI accelerator class
amdkfd:
- Add debugger support for gdb
radeon:
- Fix possible UAF
drm:
- Add Colorspace functionality
UAPI:
- Add debugger interface for enabling gdb
Proposed userspace: https://github.com/ROCm-Developer-Tools/ROCdbgapi/tree/wip-dbgapi
- Add KMS colorspace API
Discussion: https://lists.freedesktop.org/archives/dri-devel/2023-June/408128.html
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230609174817.7764-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 72 |
1 files changed, 71 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 17b3d1992e80..46bf1889a9d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -32,6 +32,11 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) +/* position of instance value in sub_block_index of + * ta_ras_trigger_error_input, the sub block uses lower 12 bits + */ +#define AMDGPU_RAS_INST_MASK 0xfffff000 +#define AMDGPU_RAS_INST_SHIFT 0xc enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -314,6 +319,45 @@ enum amdgpu_ras_ret { AMDGPU_RAS_PT, }; +/* ras error status reisger fields */ +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 +#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L +#define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18 +#define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2 +#define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L +#define ERR_STATUS__ERR_CNT__SHIFT 0x17 +#define ERR_STATUS__ERR_CNT_MASK 0x03800000L + +#define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \ + ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi + +#define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \ + (adev->reg_offset[hwip][ip_inst][segment] + (reg)) + +#define AMDGPU_RAS_ERR_INFO_VALID (1 << 0) +#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) +#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) + +#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0) + +struct amdgpu_ras_err_status_reg_entry { + uint32_t hwip; + uint32_t ip_inst; + uint32_t seg_lo; + uint32_t reg_lo; + uint32_t seg_hi; + uint32_t reg_hi; + uint32_t reg_inst; + uint32_t flags; + const char *block_name; +}; + +struct amdgpu_ras_memory_id_entry { + uint32_t memory_id; + const char *name; +}; + struct ras_common_if { enum amdgpu_ras_block block; enum amdgpu_ras_error_type type; @@ -385,6 +429,9 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + + /* Record special requirements of gpu reset caller */ + uint32_t gpu_reset_flags; }; struct ras_fs_data { @@ -471,6 +518,7 @@ struct ras_inject_if { struct ras_common_if head; uint64_t address; uint64_t value; + uint32_t instance_mask; }; struct ras_cure_if { @@ -508,7 +556,8 @@ struct amdgpu_ras_block_object { }; struct amdgpu_ras_block_hw_ops { - int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); + int (*ras_error_inject)(struct amdgpu_device *adev, + void *inject_if, uint32_t instance_mask); void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); @@ -696,4 +745,25 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name); +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id); +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt); +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count); +void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + uint32_t instance); #endif |