diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
117 files changed, 3397 insertions, 8260 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index ba80542ead9d..5100e35027ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o amdgpu_doorbell_mgr.o amdgpu_kms amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o amdgpu_lockdep.o \ - amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \ + amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_dev_coredump.o \ amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7b09410d6d8f..dd8ea71077af 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -44,6 +44,7 @@ #include <linux/hashtable.h> #include <linux/dma-fence.h> #include <linux/pci.h> +#include <linux/xarray.h> #include <drm/ttm/ttm_bo.h> #include <drm/ttm/ttm_placement.h> @@ -103,7 +104,6 @@ #include "amdgpu_smuio.h" #include "amdgpu_fdinfo.h" #include "amdgpu_mca.h" -#include "amdgpu_aca.h" #include "amdgpu_ras.h" #include "amdgpu_lockdep.h" #include "amdgpu_cper.h" @@ -113,6 +113,7 @@ #include "amdgpu_userq.h" #include "amdgpu_eviction_fence.h" #include "amdgpu_ip.h" +#include "amdgpu_sa.h" #if defined(CONFIG_DRM_AMD_ISP) #include "amdgpu_isp.h" #endif @@ -272,7 +273,6 @@ extern int amdgpu_ptl; extern uint amdgpu_hdmi_hpd_debounce_delay_ms; -#define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000 #define AMDGPU_MAX_USEC_TIMEOUT 100000 /* 100 ms */ @@ -305,9 +305,10 @@ extern uint amdgpu_hdmi_hpd_debounce_delay_ms; /* reset mask */ #define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */ -#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */ +#define AMDGPU_RESET_TYPE_SOFT_RECOVERY (1 << 1) /* soft recovery, eg. kill shaders */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */ +#define AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET (1 << 4) /* soft-resets an IP block */ /* max cursor sizes (in pixels) */ #define CIK_CURSOR_WIDTH 128 @@ -387,37 +388,6 @@ struct amdgpu_clock { uint32_t max_pixel_clock; }; -/* sub-allocation manager, it has to be protected by another lock. - * By conception this is an helper for other part of the driver - * like the indirect buffer or semaphore, which both have their - * locking. - * - * Principe is simple, we keep a list of sub allocation in offset - * order (first entry has offset == 0, last entry has the highest - * offset). - * - * When allocating new object we first check if there is room at - * the end total_size - (last_object_offset + last_object_size) >= - * alloc_size. If so we allocate new object there. - * - * When there is not enough room at the end, we start waiting for - * each sub object until we reach object_offset+object_size >= - * alloc_size, this object then become the sub object we return. - * - * Alignment can't be bigger than page size. - * - * Hole are not considered for allocation to keep things simple. - * Assumption is that there won't be hole (all object on same - * alignment). - */ - -struct amdgpu_sa_manager { - struct drm_suballoc_manager base; - struct amdgpu_bo *bo; - uint64_t gpu_addr; - void *cpu_ptr; -}; - /* * IRQS. */ @@ -446,8 +416,7 @@ struct amdgpu_fpriv { struct amdgpu_bo_va *prt_va; struct amdgpu_bo_va *csa_va; struct amdgpu_bo_va *seq64_va; - struct mutex bo_list_lock; - struct idr bo_list_handles; + struct xarray bo_list_handles; struct amdgpu_ctx_mgr ctx_mgr; struct amdgpu_userq_mgr userq_mgr; @@ -587,8 +556,6 @@ struct amdgpu_asic_funcs { /* invalidate hdp read cache */ void (*invalidate_hdp)(struct amdgpu_device *adev, struct amdgpu_ring *ring); - /* check if the asic needs a full reset of if soft reset will work */ - bool (*need_full_reset)(struct amdgpu_device *adev); /* initialize doorbell layout for specific asic*/ void (*init_doorbell_index)(struct amdgpu_device *adev); /* PCIe bandwidth usage */ @@ -851,6 +818,7 @@ struct amdgpu_device { struct dev_pm_domain vga_pm_domain; bool have_disp_power_ref; bool have_atomics_support; + bool is_sw_smu; /* BIOS */ bool is_atom_fw; @@ -1022,9 +990,6 @@ struct amdgpu_device { /* MCA */ struct amdgpu_mca mca; - /* ACA */ - struct amdgpu_aca aca; - /* CPER */ struct amdgpu_cper cper; @@ -1136,6 +1101,8 @@ struct amdgpu_device { bool debug_vm_userptr; bool debug_disable_ce_logs; bool debug_enable_ce_cs; + bool debug_hibernation_thaw_resume_gpu; + bool debug_disable_ip_block_soft_reset; /* Protection for the following isolation structure */ struct mutex enforce_isolation_mutex; @@ -1356,7 +1323,6 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l)) #define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v))) #define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev)) -#define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) #define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev)) #define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1))) #define amdgpu_asic_need_reset_on_init(adev) (adev)->asic_funcs->need_reset_on_init((adev)) @@ -1468,6 +1434,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc); void amdgpu_disable_vblank_kms(struct drm_crtc *crtc); int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); +int amdgpu_proc_options_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); /* * functions used by amdgpu_encoder.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c deleted file mode 100644 index db7858fe0c3d..000000000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ /dev/null @@ -1,985 +0,0 @@ -/* - * Copyright 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include <linux/list.h> -#include "amdgpu.h" -#include "amdgpu_aca.h" -#include "amdgpu_ras.h" - -#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} - -typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); - -static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = { - ACA_BANK_HWID(SMU, 0x01, 0x01), - ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00), - ACA_BANK_HWID(UMC, 0x96, 0x00), -}; - -static void aca_banks_init(struct aca_banks *banks) -{ - if (!banks) - return; - - memset(banks, 0, sizeof(*banks)); - INIT_LIST_HEAD(&banks->list); -} - -static int aca_banks_add_bank(struct aca_banks *banks, struct aca_bank *bank) -{ - struct aca_bank_node *node; - - if (!bank) - return -EINVAL; - - node = kvzalloc_obj(*node); - if (!node) - return -ENOMEM; - - memcpy(&node->bank, bank, sizeof(*bank)); - - INIT_LIST_HEAD(&node->node); - list_add_tail(&node->node, &banks->list); - - banks->nr_banks++; - - return 0; -} - -static void aca_banks_release(struct aca_banks *banks) -{ - struct aca_bank_node *node, *tmp; - - if (list_empty(&banks->list)) - return; - - list_for_each_entry_safe(node, tmp, &banks->list, node) { - list_del(&node->node); - kvfree(node); - banks->nr_banks--; - } -} - -static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count) -{ - struct amdgpu_aca *aca = &adev->aca; - const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - - if (!count) - return -EINVAL; - - if (!smu_funcs || !smu_funcs->get_valid_aca_count) - return -EOPNOTSUPP; - - return smu_funcs->get_valid_aca_count(adev, type, count); -} - -static struct aca_regs_dump { - const char *name; - int reg_idx; -} aca_regs[] = { - {"CONTROL", ACA_REG_IDX_CTL}, - {"STATUS", ACA_REG_IDX_STATUS}, - {"ADDR", ACA_REG_IDX_ADDR}, - {"MISC", ACA_REG_IDX_MISC0}, - {"CONFIG", ACA_REG_IDX_CONFIG}, - {"IPID", ACA_REG_IDX_IPID}, - {"SYND", ACA_REG_IDX_SYND}, - {"DESTAT", ACA_REG_IDX_DESTAT}, - {"DEADDR", ACA_REG_IDX_DEADDR}, - {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK}, -}; - -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, - struct ras_query_context *qctx) -{ - u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID; - int i; - - if (adev->debug_disable_ce_logs && - bank->smu_err_type == ACA_SMU_TYPE_CE && - !ACA_BANK_ERR_IS_DEFFERED(bank)) - return; - - RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); - /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ - for (i = 0; i < ARRAY_SIZE(aca_regs); i++) - RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", - idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); - - if (ACA_REG__STATUS__SCRUB(bank->regs[ACA_REG_IDX_STATUS])) - RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n"); -} - -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type) -{ - - struct aca_hwip *hwip; - int hwid, mcatype; - u64 ipid; - - if (!bank || type == ACA_HWIP_TYPE_UNKNOW) - return false; - - hwip = &aca_hwid_mcatypes[type]; - if (!hwip->hwid) - return false; - - ipid = bank->regs[ACA_REG_IDX_IPID]; - hwid = ACA_REG__IPID__HARDWAREID(ipid); - mcatype = ACA_REG__IPID__MCATYPE(ipid); - - return hwip->hwid == hwid && hwip->mcatype == mcatype; -} - -static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, - int start, int count, - struct aca_banks *banks, struct ras_query_context *qctx) -{ - struct amdgpu_aca *aca = &adev->aca; - const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - struct aca_bank bank; - int i, max_count, ret; - - if (!count) - return 0; - - if (!smu_funcs || !smu_funcs->get_valid_aca_bank) - return -EOPNOTSUPP; - - switch (type) { - case ACA_SMU_TYPE_UE: - max_count = smu_funcs->max_ue_bank_count; - break; - case ACA_SMU_TYPE_CE: - max_count = smu_funcs->max_ce_bank_count; - break; - default: - return -EINVAL; - } - - if (start + count > max_count) - return -EINVAL; - - count = min_t(int, count, max_count); - for (i = 0; i < count; i++) { - memset(&bank, 0, sizeof(bank)); - ret = smu_funcs->get_valid_aca_bank(adev, type, start + i, &bank); - if (ret) - return ret; - - bank.smu_err_type = type; - - /* - * Poison being consumed when injecting a UE while running background workloads, - * which are unexpected. - */ - if (type == ACA_SMU_TYPE_UE && - ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) && - !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC)) - continue; - - aca_smu_bank_dump(adev, i, count, &bank, qctx); - - ret = aca_banks_add_bank(banks, &bank); - if (ret) - return ret; - } - - return 0; -} - -static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) -{ - const struct aca_bank_ops *bank_ops = handle->bank_ops; - - /* Parse all deferred errors with UMC aca handle */ - if (ACA_BANK_ERR_IS_DEFFERED(bank)) - return handle->hwip == ACA_HWIP_TYPE_UMC; - - if (!aca_bank_hwip_is_matched(bank, handle->hwip)) - return false; - - if (!bank_ops->aca_bank_is_valid) - return true; - - return bank_ops->aca_bank_is_valid(handle, bank, type, handle->data); -} - -static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_bank_info *info) -{ - struct aca_bank_error *bank_error; - - bank_error = kvzalloc_obj(*bank_error); - if (!bank_error) - return NULL; - - INIT_LIST_HEAD(&bank_error->node); - memcpy(&bank_error->info, info, sizeof(*info)); - - mutex_lock(&aerr->lock); - list_add_tail(&bank_error->node, &aerr->list); - aerr->nr_errors++; - mutex_unlock(&aerr->lock); - - return bank_error; -} - -static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct aca_bank_info *info) -{ - struct aca_bank_error *bank_error = NULL; - struct aca_bank_info *tmp_info; - bool found = false; - - mutex_lock(&aerr->lock); - list_for_each_entry(bank_error, &aerr->list, node) { - tmp_info = &bank_error->info; - if (tmp_info->socket_id == info->socket_id && - tmp_info->die_id == info->die_id) { - found = true; - goto out_unlock; - } - } - -out_unlock: - mutex_unlock(&aerr->lock); - - return found ? bank_error : NULL; -} - -static void aca_bank_error_remove(struct aca_error *aerr, struct aca_bank_error *bank_error) -{ - if (!aerr || !bank_error) - return; - - list_del(&bank_error->node); - aerr->nr_errors--; - - kvfree(bank_error); -} - -static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_bank_info *info) -{ - struct aca_bank_error *bank_error; - - if (!aerr || !info) - return NULL; - - bank_error = find_bank_error(aerr, info); - if (bank_error) - return bank_error; - - return new_bank_error(aerr, info); -} - -int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, - enum aca_error_type type, u64 count) -{ - struct aca_error_cache *error_cache = &handle->error_cache; - struct aca_bank_error *bank_error; - struct aca_error *aerr; - - if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT) - return -EINVAL; - - if (!count) - return 0; - - aerr = &error_cache->errors[type]; - bank_error = get_bank_error(aerr, info); - if (!bank_error) - return -ENOMEM; - - bank_error->count += count; - - return 0; -} - -static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) -{ - const struct aca_bank_ops *bank_ops = handle->bank_ops; - - if (!bank) - return -EINVAL; - - if (!bank_ops->aca_bank_parser) - return -EOPNOTSUPP; - - return bank_ops->aca_bank_parser(handle, bank, type, - handle->data); -} - -static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - int ret; - - ret = aca_bank_parser(handle, bank, type); - if (ret) - return ret; - - return 0; -} - -static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, - enum aca_smu_type type, bank_handler_t handler, void *data) -{ - struct aca_handle *handle; - int ret; - - if (list_empty(&mgr->list)) - return 0; - - list_for_each_entry(handle, &mgr->list, node) { - if (!aca_bank_is_valid(handle, bank, type)) - continue; - - ret = handler(handle, bank, type, data); - if (ret) - return ret; - } - - return 0; -} - -static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, - enum aca_smu_type type, bank_handler_t handler, void *data) -{ - struct aca_bank_node *node; - struct aca_bank *bank; - int ret; - - if (!mgr || !banks) - return -EINVAL; - - /* pre check to avoid unnecessary operations */ - if (list_empty(&mgr->list) || list_empty(&banks->list)) - return 0; - - list_for_each_entry(node, &banks->list, node) { - bank = &node->bank; - - ret = aca_dispatch_bank(mgr, bank, type, handler, data); - if (ret) - return ret; - } - - return 0; -} - -static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) -{ - struct amdgpu_aca *aca = &adev->aca; - bool ret = true; - - /* - * Because the UE Valid MCA count will only be cleared after reset, - * in order to avoid repeated counting of the error count, - * the aca bank is only updated once during the gpu recovery stage. - */ - if (type == ACA_SMU_TYPE_UE) { - if (amdgpu_ras_intr_triggered()) - ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; - else - atomic_set(&aca->ue_update_flag, 0); - } - - return ret; -} - -static void aca_banks_generate_cper(struct amdgpu_device *adev, - enum aca_smu_type type, - struct aca_banks *banks, - int count) -{ - struct aca_bank_node *node; - struct aca_bank *bank; - int r; - - if (!adev->cper.enabled) - return; - - if (!banks || !count) { - dev_warn(adev->dev, "fail to generate cper records\n"); - return; - } - - /* UEs must be encoded into separate CPER entries */ - if (type == ACA_SMU_TYPE_UE) { - struct aca_banks de_banks; - - aca_banks_init(&de_banks); - list_for_each_entry(node, &banks->list, node) { - bank = &node->bank; - if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { - r = aca_banks_add_bank(&de_banks, bank); - if (r) - dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r); - } else { - if (amdgpu_cper_generate_ue_record(adev, bank)) - dev_warn(adev->dev, "fail to generate ue cper records\n"); - } - } - - if (!list_empty(&de_banks.list)) { - if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks)) - dev_warn(adev->dev, "fail to generate de cper records\n"); - } - - aca_banks_release(&de_banks); - } else { - /* - * SMU_TYPE_CE banks are combined into 1 CPER entries, - * they could be CEs or DEs or both - */ - if (amdgpu_cper_generate_ce_records(adev, banks, count)) - dev_warn(adev->dev, "fail to generate ce cper records\n"); - } -} - -static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, - bank_handler_t handler, struct ras_query_context *qctx, void *data) -{ - struct amdgpu_aca *aca = &adev->aca; - struct aca_banks banks; - u32 count = 0; - int ret; - - if (list_empty(&aca->mgr.list)) - return 0; - - if (!aca_bank_should_update(adev, type)) - return 0; - - ret = aca_smu_get_valid_aca_count(adev, type, &count); - if (ret) - return ret; - - if (!count) - return 0; - - aca_banks_init(&banks); - - ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx); - if (ret) - goto err_release_banks; - - if (list_empty(&banks.list)) { - ret = 0; - goto err_release_banks; - } - - ret = aca_dispatch_banks(&aca->mgr, &banks, type, - handler, data); - if (ret) - goto err_release_banks; - - aca_banks_generate_cper(adev, type, &banks, count); - -err_release_banks: - aca_banks_release(&banks); - - return ret; -} - -static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_error_type type, struct ras_err_data *err_data) -{ - struct aca_bank_info *info; - struct amdgpu_smuio_mcm_config_info mcm_info; - u64 count; - - if (type >= ACA_ERROR_TYPE_COUNT) - return -EINVAL; - - count = bank_error->count; - if (!count) - return 0; - - info = &bank_error->info; - mcm_info.die_id = info->die_id; - mcm_info.socket_id = info->socket_id; - - switch (type) { - case ACA_ERROR_TYPE_UE: - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count); - break; - case ACA_ERROR_TYPE_CE: - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count); - break; - case ACA_ERROR_TYPE_DEFERRED: - amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count); - break; - default: - break; - } - - return 0; -} - -static int aca_log_aca_error(struct aca_handle *handle, enum aca_error_type type, struct ras_err_data *err_data) -{ - struct aca_error_cache *error_cache = &handle->error_cache; - struct aca_error *aerr = &error_cache->errors[type]; - struct aca_bank_error *bank_error, *tmp; - - mutex_lock(&aerr->lock); - - if (list_empty(&aerr->list)) - goto out_unlock; - - list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) { - aca_log_aca_error_data(bank_error, type, err_data); - aca_bank_error_remove(aerr, bank_error); - } - -out_unlock: - mutex_unlock(&aerr->lock); - - return 0; -} - -static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, - struct ras_err_data *err_data, struct ras_query_context *qctx) -{ - enum aca_smu_type smu_type; - int ret; - - switch (type) { - case ACA_ERROR_TYPE_UE: - smu_type = ACA_SMU_TYPE_UE; - break; - case ACA_ERROR_TYPE_CE: - case ACA_ERROR_TYPE_DEFERRED: - smu_type = ACA_SMU_TYPE_CE; - break; - default: - return -EINVAL; - } - - /* update aca bank to aca source error_cache first */ - ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL); - if (ret) - return ret; - - /* DEs may contain in CEs or UEs */ - if (type != ACA_ERROR_TYPE_DEFERRED) - aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data); - - return aca_log_aca_error(handle, type, err_data); -} - -static bool aca_handle_is_valid(struct aca_handle *handle) -{ - if (!handle->mask || !list_empty(&handle->node)) - return false; - - return true; -} - -int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, struct ras_err_data *err_data, - struct ras_query_context *qctx) -{ - if (!handle || !err_data) - return -EINVAL; - - if (aca_handle_is_valid(handle)) - return -EOPNOTSUPP; - - if ((type < 0) || (!(BIT(type) & handle->mask))) - return 0; - - return __aca_get_error_data(adev, handle, type, err_data, qctx); -} - -static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) -{ - mutex_init(&aerr->lock); - INIT_LIST_HEAD(&aerr->list); - aerr->type = type; - aerr->nr_errors = 0; -} - -static void aca_init_error_cache(struct aca_handle *handle) -{ - struct aca_error_cache *error_cache = &handle->error_cache; - int type; - - for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++) - aca_error_init(&error_cache->errors[type], type); -} - -static void aca_error_fini(struct aca_error *aerr) -{ - struct aca_bank_error *bank_error, *tmp; - - mutex_lock(&aerr->lock); - if (list_empty(&aerr->list)) - goto out_unlock; - - list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) - aca_bank_error_remove(aerr, bank_error); - -out_unlock: - mutex_unlock(&aerr->lock); - mutex_destroy(&aerr->lock); -} - -static void aca_fini_error_cache(struct aca_handle *handle) -{ - struct aca_error_cache *error_cache = &handle->error_cache; - int type; - - for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++) - aca_error_fini(&error_cache->errors[type]); -} - -static int add_aca_handle(struct amdgpu_device *adev, struct aca_handle_manager *mgr, struct aca_handle *handle, - const char *name, const struct aca_info *ras_info, void *data) -{ - memset(handle, 0, sizeof(*handle)); - - handle->adev = adev; - handle->mgr = mgr; - handle->name = name; - handle->hwip = ras_info->hwip; - handle->mask = ras_info->mask; - handle->bank_ops = ras_info->bank_ops; - handle->data = data; - aca_init_error_cache(handle); - - INIT_LIST_HEAD(&handle->node); - list_add_tail(&handle->node, &mgr->list); - mgr->nr_handles++; - - return 0; -} - -static ssize_t aca_sysfs_read(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct aca_handle *handle = container_of(attr, struct aca_handle, aca_attr); - - /* NOTE: the aca cache will be auto cleared once read, - * So the driver should unify the query entry point, forward request to ras query interface directly */ - return amdgpu_ras_aca_sysfs_read(dev, attr, handle, buf, handle->data); -} - -static int add_aca_sysfs(struct amdgpu_device *adev, struct aca_handle *handle) -{ - struct device_attribute *aca_attr = &handle->aca_attr; - - snprintf(handle->attr_name, sizeof(handle->attr_name) - 1, "aca_%s", handle->name); - aca_attr->show = aca_sysfs_read; - aca_attr->attr.name = handle->attr_name; - aca_attr->attr.mode = S_IRUGO; - sysfs_attr_init(&aca_attr->attr); - - return sysfs_add_file_to_group(&adev->dev->kobj, - &aca_attr->attr, - "ras"); -} - -int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, - const char *name, const struct aca_info *ras_info, void *data) -{ - struct amdgpu_aca *aca = &adev->aca; - int ret; - - if (!amdgpu_aca_is_enabled(adev)) - return 0; - - ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data); - if (ret) - return ret; - - return add_aca_sysfs(adev, handle); -} - -static void remove_aca_handle(struct aca_handle *handle) -{ - struct aca_handle_manager *mgr = handle->mgr; - - aca_fini_error_cache(handle); - list_del(&handle->node); - mgr->nr_handles--; -} - -static void remove_aca_sysfs(struct aca_handle *handle) -{ - struct amdgpu_device *adev = handle->adev; - struct device_attribute *aca_attr = &handle->aca_attr; - - if (adev->dev->kobj.sd) - sysfs_remove_file_from_group(&adev->dev->kobj, - &aca_attr->attr, - "ras"); -} - -void amdgpu_aca_remove_handle(struct aca_handle *handle) -{ - if (!handle || list_empty(&handle->node)) - return; - - remove_aca_sysfs(handle); - remove_aca_handle(handle); -} - -static int aca_manager_init(struct aca_handle_manager *mgr) -{ - INIT_LIST_HEAD(&mgr->list); - mgr->nr_handles = 0; - - return 0; -} - -static void aca_manager_fini(struct aca_handle_manager *mgr) -{ - struct aca_handle *handle, *tmp; - - if (list_empty(&mgr->list)) - return; - - list_for_each_entry_safe(handle, tmp, &mgr->list, node) - amdgpu_aca_remove_handle(handle); -} - -bool amdgpu_aca_is_enabled(struct amdgpu_device *adev) -{ - return (adev->aca.is_enabled || - adev->debug_enable_ras_aca); -} - -int amdgpu_aca_init(struct amdgpu_device *adev) -{ - struct amdgpu_aca *aca = &adev->aca; - int ret; - - atomic_set(&aca->ue_update_flag, 0); - - ret = aca_manager_init(&aca->mgr); - if (ret) - return ret; - - return 0; -} - -void amdgpu_aca_fini(struct amdgpu_device *adev) -{ - struct amdgpu_aca *aca = &adev->aca; - - aca_manager_fini(&aca->mgr); - - atomic_set(&aca->ue_update_flag, 0); -} - -int amdgpu_aca_reset(struct amdgpu_device *adev) -{ - struct amdgpu_aca *aca = &adev->aca; - - atomic_set(&aca->ue_update_flag, 0); - - return 0; -} - -void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs) -{ - struct amdgpu_aca *aca = &adev->aca; - - WARN_ON(aca->smu_funcs); - aca->smu_funcs = smu_funcs; -} - -int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info) -{ - u64 ipid; - u32 instidhi, instidlo; - - if (!bank || !info) - return -EINVAL; - - ipid = bank->regs[ACA_REG_IDX_IPID]; - info->hwid = ACA_REG__IPID__HARDWAREID(ipid); - info->mcatype = ACA_REG__IPID__MCATYPE(ipid); - /* - * Unfied DieID Format: SAASS. A:AID, S:Socket. - * Unfied DieID[4:4] = InstanceId[0:0] - * Unfied DieID[0:3] = InstanceIdHi[0:3] - */ - instidhi = ACA_REG__IPID__INSTANCEIDHI(ipid); - instidlo = ACA_REG__IPID__INSTANCEIDLO(ipid); - info->die_id = ((instidhi >> 2) & 0x03); - info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03); - - return 0; -} - -static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank) -{ - struct amdgpu_aca *aca = &adev->aca; - const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - - if (!smu_funcs || !smu_funcs->parse_error_code) - return -EOPNOTSUPP; - - return smu_funcs->parse_error_code(adev, bank); -} - -int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) -{ - int i, error_code; - - if (!bank || !err_codes) - return -EINVAL; - - error_code = aca_bank_get_error_code(adev, bank); - if (error_code < 0) - return error_code; - - for (i = 0; i < size; i++) { - if (err_codes[i] == error_code) - return 0; - } - - return -EINVAL; -} - -int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en) -{ - struct amdgpu_aca *aca = &adev->aca; - const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - - if (!smu_funcs || !smu_funcs->set_debug_mode) - return -EOPNOTSUPP; - - return smu_funcs->set_debug_mode(adev, en); -} - -#if defined(CONFIG_DEBUG_FS) -static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)data; - int ret; - - ret = amdgpu_ras_set_aca_debug_mode(adev, val ? true : false); - if (ret) - return ret; - - dev_info(adev->dev, "amdgpu set smu aca debug mode %s success\n", val ? "on" : "off"); - - return 0; -} - -static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx) -{ - struct aca_bank_info info; - int i, ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return; - - seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE"); - seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n", - idx, info.socket_id, info.die_id, info.hwid, info.mcatype); - - for (i = 0; i < ARRAY_SIZE(aca_regs); i++) - seq_printf(m, "aca entry[%d].regs[%d]: 0x%016llx\n", idx, aca_regs[i].reg_idx, bank->regs[aca_regs[i].reg_idx]); -} - -struct aca_dump_context { - struct seq_file *m; - int idx; -}; - -static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_dump_context *ctx = (struct aca_dump_context *)data; - - aca_dump_entry(ctx->m, bank, type, ctx->idx++); - - return handler_aca_log_bank_error(handle, bank, type, NULL); -} - -static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)m->private; - struct aca_dump_context context = { - .m = m, - .idx = 0, - }; - - return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context); -} - -static int aca_dump_ce_show(struct seq_file *m, void *unused) -{ - return aca_dump_show(m, ACA_SMU_TYPE_CE); -} - -static int aca_dump_ce_open(struct inode *inode, struct file *file) -{ - return single_open(file, aca_dump_ce_show, inode->i_private); -} - -static const struct file_operations aca_ce_dump_debug_fops = { - .owner = THIS_MODULE, - .open = aca_dump_ce_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int aca_dump_ue_show(struct seq_file *m, void *unused) -{ - return aca_dump_show(m, ACA_SMU_TYPE_UE); -} - -static int aca_dump_ue_open(struct inode *inode, struct file *file) -{ - return single_open(file, aca_dump_ue_show, inode->i_private); -} - -static const struct file_operations aca_ue_dump_debug_fops = { - .owner = THIS_MODULE, - .open = aca_dump_ue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_set, "%llu\n"); -#endif - -void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) -{ -#if defined(CONFIG_DEBUG_FS) - if (!root) - return; - - debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops); - debugfs_create_file("aca_ue_dump", 0400, root, adev, &aca_ue_dump_debug_fops); - debugfs_create_file("aca_ce_dump", 0400, root, adev, &aca_ce_dump_debug_fops); -#endif -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h deleted file mode 100644 index 38c88897e1ec..000000000000 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef __AMDGPU_ACA_H__ -#define __AMDGPU_ACA_H__ - -#include <linux/list.h> - -struct ras_err_data; -struct ras_query_context; - -#define ACA_MAX_REGS_COUNT (16) - -#define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) -#define ACA_REG__STATUS__VAL(x) ACA_REG_FIELD(x, 63, 63) -#define ACA_REG__STATUS__OVERFLOW(x) ACA_REG_FIELD(x, 62, 62) -#define ACA_REG__STATUS__UC(x) ACA_REG_FIELD(x, 61, 61) -#define ACA_REG__STATUS__EN(x) ACA_REG_FIELD(x, 60, 60) -#define ACA_REG__STATUS__MISCV(x) ACA_REG_FIELD(x, 59, 59) -#define ACA_REG__STATUS__ADDRV(x) ACA_REG_FIELD(x, 58, 58) -#define ACA_REG__STATUS__PCC(x) ACA_REG_FIELD(x, 57, 57) -#define ACA_REG__STATUS__ERRCOREIDVAL(x) ACA_REG_FIELD(x, 56, 56) -#define ACA_REG__STATUS__TCC(x) ACA_REG_FIELD(x, 55, 55) -#define ACA_REG__STATUS__SYNDV(x) ACA_REG_FIELD(x, 53, 53) -#define ACA_REG__STATUS__CECC(x) ACA_REG_FIELD(x, 46, 46) -#define ACA_REG__STATUS__UECC(x) ACA_REG_FIELD(x, 45, 45) -#define ACA_REG__STATUS__DEFERRED(x) ACA_REG_FIELD(x, 44, 44) -#define ACA_REG__STATUS__POISON(x) ACA_REG_FIELD(x, 43, 43) -#define ACA_REG__STATUS__SCRUB(x) ACA_REG_FIELD(x, 40, 40) -#define ACA_REG__STATUS__ERRCOREID(x) ACA_REG_FIELD(x, 37, 32) -#define ACA_REG__STATUS__ADDRLSB(x) ACA_REG_FIELD(x, 29, 24) -#define ACA_REG__STATUS__ERRORCODEEXT(x) ACA_REG_FIELD(x, 21, 16) -#define ACA_REG__STATUS__ERRORCODE(x) ACA_REG_FIELD(x, 15, 0) - -#define ACA_REG__IPID__MCATYPE(x) ACA_REG_FIELD(x, 63, 48) -#define ACA_REG__IPID__INSTANCEIDHI(x) ACA_REG_FIELD(x, 47, 44) -#define ACA_REG__IPID__HARDWAREID(x) ACA_REG_FIELD(x, 43, 32) -#define ACA_REG__IPID__INSTANCEIDLO(x) ACA_REG_FIELD(x, 31, 0) - -#define ACA_REG__MISC0__VALID(x) ACA_REG_FIELD(x, 63, 63) -#define ACA_REG__MISC0__OVRFLW(x) ACA_REG_FIELD(x, 48, 48) -#define ACA_REG__MISC0__ERRCNT(x) ACA_REG_FIELD(x, 43, 32) - -#define ACA_REG__SYND__ERRORINFORMATION(x) ACA_REG_FIELD(x, 17, 0) - -/* NOTE: The following codes refers to the smu header file */ -#define ACA_EXTERROR_CODE_CE 0x3a -#define ACA_EXTERROR_CODE_FAULT 0x3b - -#define ACA_ERROR_UE_MASK BIT_MASK(ACA_ERROR_TYPE_UE) -#define ACA_ERROR_CE_MASK BIT_MASK(ACA_ERROR_TYPE_CE) -#define ACA_ERROR_DEFERRED_MASK BIT_MASK(ACA_ERROR_TYPE_DEFERRED) - -#define mmSMNAID_AID0_MCA_SMU 0x03b30400 /* SMN AID AID0 */ -#define mmSMNAID_XCD0_MCA_SMU 0x36430400 /* SMN AID XCD0 */ -#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */ -#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */ - -#define ACA_BANK_ERR_IS_DEFFERED(bank) \ - (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \ - ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) - -enum aca_reg_idx { - ACA_REG_IDX_CTL = 0, - ACA_REG_IDX_STATUS = 1, - ACA_REG_IDX_ADDR = 2, - ACA_REG_IDX_MISC0 = 3, - ACA_REG_IDX_CONFIG = 4, - ACA_REG_IDX_IPID = 5, - ACA_REG_IDX_SYND = 6, - ACA_REG_IDX_DESTAT = 8, - ACA_REG_IDX_DEADDR = 9, - ACA_REG_IDX_CTL_MASK = 10, - ACA_REG_IDX_COUNT = 16, -}; - -enum aca_hwip_type { - ACA_HWIP_TYPE_UNKNOW = -1, - ACA_HWIP_TYPE_PSP = 0, - ACA_HWIP_TYPE_UMC, - ACA_HWIP_TYPE_SMU, - ACA_HWIP_TYPE_PCS_XGMI, - ACA_HWIP_TYPE_COUNT, -}; - -enum aca_error_type { - ACA_ERROR_TYPE_INVALID = -1, - ACA_ERROR_TYPE_UE = 0, - ACA_ERROR_TYPE_CE, - ACA_ERROR_TYPE_DEFERRED, - ACA_ERROR_TYPE_COUNT -}; - -enum aca_smu_type { - ACA_SMU_TYPE_INVALID = -1, - ACA_SMU_TYPE_UE = 0, - ACA_SMU_TYPE_CE, - ACA_SMU_TYPE_COUNT, -}; - -struct aca_hwip { - int hwid; - int mcatype; -}; - -struct aca_bank { - enum aca_error_type aca_err_type; - enum aca_smu_type smu_err_type; - u64 regs[ACA_MAX_REGS_COUNT]; -}; - -struct aca_bank_node { - struct aca_bank bank; - struct list_head node; -}; - -struct aca_banks { - int nr_banks; - struct list_head list; -}; - -struct aca_bank_info { - int die_id; - int socket_id; - int hwid; - int mcatype; -}; - -struct aca_bank_error { - struct list_head node; - struct aca_bank_info info; - u64 count; -}; - -struct aca_error { - struct list_head list; - struct mutex lock; - enum aca_error_type type; - int nr_errors; -}; - -struct aca_handle_manager { - struct list_head list; - int nr_handles; -}; - -struct aca_error_cache { - struct aca_error errors[ACA_ERROR_TYPE_COUNT]; -}; - -struct aca_handle { - struct list_head node; - enum aca_hwip_type hwip; - struct amdgpu_device *adev; - struct aca_handle_manager *mgr; - struct aca_error_cache error_cache; - const struct aca_bank_ops *bank_ops; - struct device_attribute aca_attr; - char attr_name[64]; - const char *name; - u32 mask; - void *data; -}; - -struct aca_bank_ops { - int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); - bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, - void *data); -}; - -struct aca_smu_funcs { - int max_ue_bank_count; - int max_ce_bank_count; - int (*set_debug_mode)(struct amdgpu_device *adev, bool enable); - int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count); - int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank); - int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank); -}; - -struct amdgpu_aca { - struct aca_handle_manager mgr; - const struct aca_smu_funcs *smu_funcs; - atomic_t ue_update_flag; - bool is_enabled; -}; - -struct aca_info { - enum aca_hwip_type hwip; - const struct aca_bank_ops *bank_ops; - u32 mask; -}; - -int amdgpu_aca_init(struct amdgpu_device *adev); -void amdgpu_aca_fini(struct amdgpu_device *adev); -int amdgpu_aca_reset(struct amdgpu_device *adev); -void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs); -bool amdgpu_aca_is_enabled(struct amdgpu_device *adev); - -int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info); -int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size); - -int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, - const char *name, const struct aca_info *aca_info, void *data); -void amdgpu_aca_remove_handle(struct aca_handle *handle); -int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, struct ras_err_data *err_data, - struct ras_query_context *qctx); -int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en); -void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); -int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, - enum aca_error_type type, u64 count); -#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 516ab9cf88fc..7f5abb03be1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -140,13 +140,15 @@ static struct amdgpu_acpi_priv { * @atif: atif structure * @function: the ATIF function to execute * @params: ATIF function params + * @min_size: minimum size of the expected output buffer in bytes * * Executes the requested ATIF function (all asics). * Returns a pointer to the acpi output buffer. */ static union acpi_object *amdgpu_atif_call(struct amdgpu_atif *atif, int function, - struct acpi_buffer *params) + struct acpi_buffer *params, + size_t min_size) { acpi_status status; union acpi_object *obj; @@ -189,6 +191,28 @@ static union acpi_object *amdgpu_atif_call(struct amdgpu_atif *atif, return NULL; } + if (obj->buffer.length < sizeof(u16)) { + DRM_DEBUG_DRIVER("ATIF buffer too small to hold size field: %u\n", + obj->buffer.length); + kfree(obj); + return NULL; + } + + if (obj->buffer.length < *(u16 *)obj->buffer.pointer) { + DRM_DEBUG_DRIVER("ATIF buffer length mismatch: reported %u, actual %u\n", + *(u16 *)obj->buffer.pointer, + obj->buffer.length); + kfree(obj); + return NULL; + } + + if (*(u16 *)obj->buffer.pointer < min_size) { + DRM_DEBUG_DRIVER("ATIF buffer too small: expected %zu, got %u\n", + min_size, *(u16 *)obj->buffer.pointer); + kfree(obj); + return NULL; + } + return obj; } @@ -251,19 +275,14 @@ int amdgpu_atif_verify_interface(struct amdgpu_atif *atif) size_t size; int err = 0; - info = amdgpu_atif_call(atif, ATIF_FUNCTION_VERIFY_INTERFACE, NULL); + info = amdgpu_atif_call(atif, ATIF_FUNCTION_VERIFY_INTERFACE, NULL, + sizeof(output)); if (!info) return -EIO; memset(&output, 0, sizeof(output)); - size = *(u16 *) info->buffer.pointer; - if (size < 12) { - DRM_INFO("ATIF buffer is too small: %zu\n", size); - err = -EINVAL; - goto out; - } - size = min(sizeof(output), size); + size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer); memcpy(&output, info->buffer.pointer, size); @@ -273,7 +292,6 @@ int amdgpu_atif_verify_interface(struct amdgpu_atif *atif) amdgpu_atif_parse_notification(&atif->notifications, output.notification_mask); amdgpu_atif_parse_functions(&atif->functions, output.function_bits); -out: kfree(info); return err; } @@ -299,20 +317,14 @@ int amdgpu_atif_get_notification_params(struct amdgpu_atif *atif) int err = 0; info = amdgpu_atif_call(atif, ATIF_FUNCTION_GET_SYSTEM_PARAMETERS, - NULL); + NULL, offsetof(struct atif_system_params, command_code)); if (!info) { err = -EIO; goto out; } - size = *(u16 *) info->buffer.pointer; - if (size < 10) { - err = -EINVAL; - goto out; - } - memset(¶ms, 0, sizeof(params)); - size = min(sizeof(params), size); + size = min(sizeof(params), (size_t)*(u16 *)info->buffer.pointer); memcpy(¶ms, info->buffer.pointer, size); DRM_DEBUG_DRIVER("SYSTEM_PARAMS: mask = %#x, flags = %#x\n", @@ -376,20 +388,14 @@ int amdgpu_atif_query_backlight_caps(struct amdgpu_atif *atif) info = amdgpu_atif_call(atif, ATIF_FUNCTION_QUERY_BRIGHTNESS_TRANSFER_CHARACTERISTICS, - ¶ms); + ¶ms, offsetof(struct atif_qbtc_output, data_points)); if (!info) { err = -EIO; goto out; } - size = *(u16 *) info->buffer.pointer; - if (size < 10) { - err = -EINVAL; - goto out; - } - memset(&characteristics, 0, sizeof(characteristics)); - size = min(sizeof(characteristics), size); + size = min(sizeof(characteristics), (size_t)*(u16 *)info->buffer.pointer); memcpy(&characteristics, info->buffer.pointer, size); atif->backlight_caps.caps_valid = true; @@ -427,24 +433,18 @@ static int amdgpu_atif_get_sbios_requests(struct amdgpu_atif *atif, int count = 0; info = amdgpu_atif_call(atif, ATIF_FUNCTION_GET_SYSTEM_BIOS_REQUESTS, - NULL); + NULL, sizeof(*req)); if (!info) return -EIO; - size = *(u16 *)info->buffer.pointer; - if (size < 0xd) { - count = -EINVAL; - goto out; - } memset(req, 0, sizeof(*req)); - size = min(sizeof(*req), size); + size = min(sizeof(*req), (size_t)*(u16 *)info->buffer.pointer); memcpy(req, info->buffer.pointer, size); DRM_DEBUG_DRIVER("SBIOS pending requests: %#x\n", req->pending); count = hweight32(req->pending); -out: kfree(info); return count; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index da325863ad76..c693c508df1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -957,3 +957,17 @@ int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id, return r; } + +/* Reset an MES queue */ +int amdgpu_amdkfd_reset_mes_queue(struct amdgpu_device *adev, + uint32_t node_id, + int queue_type, + int pipe, int queue, + unsigned int db) +{ + if (!adev->kfd.init_complete) + return 0; + + return kgd2kfd_reset_mes_queue(adev->kfd.dev, node_id, queue_type, + pipe, queue, db); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index e443a7277299..338412a750ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, uint32_t domain, struct dma_fence *fence); +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms); #else static inline bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) @@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, { return 0; } +static inline +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) +{ + return -EOPNOTSUPP; +} #endif /* Shared API */ int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size, @@ -275,7 +281,11 @@ int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id); int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id, bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable); bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id); - +int amdgpu_amdkfd_reset_mes_queue(struct amdgpu_device *adev, + uint32_t node_id, + int queue_type, + int pipe, int queue, + unsigned int db); /* Read user wptr from a specified user address space with page fault * disabled. The memory must be pinned and mapped to the hardware when @@ -326,9 +336,9 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv); int amdgpu_amdkfd_gpuvm_sync_memory( struct amdgpu_device *adev, struct kgd_mem *mem, bool intr); -int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, - void **kptr, uint64_t *size); -void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); +int amdgpu_amdkfd_gpuvm_map_bo_to_kernel(struct kgd_mem *mem, void **kptr, + u64 *size, u32 domain); +void amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(struct kgd_mem *mem); int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart); @@ -446,6 +456,9 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr bool retry_fault); void kgd2kfd_lock_kfd(void); void kgd2kfd_teardown_processes(struct amdgpu_device *adev); +int kgd2kfd_reset_mes_queue(struct kfd_dev *kfd, uint32_t node_id, + int queue_type, int pipe, int queue, + unsigned int db); #else static inline int kgd2kfd_init(void) @@ -576,5 +589,12 @@ static inline void kgd2kfd_teardown_processes(struct amdgpu_device *adev) { } +static inline int kgd2kfd_reset_mes_queue(struct kfd_dev *kfd, uint32_t node_id, + int queue_type, int pipe, int queue, + unsigned int db) +{ + return 0; +} + #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c index 6ed399163547..bc079b95fc52 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c @@ -530,6 +530,66 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct amdgpu_device *adev, ptl_state, fmt1, fmt2); } +static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device *adev, + void *mqd, uint32_t num_sdma_queues_per_eng, + uint64_t *val) +{ + struct v9_sdma_mqd *m = get_sdma_mqd(mqd); + uint32_t sdma_rlc_reg_offset = 0; + uint32_t sdma_rlc_rb_cntl; + uint32_t engine_id, queue_id; + uint32_t engines = adev->sdma.num_instances; + uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi; + bool found = false; + + if (!m) + return -EINVAL; + + if (((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + adev->gfx.mec_fw_version < 194) || + (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) && + adev->gfx.mec_fw_version < 44)) { + pr_warn_once("MEC FW doesn't support SDMA counter!\n"); + return -EOPNOTSUPP; + } + + /* SDMA doesn't support over-subscription, there must be + * a HQD associated with a MQD, so found must be true in + * the finding loop. + */ + for (engine_id = 0; engine_id < engines && !found; engine_id++) { + for (queue_id = 0; queue_id < num_sdma_queues_per_eng; queue_id++) { + sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, + engine_id, queue_id); + sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset + + regSDMA_RLC0_RB_BASE); + sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset + + regSDMA_RLC0_RB_BASE_HI); + + if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base && + m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) { + found = true; + break; + } + } + } + + sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL); + + /* Read sdma activity counter from utilization register + * if hw queue is enabled, otherwise read from MQD. + */ + if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK) + *val = (uint64_t)RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_HI) << 32 | + RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_LO); + else + *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 | + m->sdmax_rlcx_utilization_lo; + + return 0; +} + const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping, @@ -566,5 +626,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, .hqd_reset = kgd_gfx_v9_hqd_reset, .hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell, - .ptl_ctrl = kgd_v9_4_3_ptl_ctrl + .ptl_ctrl = kgd_v9_4_3_ptl_ctrl, + .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 35fe2c974699..20831dbebc31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2271,11 +2271,14 @@ err_reserve_bo_failed: return ret; } -/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access +/** amdgpu_amdkfd_gpuvm_map_bo_to_kernel() - Map GTT or VRAM BO for kernel CPU access * * @mem: Buffer object to be mapped for CPU access * @kptr[out]: pointer in kernel CPU address space * @size[out]: size of the buffer + * @domain[IN]: domain for pinning (AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_DOMAIN_VRAM, + * or their combination to let the driver choose). CPU visibility is + * automatically enforced by amdgpu_bo_pin() * * Pins the BO and maps it for kernel CPU access. The eviction fence is removed * from the BO, since pinned BOs cannot be evicted. The bo must remain on the @@ -2284,8 +2287,8 @@ err_reserve_bo_failed: * * Return: 0 on success, error code on failure */ -int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, - void **kptr, uint64_t *size) +int amdgpu_amdkfd_gpuvm_map_bo_to_kernel(struct kgd_mem *mem, void **kptr, + u64 *size, u32 domain) { int ret; struct amdgpu_bo *bo = mem->bo; @@ -2295,6 +2298,11 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, return -EINVAL; } + if (!(domain & (AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_VRAM))) { + pr_debug("Invalid domain 0x%x for kernel mapping\n", domain); + return -EINVAL; + } + mutex_lock(&mem->process_info->lock); ret = amdgpu_bo_reserve(bo, true); @@ -2303,7 +2311,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, goto bo_reserve_failed; } - ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT); + ret = amdgpu_bo_pin(bo, domain); if (ret) { pr_err("Failed to pin bo. ret %d\n", ret); goto pin_failed; @@ -2336,7 +2344,7 @@ bo_reserve_failed: return ret; } -/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access +/** amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel() - Unmap GTT or VRAM BO for kernel CPU access * * @mem: Buffer object to be unmapped for CPU access * @@ -2344,7 +2352,7 @@ bo_reserve_failed: * eviction fence, so this function should only be used for cleanup before the * BO is destroyed. */ -void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem) +void amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(struct kgd_mem *mem) { struct amdgpu_bo *bo = mem->bo; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index acd22bff1882..27c0dc8f6137 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -1923,7 +1923,7 @@ int amdgpu_atombios_init(struct amdgpu_device *adev) atom_card_info->pll_read = cail_pll_read; atom_card_info->pll_write = cail_pll_write; - adev->mode_info.atom_context = amdgpu_atom_parse(atom_card_info, adev->bios); + adev->mode_info.atom_context = amdgpu_atom_parse(atom_card_info, adev->bios, adev->bios_size); if (!adev->mode_info.atom_context) { amdgpu_atombios_fini(adev); return -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index 3893e6fc2f03..e2a4644896ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -89,6 +89,15 @@ bool amdgpu_is_atpx_hybrid(void) return amdgpu_atpx_priv.atpx.is_hybrid; } +static bool amdgpu_atpx_buffer_validate(const union acpi_object *obj, + size_t min_size) +{ + return obj && obj->type == ACPI_TYPE_BUFFER && + obj->buffer.length >= sizeof(u16) && + obj->buffer.length >= *(u16 *)obj->buffer.pointer && + *(u16 *)obj->buffer.pointer >= min_size; +} + /** * amdgpu_atpx_call - call an ATPX method * @@ -179,15 +188,15 @@ static int amdgpu_atpx_validate(struct amdgpu_atpx *atpx) if (!info) return -EIO; - memset(&output, 0, sizeof(output)); - - size = *(u16 *) info->buffer.pointer; - if (size < 10) { - pr_err("ATPX buffer is too small: %zu\n", size); + if (!amdgpu_atpx_buffer_validate(info, sizeof(output))) { + pr_err("Invalid ATPX GET_PX_PARAMETERS response\n"); kfree(info); return -EINVAL; } - size = min(sizeof(output), size); + + memset(&output, 0, sizeof(output)); + + size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer); memcpy(&output, info->buffer.pointer, size); @@ -258,15 +267,15 @@ static int amdgpu_atpx_verify_interface(struct amdgpu_atpx *atpx) if (!info) return -EIO; - memset(&output, 0, sizeof(output)); - - size = *(u16 *) info->buffer.pointer; - if (size < 8) { - pr_err("ATPX buffer is too small: %zu\n", size); + if (!amdgpu_atpx_buffer_validate(info, sizeof(output))) { + pr_err("Invalid ATPX VERIFY_INTERFACE response\n"); err = -EINVAL; goto out; } - size = min(sizeof(output), size); + + memset(&output, 0, sizeof(output)); + + size = min(sizeof(output), (size_t)*(u16 *)info->buffer.pointer); memcpy(&output, info->buffer.pointer, size); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c index aa039e148a5e..3ebdd792feec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c @@ -296,8 +296,14 @@ static int amdgpu_atrm_call(acpi_handle atrm_handle, uint8_t *bios, } obj = (union acpi_object *)buffer.pointer; - memcpy(bios+offset, obj->buffer.pointer, obj->buffer.length); - len = obj->buffer.length; + if (!obj || obj->type != ACPI_TYPE_BUFFER) { + DRM_ERROR("ATRM returned an invalid object\n"); + kfree(buffer.pointer); + return -EINVAL; + } + + len = min_t(size_t, obj->buffer.length, len); + memcpy(bios+offset, obj->buffer.pointer, len); kfree(buffer.pointer); return len; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c index 43864df8af04..ce1d08f112a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c @@ -38,14 +38,6 @@ #define AMDGPU_BO_LIST_NUM_BUCKETS (AMDGPU_BO_LIST_MAX_PRIORITY + 1) #define AMDGPU_BO_LIST_MAX_ENTRIES (128 * 1024) -static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu) -{ - struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list, - rhead); - mutex_destroy(&list->bo_list_mutex); - kvfree(list); -} - static void amdgpu_bo_list_free(struct kref *ref) { struct amdgpu_bo_list *list = container_of(ref, struct amdgpu_bo_list, @@ -54,7 +46,8 @@ static void amdgpu_bo_list_free(struct kref *ref) amdgpu_bo_list_for_each_entry(e, list) amdgpu_bo_unref(&e->bo); - call_rcu(&list->rhead, amdgpu_bo_list_free_rcu); + + kvfree(list); } static int amdgpu_bo_list_entry_cmp(const void *_a, const void *_b) @@ -66,9 +59,9 @@ static int amdgpu_bo_list_entry_cmp(const void *_a, const void *_b) return (int)a->priority - (int)b->priority; } -int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, - struct drm_amdgpu_bo_list_entry *info, - size_t num_entries, struct amdgpu_bo_list **result) +struct amdgpu_bo_list * +amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, + struct drm_amdgpu_bo_list_entry *info, size_t num_entries) { unsigned last_entry = 0, first_userptr = num_entries; struct amdgpu_bo_list_entry *array; @@ -79,7 +72,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, list = kvzalloc_flex(*list, entries, num_entries); if (!list) - return -ENOMEM; + return ERR_PTR(-ENOMEM); kref_init(&list->refcount); @@ -134,9 +127,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, trace_amdgpu_cs_bo_status(list->num_entries, total_size); - mutex_init(&list->bo_list_mutex); - *result = list; - return 0; + return list; error_free: for (i = 0; i < last_entry; ++i) @@ -144,150 +135,125 @@ error_free: for (i = first_userptr; i < num_entries; ++i) amdgpu_bo_unref(&array[i].bo); kvfree(list); - return r; + return ERR_PTR(r); } -static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id) +struct amdgpu_bo_list *amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, u32 id) { struct amdgpu_bo_list *list; - mutex_lock(&fpriv->bo_list_lock); - list = idr_remove(&fpriv->bo_list_handles, id); - mutex_unlock(&fpriv->bo_list_lock); + xa_lock(&fpriv->bo_list_handles); + list = xa_load(&fpriv->bo_list_handles, id); if (list) - kref_put(&list->refcount, amdgpu_bo_list_free); -} - -int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id, - struct amdgpu_bo_list **result) -{ - rcu_read_lock(); - *result = idr_find(&fpriv->bo_list_handles, id); - - if (*result && kref_get_unless_zero(&(*result)->refcount)) { - rcu_read_unlock(); - return 0; - } + kref_get(&list->refcount); + else + list = ERR_PTR(-ENOENT); + xa_unlock(&fpriv->bo_list_handles); - rcu_read_unlock(); - *result = NULL; - return -ENOENT; + return list; } void amdgpu_bo_list_put(struct amdgpu_bo_list *list) { - kref_put(&list->refcount, amdgpu_bo_list_free); + if (list) + kref_put(&list->refcount, amdgpu_bo_list_free); } -int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, - struct drm_amdgpu_bo_list_entry **info_param) +struct drm_amdgpu_bo_list_entry * +amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in) { - const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry); const void __user *uptr = u64_to_user_ptr(in->bo_info_ptr); - const uint32_t bo_info_size = in->bo_info_size; const uint32_t bo_number = in->bo_number; - struct drm_amdgpu_bo_list_entry *info; if (bo_number > AMDGPU_BO_LIST_MAX_ENTRIES) - return -EINVAL; + return ERR_PTR(-EINVAL); - /* copy the handle array from userspace to a kernel buffer */ - if (likely(info_size == bo_info_size)) { - info = vmemdup_array_user(uptr, bo_number, info_size); - if (IS_ERR(info)) - return PTR_ERR(info); - } else { - const uint32_t bytes = min(bo_info_size, info_size); - unsigned i; - - info = kvmalloc_array(bo_number, info_size, GFP_KERNEL); - if (!info) - return -ENOMEM; - - memset(info, 0, bo_number * info_size); - for (i = 0; i < bo_number; ++i, uptr += bo_info_size) { - if (copy_from_user(&info[i], uptr, bytes)) { - kvfree(info); - return -EFAULT; - } - } - } + if (in->bo_info_size != sizeof(struct drm_amdgpu_bo_list_entry)) + return ERR_PTR(-EINVAL); - *info_param = info; - return 0; + return vmemdup_array_user(uptr, bo_number, + sizeof(struct drm_amdgpu_bo_list_entry)); } int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_device *adev = drm_to_adev(dev); + struct amdgpu_bo_list *list, *prev, *curr; union drm_amdgpu_bo_list *args = data; uint32_t handle = args->in.list_handle; - struct drm_amdgpu_bo_list_entry *info = NULL; - struct amdgpu_bo_list *list, *old; + struct drm_amdgpu_bo_list_entry *info; int r; - r = amdgpu_bo_create_list_entry_array(&args->in, &info); - if (r) - return r; - switch (args->in.operation) { case AMDGPU_BO_LIST_OP_CREATE: - r = amdgpu_bo_list_create(adev, filp, info, args->in.bo_number, - &list); - if (r) - goto error_free; + case AMDGPU_BO_LIST_OP_UPDATE: + info = amdgpu_bo_create_list_entry_array(&args->in); + if (IS_ERR(info)) + return PTR_ERR(info); - mutex_lock(&fpriv->bo_list_lock); - r = idr_alloc(&fpriv->bo_list_handles, list, 1, 0, GFP_KERNEL); - mutex_unlock(&fpriv->bo_list_lock); - if (r < 0) { - goto error_put_list; - } + list = amdgpu_bo_list_create(adev, filp, info, + args->in.bo_number); + kvfree(info); + if (IS_ERR(list)) + return PTR_ERR(list); - handle = r; break; case AMDGPU_BO_LIST_OP_DESTROY: - amdgpu_bo_list_destroy(fpriv, handle); + list = xa_erase(&fpriv->bo_list_handles, handle); + amdgpu_bo_list_put(list); handle = 0; + break; - case AMDGPU_BO_LIST_OP_UPDATE: - r = amdgpu_bo_list_create(adev, filp, info, args->in.bo_number, - &list); + default: + return -EINVAL; + }; + + switch (args->in.operation) { + case AMDGPU_BO_LIST_OP_CREATE: + r = xa_alloc(&fpriv->bo_list_handles, &handle, list, + xa_limit_32b, GFP_KERNEL); if (r) - goto error_free; + goto error_put_list; + + break; - mutex_lock(&fpriv->bo_list_lock); - old = idr_replace(&fpriv->bo_list_handles, list, handle); - mutex_unlock(&fpriv->bo_list_lock); + case AMDGPU_BO_LIST_OP_UPDATE: + curr = xa_load(&fpriv->bo_list_handles, handle); + if (!curr) { + r = -ENOENT; + goto error_put_list; + } - if (IS_ERR(old)) { - r = PTR_ERR(old); + prev = xa_cmpxchg(&fpriv->bo_list_handles, handle, curr, list, + GFP_KERNEL); + if (xa_is_err(prev)) { + r = xa_err(prev); + goto error_put_list; + } else if (prev != curr) { + r = -ENOENT; goto error_put_list; } - amdgpu_bo_list_put(old); + amdgpu_bo_list_put(curr); break; + case AMDGPU_BO_LIST_OP_DESTROY: default: - r = -EINVAL; - goto error_free; + /* Handled above. */ + break; } memset(args, 0, sizeof(*args)); args->out.list_handle = handle; - kvfree(info); return 0; error_put_list: amdgpu_bo_list_put(list); - -error_free: - kvfree(info); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h index 2b5e7c46a39d..bde912150824 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h @@ -43,7 +43,6 @@ struct amdgpu_bo_list_entry { }; struct amdgpu_bo_list { - struct rcu_head rhead; struct kref refcount; struct amdgpu_bo *gds_obj; struct amdgpu_bo *gws_obj; @@ -51,24 +50,19 @@ struct amdgpu_bo_list { unsigned first_userptr; unsigned num_entries; - /* Protect access during command submission. - */ - struct mutex bo_list_mutex; - struct amdgpu_bo_list_entry entries[] __counted_by(num_entries); }; -int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id, - struct amdgpu_bo_list **result); +struct amdgpu_bo_list *amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, u32 id); void amdgpu_bo_list_put(struct amdgpu_bo_list *list); -int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, - struct drm_amdgpu_bo_list_entry **info_param); +struct drm_amdgpu_bo_list_entry * +amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in); -int amdgpu_bo_list_create(struct amdgpu_device *adev, - struct drm_file *filp, - struct drm_amdgpu_bo_list_entry *info, - size_t num_entries, - struct amdgpu_bo_list **list); +struct amdgpu_bo_list * +amdgpu_bo_list_create(struct amdgpu_device *adev, + struct drm_file *filp, + struct drm_amdgpu_bo_list_entry *info, + size_t num_entries); #define amdgpu_bo_list_for_each_entry(e, list) \ for (e = list->entries; \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index d5e59c24d907..6fb129025761 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -289,40 +289,6 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, return hdr; } -int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, - struct aca_bank *bank) -{ - struct cper_hdr *fatal = NULL; - struct cper_sec_crashdump_reg_data reg_data = { 0 }; - struct amdgpu_ring *ring = &adev->cper.ring_buf; - int ret; - - fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1); - if (!fatal) { - dev_err(adev->dev, "fail to alloc cper entry for ue record\n"); - return -ENOMEM; - } - - reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); - reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); - reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); - reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); - reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); - reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); - reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); - reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); - - amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL_UNCORRECTED); - ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data); - if (ret) - return ret; - - amdgpu_cper_ring_write(ring, fatal, fatal->record_length); - kfree(fatal); - - return 0; -} - int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev) { struct cper_hdr *bp_threshold = NULL; @@ -348,83 +314,6 @@ int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev) return 0; } -static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, - enum aca_error_type aca_err_type) -{ - switch (aca_err_type) { - case ACA_ERROR_TYPE_UE: - return CPER_SEV_FATAL_UNCORRECTED; - case ACA_ERROR_TYPE_CE: - return CPER_SEV_NON_FATAL_CORRECTED; - case ACA_ERROR_TYPE_DEFERRED: - return CPER_SEV_NON_FATAL_UNCORRECTED; - default: - dev_err(adev->dev, "Unknown ACA error type!\n"); - return CPER_SEV_FATAL_UNCORRECTED; - } -} - -int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, - struct aca_banks *banks, - uint16_t bank_count) -{ - struct cper_hdr *corrected = NULL; - enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED; - struct amdgpu_ring *ring = &adev->cper.ring_buf; - uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 }; - struct aca_bank_node *node; - struct aca_bank *bank; - uint32_t i = 0; - int ret; - - corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count); - if (!corrected) { - dev_err(adev->dev, "fail to allocate cper entry for ce records\n"); - return -ENOMEM; - } - - /* Raise severity if any DE is detected in the ACA bank list */ - list_for_each_entry(node, &banks->list, node) { - bank = &node->bank; - if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { - sev = CPER_SEV_NON_FATAL_UNCORRECTED; - break; - } - } - - amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev); - - /* Combine CE and DE in cper record */ - list_for_each_entry(node, &banks->list, node) { - bank = &node->bank; - reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]); - reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]); - reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); - reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); - reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); - reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); - reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]); - reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]); - reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); - reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); - reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); - reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); - reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); - reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); - - ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++, - amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type), - reg_data, CPER_ACA_REG_COUNT); - if (ret) - return ret; - } - - amdgpu_cper_ring_write(ring, corrected, corrected->record_length); - kfree(corrected); - - return 0; -} - static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos) { char signature[CPER_SIGNATURE_SZ]; @@ -592,8 +481,7 @@ int amdgpu_cper_init(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_cper_en(adev)) return 0; - else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev) && - !amdgpu_aca_is_enabled(adev)) + else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev)) return 0; r = amdgpu_cper_ring_init(adev); @@ -612,7 +500,7 @@ int amdgpu_cper_init(struct amdgpu_device *adev) int amdgpu_cper_fini(struct amdgpu_device *adev) { - if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev)) + if (amdgpu_sriov_vf(adev)) return 0; adev->cper.enabled = false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h index 353421807387..d12c98077d9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h @@ -26,7 +26,6 @@ #define __AMDGPU_CPER_H__ #include "amd_cper.h" -#include "amdgpu_aca.h" #define CPER_MAX_ALLOWED_COUNT 0x1000 #define CPER_MAX_RING_SIZE 0X100000 @@ -88,13 +87,6 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, enum amdgpu_cper_type type, uint16_t section_count); -/* UE must be encoded into separated cper entries, 1 UE 1 cper */ -int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, - struct aca_bank *bank); -/* CEs and DEs are combined into 1 cper entry */ -int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, - struct aca_banks *banks, - uint16_t bank_count); /* Bad page threshold is encoded into separated cper entry */ int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev); void amdgpu_cper_ring_write(struct amdgpu_ring *ring, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index e714cee2997a..d777375e5350 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -140,24 +140,19 @@ static int amdgpu_cs_p1_bo_handles(struct amdgpu_cs_parser *p, struct drm_amdgpu_bo_list_in *data) { struct drm_amdgpu_bo_list_entry *info; - int r; - - r = amdgpu_bo_create_list_entry_array(data, &info); - if (r) - return r; - - r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number, - &p->bo_list); - if (r) - goto error_free; + struct amdgpu_bo_list *list; - kvfree(info); - return 0; + info = amdgpu_bo_create_list_entry_array(data); + if (IS_ERR(info)) + return PTR_ERR(info); -error_free: + list = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number); kvfree(info); + if (IS_ERR(list)) + return PTR_ERR(list); - return r; + p->bo_list = list; + return 0; } /* Copy the data from userspace and go over it the first time */ @@ -846,6 +841,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct ttm_operation_ctx ctx = { true, false }; + struct amdgpu_bo_list *list = NULL; struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_list_entry *e; struct drm_gem_object *obj; @@ -857,25 +853,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, if (p->bo_list) return -EINVAL; - r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle, - &p->bo_list); - if (r) - return r; + list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); } else if (!p->bo_list) { /* Create a empty bo_list when no handle is provided */ - r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0, - &p->bo_list); - if (r) - return r; + list = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0); } - mutex_lock(&p->bo_list->bo_list_mutex); + if (IS_ERR(list)) + return PTR_ERR(list); + else if (list) + p->bo_list = list; + else + list = p->bo_list; /* Get userptr backing pages. If pages are updated after registered * in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do * amdgpu_ttm_backend_bind() to flush and invalidate new pages */ - amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { + amdgpu_bo_list_for_each_userptr_entry(e, list) { bool userpage_invalidated = false; struct amdgpu_bo *bo = e->bo; @@ -905,7 +900,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, if (unlikely(r)) goto out_free_user_pages; - amdgpu_bo_list_for_each_entry(e, p->bo_list) { + amdgpu_bo_list_for_each_entry(e, list) { r = drm_exec_prepare_obj(&p->exec, &e->bo->tbo.base, TTM_NUM_MOVE_FENCES + p->gang_size); drm_exec_retry_on_contention(&p->exec); @@ -924,7 +919,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, } } - amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { + amdgpu_bo_list_for_each_userptr_entry(e, list) { struct mm_struct *usermm; usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm); @@ -977,17 +972,15 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, p->bytes_moved_vis); for (i = 0; i < p->gang_size; ++i) - amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj, - p->bo_list->gws_obj, - p->bo_list->oa_obj); + amdgpu_job_set_resources(p->jobs[i], list->gds_obj, + list->gws_obj, list->oa_obj); return 0; out_free_user_pages: - amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { + amdgpu_bo_list_for_each_userptr_entry(e, list) { amdgpu_hmm_range_free(e->range); e->range = NULL; } - mutex_unlock(&p->bo_list->bo_list_mutex); return r; } @@ -1371,7 +1364,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm); mutex_unlock(&p->adev->notifier_lock); - mutex_unlock(&p->bo_list->bo_list_mutex); return 0; } @@ -1443,28 +1435,25 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) r = amdgpu_cs_patch_jobs(&parser); if (r) - goto error_backoff; + goto error_fini; r = amdgpu_cs_vm_handling(&parser); if (r) - goto error_backoff; + goto error_fini; r = amdgpu_cs_sync_rings(&parser); if (r) - goto error_backoff; + goto error_fini; trace_amdgpu_cs_ibs(&parser); r = amdgpu_cs_submit(&parser, data); if (r) - goto error_backoff; + goto error_fini; amdgpu_cs_parser_fini(&parser); return 0; -error_backoff: - mutex_unlock(&parser.bo_list->bo_list_mutex); - error_fini: amdgpu_cs_parser_fini(&parser); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index ce35b415093d..d53259a5b82f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -283,6 +283,8 @@ static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_device *adev, if (!entity) return res; + drm_sched_entity_destroy(&entity->entity); + for (i = 0; i < amdgpu_sched_jobs; ++i) { res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i])); dma_fence_put(entity->fences[i]); @@ -294,32 +296,20 @@ static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_device *adev, return res; } -static int amdgpu_ctx_get_stable_pstate(struct amdgpu_ctx *ctx, - u32 *stable_pstate) +static u32 amdgpu_get_stable_pstate(struct amdgpu_device *adev) { - struct amdgpu_device *adev = ctx->mgr->adev; - enum amd_dpm_forced_level current_level; - - current_level = amdgpu_dpm_get_performance_level(adev); - - switch (current_level) { + switch (amdgpu_dpm_get_performance_level(adev)) { case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: - *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_STANDARD; - break; + return AMDGPU_CTX_STABLE_PSTATE_STANDARD; case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: - *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK; - break; + return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK; case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK: - *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK; - break; + return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK; case AMD_DPM_FORCED_LEVEL_PROFILE_PEAK: - *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_PEAK; - break; + return AMDGPU_CTX_STABLE_PSTATE_PEAK; default: - *stable_pstate = AMDGPU_CTX_STABLE_PSTATE_NONE; - break; + return AMDGPU_CTX_STABLE_PSTATE_NONE; } - return 0; } static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, @@ -383,9 +373,9 @@ static int __amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx, if (current_ctx && current_ctx != ctx) return -EBUSY; - r = amdgpu_ctx_get_stable_pstate(ctx, ¤t_stable_pstate); - if (r || current_stable_pstate == stable_pstate) - return r; + current_stable_pstate = amdgpu_get_stable_pstate(adev); + if (current_stable_pstate == stable_pstate) + return 0; r = amdgpu_dpm_force_performance_level(adev, level); if (r) @@ -416,7 +406,7 @@ static int amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx, return r; } -static void amdgpu_ctx_fini(struct kref *ref) +void amdgpu_ctx_fini(struct kref *ref) { struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount); struct amdgpu_ctx_mgr *mgr = ctx->mgr; @@ -504,53 +494,26 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev, if (!ctx) return -ENOMEM; - mutex_lock(&mgr->lock); - r = idr_alloc(&mgr->ctx_handles, ctx, 1, AMDGPU_VM_MAX_NUM_CTX, GFP_KERNEL); - if (r < 0) { - mutex_unlock(&mgr->lock); - kfree(ctx); - return r; - } - - *id = (uint32_t)r; r = amdgpu_ctx_init(mgr, priority, filp, ctx); if (r) { - idr_remove(&mgr->ctx_handles, *id); - *id = 0; kfree(ctx); + return r; } - mutex_unlock(&mgr->lock); - return r; -} - -static void amdgpu_ctx_do_release(struct kref *ref) -{ - struct amdgpu_ctx *ctx; - u32 i, j; - ctx = container_of(ref, struct amdgpu_ctx, refcount); - for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { - for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { - if (!ctx->entities[i][j]) - continue; - - drm_sched_entity_destroy(&ctx->entities[i][j]->entity); - } - } + r = xa_alloc(&mgr->ctx_handles, id, ctx, xa_limit_32b, GFP_KERNEL); + if (r) + amdgpu_ctx_put(ctx); - amdgpu_ctx_fini(ref); + return r; } static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) { - struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr; struct amdgpu_ctx *ctx; - mutex_lock(&mgr->lock); - ctx = idr_remove(&mgr->ctx_handles, id); - if (ctx) - kref_put(&ctx->refcount, amdgpu_ctx_do_release); - mutex_unlock(&mgr->lock); + ctx = xa_erase(&fpriv->ctx_mgr.ctx_handles, id); + amdgpu_ctx_put(ctx); + return ctx ? 0 : -EINVAL; } @@ -559,19 +522,11 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, union drm_amdgpu_ctx_out *out) { struct amdgpu_ctx *ctx; - struct amdgpu_ctx_mgr *mgr; unsigned reset_counter; - if (!fpriv) - return -EINVAL; - - mgr = &fpriv->ctx_mgr; - mutex_lock(&mgr->lock); - ctx = idr_find(&mgr->ctx_handles, id); - if (!ctx) { - mutex_unlock(&mgr->lock); + ctx = amdgpu_ctx_get(fpriv, id); + if (!ctx) return -EINVAL; - } /* TODO: these two are always zero */ out->state.flags = 0x0; @@ -586,7 +541,8 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, out->state.reset_status = AMDGPU_CTX_UNKNOWN_RESET; ctx->reset_counter_query = reset_counter; - mutex_unlock(&mgr->lock); + amdgpu_ctx_put(ctx); + return 0; } @@ -619,18 +575,10 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ctx *ctx; - struct amdgpu_ctx_mgr *mgr; - if (!fpriv) - return -EINVAL; - - mgr = &fpriv->ctx_mgr; - mutex_lock(&mgr->lock); - ctx = idr_find(&mgr->ctx_handles, id); - if (!ctx) { - mutex_unlock(&mgr->lock); + ctx = amdgpu_ctx_get(fpriv, id); + if (!ctx) return -EINVAL; - } out->state.flags = 0x0; out->state.hangs = 0x0; @@ -671,7 +619,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS)); } - mutex_unlock(&mgr->lock); + amdgpu_ctx_put(ctx); + return 0; } @@ -680,26 +629,26 @@ static int amdgpu_ctx_stable_pstate(struct amdgpu_device *adev, bool set, u32 *stable_pstate) { struct amdgpu_ctx *ctx; - struct amdgpu_ctx_mgr *mgr; - int r; + int r = 0; - if (!fpriv) + ctx = amdgpu_ctx_get(fpriv, id); + if (!ctx) return -EINVAL; - mgr = &fpriv->ctx_mgr; - mutex_lock(&mgr->lock); - ctx = idr_find(&mgr->ctx_handles, id); - if (!ctx) { - mutex_unlock(&mgr->lock); - return -EINVAL; - } + /* + * The get path is odd in this uapi - it will check whether the context + * id exist, but otherwise does nothing with it. In other words, the + * uapi has historically been implemented as being able to query the + * global device state, as long as the caller supplies a random valid + * context id. + */ if (set) r = amdgpu_ctx_set_stable_pstate(ctx, *stable_pstate); else - r = amdgpu_ctx_get_stable_pstate(ctx, stable_pstate); + *stable_pstate = amdgpu_get_stable_pstate(adev); - mutex_unlock(&mgr->lock); + amdgpu_ctx_put(ctx); return r; } @@ -778,23 +727,14 @@ struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id) mgr = &fpriv->ctx_mgr; - mutex_lock(&mgr->lock); - ctx = idr_find(&mgr->ctx_handles, id); + xa_lock(&mgr->ctx_handles); + ctx = xa_load(&mgr->ctx_handles, id); if (ctx) kref_get(&ctx->refcount); - mutex_unlock(&mgr->lock); + xa_unlock(&mgr->ctx_handles); return ctx; } -int amdgpu_ctx_put(struct amdgpu_ctx *ctx) -{ - if (ctx == NULL) - return -EINVAL; - - kref_put(&ctx->refcount, amdgpu_ctx_do_release); - return 0; -} - uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct drm_sched_entity *entity, struct dma_fence *fence) @@ -928,8 +868,7 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr, unsigned int i; mgr->adev = adev; - mutex_init(&mgr->lock); - idr_init_base(&mgr->ctx_handles, 1); + xa_init_flags(&mgr->ctx_handles, XA_FLAGS_ALLOC1); for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) atomic64_set(&mgr->time_spend[i], 0); @@ -938,13 +877,13 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr, long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout) { struct amdgpu_ctx *ctx; - struct idr *idp; - uint32_t id, i, j; + unsigned long id; + int i, j; - idp = &mgr->ctx_handles; - - mutex_lock(&mgr->lock); - idr_for_each_entry(idp, ctx, id) { + xa_lock(&mgr->ctx_handles); + xa_for_each(&mgr->ctx_handles, id, ctx) { + kref_get(&ctx->refcount); + xa_unlock(&mgr->ctx_handles); for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { struct drm_sched_entity *entity; @@ -956,45 +895,21 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout) timeout = drm_sched_entity_flush(entity, timeout); } } + amdgpu_ctx_put(ctx); + xa_lock(&mgr->ctx_handles); } - mutex_unlock(&mgr->lock); + xa_unlock(&mgr->ctx_handles); return timeout; } -static void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) +void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr) { struct amdgpu_ctx *ctx; - struct idr *idp; - uint32_t id, i, j; - - idp = &mgr->ctx_handles; - - idr_for_each_entry(idp, ctx, id) { - if (kref_read(&ctx->refcount) != 1) { - drm_err(adev_to_drm(mgr->adev), "ctx %p is still alive\n", ctx); - continue; - } + unsigned long id; - for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { - for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { - struct drm_sched_entity *entity; - - if (!ctx->entities[i][j]) - continue; - - entity = &ctx->entities[i][j]->entity; - drm_sched_entity_fini(entity); - } - } - kref_put(&ctx->refcount, amdgpu_ctx_fini); - } -} - -void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr) -{ - amdgpu_ctx_mgr_entity_fini(mgr); - idr_destroy(&mgr->ctx_handles); - mutex_destroy(&mgr->lock); + xa_for_each(&mgr->ctx_handles, id, ctx) + amdgpu_ctx_put(ctx); + xa_destroy(&mgr->ctx_handles); } void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, @@ -1002,21 +917,21 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, { struct amdgpu_ctx *ctx; unsigned int hw_ip, i; - uint32_t id; + unsigned long id; /* * This is a little bit racy because it can be that a ctx or a fence are * destroyed just in the moment we try to account them. But that is ok * since exactly that case is explicitely allowed by the interface. */ - mutex_lock(&mgr->lock); for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) { uint64_t ns = atomic64_read(&mgr->time_spend[hw_ip]); usage[hw_ip] = ns_to_ktime(ns); } - idr_for_each_entry(&mgr->ctx_handles, ctx, id) { + xa_lock(&mgr->ctx_handles); + xa_for_each(&mgr->ctx_handles, id, ctx) { for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) { for (i = 0; i < amdgpu_ctx_num_entities[hw_ip]; ++i) { struct amdgpu_ctx_entity *centity; @@ -1030,5 +945,5 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr, } } } - mutex_unlock(&mgr->lock); + xa_unlock(&mgr->ctx_handles); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index e444b2088d40..a4b89eca4169 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -25,6 +25,7 @@ #include <linux/ktime.h> #include <linux/types.h> +#include <linux/xarray.h> #include "amdgpu_ring.h" @@ -60,16 +61,21 @@ struct amdgpu_ctx { struct amdgpu_ctx_mgr { struct amdgpu_device *adev; - struct mutex lock; - /* protected by lock */ - struct idr ctx_handles; + struct xarray ctx_handles; atomic64_t time_spend[AMDGPU_HW_IP_NUM]; }; extern const unsigned int amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM]; struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id); -int amdgpu_ctx_put(struct amdgpu_ctx *ctx); + +void amdgpu_ctx_fini(struct kref *kref); + +static inline void amdgpu_ctx_put(struct amdgpu_ctx *ctx) +{ + if (ctx) + kref_put(&ctx->refcount, amdgpu_ctx_fini); +} int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance, u32 ring, struct drm_sched_entity **entity); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 389bad724273..0455c2cd043f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -26,6 +26,7 @@ #include <linux/kthread.h> #include <linux/pci.h> #include <linux/uaccess.h> +#include <linux/security.h> #include <linux/pm_runtime.h> #include "amdgpu.h" @@ -1739,6 +1740,12 @@ int amdgpu_debugfs_regs_init(struct amdgpu_device *adev) struct dentry *ent, *root = minor->debugfs_root; unsigned int i; + if (security_locked_down(LOCKDOWN_PCI_ACCESS)) { + drm_info(adev_to_drm(adev), + "amdgpu: HW debugfs nodes disabled (kernel lockdown)\n"); + return 0; + } + for (i = 0; i < ARRAY_SIZE(debugfs_regs); i++) { ent = debugfs_create_file(debugfs_regs_names[i], S_IFREG | 0400, root, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index e77db76b48b8..4fd0df3aa70d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -64,6 +64,7 @@ const char *hw_ip_names[MAX_HWIP] = { [VCN1_HWIP] = "VCN1", [VCE_HWIP] = "VCE", [VPE_HWIP] = "VPE", + [UMSCH_HWIP] = "UMSCH", [DF_HWIP] = "DF", [DCE_HWIP] = "DCE", [OSSSYS_HWIP] = "OSSSYS", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8d6502a94306..78c96c7102e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -74,6 +74,7 @@ #include "amdgpu_ras.h" #include "amdgpu_ras_mgr.h" #include "amdgpu_pmu.h" +#include "amdgpu_smu.h" #include "amdgpu_fru_eeprom.h" #include "amdgpu_reset.h" #include "amdgpu_virt.h" @@ -2130,6 +2131,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->cg_flags &= amdgpu_cg_mask; adev->pg_flags &= amdgpu_pg_mask; + amdgpu_smu_early_init(adev); + return 0; } @@ -3677,6 +3680,10 @@ static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); + + amdgpu_discovery_sysfs_fini(adev); + amdgpu_preempt_mgr_sysfs_fini(adev); + amdgpu_device_attr_sysfs_fini(adev); amdgpu_fru_sysfs_fini(adev); @@ -3773,6 +3780,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->irq.lock); + amdgpu_early_init_rlc_reg_funcs(adev); amdgpu_device_init_apu_flags(adev); r = amdgpu_device_check_arguments(adev); @@ -4208,6 +4216,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) if (adev->mman.initialized) drain_workqueue(adev->mman.bdev.wq); + adev->shutdown = true; unregister_pm_notifier(&adev->pm_nb); @@ -4707,161 +4716,6 @@ exit: } /** - * amdgpu_device_ip_check_soft_reset - did soft reset succeed - * - * @adev: amdgpu_device pointer - * - * The list of all the hardware IPs that make up the asic is walked and - * the check_soft_reset callbacks are run. check_soft_reset determines - * if the asic is still hung or not. - * Returns true if any of the IPs are still in a hung state, false if not. - */ -static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) -{ - int i; - bool asic_hang = false; - - if (amdgpu_sriov_vf(adev)) - return true; - - if (amdgpu_asic_need_full_reset(adev)) - return true; - - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if (adev->ip_blocks[i].version->funcs->check_soft_reset) - adev->ip_blocks[i].status.hang = - adev->ip_blocks[i].version->funcs->check_soft_reset( - &adev->ip_blocks[i]); - if (adev->ip_blocks[i].status.hang) { - dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); - asic_hang = true; - } - } - return asic_hang; -} - -/** - * amdgpu_device_ip_pre_soft_reset - prepare for soft reset - * - * @adev: amdgpu_device pointer - * - * The list of all the hardware IPs that make up the asic is walked and the - * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset - * handles any IP specific hardware or software state changes that are - * necessary for a soft reset to succeed. - * Returns 0 on success, negative error code on failure. - */ -static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) -{ - int i, r = 0; - - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if (adev->ip_blocks[i].status.hang && - adev->ip_blocks[i].version->funcs->pre_soft_reset) { - r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); - if (r) - return r; - } - } - - return 0; -} - -/** - * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed - * - * @adev: amdgpu_device pointer - * - * Some hardware IPs cannot be soft reset. If they are hung, a full gpu - * reset is necessary to recover. - * Returns true if a full asic reset is required, false if not. - */ -static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) -{ - int i; - - if (amdgpu_asic_need_full_reset(adev)) - return true; - - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { - if (adev->ip_blocks[i].status.hang) { - dev_info(adev->dev, "Some block need full reset!\n"); - return true; - } - } - } - return false; -} - -/** - * amdgpu_device_ip_soft_reset - do a soft reset - * - * @adev: amdgpu_device pointer - * - * The list of all the hardware IPs that make up the asic is walked and the - * soft_reset callbacks are run if the block is hung. soft_reset handles any - * IP specific hardware or software state changes that are necessary to soft - * reset the IP. - * Returns 0 on success, negative error code on failure. - */ -static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) -{ - int i, r = 0; - - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if (adev->ip_blocks[i].status.hang && - adev->ip_blocks[i].version->funcs->soft_reset) { - r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); - if (r) - return r; - } - } - - return 0; -} - -/** - * amdgpu_device_ip_post_soft_reset - clean up from soft reset - * - * @adev: amdgpu_device pointer - * - * The list of all the hardware IPs that make up the asic is walked and the - * post_soft_reset callbacks are run if the asic was hung. post_soft_reset - * handles any IP specific hardware or software state changes that are - * necessary after the IP has been soft reset. - * Returns 0 on success, negative error code on failure. - */ -static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) -{ - int i, r = 0; - - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if (adev->ip_blocks[i].status.hang && - adev->ip_blocks[i].version->funcs->post_soft_reset) - r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); - if (r) - return r; - } - - return 0; -} - -/** * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * * @adev: amdgpu_device pointer @@ -5152,20 +5006,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ if (!amdgpu_sriov_vf(adev)) { - - if (!need_full_reset) - need_full_reset = amdgpu_device_ip_need_full_reset(adev); - - if (!need_full_reset && amdgpu_gpu_recovery && - amdgpu_device_ip_check_soft_reset(adev)) { - amdgpu_device_ip_pre_soft_reset(adev); - r = amdgpu_device_ip_soft_reset(adev); - amdgpu_device_ip_post_soft_reset(adev); - if (r || amdgpu_device_ip_check_soft_reset(adev)) { - dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); - need_full_reset = true; - } - } + need_full_reset = true; if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { dev_info(tmp_adev->dev, "Dumping IP State\n"); @@ -5618,8 +5459,7 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev, drm_client_dev_suspend(adev_to_drm(tmp_adev)); /* disable ras on ALL IPs */ - if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && - amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev)) amdgpu_ras_suspend(tmp_adev); amdgpu_userq_pre_reset(tmp_adev); @@ -6891,7 +6731,7 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) if (unlikely(!ring->adev->debug_disable_soft_recovery) && !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) - size |= AMDGPU_RESET_TYPE_SOFT_RESET; + size |= AMDGPU_RESET_TYPE_SOFT_RECOVERY; return size; } @@ -6907,8 +6747,8 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) } - if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) - size += sysfs_emit_at(buf, size, "soft "); + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RECOVERY) + size += sysfs_emit_at(buf, size, "soft_recovery "); if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) size += sysfs_emit_at(buf, size, "queue "); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 853365dee2a7..a015d55aa158 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -22,6 +22,7 @@ */ #include <linux/firmware.h> +#include <linux/kernfs.h> #include "amdgpu.h" #include "amdgpu_discovery.h" @@ -148,6 +149,26 @@ MODULE_FIRMWARE("amdgpu/aldebaran_ip_discovery.bin"); #define mmDRIVER_SCRATCH_1 0x95 #define mmDRIVER_SCRATCH_2 0x96 +struct ip_discovery_top { + struct kobject kobj; + struct kset die_kset; + struct pci_dev *pdev; + struct amdgpu_device *adev; + uint8_t *discovery_bin; + uint32_t bin_size; + bool standalone_mode; +}; + +/* List to track early-initialized ip_discovery_top entries */ +struct early_ip_discovery { + struct list_head list; + struct pci_dev *pdev; + struct ip_discovery_top *ip_top; +}; + +static LIST_HEAD(early_ip_discovery_list); +static DEFINE_MUTEX(early_ip_discovery_mutex); + static const char *hw_id_names[HW_ID_MAX] = { [MP1_HWID] = "MP1", [MP2_HWID] = "MP2", @@ -226,6 +247,7 @@ static const char *hw_id_names[HW_ID_MAX] = { [XGBE_HWID] = "XGBE", [MP0_HWID] = "MP0", [VPE_HWID] = "VPE", + [UMSCH_HWID] = "UMSCH", [ATU_HWID] = "ATU", [AIGC_HWID] = "AIGC", }; @@ -258,6 +280,7 @@ static int hw_id_map[MAX_HWIP] = { [DCI_HWIP] = DCI_HWID, [PCIE_HWIP] = PCIE_HWID, [VPE_HWIP] = VPE_HWID, + [UMSCH_HWIP] = UMSCH_HWID, [ISP_HWIP] = ISP_HWID, [ATU_HWIP] = ATU_HWID, }; @@ -542,25 +565,37 @@ static const char *amdgpu_discovery_get_fw_name(struct amdgpu_device *adev) } } -static int amdgpu_discovery_get_table_info(struct amdgpu_device *adev, - struct table_info **info, - uint16_t table_id) +static struct table_info * +amdgpu_discovery_get_table_info_from_bin(uint8_t *discovery_bin, + uint16_t table_id) { - struct binary_header *bhdr = - (struct binary_header *)adev->discovery.bin; + struct binary_header *bhdr = (struct binary_header *)discovery_bin; struct binary_header_v2 *bhdrv2; switch (bhdr->version_major) { case 2: - bhdrv2 = (struct binary_header_v2 *)adev->discovery.bin; - *info = &bhdrv2->table_list[table_id]; - break; + bhdrv2 = (struct binary_header_v2 *)discovery_bin; + return &bhdrv2->table_list[table_id]; case 1: case 0: - *info = &bhdr->table_list[table_id]; - break; + return &bhdr->table_list[table_id]; default: - dev_err(adev->dev, "Invalid ip discovery table version %d\n",bhdr->version_major); + return NULL; + } +} + +static int amdgpu_discovery_get_table_info(struct amdgpu_device *adev, + struct table_info **info, + uint16_t table_id) +{ + struct binary_header *bhdr = + (struct binary_header *)adev->discovery.bin; + + *info = amdgpu_discovery_get_table_info_from_bin(adev->discovery.bin, + table_id); + if (!*info) { + dev_err(adev->dev, "Invalid ip discovery table version %d\n", + bhdr->version_major); return -EINVAL; } @@ -724,11 +759,11 @@ out: return r; } -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev); - void amdgpu_discovery_fini(struct amdgpu_device *adev) { - amdgpu_discovery_sysfs_fini(adev); + if (adev->discovery.ip_top && !adev->discovery.ip_top->standalone_mode) + amdgpu_discovery_sysfs_fini(adev); + kfree(adev->discovery.bin); adev->discovery.bin = NULL; } @@ -737,15 +772,17 @@ static int amdgpu_discovery_validate_ip(struct amdgpu_device *adev, uint8_t instance, uint16_t hw_id) { if (instance >= HWIP_MAX_INSTANCE) { - dev_err(adev->dev, - "Unexpected instance_number (%d) from ip discovery blob\n", - instance); + if (adev) + dev_err(adev->dev, + "Unexpected instance_number (%d) from ip discovery blob\n", + instance); return -EINVAL; } if (hw_id >= HW_ID_MAX) { - dev_err(adev->dev, - "Unexpected hw_id (%d) from ip discovery blob\n", - hw_id); + if (adev) + dev_err(adev->dev, + "Unexpected hw_id (%d) from ip discovery blob\n", + hw_id); return -EINVAL; } @@ -1111,12 +1148,6 @@ static const struct kobj_type ip_discovery_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; -struct ip_discovery_top { - struct kobject kobj; /* ip_discovery/ */ - struct kset die_kset; /* ip_discovery/die/, contains ip_die_entry */ - struct amdgpu_device *adev; -}; - static void die_kobj_release(struct kobject *kobj) { struct ip_discovery_top *ip_top = container_of(to_kset(kobj), @@ -1132,8 +1163,14 @@ static void ip_disc_release(struct kobject *kobj) kobj); struct amdgpu_device *adev = ip_top->adev; + /* In standalone mode, discovery_bin is managed by devm and will be + * freed automatically when the PCI device is removed. Do not manually + * free it here to avoid double-free. + */ + kfree(ip_top); - adev->discovery.ip_top = NULL; + if (adev) + adev->discovery.ip_top = NULL; } static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, @@ -1141,6 +1178,10 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, { uint8_t harvest = 0; + /* In early init mode (adev == NULL), harvest info is not available */ + if (!adev) + return 0; + /* Until a uniform way is figured, get mask based on hwid */ switch (hw_id) { case VCN_HWID: @@ -1169,11 +1210,14 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, } static int amdgpu_discovery_sysfs_ips(struct amdgpu_device *adev, + struct ip_discovery_top *ip_top, struct ip_die_entry *ip_die_entry, const size_t _ip_offset, const int num_ips, bool reg_base_64) { - uint8_t *discovery_bin = adev->discovery.bin; + uint8_t *discovery_bin = ip_top->standalone_mode ? + ip_top->discovery_bin : + adev->discovery.bin; int ii, jj, kk, res; uint16_t hw_id; uint8_t inst; @@ -1270,10 +1314,12 @@ next_ip: return 0; } -static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev) +static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev, + struct ip_discovery_top *ip_top) { - struct ip_discovery_top *ip_top = adev->discovery.ip_top; - uint8_t *discovery_bin = adev->discovery.bin; + uint8_t *discovery_bin = ip_top->standalone_mode ? + ip_top->discovery_bin : + adev->discovery.bin; struct table_info *info; struct ip_discovery_header *ihdr; struct die_header *dhdr; @@ -1282,9 +1328,10 @@ static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev) size_t ip_offset; int ii, res; - res = amdgpu_discovery_get_table_info(adev, &info, IP_DISCOVERY); - if (res) - return res; + info = amdgpu_discovery_get_table_info_from_bin(discovery_bin, + IP_DISCOVERY); + if (!info) + return -EINVAL; ihdr = (struct ip_discovery_header *)(discovery_bin + le16_to_cpu(info->offset)); @@ -1322,7 +1369,8 @@ static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev) return res; } - amdgpu_discovery_sysfs_ips(adev, ip_die_entry, ip_offset, num_ips, !!ihdr->base_addr_64_bit); + amdgpu_discovery_sysfs_ips(adev, ip_top, ip_die_entry, ip_offset, + num_ips, !!ihdr->base_addr_64_bit); } return 0; @@ -1338,12 +1386,30 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) if (!discovery_bin) return -EINVAL; + /* If early init already created sysfs in standalone mode, skip normal init */ + if (adev->discovery.ip_top && adev->discovery.ip_top->standalone_mode) + return 0; + ip_top = kzalloc_obj(*ip_top); if (!ip_top) return -ENOMEM; ip_top->adev = adev; - adev->discovery.ip_top = ip_top; + + /* Check if ip_discovery already exists before creating. + * This shouldn't normally happen but handle it gracefully. + */ + if (adev->dev->kobj.sd) { + struct kernfs_node *existing; + + existing = kernfs_find_and_get(adev->dev->kobj.sd, "ip_discovery"); + if (existing) { + kernfs_put(existing); + kfree(ip_top); + return 0; + } + } + res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype, &adev->dev->kobj, "ip_discovery"); if (res) { @@ -1351,6 +1417,8 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) goto Err; } + adev->discovery.ip_top = ip_top; + die_kset = &ip_top->die_kset; kobject_set_name(&die_kset->kobj, "%s", "die"); die_kset->kobj.parent = &ip_top->kobj; @@ -1365,7 +1433,7 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr; ip_hw_instance_attrs[ii] = NULL; - res = amdgpu_discovery_sysfs_recurse(adev); + res = amdgpu_discovery_sysfs_recurse(adev, ip_top); return res; Err: @@ -1412,7 +1480,7 @@ static void amdgpu_discovery_sysfs_die_free(struct ip_die_entry *ip_die_entry) kobject_put(&ip_die_entry->ip_kset.kobj); } -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) { struct ip_discovery_top *ip_top = adev->discovery.ip_top; struct list_head *el, *tmp; @@ -1421,6 +1489,16 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) if (!ip_top) return; + /* + * In standalone mode the sysfs hierarchy is tied to the PCI device + * lifetime and is torn down by amdgpu_discovery_sysfs_early_fini(). + * Freeing it here would leave a dangling pointer in the early + * discovery list, causing a use-after-free on driver unbind. + */ + if (ip_top->standalone_mode) + return; + + adev->discovery.ip_top = NULL; die_kset = &ip_top->die_kset; spin_lock(&die_kset->list_lock); list_for_each_prev_safe(el, tmp, &die_kset->list) { @@ -1479,6 +1557,150 @@ void amdgpu_discovery_dump(struct amdgpu_device *adev, struct drm_printer *p) spin_unlock(&die_kset->list_lock); } +int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, struct pci_dev *pdev) +{ + struct ip_discovery_top *ip_top; + struct early_ip_discovery *early_entry, *tmp; + struct kset *die_kset; + uint8_t *discovery_bin; + int res, ii; + + if (!adev || !adev->discovery.bin) + return -EINVAL; + + if (adev->discovery.ip_top) + return 0; + + mutex_lock(&early_ip_discovery_mutex); + list_for_each_entry_safe(early_entry, tmp, &early_ip_discovery_list, list) { + if (early_entry->pdev == pdev) { + adev->discovery.ip_top = early_entry->ip_top; + early_entry->ip_top->adev = adev; + mutex_unlock(&early_ip_discovery_mutex); + return 0; + } + } + mutex_unlock(&early_ip_discovery_mutex); + + discovery_bin = adev->discovery.bin; + + early_entry = kzalloc(sizeof(*early_entry), GFP_KERNEL); + if (!early_entry) + return -ENOMEM; + + ip_top = kzalloc(sizeof(*ip_top), GFP_KERNEL); + if (!ip_top) { + kfree(early_entry); + return -ENOMEM; + } + + ip_top->discovery_bin = devm_kmemdup(&pdev->dev, discovery_bin, + DISCOVERY_TMR_SIZE, GFP_KERNEL); + if (!ip_top->discovery_bin) { + kfree(ip_top); + kfree(early_entry); + return -ENOMEM; + } + + ip_top->bin_size = DISCOVERY_TMR_SIZE; + ip_top->pdev = pdev; + ip_top->adev = adev; + ip_top->standalone_mode = true; + + /* Check if ip_discovery already exists (from previous probe attempt). + * This can happen if the module was unloaded and reloaded but the + * sysfs persisted (tied to PCI device lifetime). + */ + if (pdev->dev.kobj.sd) { + struct kernfs_node *existing; + + existing = kernfs_find_and_get(pdev->dev.kobj.sd, "ip_discovery"); + if (existing) { + kernfs_put(existing); + kfree(ip_top); + kfree(early_entry); + return 0; + } + } + + res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype, + &pdev->dev.kobj, "ip_discovery"); + if (res) + goto err_put_kobj; + + adev->discovery.ip_top = ip_top; + + die_kset = &ip_top->die_kset; + kobject_set_name(&die_kset->kobj, "%s", "die"); + die_kset->kobj.parent = &ip_top->kobj; + die_kset->kobj.ktype = &die_kobj_ktype; + res = kset_register(&ip_top->die_kset); + if (res) + goto err_put_die_kset; + + for (ii = 0; ii < ARRAY_SIZE(ip_hw_attr); ii++) + ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr; + ip_hw_instance_attrs[ii] = NULL; + + res = amdgpu_discovery_sysfs_recurse(NULL, ip_top); + if (res) + goto err_put_die_kset; + + early_entry->pdev = pdev; + early_entry->ip_top = ip_top; + mutex_lock(&early_ip_discovery_mutex); + list_add(&early_entry->list, &early_ip_discovery_list); + mutex_unlock(&early_ip_discovery_mutex); + + return 0; + +err_put_die_kset: + kobject_put(&ip_top->die_kset.kobj); +err_put_kobj: + kobject_put(&ip_top->kobj); + kfree(early_entry); + adev->discovery.ip_top = NULL; + return res; +} + +void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev) +{ + struct early_ip_discovery *entry, *tmp_entry; + struct ip_discovery_top *ip_top = NULL; + struct list_head *el, *tmp; + struct kset *die_kset; + + /* Find the entry in our tracking list */ + mutex_lock(&early_ip_discovery_mutex); + list_for_each_entry_safe(entry, tmp_entry, &early_ip_discovery_list, list) { + if (entry->pdev == pdev) { + ip_top = entry->ip_top; + list_del(&entry->list); + kfree(entry); + break; + } + } + mutex_unlock(&early_ip_discovery_mutex); + + if (!ip_top) + return; + + /* Clean up sysfs hierarchy */ + die_kset = &ip_top->die_kset; + + spin_lock(&die_kset->list_lock); + list_for_each_prev_safe(el, tmp, &die_kset->list) { + list_del_init(el); + spin_unlock(&die_kset->list_lock); + amdgpu_discovery_sysfs_die_free(to_ip_die_entry(list_to_kobj(el))); + spin_lock(&die_kset->list_lock); + } + spin_unlock(&die_kset->list_lock); + + kobject_put(&ip_top->die_kset.kobj); + kobject_put(&ip_top->kobj); + /* ip_top itself will be freed by kobject_put via ip_disc_release */ +} /* ================================================== */ @@ -1504,6 +1726,9 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev) r = amdgpu_discovery_init(adev); if (r) return r; + + amdgpu_discovery_sysfs_early_init(adev, adev->pdev); + discovery_bin = adev->discovery.bin; wafl_ver = 0; adev->gfx.xcc_mask = 0; @@ -2636,7 +2861,12 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev) return -EINVAL; } } else { - switch (amdgpu_ip_version(adev, UVD_HWIP, 0)) { + uint32_t vcn_version = amdgpu_ip_version(adev, UVD_HWIP, 0); + + /* no VCN discovered; nothing to add */ + if (!vcn_version) + return 0; + switch (vcn_version) { case IP_VERSION(1, 0, 0): case IP_VERSION(1, 0, 1): amdgpu_device_ip_block_add(adev, &vcn_v1_0_ip_block); @@ -2704,7 +2934,7 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev) default: dev_err(adev->dev, "Failed to add vcn/jpeg ip block(UVD_HWIP:0x%x)\n", - amdgpu_ip_version(adev, UVD_HWIP, 0)); + vcn_version); return -EINVAL; } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h index e0010f6a3eda..5b2b16f68576 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h @@ -41,6 +41,7 @@ struct amdgpu_discovery_info { bool reserve_tmr; }; +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev); void amdgpu_discovery_fini(struct amdgpu_device *adev); int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev); @@ -53,4 +54,9 @@ int amdgpu_discovery_get_gc_major_minor_version(struct amdgpu_device *adev, void amdgpu_discovery_dump(struct amdgpu_device *adev, struct drm_printer *p); +/* Early sysfs functions for persistent ip_discovery export */ +int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, + struct pci_dev *pdev); +void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev); + #endif /* __AMDGPU_DISCOVERY__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 4c0c77eafbd1..ad631ad31899 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -33,7 +33,6 @@ #include <drm/drm_vblank.h> #include <linux/cc_platform.h> -#include <linux/console.h> #include <linux/dynamic_debug.h> #include <linux/module.h> #include <linux/mmu_notifier.h> @@ -146,7 +145,9 @@ enum AMDGPU_DEBUG_MASK { AMDGPU_DEBUG_SMU_POOL = BIT(7), AMDGPU_DEBUG_VM_USERPTR = BIT(8), AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9), - AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10) + AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10), + AMDGPU_DEBUG_HIBERNATION_THAW_RESUME_GPU = BIT(11), + AMDGPU_DEBUG_DISABLE_IP_BLOCK_SOFT_RESET = BIT(12), }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -1939,6 +1940,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x6646, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE|AMD_IS_MOBILITY}, {0x1002, 0x6647, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE|AMD_IS_MOBILITY}, {0x1002, 0x6649, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE}, + {0x1002, 0x664D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE}, {0x1002, 0x6650, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE}, {0x1002, 0x6651, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE}, {0x1002, 0x6658, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BONAIRE}, @@ -2008,6 +2010,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x6930, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA}, {0x1002, 0x6938, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA}, {0x1002, 0x6939, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA}, + {0x1002, 0x693B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TONGA}, /* fiji */ {0x1002, 0x7300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_FIJI}, {0x1002, 0x730F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_FIJI}, @@ -2036,6 +2039,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x67C4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67C7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67D0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, + {0x1002, 0x67D4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67DF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67C8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67C9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, @@ -2049,6 +2053,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x6985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x698F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6995, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6997, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, @@ -2250,7 +2255,7 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) } if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) { - pr_info("debug: soft reset for GPU recovery disabled\n"); + pr_info("debug: soft recovery disabled\n"); adev->debug_disable_soft_recovery = true; } @@ -2291,6 +2296,16 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) pr_info("debug: allowing command submission to CE engine\n"); adev->debug_enable_ce_cs = true; } + + if (amdgpu_debug_mask & AMDGPU_DEBUG_HIBERNATION_THAW_RESUME_GPU) { + pr_info("debug: resume gpu in thaw() of hibernation\n"); + adev->debug_hibernation_thaw_resume_gpu = true; + } + + if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_IP_BLOCK_SOFT_RESET) { + pr_info("debug: IP block soft reset disabled\n"); + adev->debug_disable_ip_block_soft_reset = true; + } } static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags) @@ -2552,6 +2567,8 @@ amdgpu_pci_remove(struct pci_dev *pdev) amdgpu_driver_unload_kms(dev); + amdgpu_discovery_sysfs_early_fini(pdev); + /* * Flush any in flight DMA operations from device. * Clear the Bus Master Enable bit and then wait on the PCIe Device @@ -2705,9 +2722,10 @@ static int amdgpu_pmops_freeze(struct device *dev) static int amdgpu_pmops_thaw(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(drm_dev); /* do not resume device if it's normal hibernation */ - if (console_suspend_enabled && + if (!adev->debug_hibernation_thaw_resume_gpu && !pm_hibernate_is_recovering() && !pm_hibernation_mode_is_suspend()) return 0; @@ -3076,6 +3094,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_PROC_OPTIONS, amdgpu_proc_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static const struct drm_driver amdgpu_kms_driver = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index ea69b1bac7c6..3043ad041bb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -727,6 +727,15 @@ void amdgpu_ring_set_fence_errors_and_reemit(struct amdgpu_ring *ring, last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask; seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask; + /* If there is nothing to reemit, return early and set an error on the fence + * if applicable. If all of the fences are siganlled, this will be a nop. + * if there are still fences and ring_backup_entries_to_copy is 0, then + * we are skipping it on purpose. + */ + if (!ring->ring_backup_entries_to_copy) { + amdgpu_fence_driver_force_completion(ring, &guilty_fence->base); + return; + } ring->reemit = true; amdgpu_ring_alloc(ring, ring->ring_backup_entries_to_copy); spin_lock_irqsave(&ring->fence_drv.lock, flags); @@ -741,7 +750,8 @@ void amdgpu_ring_set_fence_errors_and_reemit(struct amdgpu_ring *ring, if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) { fence = container_of(unprocessed, struct amdgpu_fence, base); is_guilty_fence = fence == guilty_fence; - is_guilty_context = fence->context == guilty_fence->context; + is_guilty_context = guilty_fence ? + (fence->context == guilty_fence->context) : false; /* mark all fences from the guilty context with an error */ if (is_guilty_fence) @@ -794,6 +804,17 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring, seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask; ring->ring_backup_entries_to_copy = 0; + /* if we've already seen this fence, return early. + * ring->ring_backup_entries_to_copy is set to 0 so + * the reemit helper will return early as well to + * avoid getting stuck in a reemit loop. + */ + if (ring->guilty_fence == guilty_fence) { + ring->guilty_fence = NULL; + return; + } + ring->guilty_fence = guilty_fence; + do { last_seq++; last_seq &= ring->fence_drv.num_fences_mask; @@ -811,6 +832,36 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring, } while (last_seq != seq); } +struct amdgpu_fence * +amdgpu_ring_find_guilty_fence(struct amdgpu_ring *ring) +{ + struct dma_fence *unprocessed; + struct dma_fence __rcu **ptr; + struct amdgpu_fence *fence; + u32 seq, last_seq; + + last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask; + seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask; + + do { + last_seq++; + last_seq &= ring->fence_drv.num_fences_mask; + + ptr = &ring->fence_drv.fences[last_seq]; + rcu_read_lock(); + unprocessed = rcu_dereference(*ptr); + + if (unprocessed && !dma_fence_is_signaled(unprocessed)) { + fence = container_of(unprocessed, struct amdgpu_fence, base); + rcu_read_unlock(); + return fence; + } + rcu_read_unlock(); + } while (last_seq != seq); + + return NULL; +} + /* * Common fence implementation */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 85372af1216d..96c9d4f00b27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -34,6 +34,7 @@ #include "amdgpu_xcp.h" #include "amdgpu_xgmi.h" #include "amdgpu_mes.h" +#include "mes_userqueue.h" #include "nvd.h" /* delay 0.1 second to enable gfx off feature */ @@ -377,6 +378,30 @@ int amdgpu_gfx_kiq_init(struct amdgpu_device *adev, return 0; } +static void amdgpu_gfx_mqd_reset_restore(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + int mqd_idx, mqd_size; + + /* restore mqd with the backup copy */ + if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) { + mqd_idx = ring - &adev->gfx.compute_ring[0]; + mqd_size = adev->mqds[AMDGPU_HW_IP_COMPUTE].mqd_size; + if (adev->gfx.mec.mqd_backup[mqd_idx]) + memcpy_toio(ring->mqd_ptr, adev->gfx.mec.mqd_backup[mqd_idx], mqd_size); + } else if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) { + mqd_size = adev->mqds[AMDGPU_HW_IP_GFX].mqd_size; + mqd_idx = ring - &adev->gfx.gfx_ring[0]; + + if (adev->gfx.me.mqd_backup[mqd_idx]) + memcpy_toio(ring->mqd_ptr, adev->gfx.me.mqd_backup[mqd_idx], mqd_size); + } + /* reset the ring */ + ring->wptr = 0; + atomic64_set((atomic64_t *)ring->wptr_cpu_addr, 0); + amdgpu_ring_clear_ring(ring); +} + /* create MQD for each compute/gfx queue */ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev, unsigned int mqd_size, int xcc_id) @@ -1964,6 +1989,60 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); } +static int amdgpu_gfx_mes_reset_queue_start(struct amdgpu_ring *ring, + unsigned int vmid, + struct amdgpu_fence *timedout_fence, + bool use_mmio) +{ + struct amdgpu_device *adev = ring->adev; + bool reinit_queue; + int r; + + if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) && + adev->mes.compute_pipe_reset_enabled) + reinit_queue = true; + else if ((ring->funcs->type == AMDGPU_RING_TYPE_GFX) && + adev->mes.gfx_pipe_reset_enabled) + reinit_queue = true; + else + reinit_queue = use_mmio; + + amdgpu_ring_reset_helper_begin(ring, timedout_fence); + + r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0); + if (r) + return r; + + if (reinit_queue) { + r = amdgpu_mes_unmap_legacy_queue(adev, ring, + RESET_QUEUES, 0, 0, 0); + if (r) + return r; + amdgpu_gfx_mqd_reset_restore(ring); + + r = amdgpu_mes_map_legacy_queue(adev, ring, 0); + if (r) { + dev_err(adev->dev, "failed to remap kgq\n"); + return r; + } + } + return 0; +} + +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, + unsigned int vmid, + struct amdgpu_fence *timedout_fence, + bool use_mmio) +{ + int r; + + r = amdgpu_gfx_mes_reset_queue_start(ring, vmid, timedout_fence, + use_mmio); + if (r) + return r; + return amdgpu_ring_reset_helper_end(ring, timedout_fence); +} + static DEVICE_ATTR(run_cleaner_shader, 0200, NULL, amdgpu_gfx_set_run_cleaner_shader); @@ -2122,6 +2201,200 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) } } +static void amdgpu_gfx_reset_start_compute_scheds(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring) +{ + struct amdgpu_ring *ring; + int i; + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + drm_sched_wqueue_start(&ring->sched); + } +} + +static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring) +{ + struct amdgpu_ring *ring; + int i; + + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + drm_sched_wqueue_stop(&ring->sched); + } +} + +/* + * Match the MES-reported hung doorbell against a compute ring and run + * the reset. On hit, the matched ring and its guilty fence are returned + * via *out_ring / *out_fence so the caller can defer reset end until + * after MES has resumed all gangs. + */ +static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev, + struct amdgpu_ring *guilty_ring, + unsigned int db, + struct amdgpu_ring **out_ring, + struct amdgpu_fence **out_fence) +{ + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; + struct amdgpu_fence *fence; + struct amdgpu_ring *ring; + int i, r; + + *out_ring = NULL; + *out_fence = NULL; + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + ring = &adev->gfx.compute_ring[i]; + if (ring == guilty_ring) + continue; + if (ring->doorbell_index == db) { + fence = amdgpu_ring_find_guilty_fence(ring); + r = amdgpu_gfx_mes_reset_queue_start(ring, 0, fence, + use_mmio); + if (r) + return r; + *out_ring = ring; + *out_fence = fence; + break; + } + } + return 0; +} + +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + struct amdgpu_fence *guilty_fence, + struct amdgpu_usermode_queue *uq, + unsigned int *hung_queue_count, + void *faulty_queue_input) +{ + struct amdgpu_mes_hung_queue_hqd_info *hqd_info = + (struct amdgpu_mes_hung_queue_hqd_info *) + &adev->gfx.mec.mes_hung_db_array[adev->mes.hung_queue_hqd_info_offset]; + int i, r, pipe, queue, queue_type; + unsigned int num_hung = 0; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; + struct mes_remove_queue_input *queue_input = (struct mes_remove_queue_input *)faulty_queue_input; + struct amdgpu_gfx_deferred_entry deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1]; + int n_deferred = 0; + int ring_err; + + guard(mutex)(&adev->gfx.mec.reset_mutex); + /* stop the drm schedulers for all compute queues */ + amdgpu_gfx_reset_stop_compute_scheds(adev, ring); + /* suspend all will determine which queues are hung. + * reset detect will return the array of bad queue doorbells + */ + r = amdgpu_mes_suspend(adev, 0); + /* if suspend all success, it should no hang queue */ + if (!r) + /* always reset the KCQ/userq since we need to signal the fence + * and we could be stuck in a loop which is preemptable. + */ + goto fence_reset; + r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, + true, &num_hung, adev->gfx.mec.mes_hung_db_array, 0); + if (r) + goto out; + if (hung_queue_count) + *hung_queue_count = num_hung; + +fence_reset: + /* reset the queue this came from if specified */ + if (ring) { + r = amdgpu_gfx_mes_reset_queue_start(ring, 0, guilty_fence, + use_mmio); + if (r) + goto out; + deferred_end[n_deferred].ring = ring; + deferred_end[n_deferred].fence = guilty_fence; + n_deferred++; + } + if (uq) { + r = mes_userq_reset(uq); + if (r) + goto out; + } + for (i = 0; i < num_hung; i++) { + struct amdgpu_ring *hr = NULL; + struct amdgpu_fence *hf = NULL; + + pipe = hqd_info[i].pipe_index; + queue = hqd_info[i].queue_index; + queue_type = hqd_info[i].queue_type; + + /* reset any KCQs */ + r = amdgpu_gfx_reset_mes_kcq(adev, ring, + adev->gfx.mec.mes_hung_db_array[i], + &hr, &hf); + if (r) + goto out; + if (hr) { + deferred_end[n_deferred].ring = hr; + deferred_end[n_deferred].fence = hf; + n_deferred++; + } + /* reset any KFD queues */ + r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, queue, + adev->gfx.mec.mes_hung_db_array[i]); + if (r) + goto out; + /* reset KGD user queues */ + r = mes_userq_reset_queue(adev, uq, queue_type, pipe, queue, + adev->gfx.mec.mes_hung_db_array[i]); + if (r) + goto out; + } + + /* MES doesn't detect any hung queue but we have a known bad queue + * and it is not KCQ + */ + if (!num_hung && queue_input && !ring) { + /* MES suspend_all is successful means this bad queue is + * preempted successfuly. Remove it before resume all so it + * doesn't get mapped back + */ + if (!down_read_trylock(&adev->reset_domain->sem)) { + r = -EIO; + goto out; + } + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->remove_hw_queue(&adev->mes, queue_input); + amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); + } + +out: + /* resume all will enable the non-hung queues */ + amdgpu_mes_resume(adev, 0); + + /* Now CP is running again — replay backed-up commands and ring + * doorbells on each reset queue. + */ + ring_err = r; + for (i = 0; i < n_deferred; i++) { + int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring, + deferred_end[i].fence); + + if (er && !ring_err) + ring_err = er; + } + + if (!ring_err) + amdgpu_gfx_reset_start_compute_scheds(adev, ring); + + /* If this reset is triggered by non-KCQ, the KCQ result after resume must + * not override the reset result; otherwise a false reset failure is returned + * to the non-KCQ caller + */ + return ring ? ring_err : r; +} + int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, unsigned int cleaner_shader_size) { @@ -2460,9 +2733,8 @@ void amdgpu_gfx_profile_ring_begin_use(struct amdgpu_ring *ring) else profile = PP_SMC_POWER_PROFILE_COMPUTE; - atomic_inc(&adev->gfx.total_submission_cnt); - - cancel_delayed_work_sync(&adev->gfx.idle_work); + if (!atomic_fetch_inc(&adev->gfx.total_submission_cnt)) + cancel_delayed_work_sync(&adev->gfx.idle_work); /* We can safely return early here because we've cancelled the * the delayed work so there is no one else to set it to false @@ -2490,9 +2762,9 @@ void amdgpu_gfx_profile_ring_end_use(struct amdgpu_ring *ring) if (amdgpu_dpm_is_overdrive_enabled(adev)) return; - atomic_dec(&ring->adev->gfx.total_submission_cnt); - - schedule_delayed_work(&ring->adev->gfx.idle_work, GFX_PROFILE_IDLE_TIMEOUT); + if (atomic_dec_and_test(&ring->adev->gfx.total_submission_cnt)) + schedule_delayed_work(&ring->adev->gfx.idle_work, + GFX_PROFILE_IDLE_TIMEOUT); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 54c1eb9c499b..aefd4f03b443 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -36,6 +36,8 @@ #include "amdgpu_ring_mux.h" #include "amdgpu_xcp.h" +struct amdgpu_usermode_queue; + /* GFX current status */ #define AMDGPU_GFX_NORMAL_MODE 0x00000000L #define AMDGPU_GFX_SAFE_MODE 0x00000001L @@ -116,6 +118,9 @@ struct amdgpu_mec { u32 num_pipe_per_mec; u32 num_queue_per_pipe; void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES]; + bool use_mmio_for_reset; + u32 *mes_hung_db_array; + struct mutex reset_mutex; }; struct amdgpu_mec_bitmap { @@ -401,6 +406,7 @@ struct amdgpu_me { uint32_t num_pipe_per_me; uint32_t num_queue_per_pipe; void *mqd_backup[AMDGPU_MAX_GFX_RINGS]; + bool use_mmio_for_reset; /* These are the resources for which amdgpu takes ownership */ DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_GFX_QUEUES); @@ -479,8 +485,6 @@ struct amdgpu_gfx { const struct amdgpu_gfx_funcs *funcs; /* reset mask */ - uint32_t grbm_soft_reset; - uint32_t srbm_soft_reset; uint32_t gfx_supported_reset; uint32_t compute_supported_reset; @@ -543,6 +547,11 @@ struct amdgpu_gfx { bool disable_uq; }; +struct amdgpu_gfx_deferred_entry { + struct amdgpu_ring *ring; + struct amdgpu_fence *fence; +}; + struct amdgpu_gfx_ras_reg_entry { struct amdgpu_ras_err_status_reg_entry reg_entry; enum amdgpu_gfx_ras_mem_id_type mem_id_type; @@ -641,6 +650,12 @@ int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev, bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id); int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev); void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev); +int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + struct amdgpu_fence *guilty_fence, + struct amdgpu_usermode_queue *uq, + unsigned int *hung_queue_count, + void *faulty_queue_input); void amdgpu_gfx_ras_error_func(struct amdgpu_device *adev, void *ras_error_status, void (*func)(struct amdgpu_device *adev, void *ras_error_status, @@ -667,6 +682,11 @@ void amdgpu_debugfs_compute_sched_mask_init(struct amdgpu_device *adev); int amdgpu_gfx_ring_preempt_ib(struct amdgpu_ring *ring); +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, + unsigned int vmid, + struct amdgpu_fence *timedout_fence, + bool use_mmio); + static inline const char *amdgpu_gfx_compute_mode_desc(int mode) { switch (mode) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 5d6149ba7ab7..4000b2c6fc98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -1763,10 +1763,15 @@ int amdgpu_gmc_init_mem_ranges(struct amdgpu_device *adev) valid = true; else valid = amdgpu_gmc_validate_partition_info(adev); - if (!valid) { - /* TODO: handle invalid case */ + if (!valid) dev_warn(adev->dev, "Mem ranges not matching with hardware config\n"); + + if (!adev->gmc.num_mem_partitions) { + dev_err(adev->dev, "num_mem_partitions is zero\n"); + kfree(adev->gmc.mem_partitions); + adev->gmc.mem_partitions = NULL; + return -EINVAL; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index ddb0d500e0fa..3ca187f5ade8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -286,7 +286,6 @@ struct amdgpu_gmc { struct amdgpu_irq_src vm_fault; uint32_t vram_type; uint8_t vram_vendor; - uint32_t srbm_soft_reset; bool prt_warning; uint32_t sdpif_register; /* apertures */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c index 6aa54156bbc9..33a04113ed74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.c @@ -369,43 +369,152 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, } /** - * amdgpu_device_ip_is_hw - is the hardware IP enabled + * amdgpu_device_ip_is_valid - is the hardware IP valid * * @adev: amdgpu_device pointer * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) * - * Check if the hardware IP is enable or not. - * Returns true if it the IP is enable, false if not. + * Check if the hardware IP is valid or not. + * Returns true if it the IP is valid, false if not. */ -bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, - enum amd_ip_block_type block_type) +bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, + enum amd_ip_block_type block_type) { struct amdgpu_ip_block *ip_block; ip_block = amdgpu_device_ip_get_ip_block(adev, block_type); if (ip_block) - return ip_block->status.hw; + return ip_block->status.valid; return false; } /** - * amdgpu_device_ip_is_valid - is the hardware IP valid + * amdgpu_ip_from_ring() - Find IP block type corresponding to ring type. * - * @adev: amdgpu_device pointer - * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) + * @ring_type: The ring type whose IP block you are looking for. + */ +static enum amd_ip_block_type amdgpu_ip_from_ring(const enum amdgpu_ring_type ring_type) +{ + switch (ring_type) { + case AMDGPU_RING_TYPE_GFX: + case AMDGPU_RING_TYPE_COMPUTE: + return AMD_IP_BLOCK_TYPE_GFX; + + case AMDGPU_RING_TYPE_SDMA: + return AMD_IP_BLOCK_TYPE_SDMA; + + case AMDGPU_RING_TYPE_UVD: + case AMDGPU_RING_TYPE_UVD_ENC: + return AMD_IP_BLOCK_TYPE_UVD; + + case AMDGPU_RING_TYPE_VCE: + return AMD_IP_BLOCK_TYPE_VCE; + + case AMDGPU_RING_TYPE_VCN_DEC: + case AMDGPU_RING_TYPE_VCN_ENC: + return AMD_IP_BLOCK_TYPE_VCN; + + case AMDGPU_RING_TYPE_VCN_JPEG: + return AMD_IP_BLOCK_TYPE_JPEG; + + case AMDGPU_RING_TYPE_VPE: + return AMD_IP_BLOCK_TYPE_VPE; + + default: + return AMD_IP_BLOCK_TYPE_NUM; + } +} + +/** + * amdgpu_ring_mask_from_ip() - Find mask of ring types corresponding to an IP block type. * - * Check if the hardware IP is valid or not. - * Returns true if it the IP is valid, false if not. + * @ip_type: The IP block type whose rings you are looking for. */ -bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, - enum amd_ip_block_type block_type) +static u32 amdgpu_ring_mask_from_ip(const enum amd_ip_block_type ip_type) +{ + switch (ip_type) { + case AMD_IP_BLOCK_TYPE_GFX: + return BIT(AMDGPU_RING_TYPE_GFX) | BIT(AMDGPU_RING_TYPE_COMPUTE); + + case AMD_IP_BLOCK_TYPE_SDMA: + return BIT(AMDGPU_RING_TYPE_SDMA); + + case AMD_IP_BLOCK_TYPE_UVD: + return BIT(AMDGPU_RING_TYPE_UVD) | BIT(AMDGPU_RING_TYPE_UVD_ENC); + + case AMD_IP_BLOCK_TYPE_VCE: + return BIT(AMD_IP_BLOCK_TYPE_VCE); + + case AMD_IP_BLOCK_TYPE_VCN: + return BIT(AMDGPU_RING_TYPE_VCN_DEC) | BIT(AMDGPU_RING_TYPE_VCN_ENC); + + case AMD_IP_BLOCK_TYPE_JPEG: + return BIT(AMDGPU_RING_TYPE_VCN_JPEG); + + case AMD_IP_BLOCK_TYPE_VPE: + return BIT(AMDGPU_RING_TYPE_VPE); + + default: + return 0; + } +} + +/** + * amdgpu_device_ip_soft_reset() - Perform a graceful soft reset on an IP block. + * + * @guilty_ring: The ring which is guilty of causing a reset. + * @guilty_fence: The fence which didn't signal. + * + * IP block soft reset is used when attempting to recover + * from a GPU hang in a situation where a more fine grained + * reset type isn't available or didn't work. This effectively + * resets all rings that belong to the same device IP block + * and re-initializes the device IP block. + * + * The reset is handled gracefully, meaning that we try to + * minimize collateral damage (ie. avoid rejecting non-guilty jobs) + * as well as back up and restore the contents of all rings + * so that the system can move on from the hang. + */ +int amdgpu_device_ip_soft_reset(struct amdgpu_ring *guilty_ring, + struct amdgpu_fence *guilty_fence) { + struct amdgpu_device *adev = guilty_ring->adev; struct amdgpu_ip_block *ip_block; + enum amd_ip_block_type ip_type; + u32 ring_type_mask; + int r; - ip_block = amdgpu_device_ip_get_ip_block(adev, block_type); - if (ip_block) - return ip_block->status.valid; + ip_type = amdgpu_ip_from_ring(guilty_ring->funcs->type); + ip_block = amdgpu_device_ip_get_ip_block(adev, ip_type); - return false; + if (!ip_block || !ip_block->version->funcs->soft_reset) { + dev_warn(adev->dev, "IP block soft reset not supported on %s\n", + ip_block->version->funcs->name); + return -EOPNOTSUPP; + } + + dev_err(adev->dev, "Starting %s IP block soft reset\n", + ip_block->version->funcs->name); + + ring_type_mask = amdgpu_ring_mask_from_ip(ip_type); + + amdgpu_device_lock_reset_domain(adev->reset_domain); + amdgpu_multi_ring_reset_helper_begin(ring_type_mask, guilty_ring, guilty_fence); + + r = ip_block->version->funcs->soft_reset(ip_block); + + r = amdgpu_multi_ring_reset_helper_end(ring_type_mask, guilty_ring, r); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + + if (r) { + dev_err(adev->dev, "Failed %s IP block soft reset: %d\n", + ip_block->version->funcs->name, r); + return r; + } + + dev_err(adev->dev, "Successful %s IP block soft reset\n", + ip_block->version->funcs->name); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h index 1d0df6d93957..70fc4e5db51f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ip.h @@ -68,6 +68,7 @@ enum amd_hw_ip_block_type { ISP_HWIP, ATU_HWIP, AIGC_HWIP, + UMSCH_HWIP, MAX_HWIP }; @@ -84,6 +85,9 @@ enum amd_hw_ip_block_type { #define IP_VERSION_SUBREV(ver) ((ver) & 0xF) #define IP_VERSION_MAJ_MIN_REV(ver) ((ver) >> 8) +struct amdgpu_ring; +struct amdgpu_fence; + struct amdgpu_ip_map_info { /* Map of logical to actual dev instances/mask */ uint32_t dev_inst[MAX_HWIP][HWIP_MAX_INSTANCE]; @@ -146,9 +150,9 @@ void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, u64 *flags); int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, enum amd_ip_block_type block_type); -bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, - enum amd_ip_block_type block_type); bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, enum amd_ip_block_type block_type); +int amdgpu_device_ip_soft_reset(struct amdgpu_ring *guilty_ring, + struct amdgpu_fence *guilty_fence); #endif /* __AMDGPU_IP_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9ecc6387c1eb..cff73f1b5a72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -112,7 +112,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_job_core_dump(adev, job); if (amdgpu_gpu_recovery && - amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RESET) && + amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RECOVERY) && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { dev_err(adev->dev, "ring %s timeout, but soft recovered\n", s_job->sched->name); @@ -151,6 +151,17 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); } + /* Attempt an IP block soft reset, if supported. */ + if (amdgpu_gpu_recovery && + amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET)) { + r = amdgpu_device_ip_soft_reset(ring, job->hw_fence); + if (!r) { + atomic_inc(&ring->adev->gpu_reset_counter); + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, info); + goto exit; + } + } + if (dma_fence_get_status(&s_job->s_fence->finished) == 0) dma_fence_set_error(&s_job->s_fence->finished, -ETIME); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 63ee6ba6a931..57935c321515 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -134,8 +134,8 @@ void amdgpu_jpeg_ring_begin_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - atomic_inc(&adev->jpeg.total_submission_cnt); - cancel_delayed_work_sync(&adev->jpeg.idle_work); + if (!atomic_fetch_inc(&adev->jpeg.total_submission_cnt)) + cancel_delayed_work_sync(&adev->jpeg.idle_work); mutex_lock(&adev->jpeg.jpeg_pg_lock); amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_JPEG, @@ -145,8 +145,9 @@ void amdgpu_jpeg_ring_begin_use(struct amdgpu_ring *ring) void amdgpu_jpeg_ring_end_use(struct amdgpu_ring *ring) { - atomic_dec(&ring->adev->jpeg.total_submission_cnt); - schedule_delayed_work(&ring->adev->jpeg.idle_work, JPEG_IDLE_TIMEOUT); + if (atomic_dec_and_test(&ring->adev->jpeg.total_submission_cnt)) + schedule_delayed_work(&ring->adev->jpeg.idle_work, + JPEG_IDLE_TIMEOUT); } int amdgpu_jpeg_dec_ring_test_ring(struct amdgpu_ring *ring) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h index 346ae0ab09d3..fe95d9188713 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h @@ -149,6 +149,9 @@ struct amdgpu_jpeg { u32 *ip_dump; u32 reg_count; const struct amdgpu_hwip_reg_entry *reg_list; + + bool disable_uq; + bool disable_kq; }; int amdgpu_jpeg_sw_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 71272f40feef..215aa678d1d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1424,6 +1424,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) } /** + * amdgpu_proc_options_ioctl - set per-fd user options + * + * @dev: drm dev pointer + * @data: pointer to struct drm_amdgpu_proc_options + * @filp: drm file + * + * Sets options stored on the per-file amdgpu_fpriv. Currently the only + * supported option is %AMDGPU_PROC_OPTIONS_OP_KFD_SIGBUS_DELAY which + * controls how KFD delivers SIGBUS for poison/RAS events to the calling + * process (immediate, suppressed, or delayed by N milliseconds). + */ +int amdgpu_proc_options_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct drm_amdgpu_proc_options *args = data; + + switch (args->op) { + case AMDGPU_PROC_OPTIONS_OP_KFD_SIGBUS_DELAY: + return amdgpu_amdkfd_set_sigbus_delay(current, + args->kfd_sigbus_delay.value); + default: + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); + return -EINVAL; + } +} + +/** * amdgpu_driver_open_kms - drm callback for open * * @dev: drm dev pointer @@ -1504,8 +1531,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) if (r) goto error_vm; - mutex_init(&fpriv->bo_list_lock); - idr_init_base(&fpriv->bo_list_handles, 1); + xa_init_flags(&fpriv->bo_list_handles, XA_FLAGS_ALLOC1); r = amdgpu_userq_mgr_init(&fpriv->userq_mgr, file_priv, adev); if (r) @@ -1550,8 +1576,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, struct amdgpu_fpriv *fpriv = file_priv->driver_priv; struct amdgpu_bo_list *list; struct amdgpu_bo *pd; + unsigned long handle; u32 pasid; - int handle; if (!fpriv) return; @@ -1587,11 +1613,9 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, amdgpu_pasid_free_delayed(pd->tbo.base.resv, pasid); amdgpu_bo_unref(&pd); - idr_for_each_entry(&fpriv->bo_list_handles, list, handle) + xa_for_each(&fpriv->bo_list_handles, handle, list) amdgpu_bo_list_put(list); - - idr_destroy(&fpriv->bo_list_handles); - mutex_destroy(&fpriv->bo_list_lock); + xa_destroy(&fpriv->bo_list_handles); kfree(fpriv); file_priv->driver_priv = NULL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index cc6d1a4e4c3a..9a7f7d2b2767 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -27,16 +27,6 @@ #include "umc/umc_6_7_0_offset.h" #include "umc/umc_6_7_0_sh_mask.h" -static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev, - uint64_t mc_status) -{ - if (adev->umc.ras->check_ecc_err_status) - return adev->umc.ras->check_ecc_err_status(adev, - AMDGPU_MCA_ERROR_TYPE_DE, &mc_status); - - return false; -} - void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) @@ -155,479 +145,3 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) return 0; } - -static void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set) -{ - if (!mca_set) - return; - - memset(mca_set, 0, sizeof(*mca_set)); - INIT_LIST_HEAD(&mca_set->list); -} - -static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry) -{ - struct mca_bank_node *node; - - if (!entry) - return -EINVAL; - - node = kvzalloc_obj(*node); - if (!node) - return -ENOMEM; - - memcpy(&node->entry, entry, sizeof(*entry)); - - INIT_LIST_HEAD(&node->node); - list_add_tail(&node->node, &mca_set->list); - - mca_set->nr_entries++; - - return 0; -} - -static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new) -{ - struct mca_bank_node *node; - - list_for_each_entry(node, &new->list, node) - amdgpu_mca_bank_set_add_entry(mca_set, &node->entry); - - return 0; -} - -static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node) -{ - if (!node) - return; - - list_del(&node->node); - kvfree(node); - - mca_set->nr_entries--; -} - -static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set) -{ - struct mca_bank_node *node, *tmp; - - if (list_empty(&mca_set->list)) - return; - - list_for_each_entry_safe(node, tmp, &mca_set->list, node) - amdgpu_mca_bank_set_remove_node(mca_set, node); -} - -void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) -{ - struct amdgpu_mca *mca = &adev->mca; - - mca->mca_funcs = mca_funcs; -} - -int amdgpu_mca_init(struct amdgpu_device *adev) -{ - struct amdgpu_mca *mca = &adev->mca; - struct mca_bank_cache *mca_cache; - int i; - - atomic_set(&mca->ue_update_flag, 0); - - for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) { - mca_cache = &mca->mca_caches[i]; - mutex_init(&mca_cache->lock); - amdgpu_mca_bank_set_init(&mca_cache->mca_set); - } - - return 0; -} - -void amdgpu_mca_fini(struct amdgpu_device *adev) -{ - struct amdgpu_mca *mca = &adev->mca; - struct mca_bank_cache *mca_cache; - int i; - - atomic_set(&mca->ue_update_flag, 0); - - for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) { - mca_cache = &mca->mca_caches[i]; - amdgpu_mca_bank_set_release(&mca_cache->mca_set); - mutex_destroy(&mca_cache->lock); - } -} - -int amdgpu_mca_reset(struct amdgpu_device *adev) -{ - amdgpu_mca_fini(adev); - - return amdgpu_mca_init(adev); -} - -int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) -{ - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (mca_funcs && mca_funcs->mca_set_debug_mode) - return mca_funcs->mca_set_debug_mode(adev, enable); - - return -EOPNOTSUPP; -} - -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, - struct ras_query_context *qctx) -{ - u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID; - - RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); - RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_STATUS]); - RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_ADDR]); - RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_MISC0]); - RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_IPID]); - RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_SYND]); -} - -static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) -{ - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (!count) - return -EINVAL; - - if (mca_funcs && mca_funcs->mca_get_valid_mca_count) - return mca_funcs->mca_get_valid_mca_count(adev, type, count); - - return -EOPNOTSUPP; -} - -static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, - int idx, struct mca_bank_entry *entry) -{ - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - int count; - - if (!mca_funcs || !mca_funcs->mca_get_mca_entry) - return -EOPNOTSUPP; - - switch (type) { - case AMDGPU_MCA_ERROR_TYPE_UE: - count = mca_funcs->max_ue_count; - break; - case AMDGPU_MCA_ERROR_TYPE_CE: - count = mca_funcs->max_ce_count; - break; - default: - return -EINVAL; - } - - if (idx >= count) - return -EINVAL; - - return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); -} - -static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev, enum amdgpu_mca_error_type type) -{ - struct amdgpu_mca *mca = &adev->mca; - bool ret = true; - - /* - * Because the UE Valid MCA count will only be cleared after reset, - * in order to avoid repeated counting of the error count, - * the aca bank is only updated once during the gpu recovery stage. - */ - if (type == AMDGPU_MCA_ERROR_TYPE_UE) { - if (amdgpu_ras_intr_triggered()) - ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) == 0; - else - atomic_set(&mca->ue_update_flag, 0); - } - - return ret; -} - -static bool amdgpu_mca_bank_should_dump(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, - struct mca_bank_entry *entry) -{ - bool ret; - - switch (type) { - case AMDGPU_MCA_ERROR_TYPE_CE: - ret = amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]); - break; - case AMDGPU_MCA_ERROR_TYPE_UE: - default: - ret = true; - break; - } - - return ret; -} - -static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set, - struct ras_query_context *qctx) -{ - struct mca_bank_entry entry; - uint32_t count = 0, i; - int ret; - - if (!mca_set) - return -EINVAL; - - if (!amdgpu_mca_bank_should_update(adev, type)) - return 0; - - ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count); - if (ret) - return ret; - - for (i = 0; i < count; i++) { - memset(&entry, 0, sizeof(entry)); - ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, &entry); - if (ret) - return ret; - - amdgpu_mca_bank_set_add_entry(mca_set, &entry); - - if (amdgpu_mca_bank_should_dump(adev, type, &entry)) - amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx); - } - - return 0; -} - -static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) -{ - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (!count || !entry) - return -EINVAL; - - if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count) - return -EOPNOTSUPP; - - return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count); -} - -static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, - struct mca_bank_set *mca_set, struct ras_err_data *err_data) -{ - struct amdgpu_smuio_mcm_config_info mcm_info; - struct mca_bank_node *node, *tmp; - struct mca_bank_entry *entry; - uint32_t count; - int ret; - - if (!mca_set) - return -EINVAL; - - if (!mca_set->nr_entries) - return 0; - - list_for_each_entry_safe(node, tmp, &mca_set->list, node) { - entry = &node->entry; - - count = 0; - ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); - if (ret && ret != -EOPNOTSUPP) - return ret; - - if (!count) - continue; - - memset(&mcm_info, 0, sizeof(mcm_info)); - - mcm_info.socket_id = entry->info.socket_id; - mcm_info.die_id = entry->info.aid; - - if (type == AMDGPU_MCA_ERROR_TYPE_UE) { - amdgpu_ras_error_statistic_ue_count(err_data, - &mcm_info, (uint64_t)count); - } else { - if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) - amdgpu_ras_error_statistic_de_count(err_data, - &mcm_info, (uint64_t)count); - else - amdgpu_ras_error_statistic_ce_count(err_data, - &mcm_info, (uint64_t)count); - } - - amdgpu_mca_bank_set_remove_node(mca_set, node); - } - - return 0; -} - -static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new) -{ - struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type]; - int ret; - - mutex_lock(&mca_cache->lock); - ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new); - mutex_unlock(&mca_cache->lock); - - return ret; -} - -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, - struct ras_err_data *err_data, struct ras_query_context *qctx) -{ - struct mca_bank_set mca_set; - struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type]; - int ret; - - amdgpu_mca_bank_set_init(&mca_set); - - ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx); - if (ret) - goto out_mca_release; - - ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data); - if (ret) - goto out_mca_release; - - /* add remain mca bank to mca cache */ - if (mca_set.nr_entries) { - ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set); - if (ret) - goto out_mca_release; - } - - /* dispatch mca set again if mca cache has valid data */ - mutex_lock(&mca_cache->lock); - if (mca_cache->mca_set.nr_entries) - ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data); - mutex_unlock(&mca_cache->lock); - -out_mca_release: - amdgpu_mca_bank_set_release(&mca_set); - - return ret; -} - -#if defined(CONFIG_DEBUG_FS) -static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)data; - int ret; - - ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false); - if (ret) - return ret; - - dev_info(adev->dev, "amdgpu set smu mca debug mode %s success\n", val ? "on" : "off"); - - return 0; -} - -static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry) -{ - int i, idx = entry->idx; - int reg_idx_array[] = { - MCA_REG_IDX_STATUS, - MCA_REG_IDX_ADDR, - MCA_REG_IDX_MISC0, - MCA_REG_IDX_IPID, - MCA_REG_IDX_SYND, - }; - - seq_printf(m, "mca entry[%d].type: %s\n", idx, entry->type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE"); - seq_printf(m, "mca entry[%d].ip: %d\n", idx, entry->ip); - seq_printf(m, "mca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n", - idx, entry->info.socket_id, entry->info.aid, entry->info.hwid, entry->info.mcatype); - - for (i = 0; i < ARRAY_SIZE(reg_idx_array); i++) - seq_printf(m, "mca entry[%d].regs[%d]: 0x%016llx\n", idx, reg_idx_array[i], entry->regs[reg_idx_array[i]]); -} - -static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)m->private; - struct mca_bank_node *node; - struct mca_bank_set mca_set; - struct ras_query_context qctx; - int ret; - - amdgpu_mca_bank_set_init(&mca_set); - - qctx.evid.event_id = RAS_EVENT_INVALID_ID; - ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx); - if (ret) - goto err_free_mca_set; - - seq_printf(m, "amdgpu smu %s valid mca count: %d\n", - type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries); - - if (!mca_set.nr_entries) - goto err_free_mca_set; - - list_for_each_entry(node, &mca_set.list, node) - mca_dump_entry(m, &node->entry); - - /* add mca bank to mca bank cache */ - ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set); - -err_free_mca_set: - amdgpu_mca_bank_set_release(&mca_set); - - return ret; -} - -static int mca_dump_ce_show(struct seq_file *m, void *unused) -{ - return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_CE); -} - -static int mca_dump_ce_open(struct inode *inode, struct file *file) -{ - return single_open(file, mca_dump_ce_show, inode->i_private); -} - -static const struct file_operations mca_ce_dump_debug_fops = { - .owner = THIS_MODULE, - .open = mca_dump_ce_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mca_dump_ue_show(struct seq_file *m, void *unused) -{ - return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_UE); -} - -static int mca_dump_ue_open(struct inode *inode, struct file *file) -{ - return single_open(file, mca_dump_ue_show, inode->i_private); -} - -static const struct file_operations mca_ue_dump_debug_fops = { - .owner = THIS_MODULE, - .open = mca_dump_ue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_set, "%llu\n"); -#endif - -void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) -{ -#if defined(CONFIG_DEBUG_FS) - if (!root) - return; - - debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops); - debugfs_create_file("mca_ue_dump", 0400, root, adev, &mca_ue_dump_debug_fops); - debugfs_create_file("mca_ce_dump", 0400, root, adev, &mca_ce_dump_debug_fops); -#endif -} - diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index e80323ff90c1..6d12f8a516d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -23,45 +23,6 @@ #include "amdgpu_ras.h" -#define MCA_MAX_REGS_COUNT (16) - -#define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) -#define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63) -#define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62) -#define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61) -#define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60) -#define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59) -#define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58) -#define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57) -#define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56) -#define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55) -#define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53) -#define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46) -#define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45) -#define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44) -#define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43) -#define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40) -#define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32) -#define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24) -#define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16) -#define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0) - -#define MCA_REG__MISC0__ERRCNT(x) MCA_REG_FIELD(x, 43, 32) - -#define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0) - -enum amdgpu_mca_ip { - AMDGPU_MCA_IP_UNKNOW = -1, - AMDGPU_MCA_IP_PSP = 0, - AMDGPU_MCA_IP_SDMA, - AMDGPU_MCA_IP_GC, - AMDGPU_MCA_IP_SMU, - AMDGPU_MCA_IP_MP5, - AMDGPU_MCA_IP_UMC, - AMDGPU_MCA_IP_PCS_XGMI, - AMDGPU_MCA_IP_COUNT, -}; - enum amdgpu_mca_error_type { AMDGPU_MCA_ERROR_TYPE_UE = 0, AMDGPU_MCA_ERROR_TYPE_CE, @@ -77,77 +38,20 @@ struct amdgpu_mca_ras { struct amdgpu_mca_ras_block *ras; }; -struct mca_bank_set { - int nr_entries; - struct list_head list; -}; - -struct mca_bank_cache { - struct mca_bank_set mca_set; - struct mutex lock; -}; - struct amdgpu_mca { struct amdgpu_mca_ras mp0; struct amdgpu_mca_ras mp1; struct amdgpu_mca_ras mpio; - const struct amdgpu_mca_smu_funcs *mca_funcs; - struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE]; - atomic_t ue_update_flag; -}; - -enum mca_reg_idx { - MCA_REG_IDX_STATUS = 1, - MCA_REG_IDX_ADDR = 2, - MCA_REG_IDX_MISC0 = 3, - MCA_REG_IDX_IPID = 5, - MCA_REG_IDX_SYND = 6, - MCA_REG_IDX_COUNT = 16, -}; - -struct mca_bank_info { - int socket_id; - int aid; - int hwid; - int mcatype; -}; - -struct mca_bank_entry { - int idx; - enum amdgpu_mca_error_type type; - enum amdgpu_mca_ip ip; - struct mca_bank_info info; - uint64_t regs[MCA_MAX_REGS_COUNT]; -}; - -struct mca_bank_node { - struct mca_bank_entry entry; - struct list_head node; -}; - -struct amdgpu_mca_smu_funcs { - int max_ue_count; - int max_ce_count; - int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); - int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, - struct mca_bank_entry *entry, uint32_t *count); - int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, - uint32_t *count); - int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, - int idx, struct mca_bank_entry *entry); }; void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count); - void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count); - void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr); - void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, void *ras_error_status); @@ -155,15 +59,4 @@ int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); -void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); -int amdgpu_mca_init(struct amdgpu_device *adev); -void amdgpu_mca_fini(struct amdgpu_device *adev); -int amdgpu_mca_reset(struct amdgpu_device *adev); -int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); -int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, uint32_t *total); -void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, - struct ras_err_data *err_data, struct ras_query_context *qctx); - #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index e3972673fd64..6c0dde3786e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -217,7 +217,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev) if (r) goto error_doorbell; - if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) { + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { /* When queue/pipe reset is done in MES instead of in the * driver, MES passes hung queues information to the driver in * hung_queue_hqd_info. Calculate required space to store this @@ -252,6 +252,10 @@ int amdgpu_mes_init(struct amdgpu_device *adev) } } + adev->gfx.mec.mes_hung_db_array = + kcalloc(amdgpu_mes_get_hung_queue_db_array_size(adev), + sizeof(u32), GFP_KERNEL); + return 0; error_doorbell: @@ -279,6 +283,8 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) int i; int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + kfree(adev->gfx.mec.mes_hung_db_array); + amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); @@ -439,6 +445,59 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, return r; } +int amdgpu_mes_reset_queue_mmio(struct amdgpu_device *adev, + int queue_type, + unsigned int vmid, + unsigned int me, + unsigned int pipe, + unsigned int queue, + uint32_t xcc_id) +{ + struct mes_reset_queue_input queue_input; + int r; + + memset(&queue_input, 0, sizeof(queue_input)); + + queue_input.xcc_id = xcc_id; + queue_input.me_id = me; + queue_input.pipe_id = pipe; + queue_input.queue_id = queue; + queue_input.vmid = vmid; + queue_input.queue_type = queue_type; + queue_input.use_mmio = true; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + if (r) + dev_err(adev->dev, "failed to reset legacy queue\n"); + + return r; +} + +int amdgpu_mes_reset_user_queue(struct amdgpu_device *adev, + int queue_type, + unsigned int doorbell_index, + unsigned int xcc_id) +{ + struct mes_reset_queue_input queue_input; + int r; + + memset(&queue_input, 0, sizeof(queue_input)); + + queue_input.xcc_id = xcc_id; + queue_input.queue_type = queue_type; + queue_input.doorbell_offset = doorbell_index; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + if (r) + dev_err(adev->dev, "failed to reset user queue\n"); + + return r; +} + int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev) { return adev->mes.hung_queue_db_array_size; @@ -805,8 +864,13 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev) { - return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) && - (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73); + u32 ip_maj = IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)); + u32 ip_min = IP_VERSION_MIN(amdgpu_ip_version(adev, GC_HWIP, 0)); + u32 mes_sched = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; + + return (ip_maj == 11 && mes_sched >= 0x8c) || + ((ip_maj == 12 && ip_min == 0) && mes_sched >= 0x8d) || + ((ip_maj == 12 && ip_min == 1) && mes_sched >= 0x73); } /* Fix me -- node_id is used to identify the correct MES instances in the future */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 1aae49f4df49..f25cffad8efe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -168,6 +168,9 @@ struct amdgpu_mes { int master_xcc_ids[AMDGPU_MAX_MES_INST_PIPES]; struct amdgpu_bo *shared_cmd_buf_obj[AMDGPU_MAX_MES_INST_PIPES]; uint64_t shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES]; + + bool compute_pipe_reset_enabled; + bool gfx_pipe_reset_enabled; }; struct amdgpu_mes_hung_queue_hqd_info { @@ -271,6 +274,7 @@ struct mes_remove_queue_input { uint32_t xcc_id; uint32_t doorbell_offset; uint64_t gang_context_addr; + uint32_t queue_type; bool remove_queue_after_reset; }; @@ -461,6 +465,17 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, unsigned int vmid, bool use_mmio, uint32_t xcc_id); +int amdgpu_mes_reset_queue_mmio(struct amdgpu_device *adev, + int queue_type, + unsigned int vmid, + unsigned int me, + unsigned int pipe, + unsigned int queue, + uint32_t xcc_id); +int amdgpu_mes_reset_user_queue(struct amdgpu_device *adev, + int queue_type, + unsigned int doorbell_index, + unsigned int xcc_id); int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev); int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 6b8214650e5d..c5120ba51e24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -21,29 +21,6 @@ #ifndef __AMDGPU_MMHUB_H__ #define __AMDGPU_MMHUB_H__ -enum amdgpu_mmhub_ras_memory_id { - AMDGPU_MMHUB_WGMI_PAGEMEM = 0, - AMDGPU_MMHUB_RGMI_PAGEMEM = 1, - AMDGPU_MMHUB_WDRAM_PAGEMEM = 2, - AMDGPU_MMHUB_RDRAM_PAGEMEM = 3, - AMDGPU_MMHUB_WIO_CMDMEM = 4, - AMDGPU_MMHUB_RIO_CMDMEM = 5, - AMDGPU_MMHUB_WGMI_CMDMEM = 6, - AMDGPU_MMHUB_RGMI_CMDMEM = 7, - AMDGPU_MMHUB_WDRAM_CMDMEM = 8, - AMDGPU_MMHUB_RDRAM_CMDMEM = 9, - AMDGPU_MMHUB_MAM_DMEM0 = 10, - AMDGPU_MMHUB_MAM_DMEM1 = 11, - AMDGPU_MMHUB_MAM_DMEM2 = 12, - AMDGPU_MMHUB_MAM_DMEM3 = 13, - AMDGPU_MMHUB_WRET_TAGMEM = 19, - AMDGPU_MMHUB_RRET_TAGMEM = 20, - AMDGPU_MMHUB_WIO_DATAMEM = 21, - AMDGPU_MMHUB_WGMI_DATAMEM = 22, - AMDGPU_MMHUB_WDRAM_DATAMEM = 23, - AMDGPU_MMHUB_MEMORY_BLOCK_LAST, -}; - struct amdgpu_mmhub_ras { struct amdgpu_ras_block_object ras_block; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index 4d68732d6223..ff11a0903499 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -312,46 +312,6 @@ uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo); uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, uint32_t domain); -/* - * sub allocation - */ -static inline struct amdgpu_sa_manager * -to_amdgpu_sa_manager(struct drm_suballoc_manager *manager) -{ - return container_of(manager, struct amdgpu_sa_manager, base); -} - -static inline uint64_t amdgpu_sa_bo_gpu_addr(struct drm_suballoc *sa_bo) -{ - return to_amdgpu_sa_manager(sa_bo->manager)->gpu_addr + - drm_suballoc_soffset(sa_bo); -} - -static inline void *amdgpu_sa_bo_cpu_addr(struct drm_suballoc *sa_bo) -{ - return to_amdgpu_sa_manager(sa_bo->manager)->cpu_ptr + - drm_suballoc_soffset(sa_bo); -} - -int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev, - struct amdgpu_sa_manager *sa_manager, - unsigned size, u32 align, u32 domain); -void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev, - struct amdgpu_sa_manager *sa_manager); -int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev, - struct amdgpu_sa_manager *sa_manager); -int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager, - struct drm_suballoc **sa_bo, - unsigned int size); -void amdgpu_sa_bo_free(struct drm_suballoc **sa_bo, - struct dma_fence *fence); -#if defined(CONFIG_DEBUG_FS) -void amdgpu_sa_bo_dump_debug_info(struct amdgpu_sa_manager *sa_manager, - struct seq_file *m); -u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m); -#endif -void amdgpu_debugfs_sa_init(struct amdgpu_device *adev); - bool amdgpu_bo_support_uswc(u64 bo_flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c index b1dc33301d83..e8592970aaab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c @@ -47,6 +47,17 @@ static ssize_t mem_info_preempt_used_show(struct device *dev, static DEVICE_ATTR_RO(mem_info_preempt_used); /** + * amdgpu_preempt_mgr_sysfs_fini - remove PREEMPT manager sysfs attributes + * + * @adev: amdgpu_device pointer + */ +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev) +{ + if (adev->dev->kobj.sd) + device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); +} + +/** * amdgpu_preempt_mgr_new - allocate a new node * * @man: TTM memory type manager @@ -137,9 +148,6 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev) if (ret) return; - if (adev->dev->kobj.sd) - device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used); - ttm_resource_manager_cleanup(man); ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 0d3c18f04ac3..8ae72c862d11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -166,7 +166,8 @@ static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf, size_t if (ret) return -EFAULT; - if (ta_bin_len > PSP_1_MEG) + if (ta_bin_len < sizeof(struct common_firmware_header) || + ta_bin_len > PSP_1_MEG) return -EINVAL; copy_pos += sizeof(uint32_t); @@ -321,6 +322,8 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size ret = copy_from_user((void *)&shared_buf_len, &buf[copy_pos], sizeof(uint32_t)); if (ret) return -EFAULT; + if (!shared_buf_len || shared_buf_len > PSP_1_MEG) + return -EINVAL; copy_pos += sizeof(uint32_t); shared_buf = memdup_user(&buf[copy_pos], shared_buf_len); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 764cd4950408..148bb4cb0a2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -128,12 +128,6 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 - -#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms - -#define MAX_FLUSH_RETIRE_DWORK_TIMES 100 - #define BYPASS_ALLOCATED_ADDRESS 0x0 #define BYPASS_INITIALIZATION_ADDRESS 0x1 @@ -249,16 +243,12 @@ static int amdgpu_check_address_validity(struct amdgpu_device *adev, (address >= RAS_UMC_INJECT_ADDR_LIMIT)) return -EFAULT; - if (amdgpu_uniras_enabled(adev)) { - if (amdgpu_sriov_vf(adev)) - count = amdgpu_virt_ras_convert_retired_address(adev, address, - page_pfns, ARRAY_SIZE(page_pfns)); - else - count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, - page_pfns, ARRAY_SIZE(page_pfns)); - } else - count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, - address, page_pfns, ARRAY_SIZE(page_pfns)); + if (amdgpu_sriov_vf(adev)) + count = amdgpu_virt_ras_convert_retired_address(adev, address, + page_pfns, ARRAY_SIZE(page_pfns)); + else + count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, + page_pfns, ARRAY_SIZE(page_pfns)); if (count <= 0) return -EPERM; @@ -1381,76 +1371,6 @@ static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager obj->err_data.de_count = err_data->de_count; } -static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) -{ - struct ras_common_if head; - - memset(&head, 0, sizeof(head)); - head.block = blk; - - return amdgpu_ras_find_obj(adev, &head); -} - -int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - const struct aca_info *aca_info, void *data) -{ - struct ras_manager *obj; - - /* in resume phase, no need to create aca fs node */ - if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) - return 0; - - obj = get_ras_manager(adev, blk); - if (!obj) - return -EINVAL; - - return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); -} - -int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) -{ - struct ras_manager *obj; - - obj = get_ras_manager(adev, blk); - if (!obj) - return -EINVAL; - - amdgpu_aca_remove_handle(&obj->aca_handle); - - return 0; -} - -static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum aca_error_type type, struct ras_err_data *err_data, - struct ras_query_context *qctx) -{ - struct ras_manager *obj; - - obj = get_ras_manager(adev, blk); - if (!obj) - return -EINVAL; - - return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); -} - -ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, - struct aca_handle *handle, char *buf, void *data) -{ - struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle); - struct ras_query_if info = { - .head = obj->head, - }; - - if (!amdgpu_ras_get_error_query_ready(obj->adev)) - return sysfs_emit(buf, "Query currently inaccessible\n"); - - if (amdgpu_ras_query_error_status(obj->adev, &info)) - return -EINVAL; - - return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, - "ce", info.ce_count, "de", info.de_count); -} - static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, struct ras_query_if *info, struct ras_err_data *err_data, @@ -1459,7 +1379,6 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, { enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; struct amdgpu_ras_block_object *block_obj = NULL; - int ret; if (blk == AMDGPU_RAS_BLOCK_COUNT) return -EINVAL; @@ -1469,7 +1388,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { return amdgpu_virt_req_ras_err_count(adev, blk, err_data); - } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { + } else { if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, err_data); } else { @@ -1490,24 +1409,6 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, block_obj->hw_ops->query_ras_error_status(adev); } } - } else { - if (amdgpu_aca_is_enabled(adev)) { - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); - if (ret) - return ret; - - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx); - if (ret) - return ret; - - ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx); - if (ret) - return ret; - } else { - /* FIXME: add code to check return value later */ - amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); - amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx); - } } return 0; @@ -1624,8 +1525,6 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1633,17 +1532,14 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, return -EOPNOTSUPP; } - if (!amdgpu_ras_is_supported(adev, block) || - !amdgpu_ras_get_aca_debug_mode(adev)) + if (!amdgpu_ras_is_supported(adev, block)) return -EOPNOTSUPP; if (amdgpu_sriov_vf(adev)) return -EOPNOTSUPP; /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && - ((smu_funcs && smu_funcs->set_debug_mode) || - (mca_funcs && mca_funcs->mca_set_debug_mode))) + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) return -EOPNOTSUPP; if (block_obj->hw_ops->reset_ras_error_count) @@ -2090,9 +1986,6 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); - if (amdgpu_aca_is_enabled(adev)) - return 0; - if (!obj || obj->attr_inuse) return -EINVAL; @@ -2130,9 +2023,6 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); - if (amdgpu_aca_is_enabled(adev)) - return 0; - if (!obj || !obj->attr_inuse) return -EINVAL; @@ -2245,25 +2135,6 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, obj, &amdgpu_ras_debugfs_ops); } -static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) -{ - bool ret; - - switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { - case IP_VERSION(13, 0, 6): - case IP_VERSION(13, 0, 12): - case IP_VERSION(13, 0, 14): - case IP_VERSION(13, 0, 15): - ret = true; - break; - default: - ret = false; - break; - } - - return ret; -} - void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2290,13 +2161,6 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_create(adev, &fs_info, dir); } } - - if (amdgpu_ras_aca_is_supported(adev)) { - if (amdgpu_aca_is_enabled(adev)) - amdgpu_aca_smu_debugfs_init(adev, dir); - else - amdgpu_mca_smu_debugfs_init(adev, dir); - } } /* debugfs end */ @@ -2489,14 +2353,6 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "Poison is created\n"); - if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { - struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); - - atomic_inc(&con->page_retirement_req_cnt); - atomic_inc(&con->poison_creation_count); - - wake_up(&con->page_retirement_wq); - } } static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ -3026,77 +2882,6 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, return 0; } -static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, - struct eeprom_table_record *bps, - struct ras_err_data *err_data) -{ - struct ta_ras_query_address_input addr_in; - uint32_t socket = 0; - int ret = 0; - - if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) - socket = adev->smuio.funcs->get_socket_id(adev); - - /* reinit err_data */ - err_data->err_addr_cnt = 0; - err_data->err_addr_len = adev->umc.retire_unit; - - memset(&addr_in, 0, sizeof(addr_in)); - addr_in.ma.err_addr = bps->address; - addr_in.ma.socket_id = socket; - addr_in.ma.ch_inst = bps->mem_channel; - if (!amdgpu_ras_smu_eeprom_supported(adev)) { - /* tell RAS TA the node instance is not used */ - addr_in.ma.node_inst = TA_RAS_INV_NODE; - } else { - addr_in.ma.umc_inst = bps->mcumc_id; - addr_in.ma.node_inst = bps->cu; - } - - if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) - ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, - &addr_in, NULL, false); - - return ret; -} - -static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, - struct eeprom_table_record *bps, - struct ras_err_data *err_data) -{ - struct ta_ras_query_address_input addr_in; - uint32_t die_id, socket = 0; - - if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) - socket = adev->smuio.funcs->get_socket_id(adev); - - /* although die id is gotten from PA in nps1 mode, the id is - * fitable for any nps mode - */ - if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) - die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, - bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); - else - return -EINVAL; - - /* reinit err_data */ - err_data->err_addr_cnt = 0; - err_data->err_addr_len = adev->umc.retire_unit; - - memset(&addr_in, 0, sizeof(addr_in)); - addr_in.ma.err_addr = bps->address; - addr_in.ma.ch_inst = bps->mem_channel; - addr_in.ma.umc_inst = bps->mcumc_id; - addr_in.ma.node_inst = die_id; - addr_in.ma.socket_id = socket; - - if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) - return adev->umc.ras->convert_ras_err_addr(adev, err_data, - &addr_in, NULL, false); - else - return -EINVAL; -} - static bool __check_record_in_range(struct amdgpu_device *adev, struct eeprom_table_record *bps, int count) { @@ -3157,117 +2942,13 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, struct eeprom_table_record *bps, struct ras_err_data *err_data, enum amdgpu_memory_partition nps) { - int i = 0; - uint64_t chan_idx_v2; - enum amdgpu_memory_partition save_nps; - - save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; - chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2; - /*old asics just have pa in eeprom*/ - if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { - memcpy(err_data->err_addr, bps, - sizeof(struct eeprom_table_record) * adev->umc.retire_unit); - goto out; - } - - for (i = 0; i < adev->umc.retire_unit; i++) - bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + memcpy(err_data->err_addr, bps, + sizeof(struct eeprom_table_record) * adev->umc.retire_unit); - if (save_nps || chan_idx_v2) { - if (save_nps == nps) { - if (amdgpu_umc_pages_in_a_row(adev, err_data, - bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; - for (i = 0; i < adev->umc.retire_unit; i++) { - err_data->err_addr[i].address = bps[0].address; - err_data->err_addr[i].mem_channel = bps[0].mem_channel; - err_data->err_addr[i].bank = bps[0].bank; - err_data->err_addr[i].err_type = bps[0].err_type; - err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; - } - } else { - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) - return -EINVAL; - } - } else { - if (bps[0].address == 0) { - /* for specific old eeprom data, mca address is not stored, - * calc it from pa - */ - if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, - &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) - return -EINVAL; - } - - if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { - if (nps == AMDGPU_NPS1_PARTITION_MODE) - memcpy(err_data->err_addr, bps, - sizeof(struct eeprom_table_record) * adev->umc.retire_unit); - else - return -EOPNOTSUPP; - } - } - -out: return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); } -static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, - struct eeprom_table_record *bps, struct ras_err_data *err_data, - enum amdgpu_memory_partition nps) -{ - int i = 0; - uint64_t chan_idx_v2; - enum amdgpu_memory_partition save_nps; - - if (!amdgpu_ras_smu_eeprom_supported(adev)) { - save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; - chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2; - bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); - } else { - /* if pmfw manages eeprom, save_nps is not stored on eeprom, - * we should always convert mca address into physical address, - * make save_nps different from nps - */ - save_nps = nps + 1; - } - - if (save_nps == nps) { - if (amdgpu_umc_pages_in_a_row(adev, err_data, - bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; - for (i = 0; i < adev->umc.retire_unit; i++) { - err_data->err_addr[i].address = bps->address; - err_data->err_addr[i].mem_channel = bps->mem_channel; - err_data->err_addr[i].bank = bps->bank; - err_data->err_addr[i].err_type = bps->err_type; - err_data->err_addr[i].mcumc_id = bps->mcumc_id; - } - } else { - if (save_nps || chan_idx_v2) { - if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) - return -EINVAL; - } else { - /* for specific old eeprom data, mca address is not stored, - * calc it from pa - */ - if (bps->address == 0) - if (amdgpu_umc_pa2mca(adev, - bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, - &(bps->address), - AMDGPU_NPS1_PARTITION_MODE)) - return -EINVAL; - - if (amdgpu_ras_mca2pa(adev, bps, err_data)) - return -EOPNOTSUPP; - } - } - - return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, - adev->umc.retire_unit); -} - /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages, bool from_rom) @@ -3300,8 +2981,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (from_rom) { /* there is no pa recs in V3, so skip pa recs processing */ - if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && - !amdgpu_ras_smu_eeprom_supported(adev)) { + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { for (i = 0; i < pages; i++) { if (control->ras_num_recs - i >= adev->umc.retire_unit) { if ((bps[i].address == bps[i + 1].address) && @@ -3318,10 +2998,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, } } } - for (; i < pages; i++) { - ret = __amdgpu_ras_convert_rec_from_rom(adev, - &bps[i], &err_data, nps); - } + for (; i < pages; i++) + bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); con->eh_data->count_saved = con->eh_data->count; } else { @@ -3346,7 +3024,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; struct amdgpu_ras_eeprom_control *control; - int save_count, unit_num, i; + int save_count, unit_num; if (!con || !con->eh_data) { if (new_cnt) @@ -3367,12 +3045,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, mutex_lock(&con->recovery_lock); control = &con->eeprom_control; data = con->eh_data; - if (amdgpu_ras_smu_eeprom_supported(adev)) - unit_num = control->ras_num_recs - - control->ras_num_recs_old; - else - unit_num = data->count / adev->umc.retire_unit - - control->ras_num_recs; + unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs; save_count = con->bad_page_num - control->ras_num_bad_pages; mutex_unlock(&con->recovery_lock); @@ -3383,21 +3056,10 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, /* only new entries are saved */ if (unit_num && save_count) { /*old asics only save pa to eeprom like before*/ - if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { - if (amdgpu_ras_eeprom_append(control, - &data->bps[data->count_saved], unit_num)) { - dev_err(adev->dev, "Failed to save EEPROM table data!"); - return -EIO; - } - } else { - for (i = 0; i < unit_num; i++) { - if (amdgpu_ras_eeprom_append(control, - &data->bps[data->count_saved + - i * adev->umc.retire_unit], 1)) { - dev_err(adev->dev, "Failed to save EEPROM table data!"); - return -EIO; - } - } + if (amdgpu_ras_eeprom_append(control, + &data->bps[data->count_saved], unit_num)) { + dev_err(adev->dev, "Failed to save EEPROM table data!"); + return -EIO; } dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); @@ -3416,7 +3078,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) struct amdgpu_ras_eeprom_control *control = &adev->psp.ras_context.ras->eeprom_control; struct eeprom_table_record *bps; - int ret, i = 0; + int ret; /* no bad page record, skip eeprom access */ if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) @@ -3430,33 +3092,6 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (ret) { dev_err(adev->dev, "Failed to load EEPROM table records!"); } else { - if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - /*In V3, there is no pa recs, and some cases(when address==0) may be parsed - as pa recs, so add verion check to avoid it. - */ - if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && - !amdgpu_ras_smu_eeprom_supported(adev)) { - for (i = 0; i < control->ras_num_recs; i++) { - if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { - if ((bps[i].address == bps[i + 1].address) && - (bps[i].mem_channel == bps[i + 1].mem_channel)) { - control->ras_num_pa_recs += adev->umc.retire_unit; - i += (adev->umc.retire_unit - 1); - } else { - control->ras_num_mca_recs += - (control->ras_num_recs - i); - break; - } - } else { - control->ras_num_mca_recs += (control->ras_num_recs - i); - break; - } - } - } else { - control->ras_num_mca_recs = control->ras_num_recs; - } - } - ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); if (ret) goto out; @@ -3550,293 +3185,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, } } -int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, - enum amdgpu_ras_block block, uint16_t pasid, - pasid_notify pasid_fn, void *data, uint32_t reset) -{ - int ret = 0; - struct ras_poison_msg poison_msg; - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - memset(&poison_msg, 0, sizeof(poison_msg)); - poison_msg.block = block; - poison_msg.pasid = pasid; - poison_msg.reset = reset; - poison_msg.pasid_fn = pasid_fn; - poison_msg.data = data; - - ret = kfifo_put(&con->poison_fifo, poison_msg); - if (!ret) { - dev_err(adev->dev, "Poison message fifo is full!\n"); - return -ENOSPC; - } - - return 0; -} - -static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, - struct ras_poison_msg *poison_msg) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - return kfifo_get(&con->poison_fifo, poison_msg); -} - -static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) -{ - mutex_init(&ecc_log->lock); - - INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_queried_count = 0; - ecc_log->consumption_q_count = 0; -} - -static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) -{ - struct radix_tree_iter iter; - void __rcu **slot; - struct ras_ecc_err *ecc_err; - - mutex_lock(&ecc_log->lock); - radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { - ecc_err = radix_tree_deref_slot(slot); - kfree(ecc_err->err_pages.pfn); - kfree(ecc_err); - radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); - } - mutex_unlock(&ecc_log->lock); - - mutex_destroy(&ecc_log->lock); - ecc_log->de_queried_count = 0; - ecc_log->consumption_q_count = 0; -} - -static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, - uint32_t delayed_ms) -{ - int ret; - - mutex_lock(&con->umc_ecc_log.lock); - ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, - UMC_ECC_NEW_DETECTED_TAG); - mutex_unlock(&con->umc_ecc_log.lock); - - if (ret) - schedule_delayed_work(&con->page_retirement_dwork, - msecs_to_jiffies(delayed_ms)); - - return ret ? true : false; -} - -static void amdgpu_ras_do_page_retirement(struct work_struct *work) -{ - struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, - page_retirement_dwork.work); - struct amdgpu_device *adev = con->adev; - struct ras_err_data err_data; - - /* If gpu reset is ongoing, delay retiring the bad pages */ - if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { - amdgpu_ras_schedule_retirement_dwork(con, - AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3); - return; - } - - amdgpu_ras_error_data_init(&err_data); - - amdgpu_umc_handle_bad_pages(adev, &err_data); - - amdgpu_ras_error_data_fini(&err_data); - - amdgpu_ras_schedule_retirement_dwork(con, - AMDGPU_RAS_RETIRE_PAGE_INTERVAL); -} - -static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, - uint32_t poison_creation_count) -{ - int ret = 0; - struct ras_ecc_log_info *ecc_log; - struct ras_query_if info; - u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - u64 de_queried_count; - u64 consumption_q_count; - enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; - - memset(&info, 0, sizeof(info)); - info.head.block = AMDGPU_RAS_BLOCK__UMC; - - ecc_log = &ras->umc_ecc_log; - ecc_log->de_queried_count = 0; - ecc_log->consumption_q_count = 0; - - do { - ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); - if (ret) - return ret; - - de_queried_count = ecc_log->de_queried_count; - consumption_q_count = ecc_log->consumption_q_count; - - if (de_queried_count && consumption_q_count) - break; - - msleep(100); - } while (--timeout); - - if (de_queried_count) - schedule_delayed_work(&ras->page_retirement_dwork, 0); - - if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) - amdgpu_ras_reset_gpu(adev); - - return 0; -} - -static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_poison_msg msg; - int ret; - - do { - ret = kfifo_get(&con->poison_fifo, &msg); - } while (ret); -} - -static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, - uint32_t msg_count, uint32_t *gpu_reset) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t reset_flags = 0, reset = 0; - struct ras_poison_msg msg; - int ret, i; - - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - - for (i = 0; i < msg_count; i++) { - ret = amdgpu_ras_get_poison_req(adev, &msg); - if (!ret) - continue; - - if (msg.pasid_fn) - msg.pasid_fn(adev, msg.pasid, msg.data); - - reset_flags |= msg.reset; - } - - /* - * Try to ensure poison creation handler is completed first - * to set rma if bad page exceed threshold. - */ - flush_delayed_work(&con->page_retirement_dwork); - - /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset_flags && !amdgpu_ras_is_rma(adev)) { - if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; - else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; - else - reset = reset_flags; - - con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); - - *gpu_reset = reset; - - /* Wait for gpu recovery to complete */ - flush_work(&con->recovery_work); - } - - return 0; -} - -static int amdgpu_ras_page_retirement_thread(void *param) -{ - struct amdgpu_device *adev = (struct amdgpu_device *)param; - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t poison_creation_count, msg_count; - uint32_t gpu_reset; - int ret; - - while (!kthread_should_stop()) { - - wait_event_interruptible(con->page_retirement_wq, - kthread_should_stop() || - atomic_read(&con->page_retirement_req_cnt)); - - if (kthread_should_stop()) - break; - - mutex_lock(&con->poison_lock); - gpu_reset = 0; - - do { - poison_creation_count = atomic_read(&con->poison_creation_count); - ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); - if (ret == -EIO) - break; - - if (poison_creation_count) { - atomic_sub(poison_creation_count, &con->poison_creation_count); - atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); - } - } while (atomic_read(&con->poison_creation_count) && - !atomic_read(&con->poison_consumption_count)); - - if (ret != -EIO) { - msg_count = kfifo_len(&con->poison_fifo); - if (msg_count) { - ret = amdgpu_ras_poison_consumption_handler(adev, - msg_count, &gpu_reset); - if ((ret != -EIO) && - (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) - atomic_sub(msg_count, &con->page_retirement_req_cnt); - } - } - - if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { - /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ - /* Clear poison creation request */ - atomic_set(&con->poison_creation_count, 0); - atomic_set(&con->poison_consumption_count, 0); - - /* Clear poison fifo */ - amdgpu_ras_clear_poison_fifo(adev); - - /* Clear all poison requests */ - atomic_set(&con->page_retirement_req_cnt, 0); - - if (ret == -EIO) { - /* Wait for mode-1 reset to complete */ - down_read(&adev->reset_domain->sem); - up_read(&adev->reset_domain->sem); - } - - /* Wake up work to save bad pages to eeprom */ - schedule_delayed_work(&con->page_retirement_dwork, 0); - } else if (gpu_reset) { - /* gpu just completed mode-2 reset or other reset */ - /* Clear poison consumption messages cached in fifo */ - msg_count = kfifo_len(&con->poison_fifo); - if (msg_count) { - amdgpu_ras_clear_poison_fifo(adev); - atomic_sub(msg_count, &con->page_retirement_req_cnt); - } - - atomic_set(&con->poison_consumption_count, 0); - - /* Wake up work to save bad pages to eeprom */ - schedule_delayed_work(&con->page_retirement_dwork, 0); - } - mutex_unlock(&con->poison_lock); - } - - return 0; -} - int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3846,7 +3194,14 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) if (!con || amdgpu_sriov_vf(adev)) return 0; - if (amdgpu_uniras_enabled(adev)) + /* + * For the reset-on-init path (e.g. an NPS memory partition, + * switch) the RAS IP block hw_init has not been enabled and + * the amdgpu_uniras_enabled return false, check amdgpu ras + * context uniras_enabled flag, eeprom init will be called + * during RAS IP block hw_init. + */ + if (amdgpu_uniras_enabled(adev) || con->uniras_enabled) return 0; control = &con->eeprom_control; @@ -3855,9 +3210,6 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) ret = amdgpu_ras_eeprom_init(control); control->is_eeprom_valid = !ret; - if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) - control->ras_num_pa_recs = control->ras_num_recs; - if (adev->umc.ras && adev->umc.ras->get_retire_flip_bits) adev->umc.ras->get_retire_flip_bits(adev); @@ -3877,13 +3229,6 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) adev, control->bad_channel_bitmap); con->update_channel_flag = false; } - - /* The format action is only applied to new ASICs */ - if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && - control->tbl_hdr.version < RAS_TABLE_VER_V3) - if (!amdgpu_ras_eeprom_reset_table(control)) - if (amdgpu_ras_save_bad_pages(adev, NULL)) - dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); } return 0; @@ -3917,10 +3262,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) } mutex_init(&con->recovery_lock); - mutex_init(&con->poison_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); - atomic_set(&con->rma_in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -3933,21 +3276,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) } mutex_init(&con->page_rsv_lock); - INIT_KFIFO(con->poison_fifo); mutex_init(&con->page_retirement_lock); - init_waitqueue_head(&con->page_retirement_wq); - atomic_set(&con->page_retirement_req_cnt, 0); - atomic_set(&con->poison_creation_count, 0); - atomic_set(&con->poison_consumption_count, 0); - con->page_retirement_thread = - kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); - if (IS_ERR(con->page_retirement_thread)) { - con->page_retirement_thread = NULL; - dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); - } - - INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); - amdgpu_ras_ecc_log_init(&con->umc_ecc_log); + #ifdef CONFIG_X86_MCE_AMD if ((adev->asic_type == CHIP_ALDEBARAN) && (adev->gmc.xgmi.connected_to_cpu)) @@ -3978,33 +3308,15 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; - int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; - bool ret; /* recovery_init failed to init it, fini is useless */ if (!data) return 0; - /* Save all cached bad pages to eeprom */ - do { - flush_delayed_work(&con->page_retirement_dwork); - ret = amdgpu_ras_schedule_retirement_dwork(con, 0); - } while (ret && max_flush_timeout--); - - if (con->page_retirement_thread) - kthread_stop(con->page_retirement_thread); - - atomic_set(&con->page_retirement_req_cnt, 0); - atomic_set(&con->poison_creation_count, 0); - mutex_destroy(&con->page_rsv_lock); cancel_work_sync(&con->recovery_work); - cancel_delayed_work_sync(&con->page_retirement_dwork); - - amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); - mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); @@ -4206,15 +3518,6 @@ init_ras_enabled_flag: adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : adev->ras_hw_enabled & amdgpu_ras_mask; - /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ - if (!amdgpu_sriov_vf(adev)) { - adev->aca.is_enabled = - (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15)); - } - /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) @@ -4435,15 +3738,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } - if (amdgpu_ras_aca_is_supported(adev)) { - if (amdgpu_aca_is_enabled(adev)) - r = amdgpu_aca_init(adev); - else - r = amdgpu_mca_init(adev); - if (r) - goto release_con; - } - con->init_task_pid = task_pid_nr(current); get_task_comm(con->init_task_comm, current); @@ -4541,9 +3835,9 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, goto cleanup; } - if (ras_obj->hw_ops && + if (amdgpu_uniras_enabled(adev) || (ras_obj->hw_ops && (ras_obj->hw_ops->query_ras_error_count || - ras_obj->hw_ops->query_ras_error_status)) { + ras_obj->hw_ops->query_ras_error_status))) { r = amdgpu_ras_sysfs_create(adev, ras_block); if (r) goto interrupt; @@ -4671,28 +3965,13 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_event_mgr_init(adev); - if (amdgpu_ras_aca_is_supported(adev)) { - if (amdgpu_reset_in_recovery(adev)) { - if (amdgpu_aca_is_enabled(adev)) - r = amdgpu_aca_reset(adev); - else - r = amdgpu_mca_reset(adev); - if (r) - return r; - } - - if (!amdgpu_sriov_vf(adev)) { - if (amdgpu_aca_is_enabled(adev)) - amdgpu_ras_set_aca_debug_mode(adev, false); - else - amdgpu_ras_set_mca_debug_mode(adev, false); - } - } - /* Guest side doesn't need init ras feature */ if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) return 0; + if (amdgpu_uniras_enabled(adev)) + amdgpu_ras_mgr_set_debug_mode(adev, false); + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { obj = node->ras_obj; if (!obj) { @@ -4773,13 +4052,6 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); - if (amdgpu_ras_aca_is_supported(adev)) { - if (amdgpu_aca_is_enabled(adev)) - amdgpu_aca_fini(adev); - else - amdgpu_mca_fini(adev); - } - WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); if (AMDGPU_RAS_GET_FEATURES(con->features)) @@ -5064,6 +4336,13 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) * Use this list instead of mgpu_info to find the amdgpu * device on which the UMC error was reported. */ + if (mce_adev_list.num_gpu >= MAX_GPU_INSTANCE) { + dev_warn_ratelimited(adev->dev, + "mce_adev_list full, skip notifier registration (max=%d)\n", + MAX_GPU_INSTANCE); + return; + } + mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; /* @@ -5181,59 +4460,10 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } -int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ret = 0; - - if (con) { - ret = amdgpu_mca_smu_set_debug_mode(adev, enable); - if (!ret) - con->is_aca_debug_mode = enable; - } - - return ret; -} - -int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ret = 0; - - if (con) { - if (amdgpu_aca_is_enabled(adev)) - ret = amdgpu_aca_smu_set_debug_mode(adev, enable); - else - ret = amdgpu_mca_smu_set_debug_mode(adev, enable); - if (!ret) - con->is_aca_debug_mode = enable; - } - - return ret; -} - -bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (!con) - return false; - - if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || - (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) - return con->is_aca_debug_mode; - else - return true; -} - bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, unsigned int *error_query_mode) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; if (!con) { *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; @@ -5242,9 +4472,6 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, if (amdgpu_sriov_vf(adev)) { *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; - } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { - *error_query_mode = - (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; } else { *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; } @@ -5834,3 +5061,8 @@ void amdgpu_ras_post_reset(struct amdgpu_device *adev, amdgpu_ras_mgr_post_reset(tmp_adev); } } + +void amdgpu_ras_resume_after_reset(struct amdgpu_device *adev) +{ + amdgpu_ras_mgr_resume_after_reset(adev); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index a86ab65aa2f0..a44aed7f169e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -31,7 +31,6 @@ #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" #include "amdgpu_smuio.h" -#include "amdgpu_aca.h" struct amdgpu_iv_entry; @@ -466,14 +465,6 @@ struct ras_query_context { typedef int (*pasid_notify)(struct amdgpu_device *adev, uint16_t pasid, void *data); -struct ras_poison_msg { - enum amdgpu_ras_block block; - uint16_t pasid; - uint32_t reset; - pasid_notify pasid_fn; - void *data; -}; - struct ras_err_pages { uint32_t count; uint64_t *pfn; @@ -492,8 +483,6 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t consumption_q_count; }; struct ras_critical_region { @@ -549,7 +538,6 @@ struct amdgpu_ras { /* gpu recovery */ struct work_struct recovery_work; atomic_t in_recovery; - atomic_t rma_in_recovery; struct amdgpu_device *adev; /* error handler data */ struct ras_err_handler_data *eh_data; @@ -581,22 +569,15 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; /* Record status of smu mca debug mode */ - bool is_aca_debug_mode; + bool is_mca_debug_mode; bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; - struct task_struct *page_retirement_thread; - wait_queue_head_t page_retirement_wq; struct mutex page_retirement_lock; - atomic_t page_retirement_req_cnt; - atomic_t poison_creation_count; - atomic_t poison_consumption_count; struct mutex page_rsv_lock; - DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); struct ras_ecc_log_info umc_ecc_log; - struct delayed_work page_retirement_dwork; /* ras errors detected */ unsigned long ras_err_state; @@ -615,9 +596,6 @@ struct amdgpu_ras { struct list_head critical_region_head; struct mutex critical_region_lock; - /* Protect poison injection */ - struct mutex poison_lock; - /* Disable/Enable uniras switch */ bool uniras_enabled; const struct ras_smu_drv *ras_smu_drv; @@ -702,8 +680,6 @@ struct ras_manager { struct ras_ih_data ih_data; struct ras_err_data err_data; - - struct aca_handle aca_handle; }; struct ras_badpage { @@ -964,8 +940,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); -int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable); -bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, unsigned int *mode); @@ -1006,12 +981,6 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); -int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - const struct aca_info *aca_info, void *data); -int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk); - -ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, - struct aca_handle *handle, char *buf, void *data); void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); @@ -1029,10 +998,6 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo); bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr); -int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, - enum amdgpu_ras_block block, uint16_t pasid, - pasid_notify pasid_fn, void *data, uint32_t reset); - bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); __printf(3, 4) @@ -1045,4 +1010,5 @@ void amdgpu_ras_pre_reset(struct amdgpu_device *adev, struct list_head *device_list); void amdgpu_ras_post_reset(struct amdgpu_device *adev, struct list_head *device_list); +void amdgpu_ras_resume_after_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index b265b4d9053f..baa8cc3646d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -124,8 +124,6 @@ RAS_TABLE_V2_1_INFO_SIZE) \ / RAS_TABLE_RECORD_SIZE) -#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ - /* Given a zero-based index of an EEPROM RAS record, yields the EEPROM * offset off of RAS_TABLE_START. That is, this is something you can * add to control->i2c_address, and then tell I2C layer to read @@ -159,6 +157,9 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) { + if (amdgpu_sriov_vf(adev)) + return false; + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */ case IP_VERSION(11, 0, 7): /* Sienna cichlid */ @@ -449,57 +450,46 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - u32 erase_res = 0; u8 csum; int res; mutex_lock(&control->ras_tbl_mutex); - if (!amdgpu_ras_smu_eeprom_supported(adev)) { - hdr->header = RAS_TABLE_HDR_VAL; - amdgpu_ras_set_eeprom_table_version(control); - - if (hdr->version >= RAS_TABLE_VER_V2_1) { - hdr->first_rec_offset = RAS_RECORD_START_V2_1; - hdr->tbl_size = RAS_TABLE_HEADER_SIZE + - RAS_TABLE_V2_1_INFO_SIZE; - rai->rma_status = GPU_HEALTH_USABLE; - - control->ras_record_offset = RAS_RECORD_START_V2_1; - control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; - /** - * GPU health represented as a percentage. - * 0 means worst health, 100 means fully health. - */ - rai->health_percent = 100; - /* ecc_page_threshold = 0 means disable bad page retirement */ - rai->ecc_page_threshold = con->bad_page_cnt_threshold; - } else { - hdr->first_rec_offset = RAS_RECORD_START; - hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + hdr->header = RAS_TABLE_HDR_VAL; + amdgpu_ras_set_eeprom_table_version(control); - control->ras_record_offset = RAS_RECORD_START; - control->ras_max_record_count = RAS_MAX_RECORD_COUNT; - } + if (hdr->version >= RAS_TABLE_VER_V2_1) { + hdr->first_rec_offset = RAS_RECORD_START_V2_1; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE + + RAS_TABLE_V2_1_INFO_SIZE; + rai->rma_status = GPU_HEALTH_USABLE; - csum = __calc_hdr_byte_sum(control); - if (hdr->version >= RAS_TABLE_VER_V2_1) - csum += __calc_ras_info_byte_sum(control); - csum = -csum; - hdr->checksum = csum; - res = __write_table_header(control); - if (!res && hdr->version > RAS_TABLE_VER_V1) - res = __write_table_ras_info(control); + control->ras_record_offset = RAS_RECORD_START_V2_1; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; + /** + * GPU health represented as a percentage. + * 0 means worst health, 100 means fully health. + */ + rai->health_percent = 100; + /* ecc_page_threshold = 0 means disable bad page retirement */ + rai->ecc_page_threshold = con->bad_page_cnt_threshold; } else { - res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res); - if (res || erase_res) { - dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d", - res, erase_res); - if (!res) - res = -EIO; - } + hdr->first_rec_offset = RAS_RECORD_START; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + + control->ras_record_offset = RAS_RECORD_START; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; } + csum = __calc_hdr_byte_sum(control); + if (hdr->version >= RAS_TABLE_VER_V2_1) + csum += __calc_ras_info_byte_sum(control); + csum = -csum; + hdr->checksum = csum; + res = __write_table_header(control); + if (!res && hdr->version > RAS_TABLE_VER_V1) + res = __write_table_ras_info(control); + control->ras_num_recs = 0; control->ras_num_bad_pages = 0; control->ras_num_mca_recs = 0; @@ -662,7 +652,6 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); - struct amdgpu_device *adev = to_amdgpu_device(control); u32 a, b, i; u8 *buf, *pp; int res; @@ -767,10 +756,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, % control->ras_max_record_count; /*old asics only save pa to eeprom like before*/ - if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) - control->ras_num_pa_recs += num; - else - control->ras_num_mca_recs += num; + control->ras_num_pa_recs += num; control->ras_num_bad_pages = con->bad_page_num; Out: @@ -879,71 +865,6 @@ Out: return res; } -int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control) -{ - struct amdgpu_device *adev = to_amdgpu_device(control); - int ret, retry = 20; - - if (!amdgpu_ras_smu_eeprom_supported(adev)) - return 0; - - control->ras_num_recs_old = control->ras_num_recs; - - do { - /* 1000ms timeout is long enough, smu_get_badpage_count won't - * return -EBUSY before timeout. - */ - ret = amdgpu_ras_smu_get_badpage_count(adev, - &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS); - if (!ret && - (control->ras_num_recs_old == control->ras_num_recs)) { - /* record number update in PMFW needs some time, - * smu_get_badpage_count may return immediately without - * count update, sleep for a while and retry again. - */ - msleep(50); - retry--; - } else { - break; - } - } while (retry); - - /* no update of record number is not a real failure, - * don't print warning here - */ - if (!ret && (control->ras_num_recs_old == control->ras_num_recs)) - ret = -EINVAL; - - return ret; -} - -static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control) -{ - struct amdgpu_device *adev = to_amdgpu_device(control); - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - if (!amdgpu_ras_smu_eeprom_supported(adev) || !con) - return 0; - - control->ras_num_bad_pages = con->bad_page_num; - - if (amdgpu_bad_page_threshold != 0 && - control->ras_num_bad_pages > con->bad_page_cnt_threshold) { - dev_warn(adev->dev, - "Saved bad pages %d reaches threshold value %d\n", - control->ras_num_bad_pages, con->bad_page_cnt_threshold); - - if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev)) - dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); - - if ((amdgpu_bad_page_threshold != -1) && - (amdgpu_bad_page_threshold != -2)) - con->is_rma = true; - } - - return 0; -} - /** * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table * @control: pointer to control structure @@ -968,9 +889,6 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, if (!__is_ras_eeprom_supported(adev)) return 0; - if (amdgpu_ras_smu_eeprom_supported(adev)) - return amdgpu_ras_smu_eeprom_append(control); - if (num == 0) { dev_err(adev->dev, "will not append 0 records\n"); return -EINVAL; @@ -1046,52 +964,6 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, return res; } -int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *record, u32 rec_idx, - const u32 num) -{ - struct amdgpu_device *adev = to_amdgpu_device(control); - uint64_t ts, end_idx; - int i, ret; - u64 mca, ipid; - u32 cu, mem_channel, mcumc_id; - - if (!amdgpu_ras_smu_eeprom_supported(adev)) - return 0; - - if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse) - return -EOPNOTSUPP; - - end_idx = rec_idx + num; - for (i = rec_idx; i < end_idx; i++) { - ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca); - if (ret) - return ret; - - ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid); - if (ret) - return ret; - - ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts); - if (ret) - return ret; - - record[i - rec_idx].address = mca; - /* retired_page (pa) is unused now */ - record[i - rec_idx].retired_page = 0x1ULL; - record[i - rec_idx].ts = ts; - record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; - - adev->umc.ras->mca_ipid_parse(adev, ipid, - &cu, &mem_channel, &mcumc_id, NULL); - record[i - rec_idx].cu = (u8)cu; - record[i - rec_idx].mem_channel = (u8)mem_channel; - record[i - rec_idx].mcumc_id = (u8)mcumc_id; - } - - return 0; -} - /** * amdgpu_ras_eeprom_read -- read EEPROM * @control: pointer to control structure @@ -1113,9 +985,6 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, u8 *buf, *pp; u32 g0, g1; - if (amdgpu_ras_smu_eeprom_supported(adev)) - return amdgpu_ras_eeprom_read_idx(control, record, 0, num); - if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1396,6 +1265,86 @@ Out: } static ssize_t +amdgpu_ras_debugfs_table_read_uniras(struct amdgpu_device *adev, + char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_core_context *ras_core = ras_mgr ? ras_mgr->ras_core : NULL; + struct eeprom_umc_record *records = NULL; + struct ras_eeprom_control *control; + size_t bufsz, len = 0; + u32 num_recs; + char *kbuf; + ssize_t res; + int i; + + if (!ras_core) + return 0; + + /* pmfw manages eeprom data by itself */ + if (ras_fw_eeprom_supported(ras_core)) + return 0; + + control = &ras_core->ras_eeprom; + num_recs = ras_eeprom_get_record_count(ras_core); + + bufsz = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + + strlen(rec_hdr_str) + (size_t)rec_hdr_fmt_size * num_recs + 1; + + kbuf = kvmalloc(bufsz, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (num_recs) { + records = kvcalloc(num_recs, sizeof(*records), GFP_KERNEL); + if (!records) { + res = -ENOMEM; + goto out; + } + + res = ras_eeprom_read(ras_core, records, num_recs); + if (res) + goto out; + } + + len += scnprintf(kbuf + len, bufsz - len, "%s", tbl_hdr_str); + len += scnprintf(kbuf + len, bufsz - len, tbl_hdr_fmt, + control->tbl_hdr.header, + control->tbl_hdr.version, + control->tbl_hdr.first_rec_offset, + control->tbl_hdr.tbl_size, + control->tbl_hdr.checksum); + len += scnprintf(kbuf + len, bufsz - len, "%s", rec_hdr_str); + + for (i = 0; i < num_recs; i++) { + u32 ai = RAS_RI_TO_AI(control, i); + int et = records[i].err_type; + const char *ets = (et >= 0 && et < AMDGPU_RAS_EEPROM_ERR_COUNT) ? + record_err_type_str[et] : "na"; + + len += scnprintf(kbuf + len, bufsz - len, rec_hdr_fmt, + i, + RAS_INDEX_TO_OFFSET(control, ai), + ets, + records[i].bank, + records[i].ts, + records[i].offset, + records[i].mem_channel, + records[i].mcumc_id, + records[i].retired_row_pfn); + } + + res = simple_read_from_buffer(buf, size, pos, kbuf, len); + +out: + kvfree(records); + kvfree(kbuf); + + return res; +} + +static ssize_t amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -1408,6 +1357,10 @@ amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, if (!size) return size; + if (amdgpu_uniras_enabled(adev)) + return amdgpu_ras_debugfs_table_read_uniras(adev, buf, + size, pos); + if (!ras || !control) { res = snprintf(data, sizeof(data), "Not supported\n"); if (*pos >= res) @@ -1521,42 +1474,6 @@ Out: return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control) -{ - struct amdgpu_device *adev = to_amdgpu_device(control); - struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t local_time; - int res; - - ras->is_rma = false; - - if (!__is_ras_eeprom_supported(adev)) - return 0; - mutex_init(&control->ras_tbl_mutex); - - res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version)); - if (res) - return res; - - res = amdgpu_ras_smu_get_badpage_count(adev, - &(control->ras_num_recs), 100); - if (res) - return res; - - local_time = (uint64_t)ktime_get_real_seconds(); - res = amdgpu_ras_smu_set_timestamp(adev, local_time); - if (res) - return res; - - control->ras_max_record_count = 4000; - - control->ras_num_mca_recs = 0; - control->ras_num_pa_recs = 0; - - return 0; -} - int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); @@ -1567,9 +1484,6 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) uint32_t vram_type = adev->gmc.vram_type; int res; - if (amdgpu_ras_smu_eeprom_supported(adev)) - return amdgpu_ras_smu_eeprom_init(control); - ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) @@ -1663,47 +1577,6 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) return 0; } -static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control) -{ - struct amdgpu_device *adev = to_amdgpu_device(control); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - - if (!__is_ras_eeprom_supported(adev)) - return 0; - - control->ras_num_bad_pages = ras->bad_page_num; - - if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) && - amdgpu_bad_page_threshold != 0) { - dev_warn(adev->dev, - "RAS records:%d exceed threshold:%d\n", - control->ras_num_bad_pages, ras->bad_page_cnt_threshold); - if ((amdgpu_bad_page_threshold == -1) || - (amdgpu_bad_page_threshold == -2)) { - dev_warn(adev->dev, - "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); - } else { - ras->is_rma = true; - dev_warn(adev->dev, - "User defined threshold is set, runtime service will be halt when threshold is reached\n"); - } - - return 0; - } - - dev_dbg(adev->dev, - "Found existing EEPROM table with %d records", - control->ras_num_bad_pages); - - /* Warn if we are at 90% of the threshold or above - */ - if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) - dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", - control->ras_num_bad_pages, - ras->bad_page_cnt_threshold); - return 0; -} - int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); @@ -1711,9 +1584,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res = 0; - if (amdgpu_ras_smu_eeprom_supported(adev)) - return amdgpu_ras_smu_eeprom_check(control); - if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1973,7 +1843,7 @@ void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; - if (!control || amdgpu_bad_page_threshold == 0) + if (!__is_ras_eeprom_supported(adev) || !control || amdgpu_bad_page_threshold == 0) return; if (control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index a62114800a92..3c7fcce5fe8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -82,7 +82,6 @@ struct amdgpu_ras_eeprom_control { /* Number of records in the table. */ u32 ras_num_recs; - u32 ras_num_recs_old; /* the bad page number is ras_num_recs or * ras_num_recs * umc.retire_unit @@ -191,8 +190,6 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, u32 rec_idx, const u32 num); -int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control); - void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev); extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index b97fa35bac23..4d417c4a5cd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -935,6 +935,194 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring, return 0; } +/** + * amdgpu_multi_ring_reset_helper_begin() - Prepare multiple rings for a reset. + * + * @ring_type_mask: Bitmask of affected ring types + * @guilty_ring: The ring which is guilty of causing a reset. + * @guilty_fence: The fence which didn't signal on the guilty ring. + * + * Useful when performing a GPU reset method that affects + * multiple rings at the same time, such as an IP block soft + * reset. For example, a GFX IP block soft reset will affect + * every graphics and compute queue. + * + * This function should be called before such a reset. + * + * Prepare the affected rings before the reset, make sure to + * minimize collateral damage, and backup the contents of + * the rings. Then the caller can call the actual HW specific + * reset function. + * + * After the reset is complete, the caller should then call + * amdgpu_multi_ring_reset_helper_end() to restore the rings. + */ +void amdgpu_multi_ring_reset_helper_begin(const u32 ring_type_mask, + struct amdgpu_ring *guilty_ring, + struct amdgpu_fence *guilty_fence) +{ + struct amdgpu_device *adev = guilty_ring->adev; + struct amdgpu_fence *ring_guilty_fence; + struct amdgpu_ring *ring; + bool rings_busy; + int i; + u32 t; + + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + /* Don't accept new submissions on the ring. */ + if (amdgpu_ring_sched_ready(ring) && !drm_sched_is_stopped(&ring->sched)) + drm_sched_wqueue_stop(&ring->sched); + + /* + * Clear the preempt condition to stop the ring + * from starting its next submission. This ensures + * that only the currently executing submission + * can be rejected because of the reset and helps + * minimize collateral damage. + */ + if (ring->funcs->init_cond_exec) + amdgpu_ring_set_preempt_cond_exec(ring, false); + } + + /* Flush HDP cache so the GPU can see the updated COND_EXEC values */ + amdgpu_device_flush_hdp(adev, NULL); + + /* + * Give some time for non-guilty rings to finish their + * current submission, to try to minimize collateral damage. + * + * Note that this is just a best effort, but really there + * is no way to really know which ring is actually responsible + * because different rings may share resources, eg. a compute + * ring may hog shader engines, causing a graphics ring to hang. + */ + for (t = 0; t < adev->usec_timeout; t += 10000) { + rings_busy = false; + + /* Check if any of the non-guilty rings are busy */ + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + if (ring == guilty_ring) + continue; + + rings_busy |= + atomic_read(&ring->fence_drv.last_seq) != + READ_ONCE(ring->fence_drv.sync_seq); + } + + if (!rings_busy) + break; + + mdelay(10); + } + + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + /* + * Find guilty fences, ie. the fences that didn't signal + * on each ring. At this point there is no way to know + * which one is really responsible for the hang, and no + * way to save any of them, so we treat all of them as guilty. + */ + ring_guilty_fence = + ring == guilty_ring ? guilty_fence : + amdgpu_ring_find_guilty_fence(ring); + + /* + * Backup current contents of the ring. + * The helper takes care to only reemit unsignalled fences + * so we don't have to worry about that here. + */ + amdgpu_ring_reset_helper_begin(ring, ring_guilty_fence); + } +} + +/** + * amdgpu_multi_ring_reset_helper_end() - Prepare multiple rings for a reset. + * + * @ring_type_mask: Bitmask of affected ring types + * @guilty_ring: The ring which is guilty of causing a reset. + * @ret: Return code from the reset function. + * + * After calling amdgpu_multi_ring_reset_helper_begin() + * and executing the actual reset method, call this + * function to restore normal operation. + * + * In case the reset failed, this function should still + * be called to restore preemption state, but it won't attempt to + * fully restore the ring contents. + */ +int amdgpu_multi_ring_reset_helper_end(const u32 ring_type_mask, + struct amdgpu_ring *guilty_ring, int ret) +{ + struct amdgpu_device *adev = guilty_ring->adev; + struct amdgpu_ring *ring; + int i, r; + + /* Set preempt condition, rings are now allowed to execute submissions */ + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + if (ring->funcs->init_cond_exec) + amdgpu_ring_set_preempt_cond_exec(ring, true); + } + + /* Flush HDP cache so the GPU can see the updated COND_EXEC values */ + amdgpu_device_flush_hdp(adev, NULL); + + /* If the reset was unsuccessful, return without restoring anything else. */ + if (ret) + return ret; + + /* Restore contents of all rings */ + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + /* Restore contents of the ring */ + r = amdgpu_ring_reset_helper_end(ring, ring->guilty_fence); + if (r) { + dev_err(adev->dev, + "Failed to recover ring %s after soft reset\n", + ring->name); + return r; + } + } + + /* Accept submissions on all rings again */ + for (i = 0; i < adev->num_rings; ++i) { + ring = adev->rings[i]; + + if (!(BIT(ring->funcs->type) & ring_type_mask)) + continue; + + if (!amdgpu_ring_sched_ready(ring)) + continue; + + drm_sched_wqueue_start(&ring->sched); + } + + return 0; +} + bool amdgpu_ring_is_reset_type_supported(struct amdgpu_ring *ring, u32 reset_type) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 8f28b3bd7010..9d3934b4f106 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -314,6 +314,7 @@ struct amdgpu_ring { uint32_t *ring_backup; unsigned int ring_backup_entries_to_copy; bool reemit; + struct amdgpu_fence *guilty_fence; unsigned rptr_offs; u64 rptr_gpu_addr; u32 *rptr_cpu_addr; @@ -588,10 +589,17 @@ int amdgpu_ib_ring_tests(struct amdgpu_device *adev); bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring); void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring, struct amdgpu_fence *guilty_fence); +struct amdgpu_fence * +amdgpu_ring_find_guilty_fence(struct amdgpu_ring *ring); void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring, struct amdgpu_fence *guilty_fence); int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring, struct amdgpu_fence *guilty_fence); +void amdgpu_multi_ring_reset_helper_begin(const u32 ring_type_mask, + struct amdgpu_ring *guilty_ring, + struct amdgpu_fence *guilty_fence); +int amdgpu_multi_ring_reset_helper_end(const u32 ring_type_mask, + struct amdgpu_ring *guilty_ring, int ret); bool amdgpu_ring_is_reset_type_supported(struct amdgpu_ring *ring, u32 reset_type); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c index 572a60e1b3cb..002fae3c380e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c @@ -583,3 +583,42 @@ int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev, amdgpu_gfx_rlc_init_microcode_v2_5(adev); return 0; } + +static const struct amdgpu_rlc_reg_funcs amdgpu_sriov_rlc_reg_funcs = { + .rreg32 = amdgpu_sriov_rreg, + .wreg32 = amdgpu_sriov_wreg, +}; + +static u32 +amdgpu_rlc_rreg(struct amdgpu_device *adev, u32 reg, u32 acc_flags, u32 hwip, + u32 xcc_id) +{ + return amdgpu_device_rreg(adev, reg, 0); +} + +static void +amdgpu_rlc_wreg(struct amdgpu_device *adev, u32 reg, u32 value, u32 acc_flags, + u32 hwip, u32 xcc_id) +{ + amdgpu_device_wreg(adev, reg, value, 0); +} + +static const struct amdgpu_rlc_reg_funcs amdgpu_rlc_reg_funcs = { + .rreg32 = amdgpu_rlc_rreg, + .wreg32 = amdgpu_rlc_wreg, +}; + +void amdgpu_early_init_rlc_reg_funcs(struct amdgpu_device *adev) +{ + adev->gfx.rlc.reg_funcs = &amdgpu_rlc_reg_funcs; +} + +void amdgpu_init_rlc_reg_funcs(struct amdgpu_device *adev) +{ + if (amdgpu_sriov_vf(adev) && + adev->gfx.rlc.funcs && + adev->gfx.rlc.rlcg_reg_access_supported) + adev->gfx.rlc.reg_funcs = &amdgpu_sriov_rlc_reg_funcs; + else + adev->gfx.rlc.reg_funcs = &amdgpu_rlc_reg_funcs; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h index e535534237a1..959d60c90dcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h @@ -262,6 +262,11 @@ struct amdgpu_rlc_funcs { bool (*is_rlcg_access_range)(struct amdgpu_device *adev, uint32_t reg); }; +struct amdgpu_rlc_reg_funcs { + u32 (*rreg32)(struct amdgpu_device *adev, u32 reg, u32 acc_flags, u32 hwip, u32 xcc_id); + void (*wreg32)(struct amdgpu_device *adev, u32 reg, u32 val, u32 acc_flags, u32 hwip, u32 xcc_id); +}; + struct amdgpu_rlcg_reg_access_ctrl { uint32_t scratch_reg0; uint32_t scratch_reg1; @@ -303,6 +308,7 @@ struct amdgpu_rlc { /* safe mode for updating CG/PG state */ bool in_safe_mode[AMDGPU_MAX_RLC_INSTANCES]; const struct amdgpu_rlc_funcs *funcs; + const struct amdgpu_rlc_reg_funcs *reg_funcs; /* for firmware data */ u32 save_and_restore_offset; @@ -374,4 +380,8 @@ void amdgpu_gfx_rlc_fini(struct amdgpu_device *adev); int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev, uint16_t version_major, uint16_t version_minor); + +void amdgpu_early_init_rlc_reg_funcs(struct amdgpu_device *adev); +void amdgpu_init_rlc_reg_funcs(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h new file mode 100644 index 000000000000..8c85c80fc119 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2026 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef AMDGPU_SA_H_ +#define AMDGPU_SA_H_ + +#include <drm/drm_suballoc.h> + +struct amdgpu_device; +struct amdgpu_bo; + +struct amdgpu_sa_manager { + struct drm_suballoc_manager base; + struct amdgpu_bo *bo; + uint64_t gpu_addr; + void *cpu_ptr; +}; + +static inline struct amdgpu_sa_manager * +to_amdgpu_sa_manager(struct drm_suballoc_manager *manager) +{ + return container_of(manager, struct amdgpu_sa_manager, base); +} + +static inline uint64_t amdgpu_sa_bo_gpu_addr(struct drm_suballoc *sa_bo) +{ + return to_amdgpu_sa_manager(sa_bo->manager)->gpu_addr + + drm_suballoc_soffset(sa_bo); +} + +static inline void *amdgpu_sa_bo_cpu_addr(struct drm_suballoc *sa_bo) +{ + return to_amdgpu_sa_manager(sa_bo->manager)->cpu_ptr + + drm_suballoc_soffset(sa_bo); +} + +int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev, + struct amdgpu_sa_manager *sa_manager, + unsigned size, u32 align, u32 domain); +void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev, + struct amdgpu_sa_manager *sa_manager); +int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev, + struct amdgpu_sa_manager *sa_manager); +int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager, + struct drm_suballoc **sa_bo, + unsigned int size); +void amdgpu_sa_bo_free(struct drm_suballoc **sa_bo, + struct dma_fence *fence); +#if defined(CONFIG_DEBUG_FS) +void amdgpu_sa_bo_dump_debug_info(struct amdgpu_sa_manager *sa_manager, + struct seq_file *m); +u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m); +#endif +void amdgpu_debugfs_sa_init(struct amdgpu_device *adev); + +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c index 0eecfaa3a94c..8effb1158430 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c @@ -39,7 +39,7 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv; struct amdgpu_ctx_mgr *mgr; struct amdgpu_ctx *ctx; - uint32_t id; + unsigned long id; int r; if (fd_empty(f)) @@ -50,10 +50,10 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev, return r; mgr = &fpriv->ctx_mgr; - mutex_lock(&mgr->lock); - idr_for_each_entry(&mgr->ctx_handles, ctx, id) + xa_lock(&mgr->ctx_handles); + xa_for_each(&mgr->ctx_handles, id, ctx) amdgpu_ctx_priority_override(ctx, priority); - mutex_unlock(&mgr->lock); + xa_unlock(&mgr->ctx_handles); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index fcd81242059e..fbac732f3e01 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -553,10 +553,11 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id) int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool caller_handles_kernel_queues) { - int ret = 0; struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id]; struct amdgpu_ring *gfx_ring = &sdma_instance->ring; struct amdgpu_ring *page_ring = &sdma_instance->page; + struct amdgpu_fence *gfx_fence, *page_fence; + int ret = 0; if (amdgpu_sriov_vf(adev)) return -EOPNOTSUPP; @@ -569,9 +570,14 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, * the reset is in progress. */ drm_sched_wqueue_stop(&gfx_ring->sched); + gfx_fence = amdgpu_ring_find_guilty_fence(gfx_ring); + amdgpu_ring_reset_helper_begin(gfx_ring, gfx_fence); - if (adev->sdma.has_page_queue) + if (adev->sdma.has_page_queue) { drm_sched_wqueue_stop(&page_ring->sched); + page_fence = amdgpu_ring_find_guilty_fence(page_ring); + amdgpu_ring_reset_helper_begin(page_ring, page_fence); + } } if (sdma_instance->funcs->stop_kernel_queue) { @@ -600,14 +606,19 @@ exit: * to be submitted to the queues after the reset is complete. */ if (!ret) { - amdgpu_fence_driver_force_completion(gfx_ring, NULL); + ret = amdgpu_ring_reset_helper_end(gfx_ring, gfx_fence); + if (ret) + goto unlock; drm_sched_wqueue_start(&gfx_ring->sched); if (adev->sdma.has_page_queue) { - amdgpu_fence_driver_force_completion(page_ring, NULL); + ret = amdgpu_ring_reset_helper_end(page_ring, page_fence); + if (ret) + goto unlock; drm_sched_wqueue_start(&page_ring->sched); } } } +unlock: mutex_unlock(&sdma_instance->engine_reset_mutex); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 2bf365609775..4f4e56022c97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -85,34 +85,6 @@ struct amdgpu_sdma_instance { const struct amdgpu_sdma_funcs *funcs; }; -enum amdgpu_sdma_ras_memory_id { - AMDGPU_SDMA_MBANK_DATA_BUF0 = 1, - AMDGPU_SDMA_MBANK_DATA_BUF1 = 2, - AMDGPU_SDMA_MBANK_DATA_BUF2 = 3, - AMDGPU_SDMA_MBANK_DATA_BUF3 = 4, - AMDGPU_SDMA_MBANK_DATA_BUF4 = 5, - AMDGPU_SDMA_MBANK_DATA_BUF5 = 6, - AMDGPU_SDMA_MBANK_DATA_BUF6 = 7, - AMDGPU_SDMA_MBANK_DATA_BUF7 = 8, - AMDGPU_SDMA_MBANK_DATA_BUF8 = 9, - AMDGPU_SDMA_MBANK_DATA_BUF9 = 10, - AMDGPU_SDMA_MBANK_DATA_BUF10 = 11, - AMDGPU_SDMA_MBANK_DATA_BUF11 = 12, - AMDGPU_SDMA_MBANK_DATA_BUF12 = 13, - AMDGPU_SDMA_MBANK_DATA_BUF13 = 14, - AMDGPU_SDMA_MBANK_DATA_BUF14 = 15, - AMDGPU_SDMA_MBANK_DATA_BUF15 = 16, - AMDGPU_SDMA_UCODE_BUF = 17, - AMDGPU_SDMA_RB_CMD_BUF = 18, - AMDGPU_SDMA_IB_CMD_BUF = 19, - AMDGPU_SDMA_UTCL1_RD_FIFO = 20, - AMDGPU_SDMA_UTCL1_RDBST_FIFO = 21, - AMDGPU_SDMA_UTCL1_WR_FIFO = 22, - AMDGPU_SDMA_DATA_LUT_FIFO = 23, - AMDGPU_SDMA_SPLIT_DAT_BUF = 24, - AMDGPU_SDMA_MEMORY_BLOCK_LAST, -}; - struct amdgpu_sdma_ras { struct amdgpu_ras_block_object ras_block; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 85724ec6aaf8..5324030a13f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -28,6 +28,8 @@ #include <linux/types.h> #include <linux/tracepoint.h> +#include "amdgpu_userq_fence.h" + #undef TRACE_SYSTEM #define TRACE_SYSTEM amdgpu #define TRACE_INCLUDE_FILE amdgpu_trace @@ -582,6 +584,154 @@ TRACE_EVENT(amdgpu_reset_reg_dumps, __entry->value) ); +DECLARE_EVENT_CLASS(amdgpu_userq_queue, + TP_PROTO(struct amdgpu_usermode_queue *queue), + TP_ARGS(queue), + TP_STRUCT__entry( + __field(void *, queue) + __field(u64, doorbell_index) + __field(int, queue_type) + __field(int, state) + __field(u32, xcp_id) + ), + TP_fast_assign( + __entry->queue = queue; + __entry->doorbell_index = queue->doorbell_index; + __entry->queue_type = queue->queue_type; + __entry->state = queue->state; + __entry->xcp_id = queue->xcp_id; + ), + TP_printk("queue=%p, doorbell=%llu, type=%d, state=%d, xcp_id=%u", + __entry->queue, __entry->doorbell_index, + __entry->queue_type, __entry->state, __entry->xcp_id) +); +DEFINE_EVENT(amdgpu_userq_queue, amdgpu_userq_create_start, + TP_PROTO(struct amdgpu_usermode_queue *queue), + TP_ARGS(queue)); +DEFINE_EVENT(amdgpu_userq_queue, amdgpu_userq_destroy_start, + TP_PROTO(struct amdgpu_usermode_queue *queue), + TP_ARGS(queue)); +DECLARE_EVENT_CLASS(amdgpu_userq_queue_result, + TP_PROTO(struct amdgpu_usermode_queue *queue, int result), + TP_ARGS(queue, result), + TP_STRUCT__entry( + __field(void *, queue) + __field(u64, doorbell_index) + __field(int, queue_type) + __field(int, state) + __field(u32, xcp_id) + __field(int, result) + ), + TP_fast_assign( + __entry->queue = queue; + __entry->doorbell_index = queue->doorbell_index; + __entry->queue_type = queue->queue_type; + __entry->state = queue->state; + __entry->xcp_id = queue->xcp_id; + __entry->result = result; + ), + TP_printk("queue=%p, doorbell=%llu, type=%d, state=%d, xcp_id=%u, result=%d", + __entry->queue, __entry->doorbell_index, + __entry->queue_type, __entry->state, + __entry->xcp_id, __entry->result) +); +DEFINE_EVENT(amdgpu_userq_queue_result, amdgpu_userq_create_end, + TP_PROTO(struct amdgpu_usermode_queue *queue, int result), + TP_ARGS(queue, result)); +DEFINE_EVENT(amdgpu_userq_queue_result, amdgpu_userq_destroy_end, + TP_PROTO(struct amdgpu_usermode_queue *queue, int result), + TP_ARGS(queue, result)); + +TRACE_EVENT(amdgpu_userq_emit_fence, + TP_PROTO(struct device *device, struct amdgpu_usermode_queue *queue, struct amdgpu_userq_fence *fence), + TP_ARGS(device, queue, fence), + TP_STRUCT__entry( + __field(u64, fence_context) + __field(u64, fence_seqno) + __string(dev, dev_name(device)) + __field(u64, doorbell_index) + __field(u64, client_id) + __field(u32, queue_type) + ), + TP_fast_assign( + __entry->fence_context = fence->base.context; + __entry->fence_seqno = fence->base.seqno; + __assign_str(dev); + __entry->doorbell_index = queue->doorbell_index; + __entry->client_id = queue->userq_mgr->file->client_id; + __entry->queue_type = queue->queue_type; + ), + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, fence=%llu:%llu", + __get_str(dev), __entry->client_id, __entry->queue_type, __entry->doorbell_index, + __entry->fence_context, + __entry->fence_seqno) +); + +TRACE_EVENT(amdgpu_userq_wait_deps, + TP_PROTO(struct device *device, struct amdgpu_usermode_queue *queue, struct amdgpu_userq_fence *dep), + TP_ARGS(device, queue, dep), + TP_STRUCT__entry( + __field(u64, context) + __field(u64, dep_context) + __field(u64, dep_seqno) + __string(dev, dev_name(device)) + __field(u64, doorbell_index) + __field(u64, client_id) + __field(u32, queue_type) + ), + TP_fast_assign( + __assign_str(dev); + __entry->doorbell_index = queue->doorbell_index; + __entry->queue_type = queue->queue_type; + __entry->client_id = queue->userq_mgr->file->client_id; + __entry->context = queue->fence_drv->context; + __entry->dep_context = dep->base.context; + __entry->dep_seqno = dep->base.seqno; + ), + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, context=%llu depends on fence=%llu:%llu", + __get_str(dev), __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->context, + __entry->dep_context, + __entry->dep_seqno) +); + +TRACE_EVENT(amdgpu_userq_state_start, + TP_PROTO(struct amdgpu_usermode_queue *queue), + TP_ARGS(queue), + TP_STRUCT__entry( + __field(u64, doorbell_index) + __field(u64, client_id) + __field(u32, queue_type) + __field(u32, from) + ), + TP_fast_assign( + __entry->doorbell_index = queue->doorbell_index; + __entry->queue_type = queue->queue_type; + __entry->client_id = queue->userq_mgr->file->client_id; + __entry->from = queue->state; + ), + TP_printk("client_id=%llu, type=%u, doorbell=%llu, from=%d", + __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->from) +); + +TRACE_EVENT(amdgpu_userq_state_changed, + TP_PROTO(struct amdgpu_usermode_queue *queue, enum amdgpu_userq_state new_state), + TP_ARGS(queue, new_state), + TP_STRUCT__entry( + __field(u64, doorbell_index) + __field(u64, client_id) + __field(u32, queue_type) + __field(u32, to) + ), + TP_fast_assign( + __entry->doorbell_index = queue->doorbell_index; + __entry->queue_type = queue->queue_type; + __entry->client_id = queue->userq_mgr->file->client_id; + __entry->to = new_state; + ), + TP_printk("client_id=%llu, type=%u, doorbell=%llu, to=%d", + __entry->client_id, __entry->queue_type, __entry->doorbell_index, __entry->to) +); + #undef AMDGPU_JOB_GET_TIMELINE_NAME #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 025625e7e800..b10b0878df37 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2194,7 +2194,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) return r; } - /* Create a boorbell page for kernel usages */ + /* Create a doorbell page for kernel usages */ r = amdgpu_doorbell_create_kernel_doorbells(adev); if (r) { dev_err(adev->dev, "Failed to initialize kernel doorbells.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index b5d938b31383..ff9e2e346609 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -140,6 +140,7 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size); void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev); int amdgpu_preempt_mgr_init(struct amdgpu_device *adev); void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev); +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev); int amdgpu_vram_mgr_init(struct amdgpu_device *adev); void amdgpu_vram_mgr_fini(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index b8ed931f8a40..2a5f5e6188bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -97,7 +97,6 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct amdgpu_ras_eeprom_control *control = &con->eeprom_control; unsigned int error_query_mode; int ret = 0; unsigned long err_count; @@ -118,77 +117,66 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query; mutex_lock(&con->page_retirement_lock); - if (!amdgpu_ras_smu_eeprom_supported(adev)) { - ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); - if (ret == -EOPNOTSUPP && - error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { - if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && - adev->umc.ras->ras_block.hw_ops->query_ras_error_count) - adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, - ras_error_status); - - if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && - adev->umc.ras->ras_block.hw_ops->query_ras_error_address && - adev->umc.max_ras_err_cnt_per_query) { - kfree(err_data->err_addr); - err_data->err_addr = - kzalloc_objs(struct eeprom_table_record, - adev->umc.max_ras_err_cnt_per_query); - - /* still call query_ras_error_address to clear error status - * even NOMEM error is encountered - */ - if (!err_data->err_addr) - dev_warn(adev->dev, - "Failed to alloc memory for umc error address record!\n"); - else - err_data->err_addr_len = - adev->umc.max_ras_err_cnt_per_query; - - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, - ras_error_status); - } - } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY || - (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) { - if (adev->umc.ras && - adev->umc.ras->ecc_info_query_ras_error_count) - adev->umc.ras->ecc_info_query_ras_error_count(adev, - ras_error_status); - - if (adev->umc.ras && - adev->umc.ras->ecc_info_query_ras_error_address && - adev->umc.max_ras_err_cnt_per_query) { - kfree(err_data->err_addr); - err_data->err_addr = - kzalloc_objs(struct eeprom_table_record, - adev->umc.max_ras_err_cnt_per_query); - - /* still call query_ras_error_address to clear error status - * even NOMEM error is encountered - */ - if (!err_data->err_addr) - dev_warn(adev->dev, - "Failed to alloc memory for umc error address record!\n"); - else - err_data->err_addr_len = - adev->umc.max_ras_err_cnt_per_query; - - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - adev->umc.ras->ecc_info_query_ras_error_address(adev, - ras_error_status); - } + ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); + if (ret == -EOPNOTSUPP && + error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_count) + adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, + ras_error_status); + + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_address && + adev->umc.max_ras_err_cnt_per_query) { + err_data->err_addr = + kzalloc_objs(struct eeprom_table_record, + adev->umc.max_ras_err_cnt_per_query); + + /* still call query_ras_error_address to clear error status + * even NOMEM error is encountered + */ + if (!err_data->err_addr) + dev_warn(adev->dev, + "Failed to alloc memory for umc error address record!\n"); + else + err_data->err_addr_len = + adev->umc.max_ras_err_cnt_per_query; + + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, + ras_error_status); } - } else { - if (!amdgpu_ras_eeprom_update_record_num(control)) { - err_data->err_addr_cnt = err_data->de_count = - control->ras_num_recs - control->ras_num_recs_old; - amdgpu_ras_eeprom_read_idx(control, err_data->err_addr, - control->ras_num_recs_old, err_data->de_count); + } else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY || + (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) { + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_count) + adev->umc.ras->ecc_info_query_ras_error_count(adev, + ras_error_status); + + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_address && + adev->umc.max_ras_err_cnt_per_query) { + err_data->err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + + /* still call query_ras_error_address to clear error status + * even NOMEM error is encountered + */ + if (!err_data->err_addr) + dev_warn(adev->dev, + "Failed to alloc memory for umc error address record!\n"); + else + err_data->err_addr_len = + adev->umc.max_ras_err_cnt_per_query; + + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + adev->umc.ras->ecc_info_query_ras_error_address(adev, + ras_error_status); } } @@ -276,7 +264,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, } amdgpu_ras_error_data_fini(&err_data); - } else if (amdgpu_uniras_enabled(adev)) { + } else { struct ras_ih_info ih_info = {0}; ih_info.block = block; @@ -285,17 +273,6 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, ih_info.pasid_fn = pasid_fn; ih_info.data = data; amdgpu_ras_mgr_handle_consumer_interrupt(adev, &ih_info); - } else { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ret; - - ret = amdgpu_ras_put_poison_req(adev, - block, pasid, pasid_fn, data, reset); - if (!ret) { - atomic_inc(&con->page_retirement_req_cnt); - atomic_inc(&con->poison_consumption_count); - wake_up(&con->page_retirement_wq); - } } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) @@ -512,129 +489,3 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev, return 0; } - -int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, - uint64_t status, uint64_t ipid, uint64_t addr) -{ - if (adev->umc.ras->update_ecc_status) - return adev->umc.ras->update_ecc_status(adev, - status, ipid, addr); - return 0; -} - -int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, - struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_ecc_log_info *ecc_log; - int ret; - - ecc_log = &con->umc_ecc_log; - - mutex_lock(&ecc_log->lock); - ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err); - if (!ret) - radix_tree_tag_set(ecc_tree, - ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); - mutex_unlock(&ecc_log->lock); - - return ret; -} - -int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev, - struct ras_err_data *err_data, uint64_t pa_addr) -{ - struct ta_ras_query_address_output addr_out; - - /* reinit err_data */ - err_data->err_addr_cnt = 0; - err_data->err_addr_len = adev->umc.retire_unit; - - addr_out.pa.pa = pa_addr; - if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) - return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL, - &addr_out, false); - else - return -EINVAL; -} - -int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, - uint64_t pa_addr, uint64_t *pfns, int len) -{ - int i, ret; - struct ras_err_data err_data; - - err_data.err_addr = kzalloc_objs(struct eeprom_table_record, - adev->umc.retire_unit); - if (!err_data.err_addr) { - dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n"); - return 0; - } - - ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr); - if (ret) - goto out; - - for (i = 0; i < adev->umc.retire_unit; i++) { - if (i >= len) - goto out; - - pfns[i] = err_data.err_addr[i].retired_page; - } - ret = i; - adev->umc.err_addr_cnt = err_data.err_addr_cnt; - -out: - kfree(err_data.err_addr); - return ret; -} - -int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, - uint64_t err_addr, uint32_t ch, uint32_t umc, - uint32_t node, uint32_t socket, - struct ta_ras_query_address_output *addr_out, bool dump_addr) -{ - struct ta_ras_query_address_input addr_in; - int ret; - - memset(&addr_in, 0, sizeof(addr_in)); - addr_in.ma.err_addr = err_addr; - addr_in.ma.ch_inst = ch; - addr_in.ma.umc_inst = umc; - addr_in.ma.node_inst = node; - addr_in.ma.socket_id = socket; - - if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in, - addr_out, dump_addr); - if (ret) - return ret; - } else { - return 0; - } - - return 0; -} - -int amdgpu_umc_pa2mca(struct amdgpu_device *adev, - uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps) -{ - struct ta_ras_query_address_input addr_in; - struct ta_ras_query_address_output addr_out; - int ret; - - /* nps: the pa belongs to */ - addr_in.pa.pa = pa | ((uint64_t)nps << 58); - addr_in.addr_type = TA_RAS_PA_TO_MCA; - ret = psp_ras_query_address(&adev->psp, &addr_in, &addr_out); - if (ret) { - dev_warn(adev->dev, "Failed to query RAS MCA address for 0x%llx", - pa); - - return ret; - } - - *mca = addr_out.ma.err_addr; - - return 0; -} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 8494a55ebf76..cf06d5f856f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -103,18 +103,7 @@ struct amdgpu_umc_ras { void *ras_error_status); bool (*check_ecc_err_status)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, void *ras_error_status); - int (*update_ecc_status)(struct amdgpu_device *adev, - uint64_t status, uint64_t ipid, uint64_t addr); - int (*convert_ras_err_addr)(struct amdgpu_device *adev, - struct ras_err_data *err_data, - struct ta_ras_query_address_input *addr_in, - struct ta_ras_query_address_output *addr_out, - bool dump_addr); - uint32_t (*get_die_id_from_pa)(struct amdgpu_device *adev, - uint64_t mca_addr, uint64_t retired_page); void (*get_retire_flip_bits)(struct amdgpu_device *adev); - void (*mca_ipid_parse)(struct amdgpu_device *adev, uint64_t ipid, - uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid); }; struct amdgpu_umc_funcs { @@ -179,21 +168,6 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, int amdgpu_umc_loop_channels(struct amdgpu_device *adev, umc_func func, void *data); -int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, - uint64_t status, uint64_t ipid, uint64_t addr); -int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, - struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err); - void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, void *ras_error_status); -int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev, - struct ras_err_data *err_data, uint64_t pa_addr); -int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, - uint64_t pa_addr, uint64_t *pfns, int len); -int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, - uint64_t err_addr, uint32_t ch, uint32_t umc, - uint32_t node, uint32_t socket, - struct ta_ras_query_address_output *addr_out, bool dump_addr); -int amdgpu_umc_pa2mca(struct amdgpu_device *adev, - uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index ef3f0213cc46..82c8809d1d9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -33,6 +33,7 @@ #include "amdgpu_userq.h" #include "amdgpu_hmm.h" #include "amdgpu_userq_fence.h" +#include "amdgpu_trace.h" u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) { @@ -88,14 +89,7 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work) container_of(work, struct amdgpu_userq_mgr, reset_work); struct amdgpu_device *adev = uq_mgr->adev; - const int queue_types[] = { - AMDGPU_RING_TYPE_COMPUTE, - AMDGPU_RING_TYPE_GFX, - AMDGPU_RING_TYPE_SDMA - }; - const int num_queue_types = ARRAY_SIZE(queue_types); - bool gpu_reset = false; - int i, r; + struct amdgpu_reset_context reset_context; if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "userq reset disabled by debug mask\n"); @@ -109,42 +103,15 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work) if (!amdgpu_gpu_recovery) return; - /* - * Iterate through all queue types to detect and reset problematic queues - * Process each queue type in the defined order - */ - for (i = 0; i < num_queue_types; i++) { - int ring_type = queue_types[i]; - const struct amdgpu_userq_funcs *funcs = - adev->userq_funcs[ring_type]; - - if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, - AMDGPU_RESET_TYPE_PER_QUEUE)) - continue; + memset(&reset_context, 0, sizeof(reset_context)); - if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && - funcs && funcs->detect_and_reset) { - r = funcs->detect_and_reset(adev, ring_type); - if (r) { - gpu_reset = true; - break; - } - } - } + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_USERQ; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ - if (gpu_reset) { - struct amdgpu_reset_context reset_context; - - memset(&reset_context, 0, sizeof(reset_context)); - - reset_context.method = AMD_RESET_METHOD_NONE; - reset_context.reset_req_dev = adev; - reset_context.src = AMDGPU_RESET_SRC_USERQ; - set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ - - amdgpu_device_gpu_recover(adev, NULL, &reset_context); - } + amdgpu_device_gpu_recover(adev, NULL, &reset_context); } static void amdgpu_userq_hang_detect_work(struct work_struct *work) @@ -152,12 +119,45 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work) struct amdgpu_usermode_queue *queue = container_of(work, struct amdgpu_usermode_queue, hang_detect_work.work); + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs = + adev->userq_funcs[queue->queue_type]; + bool gpu_reset = false; + + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "userq reset disabled by debug mask\n"); + return; + } + + /* + * If GPU recovery feature is disabled system-wide, + * skip all reset detection logic + */ + if (!amdgpu_gpu_recovery) + return; + + if (amdgpu_userq_is_reset_type_supported(adev, queue->queue_type, + AMDGPU_RESET_TYPE_PER_QUEUE)) { + int r; + + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, + queue, NULL, NULL); + else + r = userq_funcs->reset(queue); + if (r) + gpu_reset = true; + } else { + gpu_reset = true; + } /* * Don't schedule the work here! Scheduling or queue work from one reset * handler to another is illegal if you don't take extra precautions! */ - amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work); + if (gpu_reset) + amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work); } /* @@ -293,11 +293,15 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue) int r; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + trace_amdgpu_userq_state_start(queue); + r = userq_funcs->preempt(queue); if (r) { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); queue->state = AMDGPU_USERQ_STATE_HUNG; return r; } else { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_PREEMPTED); queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } @@ -313,10 +317,14 @@ static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue) int r = 0; if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { + trace_amdgpu_userq_state_start(queue); + r = userq_funcs->restore(queue); if (r) { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); queue->state = AMDGPU_USERQ_STATE_HUNG; } else { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED); queue->state = AMDGPU_USERQ_STATE_MAPPED; } } @@ -334,12 +342,15 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue) if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + trace_amdgpu_userq_state_start(queue); r = userq_funcs->unmap(queue); if (r) { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); queue->state = AMDGPU_USERQ_STATE_HUNG; return r; } else { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_UNMAPPED); queue->state = AMDGPU_USERQ_STATE_UNMAPPED; } } @@ -356,11 +367,15 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) int r; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { + trace_amdgpu_userq_state_start(queue); + r = userq_funcs->map(queue); if (r) { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); queue->state = AMDGPU_USERQ_STATE_HUNG; return r; } else { + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED); queue->state = AMDGPU_USERQ_STATE_MAPPED; } } @@ -507,6 +522,8 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type]; int r = 0; + trace_amdgpu_userq_destroy_start(queue); + cancel_delayed_work_sync(&uq_mgr->resume_work); /* Cancel any pending hang detection work and cleanup */ @@ -532,6 +549,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que amdgpu_bo_unreserve(queue->db_obj.obj); amdgpu_bo_unref(&queue->db_obj.obj); + trace_amdgpu_userq_destroy_end(queue, r); kfree(queue); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); @@ -629,6 +647,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) queue->queue_type = args->in.ip_type; queue->vm = &fpriv->vm; queue->priority = priority; + queue->xcp_id = (fpriv->xcp_id != AMDGPU_XCP_NO_PARTITION) ? + fpriv->xcp_id : 0; queue->userq_mgr = uq_mgr; INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work); @@ -671,6 +691,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) } queue->doorbell_index = index; + queue->doorbell_offset = (u32)args->in.doorbell_offset; + trace_amdgpu_userq_create_start(queue); r = uq_funcs->mqd_create(queue, &args->in); if (r) { drm_file_err(uq_mgr->file, "Failed to create Queue\n"); @@ -694,6 +716,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) r = amdgpu_userq_map_helper(queue); if (r) { drm_file_err(uq_mgr->file, "Failed to map Queue\n"); + trace_amdgpu_userq_create_end(queue, r); mutex_unlock(&uq_mgr->userq_mutex); goto erase_doorbell; } @@ -710,11 +733,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) * This drops the last reference which should take care of * all cleanup. */ + trace_amdgpu_userq_create_end(queue, r); amdgpu_userq_put(queue); return r; } amdgpu_debugfs_userq_init(filp, queue, qid); + trace_amdgpu_userq_create_end(queue, 0); args->out.queue_id = qid; return 0; @@ -730,6 +755,7 @@ clean_doorbell_bo: free_fence_drv: amdgpu_userq_fence_driver_free(queue); free_queue: + trace_amdgpu_userq_create_end(queue, r); kfree(queue); err_pm_runtime: pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); @@ -862,16 +888,10 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, static int amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) { - struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); - struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_usermode_queue *queue; unsigned long queue_id; int ret = 0, r; - - if (amdgpu_bo_reserve(vm->root.bo, false)) - return false; - mutex_lock(&uq_mgr->userq_mutex); /* Resume all the queues for this process */ xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { @@ -879,6 +899,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) if (!amdgpu_userq_buffer_vas_mapped(queue)) { drm_file_err(uq_mgr->file, "trying restore queue without va mapping\n"); + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_INVALID_VA); queue->state = AMDGPU_USERQ_STATE_INVALID_VA; continue; } @@ -886,10 +907,8 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) r = amdgpu_userq_map_helper(queue); if (r) ret = r; - } mutex_unlock(&uq_mgr->userq_mutex); - amdgpu_bo_unreserve(vm->root.bo); if (ret) drm_file_err(uq_mgr->file, @@ -923,7 +942,8 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, spin_unlock(&vm->individual_lock); bo = bo_va->base.bo; - ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, + TTM_NUM_MOVE_FENCES + 1); if (unlikely(ret)) return ret; @@ -946,7 +966,7 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, /* Make sure the whole VM is ready to be used */ static int -amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr) +amdgpu_userq_vm_validate_and_restore_queue(struct amdgpu_userq_mgr *uq_mgr) { struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); bool invalidated = false, new_addition = false; @@ -1072,8 +1092,12 @@ retry_lock: dma_fence_wait(vm->last_update, false); ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec); - if (ret) + if (ret) { drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n"); + goto unlock_all; + } + + ret = amdgpu_userq_restore_all(uq_mgr); unlock_all: drm_exec_fini(&exec); @@ -1099,18 +1123,34 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) if (!dma_fence_is_signaled(ev_fence)) goto put_fence; - ret = amdgpu_userq_vm_validate(uq_mgr); + ret = amdgpu_userq_vm_validate_and_restore_queue(uq_mgr); if (ret) { drm_file_err(uq_mgr->file, "Failed to validate BOs to restore ret=%d\n", ret); goto put_fence; } - amdgpu_userq_restore_all(uq_mgr); - put_fence: dma_fence_put(ev_fence); } +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, + u32 pasid, u32 doorbell_offset) +{ + struct xarray *xa = &adev->userq_doorbell_xa; + struct amdgpu_usermode_queue *queue; + unsigned long flags, idx; + + xa_lock_irqsave(xa, flags); + xa_for_each(xa, idx, queue) { + if (queue->vm && queue->vm->pasid == pasid && + queue->doorbell_offset == doorbell_offset) { + amdgpu_userq_start_hang_detect_work(queue); + break; + } + } + xa_unlock_irqrestore(xa, flags); +} + static int amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) { @@ -1166,6 +1206,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f xa_init_flags(&userq_mgr->userq_xa, XA_FLAGS_ALLOC); userq_mgr->adev = adev; userq_mgr->file = file_priv; + mutex_init(&userq_mgr->proc_ctx_lock); INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work); @@ -1219,6 +1260,11 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) */ cancel_work_sync(&userq_mgr->reset_work); + amdgpu_bo_free_kernel(&userq_mgr->proc_ctx_obj.obj, + &userq_mgr->proc_ctx_obj.gpu_addr, + &userq_mgr->proc_ctx_obj.cpu_ptr); + + mutex_destroy(&userq_mgr->proc_ctx_lock); mutex_destroy(&userq_mgr->userq_mutex); } @@ -1370,12 +1416,14 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev) if (queue->state != AMDGPU_USERQ_STATE_MAPPED) continue; + trace_amdgpu_userq_state_start(queue); userq_funcs = adev->userq_funcs[queue->queue_type]; userq_funcs->unmap(queue); /* just mark all queues as hung at this point. * if unmap succeeds, we could map again * in amdgpu_userq_post_reset() if vram is not lost */ + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); queue->state = AMDGPU_USERQ_STATE_HUNG; amdgpu_userq_fence_driver_force_completion(queue); } @@ -1394,6 +1442,8 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { + trace_amdgpu_userq_state_start(queue); + userq_funcs = adev->userq_funcs[queue->queue_type]; /* Re-map queue */ r = userq_funcs->map(queue); @@ -1401,6 +1451,7 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id); continue; } + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_MAPPED); queue->state = AMDGPU_USERQ_STATE_MAPPED; } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index d1751febaefe..61e5f8a06eb2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue { enum amdgpu_userq_state state; uint64_t doorbell_handle; uint64_t doorbell_index; + u32 doorbell_offset; uint64_t flags; struct amdgpu_mqd_prop *userq_prop; struct amdgpu_userq_mgr *userq_mgr; @@ -111,8 +112,7 @@ struct amdgpu_userq_funcs { int (*map)(struct amdgpu_usermode_queue *queue); int (*preempt)(struct amdgpu_usermode_queue *queue); int (*restore)(struct amdgpu_usermode_queue *queue); - int (*detect_and_reset)(struct amdgpu_device *adev, - int queue_type); + int (*reset)(struct amdgpu_usermode_queue *queue); }; /* Usermode queues for gfx */ @@ -127,6 +127,8 @@ struct amdgpu_userq_mgr { struct amdgpu_device *adev; struct delayed_work resume_work; struct drm_file *file; + struct mutex proc_ctx_lock; + struct amdgpu_userq_obj proc_ctx_obj; /** * @reset_work: @@ -177,6 +179,16 @@ int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue); void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell); +/* + * CP packs the per-process doorbell_id of the queue in + * CTXID0[9:0] on priv-fault (same encoding KFD uses via + * KFD_CTXID0_DOORBELL_ID_MASK) + */ +#define AMDGPU_CTXID0_DOORBELL_ID_MASK 0x3ff + +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, + u32 pasid, u32 doorbell_offset); + int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, struct amdgpu_usermode_queue *queue, u64 addr, u64 expected_size, u64 *va_out); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index f74ad378e407..7e80442ec3e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -30,7 +30,7 @@ #include <drm/drm_syncobj.h> #include "amdgpu.h" -#include "amdgpu_userq_fence.h" +#include "amdgpu_trace.h" #define AMDGPU_USERQ_MAX_HANDLES (1U << 16) @@ -528,6 +528,8 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, /* Create the new fence */ amdgpu_userq_fence_init(queue, fence, wptr); + trace_amdgpu_userq_emit_fence(dev->dev, queue, fence); + mutex_unlock(&userq_mgr->userq_mutex); /* @@ -701,7 +703,7 @@ amdgpu_userq_wait_add_fence(struct drm_amdgpu_userq_wait *wait_info, } static int -amdgpu_userq_wait_return_fence_info(struct drm_file *filp, +amdgpu_userq_wait_return_fence_info(struct drm_device *dev, struct drm_file *filp, struct drm_amdgpu_userq_wait *wait_info, u32 *syncobj_handles, u64 *timeline_points, u32 *timeline_handles, @@ -869,6 +871,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp, amdgpu_userq_fence_driver_get(fence_drv); + trace_amdgpu_userq_wait_deps(dev->dev, waitq, userq_fence); + /* Store drm syncobj's gpu va address and value */ fence_info[cnt].va = fence_drv->va; fence_info[cnt].value = fences[i]->seqno; @@ -969,7 +973,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, gobj_write, gobj_read); } else { - r = amdgpu_userq_wait_return_fence_info(filp, wait_info, + r = amdgpu_userq_wait_return_fence_info(dev, filp, wait_info, syncobj_handles, timeline_points, timeline_handles, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 616967519869..fe504f1a3fc8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -506,9 +506,8 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring) struct amdgpu_device *adev = ring->adev; struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me]; - atomic_inc(&vcn_inst->total_submission_cnt); - - cancel_delayed_work_sync(&vcn_inst->idle_work); + if (!atomic_fetch_inc(&vcn_inst->total_submission_cnt)) + cancel_delayed_work_sync(&vcn_inst->idle_work); mutex_lock(&vcn_inst->vcn_pg_lock); vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE); @@ -550,10 +549,9 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring) !adev->vcn.inst[ring->me].using_unified_queue) atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt); - atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt); - - schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work, - VCN_IDLE_TIMEOUT); + if (atomic_dec_and_test(&ring->adev->vcn.inst[ring->me].total_submission_cnt)) + schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work, + VCN_IDLE_TIMEOUT); } int amdgpu_vcn_dec_ring_test_ring(struct amdgpu_ring *ring) @@ -1485,6 +1483,37 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block, return ret; } +static struct amdgpu_fence * +amdgpu_vcn_ring_reset_begin_helper(struct amdgpu_ring *ring, + struct amdgpu_ring *guilty_ring, + struct amdgpu_fence *timedout_fence) +{ + struct amdgpu_fence *fence; + + drm_sched_wqueue_stop(&ring->sched); + if (ring == guilty_ring) + fence = timedout_fence; + else + fence = amdgpu_ring_find_guilty_fence(ring); + amdgpu_ring_reset_helper_begin(ring, fence); + + return fence; +} + +static int +amdgpu_vcn_ring_reset_end_helper(struct amdgpu_ring *ring, + struct amdgpu_fence *fence) +{ + int r; + + r = amdgpu_ring_reset_helper_end(ring, fence); + if (r) + return r; + + drm_sched_wqueue_start(&ring->sched); + return 0; +} + /** * amdgpu_vcn_ring_reset - Reset a VCN ring * @ring: ring to reset @@ -1502,48 +1531,33 @@ int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring, { struct amdgpu_device *adev = ring->adev; struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me]; + struct amdgpu_fence *dec_fence; + struct amdgpu_fence *enc_fence[AMDGPU_VCN_MAX_ENC_RINGS]; int r, i; if (adev->vcn.inst[ring->me].using_unified_queue) return -EINVAL; mutex_lock(&vinst->engine_reset_mutex); - /* Stop the scheduler's work queue for the dec and enc rings if they are running. - * This ensures that no new tasks are submitted to the queues while - * the reset is in progress. - */ - drm_sched_wqueue_stop(&vinst->ring_dec.sched); + dec_fence = amdgpu_vcn_ring_reset_begin_helper(&vinst->ring_dec, ring, + timedout_fence); for (i = 0; i < vinst->num_enc_rings; i++) - drm_sched_wqueue_stop(&vinst->ring_enc[i].sched); + enc_fence[i] = amdgpu_vcn_ring_reset_begin_helper(&vinst->ring_enc[i], ring, + timedout_fence); /* Perform the VCN reset for the specified instance */ r = vinst->reset(vinst); if (r) goto unlock; - r = amdgpu_ring_test_ring(&vinst->ring_dec); + + r = amdgpu_vcn_ring_reset_end_helper(&vinst->ring_dec, dec_fence); if (r) goto unlock; for (i = 0; i < vinst->num_enc_rings; i++) { - r = amdgpu_ring_test_ring(&vinst->ring_enc[i]); + r = amdgpu_vcn_ring_reset_end_helper(&vinst->ring_enc[i], enc_fence[i]); if (r) goto unlock; } - amdgpu_fence_driver_force_completion(&vinst->ring_dec, - (&vinst->ring_dec == ring) ? - &timedout_fence->base : NULL); - for (i = 0; i < vinst->num_enc_rings; i++) - amdgpu_fence_driver_force_completion(&vinst->ring_enc[i], - (&vinst->ring_enc[i] == ring) ? - &timedout_fence->base : NULL); - - /* Restart the scheduler's work queue for the dec and enc rings - * if they were stopped by this function. This allows new tasks - * to be submitted to the queues after the reset is complete. - */ - drm_sched_wqueue_start(&vinst->ring_dec.sched); - for (i = 0; i < vinst->num_enc_rings; i++) - drm_sched_wqueue_start(&vinst->ring_enc[i].sched); - unlock: mutex_unlock(&vinst->engine_reset_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 82624b44e661..bea95307fd42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -368,6 +368,9 @@ struct amdgpu_vcn { struct mutex workload_profile_mutex; u32 reg_count; const struct amdgpu_hwip_reg_entry *reg_list; + + bool disable_uq; + bool disable_kq; }; struct amdgpu_fw_shared_rb_ptrs_struct { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index 409e103ffe8c..35faea0ff17f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -381,7 +381,8 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp, enum AMDGPU_XCP_IP_BLOCK ip, uint32_t *inst_mask) { - if (!xcp->valid || !inst_mask || !(xcp->ip[ip].valid)) + if (!xcp->valid || !inst_mask || ip >= AMDGPU_XCP_MAX_BLOCKS || + !(xcp->ip[ip].valid)) return -EINVAL; *inst_mask = xcp->ip[ip].inst_mask; @@ -468,14 +469,18 @@ void amdgpu_xcp_release_sched(struct amdgpu_device *adev, { struct drm_gpu_scheduler *sched = container_of(entity->entity.rq, typeof(*sched), rq); + struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; - if (!adev->xcp_mgr) + if (!xcp_mgr) return; if (drm_sched_wqueue_ready(sched)) { struct amdgpu_ring *ring = to_amdgpu_ring(sched); - atomic_dec(&adev->xcp_mgr->xcp[ring->xcp_id].ref_cnt); + mutex_lock(&xcp_mgr->xcp_lock); + if (ring->xcp_id < xcp_mgr->num_xcps && xcp_mgr->xcp[ring->xcp_id].valid) + atomic_dec(&xcp_mgr->xcp[ring->xcp_id].ref_cnt); + mutex_unlock(&xcp_mgr->xcp_lock); } } @@ -488,7 +493,9 @@ int amdgpu_xcp_select_scheds(struct amdgpu_device *adev, u32 sel_xcp_id; int i; struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; + int r = 0; + mutex_lock(&xcp_mgr->xcp_lock); if (fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION) { u32 least_ref_cnt = ~0; @@ -505,19 +512,27 @@ int amdgpu_xcp_select_scheds(struct amdgpu_device *adev, } sel_xcp_id = fpriv->xcp_id; + if (sel_xcp_id >= xcp_mgr->num_xcps || !xcp_mgr->xcp[sel_xcp_id].valid) { + dev_err(adev->dev, "Selected partition #%d is not valid.", sel_xcp_id); + r = -ENODEV; + goto out; + } + if (xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds) { *num_scheds = - xcp_mgr->xcp[fpriv->xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds; + xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].num_scheds; *scheds = - xcp_mgr->xcp[fpriv->xcp_id].gpu_sched[hw_ip][hw_prio].sched; - atomic_inc(&adev->xcp_mgr->xcp[sel_xcp_id].ref_cnt); + xcp_mgr->xcp[sel_xcp_id].gpu_sched[hw_ip][hw_prio].sched; + atomic_inc(&xcp_mgr->xcp[sel_xcp_id].ref_cnt); dev_dbg(adev->dev, "Selected partition #%d", sel_xcp_id); } else { dev_err(adev->dev, "Failed to schedule partition #%d.", sel_xcp_id); - return -ENOENT; + r = -ENOENT; } - return 0; +out: + mutex_unlock(&xcp_mgr->xcp_lock); + return r; } static void amdgpu_set_xcp_id(struct amdgpu_device *adev, @@ -574,6 +589,9 @@ static void amdgpu_xcp_gpu_sched_update(struct amdgpu_device *adev, { unsigned int *num_gpu_sched; + if (sel_xcp_id >= MAX_XCP || sel_xcp_id == AMDGPU_XCP_NO_PARTITION) + return; + num_gpu_sched = &adev->xcp_mgr->xcp[sel_xcp_id] .gpu_sched[ring->funcs->type][ring->hw_prio].num_scheds; adev->xcp_mgr->xcp[sel_xcp_id].gpu_sched[ring->funcs->type][ring->hw_prio] @@ -903,7 +921,7 @@ static void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev) { struct amdgpu_xcp_res_details *xcp_res; struct amdgpu_xcp_cfg *xcp_cfg; - int i, r, j, rid, mode; + int i, r, rid, mode; if (!adev->xcp_mgr) return; @@ -949,14 +967,16 @@ static void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev) &xcp_cfg_res_sysfs_ktype, &xcp_cfg->kobj, "%s", xcp_res_names[rid]); - if (r) + if (r) { + kobject_put(&xcp_res->kobj); goto err; + } } adev->xcp_mgr->xcp_cfg = xcp_cfg; return; err: - for (j = 0; j < i; j++) { + while (i--) { xcp_res = &xcp_cfg->xcp_res[i]; kobject_put(&xcp_res->kobj); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e63d05c477a0..d2c5bb50d94a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -106,53 +106,6 @@ static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 }; -static const int xgmi3x16_pcs_err_status_reg_v6_4[] = { - smnPCS_XGMI3X16_PCS_ERROR_STATUS, - smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000 -}; - -static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = { - smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, - smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 -}; - -static const u64 xgmi_v6_4_0_mca_base_array[] = { - 0x11a09200, - 0x11b09200, -}; - -static const char *xgmi_v6_4_0_ras_error_code_ext[32] = { - [0x00] = "XGMI PCS DataLossErr", - [0x01] = "XGMI PCS TrainingErr", - [0x02] = "XGMI PCS FlowCtrlAckErr", - [0x03] = "XGMI PCS RxFifoUnderflowErr", - [0x04] = "XGMI PCS RxFifoOverflowErr", - [0x05] = "XGMI PCS CRCErr", - [0x06] = "XGMI PCS BERExceededErr", - [0x07] = "XGMI PCS TxMetaDataErr", - [0x08] = "XGMI PCS ReplayBufParityErr", - [0x09] = "XGMI PCS DataParityErr", - [0x0a] = "XGMI PCS ReplayFifoOverflowErr", - [0x0b] = "XGMI PCS ReplayFifoUnderflowErr", - [0x0c] = "XGMI PCS ElasticFifoOverflowErr", - [0x0d] = "XGMI PCS DeskewErr", - [0x0e] = "XGMI PCS FlowCtrlCRCErr", - [0x0f] = "XGMI PCS DataStartupLimitErr", - [0x10] = "XGMI PCS FCInitTimeoutErr", - [0x11] = "XGMI PCS RecoveryTimeoutErr", - [0x12] = "XGMI PCS ReadySerialTimeoutErr", - [0x13] = "XGMI PCS ReadySerialAttemptErr", - [0x14] = "XGMI PCS RecoveryAttemptErr", - [0x15] = "XGMI PCS RecoveryRelockAttemptErr", - [0x16] = "XGMI PCS ReplayAttemptErr", - [0x17] = "XGMI PCS SyncHdrErr", - [0x18] = "XGMI PCS TxReplayTimeoutErr", - [0x19] = "XGMI PCS RxReplayTimeoutErr", - [0x1a] = "XGMI PCS LinkSubTxTimeoutErr", - [0x1b] = "XGMI PCS LinkSubRxTimeoutErr", - [0x1c] = "XGMI PCS RxCMDPktErr", -}; - static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { {"XGMI PCS DataLossErr", SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, @@ -1152,91 +1105,15 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return 0; } -static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct amdgpu_device *adev = handle->adev; - struct aca_bank_info info; - const char *error_str; - u64 status, count; - int ret, ext_error_code; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - status = bank->regs[ACA_REG_IDX_STATUS]; - ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); - - error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? - xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; - if (error_str) - dev_info(adev->dev, "%s detected\n", error_str); - - count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); - - switch (type) { - case ACA_SMU_TYPE_UE: - if (ext_error_code != 0 && ext_error_code != 1 && ext_error_code != 9) - count = 0ULL; - - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count); - break; - case ACA_SMU_TYPE_CE: - count = ext_error_code == 6 ? count : 0ULL; - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count); - break; - default: - return -EINVAL; - } - - return ret; -} - -static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { - .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser, -}; - -static const struct aca_info xgmi_v6_4_0_aca_info = { - .hwip = ACA_HWIP_TYPE_PCS_XGMI, - .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, - .bank_ops = &xgmi_v6_4_0_aca_bank_ops, -}; - static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { - int r; - if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) return 0; amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); - r = amdgpu_ras_block_late_init(adev, ras_block); - if (r) - return r; - - switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { - case IP_VERSION(6, 4, 0): - case IP_VERSION(6, 4, 1): - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL, - &xgmi_v6_4_0_aca_info, NULL); - if (r) - goto late_fini; - break; - default: - break; - } - - return 0; - -late_fini: - amdgpu_ras_block_late_fini(adev, ras_block); - - return r; + return amdgpu_ras_block_late_init(adev, ras_block); } uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, @@ -1252,7 +1129,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg WREG32_PCIE(pcs_status_reg, 0); } -static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) +static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) { uint32_t i; @@ -1278,54 +1155,6 @@ static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) default: break; } - - switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { - case IP_VERSION(6, 4, 0): - case IP_VERSION(6, 4, 1): - for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) - pcs_clear_status(adev, - xgmi3x16_pcs_err_status_reg_v6_4[i]); - break; - default: - break; - } -} - -static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base) -{ - uint64_t smn_base = - amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst); - - WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL); -} - -static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) - __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]); -} - -static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev) -{ - int i; - - for_each_inst(i, adev->aid_mask) - xgmi_v6_4_0_reset_error_count(adev, i); -} - -static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) -{ - switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { - case IP_VERSION(6, 4, 0): - case IP_VERSION(6, 4, 1): - xgmi_v6_4_0_reset_ras_error_count(adev); - break; - default: - amdgpu_xgmi_legacy_reset_ras_error_count(adev); - break; - } } static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, @@ -1343,11 +1172,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, if (is_xgmi_pcs) { if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == - IP_VERSION(6, 1, 0) || - amdgpu_ip_version(adev, XGMI_HWIP, 0) == - IP_VERSION(6, 4, 0) || - amdgpu_ip_version(adev, XGMI_HWIP, 0) == - IP_VERSION(6, 4, 1)) { + IP_VERSION(6, 1, 0)) { pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); } else { @@ -1381,11 +1206,11 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, return 0; } -static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, +static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - int i, supported = 1; + int i; uint32_t data, mask_data = 0; uint32_t ue_cnt = 0, ce_cnt = 0; @@ -1449,26 +1274,6 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, } break; default: - supported = 0; - break; - } - - switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { - case IP_VERSION(6, 4, 0): - case IP_VERSION(6, 4, 1): - /* check xgmi3x16 pcs error */ - for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) { - data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]); - mask_data = - RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]); - if (data) - amdgpu_xgmi_query_pcs_error_status(adev, data, - mask_data, &ue_cnt, &ce_cnt, true, true); - } - break; - default: - if (!supported) - dev_warn(adev->dev, "XGMI RAS error query not supported"); break; } @@ -1478,90 +1283,6 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, err_data->ce_count += ce_cnt; } -static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) -{ - const char *error_str; - int ext_error_code; - - ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); - - error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? - xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; - if (error_str) - dev_info(adev->dev, "%s detected\n", error_str); - - switch (ext_error_code) { - case 0: - return ACA_ERROR_TYPE_UE; - case 6: - return ACA_ERROR_TYPE_CE; - default: - return -EINVAL; - } - - return -EINVAL; -} - -static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info, - u64 mca_base, struct ras_err_data *err_data) -{ - int xgmi_inst = mcm_info->die_id; - uint64_t smn_base; - u64 status = 0; - - status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS); - if (!ACA_REG__STATUS__VAL(status)) - return; - - switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { - case ACA_ERROR_TYPE_UE: - amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL); - break; - case ACA_ERROR_TYPE_CE: - amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL); - break; - default: - break; - } - smn_base = amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst); - WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL); -} - -static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data) -{ - struct amdgpu_smuio_mcm_config_info mcm_info = { - .socket_id = adev->smuio.funcs->get_socket_id(adev), - .die_id = xgmi_inst, - }; - int i; - - for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) - __xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data); -} - -static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) -{ - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - int i; - - for_each_inst(i, adev->aid_mask) - xgmi_v6_4_0_query_error_count(adev, i, err_data); -} - -static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, - void *ras_error_status) -{ - switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { - case IP_VERSION(6, 4, 0): - case IP_VERSION(6, 4, 1): - xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status); - break; - default: - amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status); - break; - } -} - /* Trigger XGMI/WAFL error */ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if, uint32_t instance_mask) @@ -1663,6 +1384,16 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) if (r && r != -EHWPOISON) dev_err(tmp_adev->dev, "error during bad page data initialization"); + + /* + * For the reset-on-init path (e.g. an NPS memory partition + * switch) the RAS IP block hw_init was skipped under the + * minimal init level, so uniras was never enabled. Bring it + * up now that the reset domain has been unlocked. This is a + * no-op for any other reset path where RAS is already + * initialized, and for non-uniras devices. + */ + amdgpu_ras_resume_after_reset(tmp_adev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 72ea37dbfea8..cddfe4015f53 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -273,8 +273,10 @@ static int aqua_vanjaram_get_xcp_res_info(struct amdgpu_xcp_mgr *xcp_mgr, xcp_cfg->num_res = ARRAY_SIZE(max_res); for (i = 0; i < xcp_cfg->num_res; i++) { - res_lt_xcp = max_res[i] < num_xcp; xcp_cfg->xcp_res[i].id = i; + if (!max_res[i]) + continue; + res_lt_xcp = max_res[i] < num_xcp; xcp_cfg->xcp_res[i].num_inst = res_lt_xcp ? 1 : max_res[i] / num_xcp; xcp_cfg->xcp_res[i].num_inst = @@ -589,6 +591,29 @@ static struct aqua_reg_list pcie_reg_addrs[] = { { smreg_0x1A380088, 6, DW_ADDR_INCR }, }; +/* + * Return the GPU's internal US switch port, or NULL if it is not visible + * (e.g. passthrough) or the EP is parented under an unrelated bridge. + */ +static struct pci_dev *aqua_vanjaram_get_us_pdev(struct amdgpu_device *adev) +{ + struct pci_dev *ds_pdev, *us_pdev; + + ds_pdev = pci_upstream_bridge(adev->pdev); + if (!ds_pdev || ds_pdev->vendor != PCI_VENDOR_ID_ATI || + pci_pcie_type(ds_pdev) != PCI_EXP_TYPE_DOWNSTREAM) + return NULL; + + us_pdev = pci_upstream_bridge(ds_pdev); + if (!us_pdev || + (us_pdev->vendor != PCI_VENDOR_ID_ATI && + us_pdev->vendor != PCI_VENDOR_ID_AMD) || + pci_pcie_type(us_pdev) != PCI_EXP_TYPE_UPSTREAM) + return NULL; + + return us_pdev; +} + static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev, void *buf, size_t max_size) { @@ -596,7 +621,7 @@ static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev, uint32_t start_addr, incrx, num_regs, szbuf; struct amdgpu_regs_pcie_v1_0 *pcie_regs; struct amdgpu_smn_reg_data *reg_data; - struct pci_dev *us_pdev, *ds_pdev; + struct pci_dev *us_pdev; int aer_cap, r, n; if (!buf || !max_size) @@ -628,25 +653,27 @@ static ssize_t aqua_vanjaram_read_pcie_state(struct amdgpu_device *adev, } } - ds_pdev = pci_upstream_bridge(adev->pdev); - us_pdev = pci_upstream_bridge(ds_pdev); + us_pdev = aqua_vanjaram_get_us_pdev(adev); + if (us_pdev) { + pcie_capability_read_word(us_pdev, PCI_EXP_DEVSTA, + &pcie_regs->device_status); + pcie_capability_read_word(us_pdev, PCI_EXP_LNKSTA, + &pcie_regs->link_status); + + aer_cap = pci_find_ext_capability(us_pdev, PCI_EXT_CAP_ID_ERR); + if (aer_cap) { + pci_read_config_dword(us_pdev, + aer_cap + PCI_ERR_COR_STATUS, + &pcie_regs->pcie_corr_err_status); + pci_read_config_dword(us_pdev, + aer_cap + PCI_ERR_UNCOR_STATUS, + &pcie_regs->pcie_uncorr_err_status); + } - pcie_capability_read_word(us_pdev, PCI_EXP_DEVSTA, - &pcie_regs->device_status); - pcie_capability_read_word(us_pdev, PCI_EXP_LNKSTA, - &pcie_regs->link_status); - - aer_cap = pci_find_ext_capability(us_pdev, PCI_EXT_CAP_ID_ERR); - if (aer_cap) { - pci_read_config_dword(us_pdev, aer_cap + PCI_ERR_COR_STATUS, - &pcie_regs->pcie_corr_err_status); - pci_read_config_dword(us_pdev, aer_cap + PCI_ERR_UNCOR_STATUS, - &pcie_regs->pcie_uncorr_err_status); + pci_read_config_dword(us_pdev, PCI_PRIMARY_BUS, + &pcie_regs->sub_bus_number_latency); } - pci_read_config_dword(us_pdev, PCI_PRIMARY_BUS, - &pcie_regs->sub_bus_number_latency); - pcie_reg_state->common_header.structure_size = szbuf; pcie_reg_state->common_header.format_revision = 1; pcie_reg_state->common_header.content_revision = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index ca5d091549e1..e0e585f280e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -114,8 +114,10 @@ static uint32_t atom_iio_execute(struct atom_context *ctx, int base, uint32_t index, uint32_t data) { uint32_t temp = 0xCDCDCDCD; + int start = base; - while (1) + /* IIO opcodes read up to base+3; keep within the BIOS image */ + while (base + 3 < ctx->bios_size) switch (CU8(base)) { case ATOM_IIO_NOP: base++; @@ -180,6 +182,9 @@ static uint32_t atom_iio_execute(struct atom_context *ctx, int base, pr_info("Unknown IIO opcode\n"); return 0; } + + pr_info("IIO method starting at offset %d runs past BIOS image\n", start); + return 0; } static uint32_t atom_get_src_int(atom_exec_context *ctx, uint8_t attr, @@ -1327,11 +1332,25 @@ static void atom_index_iio(struct atom_context *ctx, int base) ctx->iio = kzalloc(2 * 256, GFP_KERNEL); if (!ctx->iio) return; - while (CU8(base) == ATOM_IIO_START) { - ctx->iio[CU8(base + 1)] = base + 2; + while (base + 1 < ctx->bios_size && CU8(base) == ATOM_IIO_START) { + uint8_t index = CU8(base + 1); + int start = base + 2; base += 2; - while (CU8(base) != ATOM_IIO_END) - base += atom_iio_len[CU8(base)]; + while (base < ctx->bios_size && CU8(base) != ATOM_IIO_END) { + uint8_t op = CU8(base); + + /* + * Unknown opcode: its length is unknown so the byte + * stream cannot be resynced reliably. + */ + if (op >= ARRAY_SIZE(atom_iio_len)) + return; + base += atom_iio_len[op]; + } + if (base >= ctx->bios_size) + return; + /* Only index well-formed methods, others stay 0 */ + ctx->iio[index] = start; base += 3; } } @@ -1339,6 +1358,7 @@ static void atom_index_iio(struct atom_context *ctx, int base) static void atom_get_vbios_name(struct atom_context *ctx) { unsigned char *p_rom; + unsigned char *p_end; unsigned char str_num; unsigned short off_to_vbios_str; unsigned char *c_ptr; @@ -1349,39 +1369,48 @@ static void atom_get_vbios_name(struct atom_context *ctx) char *back; p_rom = ctx->bios; + p_end = p_rom + ctx->bios_size; + + if (p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START + 1 >= p_end) + goto no_name; str_num = *(p_rom + OFFSET_TO_GET_ATOMBIOS_NUMBER_OF_STRINGS); - if (str_num != 0) { - off_to_vbios_str = - *(unsigned short *)(p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START); + if (!str_num) + goto no_name; - c_ptr = (unsigned char *)(p_rom + off_to_vbios_str); - } else { - /* do not know where to find name */ - memcpy(ctx->name, na, 7); - ctx->name[7] = 0; - return; - } + off_to_vbios_str = + *(unsigned short *)(p_rom + OFFSET_TO_GET_ATOMBIOS_STRING_START); + + c_ptr = (unsigned char *)(p_rom + off_to_vbios_str); + if (c_ptr >= p_end) + goto no_name; /* * skip the atombios strings, usually 4 * 1st is P/N, 2nd is ASIC, 3rd is PCI type, 4th is Memory type */ for (i = 0; i < str_num; i++) { - while (*c_ptr != 0) + while (c_ptr < p_end && *c_ptr != 0) c_ptr++; c_ptr++; } /* skip the following 2 chars: 0x0D 0x0A */ c_ptr += 2; + if (c_ptr >= p_end) + goto no_name; - name_size = strnlen(c_ptr, STRLEN_LONG - 1); + name_size = strnlen(c_ptr, min(STRLEN_LONG - 1, (int)(p_end - c_ptr))); memcpy(ctx->name, c_ptr, name_size); back = ctx->name + name_size; while ((*--back) == ' ') ; *(back + 1) = '\0'; + return; + +no_name: + /* do not know where to find name */ + strscpy(ctx->name, na, sizeof(ctx->name)); } static void atom_get_vbios_date(struct atom_context *ctx) @@ -1553,7 +1582,7 @@ static inline void atom_print_vbios_info(struct atom_context *ctx) drm_info(ctx->card->dev, "ATOM BIOS: %s\n", vbios_info); } -struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios) +struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios, uint32_t bios_size) { int base; struct atom_context *ctx = @@ -1567,6 +1596,7 @@ struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios) ctx->card = card; ctx->bios = bios; + ctx->bios_size = bios_size; if (CU16(0) != ATOM_BIOS_MAGIC) { pr_info("Invalid BIOS magic\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/atom.h b/drivers/gpu/drm/amd/amdgpu/atom.h index bb3d9eb7eb6b..4687c019cbe3 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.h +++ b/drivers/gpu/drm/amd/amdgpu/atom.h @@ -133,6 +133,7 @@ struct atom_context { struct card_info *card; struct mutex mutex; void *bios; + uint32_t bios_size; uint32_t cmd_table, data_table; uint16_t *iio; @@ -160,7 +161,7 @@ struct atom_context { extern int amdgpu_atom_debug; -struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios); +struct atom_context *amdgpu_atom_parse(struct card_info *card, void *bios, uint32_t bios_size); int amdgpu_atom_execute_table(struct atom_context *ctx, int index, uint32_t *params, int params_size); int amdgpu_atom_asic_init(struct atom_context *ctx); void amdgpu_atom_destroy(struct atom_context *ctx); diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c index 29954c7d61b0..77e120a72815 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik.c +++ b/drivers/gpu/drm/amd/amdgpu/cik.c @@ -1876,12 +1876,6 @@ static void cik_invalidate_hdp(struct amdgpu_device *adev, } } -static bool cik_need_full_reset(struct amdgpu_device *adev) -{ - /* change this when we support soft reset */ - return true; -} - static void cik_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -1971,7 +1965,6 @@ static const struct amdgpu_asic_funcs cik_asic_funcs = .get_config_memsize = &cik_get_config_memsize, .flush_hdp = &cik_flush_hdp, .invalidate_hdp = &cik_invalidate_hdp, - .need_full_reset = &cik_need_full_reset, .init_doorbell_index = &legacy_doorbell_index_init, .get_pcie_usage = &cik_get_pcie_usage, .need_reset_on_init = &cik_need_reset_on_init, diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c index c8f465158e71..f2977fe6d824 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c @@ -410,36 +410,6 @@ static u32 dce_v10_0_hpd_get_gpio_reg(struct amdgpu_device *adev) return mmDC_GPIO_HPD_A; } -static bool dce_v10_0_is_display_hung(struct amdgpu_device *adev) -{ - u32 crtc_hung = 0; - u32 crtc_status[6]; - u32 i, j, tmp; - - for (i = 0; i < adev->mode_info.num_crtc; i++) { - tmp = RREG32(mmCRTC_CONTROL + crtc_offsets[i]); - if (REG_GET_FIELD(tmp, CRTC_CONTROL, CRTC_MASTER_EN)) { - crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - crtc_hung |= (1 << i); - } - } - - for (j = 0; j < 10; j++) { - for (i = 0; i < adev->mode_info.num_crtc; i++) { - if (crtc_hung & (1 << i)) { - tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - if (tmp != crtc_status[i]) - crtc_hung &= ~(1 << i); - } - } - if (crtc_hung == 0) - return false; - udelay(100); - } - - return true; -} - static void dce_v10_0_set_vga_render_state(struct amdgpu_device *adev, bool render) { @@ -2956,40 +2926,6 @@ static bool dce_v10_0_is_idle(struct amdgpu_ip_block *ip_block) return true; } -static bool dce_v10_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - return dce_v10_0_is_display_hung(adev); -} - -static int dce_v10_0_soft_reset(struct amdgpu_ip_block *ip_block) -{ - u32 srbm_soft_reset = 0, tmp; - struct amdgpu_device *adev = ip_block->adev; - - if (dce_v10_0_is_display_hung(adev)) - srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK; - - if (srbm_soft_reset) { - tmp = RREG32(mmSRBM_SOFT_RESET); - tmp |= srbm_soft_reset; - dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp); - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - udelay(50); - - tmp &= ~srbm_soft_reset; - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - /* Wait a little for things to settle down */ - udelay(50); - } - return 0; -} - static void dce_v10_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev, int crtc, enum amdgpu_interrupt_state state) @@ -3332,8 +3268,6 @@ static const struct amd_ip_funcs dce_v10_0_ip_funcs = { .suspend = dce_v10_0_suspend, .resume = dce_v10_0_resume, .is_idle = dce_v10_0_is_idle, - .check_soft_reset = dce_v10_0_check_soft_reset, - .soft_reset = dce_v10_0_soft_reset, .set_clockgating_state = dce_v10_0_set_clockgating_state, .set_powergating_state = dce_v10_0_set_powergating_state, }; diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c index 58d0da5c2a74..c68de0fe1d7d 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c @@ -378,35 +378,6 @@ static u32 dce_v6_0_hpd_get_gpio_reg(struct amdgpu_device *adev) return mmDC_GPIO_HPD_A; } -static bool dce_v6_0_is_display_hung(struct amdgpu_device *adev) -{ - u32 crtc_hung = 0; - u32 crtc_status[6]; - u32 i, j, tmp; - - for (i = 0; i < adev->mode_info.num_crtc; i++) { - if (RREG32(mmCRTC_CONTROL + crtc_offsets[i]) & CRTC_CONTROL__CRTC_MASTER_EN_MASK) { - crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - crtc_hung |= (1 << i); - } - } - - for (j = 0; j < 10; j++) { - for (i = 0; i < adev->mode_info.num_crtc; i++) { - if (crtc_hung & (1 << i)) { - tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - if (tmp != crtc_status[i]) - crtc_hung &= ~(1 << i); - } - } - if (crtc_hung == 0) - return false; - udelay(100); - } - - return true; -} - static void dce_v6_0_set_vga_render_state(struct amdgpu_device *adev, bool render) { @@ -2901,33 +2872,6 @@ static bool dce_v6_0_is_idle(struct amdgpu_ip_block *ip_block) return true; } -static int dce_v6_0_soft_reset(struct amdgpu_ip_block *ip_block) -{ - u32 srbm_soft_reset = 0, tmp; - struct amdgpu_device *adev = ip_block->adev; - - if (dce_v6_0_is_display_hung(adev)) - srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK; - - if (srbm_soft_reset) { - tmp = RREG32(mmSRBM_SOFT_RESET); - tmp |= srbm_soft_reset; - dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp); - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - udelay(50); - - tmp &= ~srbm_soft_reset; - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - /* Wait a little for things to settle down */ - udelay(50); - } - return 0; -} - static void dce_v6_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev, int crtc, enum amdgpu_interrupt_state state) @@ -3224,7 +3168,6 @@ static const struct amd_ip_funcs dce_v6_0_ip_funcs = { .suspend = dce_v6_0_suspend, .resume = dce_v6_0_resume, .is_idle = dce_v6_0_is_idle, - .soft_reset = dce_v6_0_soft_reset, .set_clockgating_state = dce_v6_0_set_clockgating_state, .set_powergating_state = dce_v6_0_set_powergating_state, }; diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c index 6d19f6d94d25..c3906270f25e 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c @@ -362,35 +362,6 @@ static u32 dce_v8_0_hpd_get_gpio_reg(struct amdgpu_device *adev) return mmDC_GPIO_HPD_A; } -static bool dce_v8_0_is_display_hung(struct amdgpu_device *adev) -{ - u32 crtc_hung = 0; - u32 crtc_status[6]; - u32 i, j, tmp; - - for (i = 0; i < adev->mode_info.num_crtc; i++) { - if (RREG32(mmCRTC_CONTROL + crtc_offsets[i]) & CRTC_CONTROL__CRTC_MASTER_EN_MASK) { - crtc_status[i] = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - crtc_hung |= (1 << i); - } - } - - for (j = 0; j < 10; j++) { - for (i = 0; i < adev->mode_info.num_crtc; i++) { - if (crtc_hung & (1 << i)) { - tmp = RREG32(mmCRTC_STATUS_HV_COUNT + crtc_offsets[i]); - if (tmp != crtc_status[i]) - crtc_hung &= ~(1 << i); - } - } - if (crtc_hung == 0) - return false; - udelay(100); - } - - return true; -} - static void dce_v8_0_set_vga_render_state(struct amdgpu_device *adev, bool render) { @@ -2873,33 +2844,6 @@ static bool dce_v8_0_is_idle(struct amdgpu_ip_block *ip_block) return true; } -static int dce_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) -{ - u32 srbm_soft_reset = 0, tmp; - struct amdgpu_device *adev = ip_block->adev; - - if (dce_v8_0_is_display_hung(adev)) - srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_DC_MASK; - - if (srbm_soft_reset) { - tmp = RREG32(mmSRBM_SOFT_RESET); - tmp |= srbm_soft_reset; - dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp); - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - udelay(50); - - tmp &= ~srbm_soft_reset; - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - /* Wait a little for things to settle down */ - udelay(50); - } - return 0; -} - static void dce_v8_0_set_crtc_vblank_interrupt_state(struct amdgpu_device *adev, int crtc, enum amdgpu_interrupt_state state) @@ -3241,7 +3185,6 @@ static const struct amd_ip_funcs dce_v8_0_ip_funcs = { .suspend = dce_v8_0_suspend, .resume = dce_v8_0_resume, .is_idle = dce_v8_0_is_idle, - .soft_reset = dce_v8_0_soft_reset, .set_clockgating_state = dce_v8_0_set_clockgating_state, .set_powergating_state = dce_v8_0_set_powergating_state, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index b4b27e4c495d..ddf190672530 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -7852,6 +7852,8 @@ static int gfx_v10_0_early_init(struct amdgpu_ip_block *ip_block) /* init rlcg reg access ctrl */ gfx_v10_0_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v10_0_init_microcode(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 3b12eb27a253..2a121df90574 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1923,6 +1923,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + adev->gfx.me.use_mmio_for_reset = false; + adev->gfx.mec.use_mmio_for_reset = true; + + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } @@ -4233,13 +4238,13 @@ static int gfx_v11_0_gfx_mqd_init(struct amdgpu_device *adev, void *m, return 0; } -static int gfx_v11_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset) +static int gfx_v11_0_kgq_init_queue(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; struct v11_gfx_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.gfx_ring[0]; - if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -4266,7 +4271,7 @@ static int gfx_v11_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev) int r, i; for (i = 0; i < adev->gfx.num_gfx_rings; i++) { - r = gfx_v11_0_kgq_init_queue(&adev->gfx.gfx_ring[i], false); + r = gfx_v11_0_kgq_init_queue(&adev->gfx.gfx_ring[i]); if (r) return r; } @@ -4603,13 +4608,13 @@ static int gfx_v11_0_kiq_init_queue(struct amdgpu_ring *ring) return 0; } -static int gfx_v11_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset) +static int gfx_v11_0_kcq_init_queue(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; struct v11_compute_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -4646,7 +4651,7 @@ static int gfx_v11_0_kcq_resume(struct amdgpu_device *adev) gfx_v11_0_cp_compute_enable(adev, true); for (i = 0; i < adev->gfx.num_compute_rings; i++) { - r = gfx_v11_0_kcq_init_queue(&adev->gfx.compute_ring[i], false); + r = gfx_v11_0_kcq_init_queue(&adev->gfx.compute_ring[i]); if (r) return r; } @@ -5265,38 +5270,12 @@ static int gfx_v11_0_soft_reset(struct amdgpu_ip_block *ip_block) amdgpu_gfx_rlc_exit_safe_mode(adev, 0); - return gfx_v11_0_cp_resume(adev); -} - -static bool gfx_v11_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - int i, r; - struct amdgpu_device *adev = ip_block->adev; - struct amdgpu_ring *ring; - long tmo = msecs_to_jiffies(1000); - - for (i = 0; i < adev->gfx.num_gfx_rings; i++) { - ring = &adev->gfx.gfx_ring[i]; - r = amdgpu_ring_test_ib(ring, tmo); - if (r) - return true; - } - - for (i = 0; i < adev->gfx.num_compute_rings; i++) { - ring = &adev->gfx.compute_ring[i]; - r = amdgpu_ring_test_ib(ring, tmo); - if (r) - return true; - } - - return false; -} + r = gfx_v11_0_cp_resume(adev); + if (r) + return r; -static int gfx_v11_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; /** - * GFX soft reset will impact MES, need resume MES when do GFX soft reset + * GFX soft reset impacts MES, resume MES after GFX soft reset is finished */ return amdgpu_mes_resume(adev, 0); } @@ -5420,6 +5399,8 @@ static int gfx_v11_0_early_init(struct amdgpu_ip_block *ip_block) gfx_v11_0_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v11_0_init_microcode(adev); } @@ -6708,22 +6689,29 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { - u8 me_id, pipe_id, queue_id; - struct amdgpu_ring *ring; - int i; - - me_id = (entry->ring_id & 0x0c) >> 2; - pipe_id = (entry->ring_id & 0x03) >> 0; - queue_id = (entry->ring_id & 0x70) >> 4; + u32 doorbell_offset = entry->src_data[0] & AMDGPU_CTXID0_DOORBELL_ID_MASK; + /* + * Try KQ first by ring_id (HW slot is authoritative). The + * KMD compute_hqd_mask contract guarantees KCQ and user queues + * never share a HW slot. + */ if (!adev->gfx.disable_kq) { + u8 me_id = (entry->ring_id & 0x0c) >> 2; + u8 pipe_id = (entry->ring_id & 0x03) >> 0; + u8 queue_id = (entry->ring_id & 0x70) >> 4; + struct amdgpu_ring *ring; + int i; + switch (me_id) { case 0: for (i = 0; i < adev->gfx.num_gfx_rings; i++) { ring = &adev->gfx.gfx_ring[i]; if (ring->me == me_id && ring->pipe == pipe_id && - ring->queue == queue_id) + ring->queue == queue_id) { drm_sched_fault(&ring->sched); + return; + } } break; case 1: @@ -6731,8 +6719,10 @@ static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = &adev->gfx.compute_ring[i]; if (ring->me == me_id && ring->pipe == pipe_id && - ring->queue == queue_id) + ring->queue == queue_id) { drm_sched_fault(&ring->sched); + return; + } } break; default: @@ -6740,6 +6730,11 @@ static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, break; } } + + /* No KQ matched: HW slot is a MES-scheduled user queue. */ + if (adev->enable_mes && doorbell_offset) + amdgpu_userq_process_reset_irq(adev, entry->pasid, + doorbell_offset); } static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev, @@ -6846,233 +6841,14 @@ static void gfx_v11_0_emit_mem_sync(struct amdgpu_ring *ring) amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */ } -static bool gfx_v11_pipe_reset_support(struct amdgpu_device *adev) -{ - /* Disable the pipe reset until the CPFW fully support it.*/ - dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n"); - return false; -} - - -static int gfx_v11_reset_gfx_pipe(struct amdgpu_ring *ring) -{ - struct amdgpu_device *adev = ring->adev; - uint32_t reset_pipe = 0, clean_pipe = 0; - int r; - - if (!gfx_v11_pipe_reset_support(adev)) - return -EOPNOTSUPP; - - gfx_v11_0_set_safe_mode(adev, 0); - mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - PFP_PIPE0_RESET, 1); - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - ME_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - PFP_PIPE0_RESET, 0); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - ME_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - PFP_PIPE1_RESET, 1); - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - ME_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - PFP_PIPE1_RESET, 0); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - ME_PIPE1_RESET, 0); - break; - default: - break; - } - - WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe); - - r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) - - RS64_FW_UC_START_ADDR_LO; - soc21_grbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - gfx_v11_0_unset_safe_mode(adev, 0); - - dev_info(adev->dev, "The ring %s pipe reset to the ME firmware start PC: %s\n", ring->name, - r == 0 ? "successfully" : "failed"); - /* FIXME: Sometimes driver can't cache the ME firmware start PC correctly, - * so the pipe reset status relies on the later gfx ring test result. - */ - return 0; -} - static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = false; - int r; + bool use_mmio = adev->gfx.me.use_mmio_for_reset; - amdgpu_ring_reset_helper_begin(ring, timedout_fence); - - r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0); - if (r) { - - dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r); - r = gfx_v11_reset_gfx_pipe(ring); - if (r) - return r; - } - - if (use_mmio) { - r = gfx_v11_0_kgq_init_queue(ring, true); - if (r) { - dev_err(adev->dev, "failed to init kgq\n"); - return r; - } - - r = amdgpu_mes_map_legacy_queue(adev, ring, 0); - if (r) { - dev_err(adev->dev, "failed to remap kgq\n"); - return r; - } - } - - return amdgpu_ring_reset_helper_end(ring, timedout_fence); -} - -static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring) -{ - - struct amdgpu_device *adev = ring->adev; - uint32_t reset_pipe = 0, clean_pipe = 0; - int r; - - if (!gfx_v11_pipe_reset_support(adev)) - return -EOPNOTSUPP; - - gfx_v11_0_set_safe_mode(adev, 0); - mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - - reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); - clean_pipe = reset_pipe; - - if (adev->gfx.rs64_enable) { - - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 0); - break; - case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 0); - break; - case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 0); - break; - default: - break; - } - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe); - r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) - - RS64_FW_UC_START_ADDR_LO; - } else { - if (ring->me == 1) { - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 0); - break; - case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE2_RESET, 0); - break; - case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE3_RESET, 0); - break; - default: - break; - } - /* mec1 fw pc: CP_MEC1_INSTR_PNTR */ - } else { - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE1_RESET, 0); - break; - case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE2_RESET, 0); - break; - case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE3_RESET, 0); - break; - default: - break; - } - /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */ - } - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe); - r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR)); - } - - soc21_grbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - gfx_v11_0_unset_safe_mode(adev, 0); - - dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", ring->name, - r == 0 ? "successfully" : "failed"); - /*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe - * reset status relies on the compute ring test result. - */ - return 0; + return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); } static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, @@ -7080,30 +6856,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - int r = 0; - - amdgpu_ring_reset_helper_begin(ring, timedout_fence); - - r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0); - if (r) { - dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r); - r = gfx_v11_0_reset_compute_pipe(ring); - if (r) - return r; - } - - r = gfx_v11_0_kcq_init_queue(ring, true); - if (r) { - dev_err(adev->dev, "fail to init kcq\n"); - return r; - } - r = amdgpu_mes_map_legacy_queue(adev, ring, 0); - if (r) { - dev_err(adev->dev, "failed to remap kcq\n"); - return r; - } - return amdgpu_ring_reset_helper_end(ring, timedout_fence); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL, NULL); } static void gfx_v11_ip_print(struct amdgpu_ip_block *ip_block, struct drm_printer *p) @@ -7281,8 +7035,6 @@ static const struct amd_ip_funcs gfx_v11_0_ip_funcs = { .is_idle = gfx_v11_0_is_idle, .wait_for_idle = gfx_v11_0_wait_for_idle, .soft_reset = gfx_v11_0_soft_reset, - .check_soft_reset = gfx_v11_0_check_soft_reset, - .post_soft_reset = gfx_v11_0_post_soft_reset, .set_clockgating_state = gfx_v11_0_set_clockgating_state, .set_powergating_state = gfx_v11_0_set_powergating_state, .get_clockgating_state = gfx_v11_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index da668a8d6abd..c765af54669c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1603,6 +1603,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + adev->gfx.me.use_mmio_for_reset = false; + adev->gfx.mec.use_mmio_for_reset = true; + + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } @@ -3071,13 +3076,13 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device *adev, void *m, return 0; } -static int gfx_v12_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset) +static int gfx_v12_0_kgq_init_queue(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; struct v12_gfx_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.gfx_ring[0]; - if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -3104,7 +3109,7 @@ static int gfx_v12_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev) int i, r; for (i = 0; i < adev->gfx.num_gfx_rings; i++) { - r = gfx_v12_0_kgq_init_queue(&adev->gfx.gfx_ring[i], false); + r = gfx_v12_0_kgq_init_queue(&adev->gfx.gfx_ring[i]); if (r) return r; } @@ -3441,13 +3446,13 @@ static int gfx_v12_0_kiq_init_queue(struct amdgpu_ring *ring) return 0; } -static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring, bool reset) +static int gfx_v12_0_kcq_init_queue(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; struct v12_compute_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -3485,7 +3490,7 @@ static int gfx_v12_0_kcq_resume(struct amdgpu_device *adev) gfx_v12_0_cp_compute_enable(adev, true); for (i = 0; i < adev->gfx.num_compute_rings; i++) { - r = gfx_v12_0_kcq_init_queue(&adev->gfx.compute_ring[i], false); + r = gfx_v12_0_kcq_init_queue(&adev->gfx.compute_ring[i]); if (r) return r; } @@ -3986,6 +3991,8 @@ static int gfx_v12_0_early_init(struct amdgpu_ip_block *ip_block) gfx_v12_0_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v12_0_init_microcode(adev); } @@ -5025,22 +5032,30 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { - u8 me_id, pipe_id, queue_id; - struct amdgpu_ring *ring; - int i; - - me_id = (entry->ring_id & 0x0c) >> 2; - pipe_id = (entry->ring_id & 0x03) >> 0; - queue_id = (entry->ring_id & 0x70) >> 4; + u32 doorbell_offset = entry->src_data[0] & AMDGPU_CTXID0_DOORBELL_ID_MASK; + /* + * Try KQ first by ring_id; UQ as fallback. KCQ and UQ never share + * a HW slot (compute_hqd_mask contract). + */ if (!adev->gfx.disable_kq) { + u8 me_id, pipe_id, queue_id; + struct amdgpu_ring *ring; + int i; + + me_id = (entry->ring_id & 0x0c) >> 2; + pipe_id = (entry->ring_id & 0x03) >> 0; + queue_id = (entry->ring_id & 0x70) >> 4; + switch (me_id) { case 0: for (i = 0; i < adev->gfx.num_gfx_rings; i++) { ring = &adev->gfx.gfx_ring[i]; if (ring->me == me_id && ring->pipe == pipe_id && - ring->queue == queue_id) + ring->queue == queue_id) { drm_sched_fault(&ring->sched); + return; + } } break; case 1: @@ -5048,8 +5063,10 @@ static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = &adev->gfx.compute_ring[i]; if (ring->me == me_id && ring->pipe == pipe_id && - ring->queue == queue_id) + ring->queue == queue_id) { drm_sched_fault(&ring->sched); + return; + } } break; default: @@ -5057,6 +5074,11 @@ static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, break; } } + + /* No KQ matched: HW slot is a MES-scheduled user queue. */ + if (adev->enable_mes && doorbell_offset) + amdgpu_userq_process_reset_irq(adev, entry->pasid, + doorbell_offset); } static int gfx_v12_0_priv_reg_irq(struct amdgpu_device *adev, @@ -5261,185 +5283,14 @@ static void gfx_v12_ip_dump(struct amdgpu_ip_block *ip_block) amdgpu_gfx_off_ctrl(adev, true); } -static bool gfx_v12_pipe_reset_support(struct amdgpu_device *adev) -{ - /* Disable the pipe reset until the CPFW fully support it.*/ - dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n"); - return false; -} - -static int gfx_v12_reset_gfx_pipe(struct amdgpu_ring *ring) -{ - struct amdgpu_device *adev = ring->adev; - uint32_t reset_pipe = 0, clean_pipe = 0; - int r; - - if (!gfx_v12_pipe_reset_support(adev)) - return -EOPNOTSUPP; - - gfx_v12_0_set_safe_mode(adev, 0); - mutex_lock(&adev->srbm_mutex); - soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - PFP_PIPE0_RESET, 1); - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - ME_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - PFP_PIPE0_RESET, 0); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - ME_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - PFP_PIPE1_RESET, 1); - reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, - ME_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - PFP_PIPE1_RESET, 0); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, - ME_PIPE1_RESET, 0); - break; - default: - break; - } - - WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe); - - r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) - - RS64_FW_UC_START_ADDR_LO; - soc24_grbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - gfx_v12_0_unset_safe_mode(adev, 0); - - dev_info(adev->dev, "The ring %s pipe reset: %s\n", ring->name, - r == 0 ? "successfully" : "failed"); - /* Sometimes the ME start pc counter can't cache correctly, so the - * PC check only as a reference and pipe reset result rely on the - * later ring test. - */ - return 0; -} - static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - bool use_mmio = false; - int r; - - amdgpu_ring_reset_helper_begin(ring, timedout_fence); - - r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, use_mmio, 0); - if (r) { - dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r); - r = gfx_v12_reset_gfx_pipe(ring); - if (r) - return r; - } - - if (use_mmio) { - r = gfx_v12_0_kgq_init_queue(ring, true); - if (r) { - dev_err(adev->dev, "failed to init kgq\n"); - return r; - } - - r = amdgpu_mes_map_legacy_queue(adev, ring, 0); - if (r) { - dev_err(adev->dev, "failed to remap kgq\n"); - return r; - } - } - - return amdgpu_ring_reset_helper_end(ring, timedout_fence); -} - -static int gfx_v12_0_reset_compute_pipe(struct amdgpu_ring *ring) -{ - struct amdgpu_device *adev = ring->adev; - uint32_t reset_pipe = 0, clean_pipe = 0; - int r = 0; - - if (!gfx_v12_pipe_reset_support(adev)) - return -EOPNOTSUPP; - - gfx_v12_0_set_safe_mode(adev, 0); - mutex_lock(&adev->srbm_mutex); - soc24_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - - reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); - clean_pipe = reset_pipe; + bool use_mmio = adev->gfx.me.use_mmio_for_reset; - if (adev->gfx.rs64_enable) { - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 0); - break; - case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 0); - break; - case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 0); - break; - default: - break; - } - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe); - r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) - - RS64_FW_UC_START_ADDR_LO; - } else { - switch (ring->pipe) { - case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 0); - break; - case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 0); - break; - default: - break; - } - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe); - /* Doesn't find the F32 MEC instruction pointer register, and suppose - * the driver won't run into the F32 mode. - */ - } - - soc24_grbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - gfx_v12_0_unset_safe_mode(adev, 0); - - dev_info(adev->dev, "The ring %s pipe resets: %s\n", ring->name, - r == 0 ? "successfully" : "failed"); - /* Need the ring test to verify the pipe reset result.*/ - return 0; + return amdgpu_gfx_mes_reset_queue(ring, vmid, timedout_fence, use_mmio); } static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, @@ -5447,30 +5298,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_fence *timedout_fence) { struct amdgpu_device *adev = ring->adev; - int r; - - amdgpu_ring_reset_helper_begin(ring, timedout_fence); - - r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0); - if (r) { - dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r); - r = gfx_v12_0_reset_compute_pipe(ring); - if (r) - return r; - } - - r = gfx_v12_0_kcq_init_queue(ring, true); - if (r) { - dev_err(adev->dev, "failed to init kcq\n"); - return r; - } - r = amdgpu_mes_map_legacy_queue(adev, ring, 0); - if (r) { - dev_err(adev->dev, "failed to remap kcq\n"); - return r; - } - return amdgpu_ring_reset_helper_end(ring, timedout_fence); + return amdgpu_gfx_reset_mes_compute(adev, ring, timedout_fence, NULL, NULL, NULL); } static void gfx_v12_0_ring_begin_use(struct amdgpu_ring *ring) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c index e7e9f11b9754..e87f1baf5cb6 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c @@ -1287,6 +1287,8 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + mutex_init(&adev->gfx.mec.reset_mutex); + return 0; } @@ -3004,6 +3006,8 @@ static int gfx_v12_1_early_init(struct amdgpu_ip_block *ip_block) gfx_v12_1_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v12_1_init_microcode(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 70ba81e6b4d4..bee2ff6865f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1487,7 +1487,14 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) /* bail if the compute ring is not ready */ if (!ring->sched.ready) - return 0; + return -EBUSY; + + if (amdgpu_in_reset(adev)) { + /* Set preempt condition to execute IB */ + amdgpu_ring_set_preempt_cond_exec(ring, true); + /* Flush HDP cache so the GPU can see the updated COND_EXEC value */ + amdgpu_device_flush_hdp(adev, NULL); + } tmp = RREG32(mmGB_EDC_MODE); WREG32(mmGB_EDC_MODE, 0); @@ -2028,6 +2035,11 @@ static int gfx_v8_0_sw_init(struct amdgpu_ip_block *ip_block) adev->gfx.compute_supported_reset = amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + if (!amdgpu_sriov_vf(adev) && !adev->debug_disable_ip_block_soft_reset) { + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET; + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_IP_BLOCK_SOFT_RESET; + } + return 0; } @@ -4703,12 +4715,14 @@ static int gfx_v8_0_cp_test_all_rings(struct amdgpu_device *adev) if (r) return r; + r = 0; + for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = &adev->gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r |= amdgpu_ring_test_helper(ring); } - return 0; + return r; } static int gfx_v8_0_cp_resume(struct amdgpu_device *adev) @@ -4868,14 +4882,12 @@ static int gfx_v8_0_hw_fini(struct amdgpu_ip_block *ip_block) } amdgpu_gfx_rlc_enter_safe_mode(adev, 0); - if (!gfx_v8_0_wait_for_idle(ip_block)) - gfx_v8_0_cp_enable(adev, false); - else + if (!amdgpu_in_reset(adev) && gfx_v8_0_wait_for_idle(ip_block)) pr_err("cp is busy, skip halt cp\n"); - if (!gfx_v8_0_wait_for_rlc_idle(adev)) - adev->gfx.rlc.funcs->stop(adev); - else - pr_err("rlc is busy, skip halt rlc\n"); + if (!amdgpu_in_reset(adev) && gfx_v8_0_wait_for_rlc_idle(adev)) + pr_err("rlc is busy\n"); + gfx_v8_0_cp_enable(adev, false); + adev->gfx.rlc.funcs->stop(adev); amdgpu_gfx_rlc_exit_safe_mode(adev, 0); return 0; @@ -4891,128 +4903,49 @@ static int gfx_v8_0_resume(struct amdgpu_ip_block *ip_block) return gfx_v8_0_hw_init(ip_block); } -static bool gfx_v8_0_check_soft_reset(struct amdgpu_ip_block *ip_block) +static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; u32 grbm_soft_reset = 0, srbm_soft_reset = 0; u32 tmp; + int i; + int r; - /* GRBM_STATUS */ - tmp = RREG32(mmGRBM_STATUS); - if (tmp & (GRBM_STATUS__PA_BUSY_MASK | GRBM_STATUS__SC_BUSY_MASK | - GRBM_STATUS__BCI_BUSY_MASK | GRBM_STATUS__SX_BUSY_MASK | - GRBM_STATUS__TA_BUSY_MASK | GRBM_STATUS__VGT_BUSY_MASK | - GRBM_STATUS__DB_BUSY_MASK | GRBM_STATUS__CB_BUSY_MASK | - GRBM_STATUS__GDS_BUSY_MASK | GRBM_STATUS__SPI_BUSY_MASK | - GRBM_STATUS__IA_BUSY_MASK | GRBM_STATUS__IA_BUSY_NO_DMA_MASK | - GRBM_STATUS__CP_BUSY_MASK | GRBM_STATUS__CP_COHERENCY_BUSY_MASK)) { - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, - GRBM_SOFT_RESET, SOFT_RESET_CP, 1); - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, - GRBM_SOFT_RESET, SOFT_RESET_GFX, 1); - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, - SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1); - } - - /* GRBM_STATUS2 */ - tmp = RREG32(mmGRBM_STATUS2); - if (REG_GET_FIELD(tmp, GRBM_STATUS2, RLC_BUSY)) - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, - GRBM_SOFT_RESET, SOFT_RESET_RLC, 1); - - if (REG_GET_FIELD(tmp, GRBM_STATUS2, CPF_BUSY) || - REG_GET_FIELD(tmp, GRBM_STATUS2, CPC_BUSY) || - REG_GET_FIELD(tmp, GRBM_STATUS2, CPG_BUSY)) { - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, - SOFT_RESET_CPF, 1); - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, - SOFT_RESET_CPC, 1); - grbm_soft_reset = REG_SET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, - SOFT_RESET_CPG, 1); - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, - SOFT_RESET_GRBM, 1); - } - - /* SRBM_STATUS */ - tmp = RREG32(mmSRBM_STATUS); - if (REG_GET_FIELD(tmp, SRBM_STATUS, GRBM_RQ_PENDING)) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, - SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1); - if (REG_GET_FIELD(tmp, SRBM_STATUS, SEM_BUSY)) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, - SRBM_SOFT_RESET, SOFT_RESET_SEM, 1); - - if (grbm_soft_reset || srbm_soft_reset) { - adev->gfx.grbm_soft_reset = grbm_soft_reset; - adev->gfx.srbm_soft_reset = srbm_soft_reset; - return true; - } else { - adev->gfx.grbm_soft_reset = 0; - adev->gfx.srbm_soft_reset = 0; - return false; - } -} - -static int gfx_v8_0_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 grbm_soft_reset = 0; - - if ((!adev->gfx.grbm_soft_reset) && - (!adev->gfx.srbm_soft_reset)) - return 0; - - grbm_soft_reset = adev->gfx.grbm_soft_reset; - - /* stop the rlc */ - adev->gfx.rlc.funcs->stop(adev); + grbm_soft_reset = + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_RLC, 1) | + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_GFX, 1) | + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CP, 1) | + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPF, 1) | + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPC, 1) | + REG_SET_FIELD(0, GRBM_SOFT_RESET, SOFT_RESET_CPG, 1); - if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_GFX)) - /* Disable GFX parsing/prefetching */ - gfx_v8_0_cp_gfx_enable(adev, false); + srbm_soft_reset = + REG_SET_FIELD(0, SRBM_SOFT_RESET, SOFT_RESET_GRBM, 1) | + REG_SET_FIELD(0, SRBM_SOFT_RESET, SOFT_RESET_SEM, 1); - if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPF) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPC) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPG)) { - int i; + for (i = 0; i < adev->gfx.num_compute_rings; i++) { + struct amdgpu_ring *ring = &adev->gfx.compute_ring[i]; - for (i = 0; i < adev->gfx.num_compute_rings; i++) { - struct amdgpu_ring *ring = &adev->gfx.compute_ring[i]; + mutex_lock(&adev->srbm_mutex); + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + gfx_v8_0_deactivate_hqd(adev, 2); + vi_srbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); - mutex_lock(&adev->srbm_mutex); - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - gfx_v8_0_deactivate_hqd(adev, 2); - vi_srbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - } - /* Disable MEC parsing/prefetching */ - gfx_v8_0_cp_compute_enable(adev, false); + udelay(50); } - return 0; -} - -static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 grbm_soft_reset = 0, srbm_soft_reset = 0; - u32 tmp; - - if ((!adev->gfx.grbm_soft_reset) && - (!adev->gfx.srbm_soft_reset)) - return 0; - - grbm_soft_reset = adev->gfx.grbm_soft_reset; - srbm_soft_reset = adev->gfx.srbm_soft_reset; + ip_block->version->funcs->set_clockgating_state(ip_block, AMD_CG_STATE_UNGATE); + ip_block->version->funcs->set_powergating_state(ip_block, AMD_PG_STATE_UNGATE); + ip_block->version->funcs->suspend(ip_block); if (grbm_soft_reset || srbm_soft_reset) { tmp = RREG32(mmGMCON_DEBUG); tmp = REG_SET_FIELD(tmp, GMCON_DEBUG, GFX_STALL, 1); tmp = REG_SET_FIELD(tmp, GMCON_DEBUG, GFX_CLEAR, 1); WREG32(mmGMCON_DEBUG, tmp); - udelay(50); + + udelay(100); } if (grbm_soft_reset) { @@ -5022,11 +4955,13 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) WREG32(mmGRBM_SOFT_RESET, tmp); tmp = RREG32(mmGRBM_SOFT_RESET); - udelay(50); + udelay(100); tmp &= ~grbm_soft_reset; WREG32(mmGRBM_SOFT_RESET, tmp); tmp = RREG32(mmGRBM_SOFT_RESET); + + udelay(100); } if (srbm_soft_reset) { @@ -5036,11 +4971,13 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) WREG32(mmSRBM_SOFT_RESET, tmp); tmp = RREG32(mmSRBM_SOFT_RESET); - udelay(50); + udelay(100); tmp &= ~srbm_soft_reset; WREG32(mmSRBM_SOFT_RESET, tmp); tmp = RREG32(mmSRBM_SOFT_RESET); + + udelay(100); } if (grbm_soft_reset || srbm_soft_reset) { @@ -5051,48 +4988,15 @@ static int gfx_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) } /* Wait a little for things to settle down */ - udelay(50); + udelay(100); - return 0; -} - -static int gfx_v8_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 grbm_soft_reset = 0; - - if ((!adev->gfx.grbm_soft_reset) && - (!adev->gfx.srbm_soft_reset)) - return 0; - - grbm_soft_reset = adev->gfx.grbm_soft_reset; - - if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPF) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPC) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CPG)) { - int i; - - for (i = 0; i < adev->gfx.num_compute_rings; i++) { - struct amdgpu_ring *ring = &adev->gfx.compute_ring[i]; - - mutex_lock(&adev->srbm_mutex); - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - gfx_v8_0_deactivate_hqd(adev, 2); - vi_srbm_select(adev, 0, 0, 0, 0); - mutex_unlock(&adev->srbm_mutex); - } - gfx_v8_0_kiq_resume(adev); - gfx_v8_0_kcq_resume(adev); - } - - if (REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_CP) || - REG_GET_FIELD(grbm_soft_reset, GRBM_SOFT_RESET, SOFT_RESET_GFX)) - gfx_v8_0_cp_gfx_resume(adev); + r = ip_block->version->funcs->resume(ip_block); + r |= ip_block->version->funcs->late_init(ip_block); + if (r) + return r; - gfx_v8_0_cp_test_all_rings(adev); - - adev->gfx.rlc.funcs->start(adev); + ip_block->version->funcs->set_clockgating_state(ip_block, AMD_CG_STATE_GATE); + ip_block->version->funcs->set_powergating_state(ip_block, AMD_PG_STATE_GATE); return 0; } @@ -6859,10 +6763,7 @@ static const struct amd_ip_funcs gfx_v8_0_ip_funcs = { .resume = gfx_v8_0_resume, .is_idle = gfx_v8_0_is_idle, .wait_for_idle = gfx_v8_0_wait_for_idle, - .check_soft_reset = gfx_v8_0_check_soft_reset, - .pre_soft_reset = gfx_v8_0_pre_soft_reset, .soft_reset = gfx_v8_0_soft_reset, - .post_soft_reset = gfx_v8_0_post_soft_reset, .set_clockgating_state = gfx_v8_0_set_clockgating_state, .set_powergating_state = gfx_v8_0_set_powergating_state, .get_clockgating_state = gfx_v8_0_get_clockgating_state, @@ -6923,10 +6824,12 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { .get_wptr = gfx_v8_0_ring_get_wptr_compute, .set_wptr = gfx_v8_0_ring_set_wptr_compute, .emit_frame_size = + 5 + /* gfx_v8_0_ring_emit_init_cond_exec (from amdgpu_ib_schedule) */ 20 + /* gfx_v8_0_ring_emit_gds_switch */ 7 + /* gfx_v8_0_ring_emit_hdp_flush */ 5 + /* hdp_invalidate */ 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ + 5 + /* gfx_v8_0_ring_emit_init_cond_exec (from amdgpu_vm_flush) */ VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ 7 + /* gfx_v8_0_emit_mem_sync_compute */ @@ -6947,6 +6850,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { .soft_recovery = gfx_v8_0_ring_soft_recovery, .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute, .emit_wave_limit = gfx_v8_0_emit_wave_limit, + .init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec, }; static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 3370f542e990..9f81fd715418 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4875,6 +4875,8 @@ static int gfx_v9_0_early_init(struct amdgpu_ip_block *ip_block) /* init rlcg reg access ctrl */ gfx_v9_0_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v9_0_init_microcode(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 2a36647b975a..b89cbc2df951 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -39,7 +39,6 @@ #include "gfx_v9_4_3.h" #include "gfx_v9_4_3_cleaner_shader.h" #include "amdgpu_xcp.h" -#include "amdgpu_aca.h" MODULE_FIRMWARE("amdgpu/gc_9_4_3_mec.bin"); MODULE_FIRMWARE("amdgpu/gc_9_4_4_mec.bin"); @@ -851,73 +850,6 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = { .get_hdp_flush_mask = &amdgpu_gfx_get_hdp_flush_mask, }; -static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, - struct aca_bank *bank, enum aca_smu_type type, - void *data) -{ - struct aca_bank_info info; - u64 misc0; - u32 instlo; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - /* NOTE: overwrite info.die_id with xcd id for gfx */ - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - switch (instlo) { - case mmSMNAID_XCD0_MCA_SMU: - case mmSMNAID_XCD1_MCA_SMU: - case mmSMNXCD_XCD0_MCA_SMU: - return true; - default: - break; - } - - return false; -} - -static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = { - .aca_bank_parser = gfx_v9_4_3_aca_bank_parser, - .aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid, -}; - -static const struct aca_info gfx_v9_4_3_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, - .bank_ops = &gfx_v9_4_3_aca_bank_ops, -}; - static int gfx_v9_4_3_gpu_early_init(struct amdgpu_device *adev) { adev->gfx.funcs = &gfx_v9_4_3_gfx_funcs; @@ -1107,22 +1039,24 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) /* set up the compute queues - allocate horizontally across pipes */ for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) { ring_id = 0; - for (i = 0; i < adev->gfx.mec.num_mec; ++i) { - for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) { - for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; - k++) { - if (!amdgpu_gfx_is_mec_queue_enabled( - adev, xcc_id, i, k, j)) - continue; - - r = gfx_v9_4_3_compute_ring_init(adev, - ring_id, - xcc_id, - i, k, j); - if (r) - return r; - - ring_id++; + if (!adev->gfx.disable_kq) { + for (i = 0; i < adev->gfx.mec.num_mec; ++i) { + for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) { + for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; + k++) { + if (!amdgpu_gfx_is_mec_queue_enabled( + adev, xcc_id, i, k, j)) + continue; + + r = gfx_v9_4_3_compute_ring_init(adev, + ring_id, + xcc_id, + i, k, j); + if (r) + return r; + + ring_id++; + } } } } @@ -2350,6 +2284,65 @@ static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id) gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id); } +static int gfx_v9_4_3_set_userq_eop_interrupts(struct amdgpu_device *adev, + bool enable) +{ + int num_xcc = NUM_XCC(adev->gfx.xcc_mask); + unsigned int irq_type; + int m, p, xcc_id, r; + + if (adev->gfx.disable_kq) { + for (xcc_id = 0; xcc_id < num_xcc; xcc_id++) { + for (m = 0; m < adev->gfx.mec.num_mec; ++m) { + for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + + p; + + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, + irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, + irq_type); + if (r) { + if (!enable) + return r; + goto err_compute; + } + } + } + } + } + + return 0; + +err_compute: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + for (xcc_id--; xcc_id >= 0; xcc_id--) { + for (m = adev->gfx.mec.num_mec - 1; m <= 0; m--) { + for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + } + + return r; +} + static int gfx_v9_4_3_hw_init(struct amdgpu_ip_block *ip_block) { int r; @@ -2382,9 +2375,14 @@ static int gfx_v9_4_3_hw_init(struct amdgpu_ip_block *ip_block) r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); if (r) goto err_bad_op; + r = gfx_v9_4_3_set_userq_eop_interrupts(adev, true); + if (r) + goto err_bad_eop; return 0; +err_bad_eop: + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); err_bad_op: amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); err_priv_inst: @@ -2467,6 +2465,7 @@ static int gfx_v9_4_3_hw_fini(struct amdgpu_ip_block *ip_block) amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); + gfx_v9_4_3_set_userq_eop_interrupts(adev, false); num_xcc = NUM_XCC(adev->gfx.xcc_mask); for (i = 0; i < num_xcc; i++) { @@ -2612,8 +2611,24 @@ static int gfx_v9_4_3_early_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev), - AMDGPU_MAX_COMPUTE_RINGS); + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->gfx.disable_kq = false; + adev->gfx.disable_uq = true; + break; + case 2: + adev->gfx.disable_kq = true; + adev->gfx.disable_uq = true; + break; + } + + if (adev->gfx.disable_kq) + adev->gfx.num_compute_rings = 0; + else + adev->gfx.num_compute_rings = min(amdgpu_gfx_get_num_kcq(adev), + AMDGPU_MAX_COMPUTE_RINGS); gfx_v9_4_3_set_kiq_pm4_funcs(adev); gfx_v9_4_3_set_ring_funcs(adev); gfx_v9_4_3_set_irq_funcs(adev); @@ -2623,6 +2638,8 @@ static int gfx_v9_4_3_early_init(struct amdgpu_ip_block *ip_block) /* init rlcg reg access ctrl */ gfx_v9_4_3_init_rlcg_reg_access_ctrl(adev); + amdgpu_init_rlc_reg_funcs(adev); + return gfx_v9_4_3_init_microcode(adev); } @@ -3709,872 +3726,6 @@ pipe_reset: return amdgpu_ring_reset_helper_end(ring, timedout_fence); } -enum amdgpu_gfx_cp_ras_mem_id { - AMDGPU_GFX_CP_MEM1 = 1, - AMDGPU_GFX_CP_MEM2, - AMDGPU_GFX_CP_MEM3, - AMDGPU_GFX_CP_MEM4, - AMDGPU_GFX_CP_MEM5, -}; - -enum amdgpu_gfx_gcea_ras_mem_id { - AMDGPU_GFX_GCEA_IOWR_CMDMEM = 4, - AMDGPU_GFX_GCEA_IORD_CMDMEM, - AMDGPU_GFX_GCEA_GMIWR_CMDMEM, - AMDGPU_GFX_GCEA_GMIRD_CMDMEM, - AMDGPU_GFX_GCEA_DRAMWR_CMDMEM, - AMDGPU_GFX_GCEA_DRAMRD_CMDMEM, - AMDGPU_GFX_GCEA_MAM_DMEM0, - AMDGPU_GFX_GCEA_MAM_DMEM1, - AMDGPU_GFX_GCEA_MAM_DMEM2, - AMDGPU_GFX_GCEA_MAM_DMEM3, - AMDGPU_GFX_GCEA_MAM_AMEM0, - AMDGPU_GFX_GCEA_MAM_AMEM1, - AMDGPU_GFX_GCEA_MAM_AMEM2, - AMDGPU_GFX_GCEA_MAM_AMEM3, - AMDGPU_GFX_GCEA_MAM_AFLUSH_BUFFER, - AMDGPU_GFX_GCEA_WRET_TAGMEM, - AMDGPU_GFX_GCEA_RRET_TAGMEM, - AMDGPU_GFX_GCEA_IOWR_DATAMEM, - AMDGPU_GFX_GCEA_GMIWR_DATAMEM, - AMDGPU_GFX_GCEA_DRAM_DATAMEM, -}; - -enum amdgpu_gfx_gc_cane_ras_mem_id { - AMDGPU_GFX_GC_CANE_MEM0 = 0, -}; - -enum amdgpu_gfx_gcutcl2_ras_mem_id { - AMDGPU_GFX_GCUTCL2_MEM2P512X95 = 160, -}; - -enum amdgpu_gfx_gds_ras_mem_id { - AMDGPU_GFX_GDS_MEM0 = 0, -}; - -enum amdgpu_gfx_lds_ras_mem_id { - AMDGPU_GFX_LDS_BANK0 = 0, - AMDGPU_GFX_LDS_BANK1, - AMDGPU_GFX_LDS_BANK2, - AMDGPU_GFX_LDS_BANK3, - AMDGPU_GFX_LDS_BANK4, - AMDGPU_GFX_LDS_BANK5, - AMDGPU_GFX_LDS_BANK6, - AMDGPU_GFX_LDS_BANK7, - AMDGPU_GFX_LDS_BANK8, - AMDGPU_GFX_LDS_BANK9, - AMDGPU_GFX_LDS_BANK10, - AMDGPU_GFX_LDS_BANK11, - AMDGPU_GFX_LDS_BANK12, - AMDGPU_GFX_LDS_BANK13, - AMDGPU_GFX_LDS_BANK14, - AMDGPU_GFX_LDS_BANK15, - AMDGPU_GFX_LDS_BANK16, - AMDGPU_GFX_LDS_BANK17, - AMDGPU_GFX_LDS_BANK18, - AMDGPU_GFX_LDS_BANK19, - AMDGPU_GFX_LDS_BANK20, - AMDGPU_GFX_LDS_BANK21, - AMDGPU_GFX_LDS_BANK22, - AMDGPU_GFX_LDS_BANK23, - AMDGPU_GFX_LDS_BANK24, - AMDGPU_GFX_LDS_BANK25, - AMDGPU_GFX_LDS_BANK26, - AMDGPU_GFX_LDS_BANK27, - AMDGPU_GFX_LDS_BANK28, - AMDGPU_GFX_LDS_BANK29, - AMDGPU_GFX_LDS_BANK30, - AMDGPU_GFX_LDS_BANK31, - AMDGPU_GFX_LDS_SP_BUFFER_A, - AMDGPU_GFX_LDS_SP_BUFFER_B, -}; - -enum amdgpu_gfx_rlc_ras_mem_id { - AMDGPU_GFX_RLC_GPMF32 = 1, - AMDGPU_GFX_RLC_RLCVF32, - AMDGPU_GFX_RLC_SCRATCH, - AMDGPU_GFX_RLC_SRM_ARAM, - AMDGPU_GFX_RLC_SRM_DRAM, - AMDGPU_GFX_RLC_TCTAG, - AMDGPU_GFX_RLC_SPM_SE, - AMDGPU_GFX_RLC_SPM_GRBMT, -}; - -enum amdgpu_gfx_sp_ras_mem_id { - AMDGPU_GFX_SP_SIMDID0 = 0, -}; - -enum amdgpu_gfx_spi_ras_mem_id { - AMDGPU_GFX_SPI_MEM0 = 0, - AMDGPU_GFX_SPI_MEM1, - AMDGPU_GFX_SPI_MEM2, - AMDGPU_GFX_SPI_MEM3, -}; - -enum amdgpu_gfx_sqc_ras_mem_id { - AMDGPU_GFX_SQC_INST_CACHE_A = 100, - AMDGPU_GFX_SQC_INST_CACHE_B = 101, - AMDGPU_GFX_SQC_INST_CACHE_TAG_A = 102, - AMDGPU_GFX_SQC_INST_CACHE_TAG_B = 103, - AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_A = 104, - AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_B = 105, - AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_A = 106, - AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_B = 107, - AMDGPU_GFX_SQC_DATA_CACHE_A = 200, - AMDGPU_GFX_SQC_DATA_CACHE_B = 201, - AMDGPU_GFX_SQC_DATA_CACHE_TAG_A = 202, - AMDGPU_GFX_SQC_DATA_CACHE_TAG_B = 203, - AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_A = 204, - AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_B = 205, - AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_A = 206, - AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_B = 207, - AMDGPU_GFX_SQC_DIRTY_BIT_A = 208, - AMDGPU_GFX_SQC_DIRTY_BIT_B = 209, - AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU0 = 210, - AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU1 = 211, - AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A = 212, - AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B = 213, - AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_INST_CACHE = 108, -}; - -enum amdgpu_gfx_sq_ras_mem_id { - AMDGPU_GFX_SQ_SGPR_MEM0 = 0, - AMDGPU_GFX_SQ_SGPR_MEM1, - AMDGPU_GFX_SQ_SGPR_MEM2, - AMDGPU_GFX_SQ_SGPR_MEM3, -}; - -enum amdgpu_gfx_ta_ras_mem_id { - AMDGPU_GFX_TA_FS_AFIFO_RAM_LO = 1, - AMDGPU_GFX_TA_FS_AFIFO_RAM_HI, - AMDGPU_GFX_TA_FS_CFIFO_RAM, - AMDGPU_GFX_TA_FSX_LFIFO, - AMDGPU_GFX_TA_FS_DFIFO_RAM, -}; - -enum amdgpu_gfx_tcc_ras_mem_id { - AMDGPU_GFX_TCC_MEM1 = 1, -}; - -enum amdgpu_gfx_tca_ras_mem_id { - AMDGPU_GFX_TCA_MEM1 = 1, -}; - -enum amdgpu_gfx_tci_ras_mem_id { - AMDGPU_GFX_TCIW_MEM = 1, -}; - -enum amdgpu_gfx_tcp_ras_mem_id { - AMDGPU_GFX_TCP_LFIFO0 = 1, - AMDGPU_GFX_TCP_SET0BANK0_RAM, - AMDGPU_GFX_TCP_SET0BANK1_RAM, - AMDGPU_GFX_TCP_SET0BANK2_RAM, - AMDGPU_GFX_TCP_SET0BANK3_RAM, - AMDGPU_GFX_TCP_SET1BANK0_RAM, - AMDGPU_GFX_TCP_SET1BANK1_RAM, - AMDGPU_GFX_TCP_SET1BANK2_RAM, - AMDGPU_GFX_TCP_SET1BANK3_RAM, - AMDGPU_GFX_TCP_SET2BANK0_RAM, - AMDGPU_GFX_TCP_SET2BANK1_RAM, - AMDGPU_GFX_TCP_SET2BANK2_RAM, - AMDGPU_GFX_TCP_SET2BANK3_RAM, - AMDGPU_GFX_TCP_SET3BANK0_RAM, - AMDGPU_GFX_TCP_SET3BANK1_RAM, - AMDGPU_GFX_TCP_SET3BANK2_RAM, - AMDGPU_GFX_TCP_SET3BANK3_RAM, - AMDGPU_GFX_TCP_VM_FIFO, - AMDGPU_GFX_TCP_DB_TAGRAM0, - AMDGPU_GFX_TCP_DB_TAGRAM1, - AMDGPU_GFX_TCP_DB_TAGRAM2, - AMDGPU_GFX_TCP_DB_TAGRAM3, - AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE0, - AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE1, - AMDGPU_GFX_TCP_CMD_FIFO, -}; - -enum amdgpu_gfx_td_ras_mem_id { - AMDGPU_GFX_TD_UTD_CS_FIFO_MEM = 1, - AMDGPU_GFX_TD_UTD_SS_FIFO_LO_MEM, - AMDGPU_GFX_TD_UTD_SS_FIFO_HI_MEM, -}; - -enum amdgpu_gfx_tcx_ras_mem_id { - AMDGPU_GFX_TCX_FIFOD0 = 0, - AMDGPU_GFX_TCX_FIFOD1, - AMDGPU_GFX_TCX_FIFOD2, - AMDGPU_GFX_TCX_FIFOD3, - AMDGPU_GFX_TCX_FIFOD4, - AMDGPU_GFX_TCX_FIFOD5, - AMDGPU_GFX_TCX_FIFOD6, - AMDGPU_GFX_TCX_FIFOD7, - AMDGPU_GFX_TCX_FIFOB0, - AMDGPU_GFX_TCX_FIFOB1, - AMDGPU_GFX_TCX_FIFOB2, - AMDGPU_GFX_TCX_FIFOB3, - AMDGPU_GFX_TCX_FIFOB4, - AMDGPU_GFX_TCX_FIFOB5, - AMDGPU_GFX_TCX_FIFOB6, - AMDGPU_GFX_TCX_FIFOB7, - AMDGPU_GFX_TCX_FIFOA0, - AMDGPU_GFX_TCX_FIFOA1, - AMDGPU_GFX_TCX_FIFOA2, - AMDGPU_GFX_TCX_FIFOA3, - AMDGPU_GFX_TCX_FIFOA4, - AMDGPU_GFX_TCX_FIFOA5, - AMDGPU_GFX_TCX_FIFOA6, - AMDGPU_GFX_TCX_FIFOA7, - AMDGPU_GFX_TCX_CFIFO0, - AMDGPU_GFX_TCX_CFIFO1, - AMDGPU_GFX_TCX_CFIFO2, - AMDGPU_GFX_TCX_CFIFO3, - AMDGPU_GFX_TCX_CFIFO4, - AMDGPU_GFX_TCX_CFIFO5, - AMDGPU_GFX_TCX_CFIFO6, - AMDGPU_GFX_TCX_CFIFO7, - AMDGPU_GFX_TCX_FIFO_ACKB0, - AMDGPU_GFX_TCX_FIFO_ACKB1, - AMDGPU_GFX_TCX_FIFO_ACKB2, - AMDGPU_GFX_TCX_FIFO_ACKB3, - AMDGPU_GFX_TCX_FIFO_ACKB4, - AMDGPU_GFX_TCX_FIFO_ACKB5, - AMDGPU_GFX_TCX_FIFO_ACKB6, - AMDGPU_GFX_TCX_FIFO_ACKB7, - AMDGPU_GFX_TCX_FIFO_ACKD0, - AMDGPU_GFX_TCX_FIFO_ACKD1, - AMDGPU_GFX_TCX_FIFO_ACKD2, - AMDGPU_GFX_TCX_FIFO_ACKD3, - AMDGPU_GFX_TCX_FIFO_ACKD4, - AMDGPU_GFX_TCX_FIFO_ACKD5, - AMDGPU_GFX_TCX_FIFO_ACKD6, - AMDGPU_GFX_TCX_FIFO_ACKD7, - AMDGPU_GFX_TCX_DST_FIFOA0, - AMDGPU_GFX_TCX_DST_FIFOA1, - AMDGPU_GFX_TCX_DST_FIFOA2, - AMDGPU_GFX_TCX_DST_FIFOA3, - AMDGPU_GFX_TCX_DST_FIFOA4, - AMDGPU_GFX_TCX_DST_FIFOA5, - AMDGPU_GFX_TCX_DST_FIFOA6, - AMDGPU_GFX_TCX_DST_FIFOA7, - AMDGPU_GFX_TCX_DST_FIFOB0, - AMDGPU_GFX_TCX_DST_FIFOB1, - AMDGPU_GFX_TCX_DST_FIFOB2, - AMDGPU_GFX_TCX_DST_FIFOB3, - AMDGPU_GFX_TCX_DST_FIFOB4, - AMDGPU_GFX_TCX_DST_FIFOB5, - AMDGPU_GFX_TCX_DST_FIFOB6, - AMDGPU_GFX_TCX_DST_FIFOB7, - AMDGPU_GFX_TCX_DST_FIFOD0, - AMDGPU_GFX_TCX_DST_FIFOD1, - AMDGPU_GFX_TCX_DST_FIFOD2, - AMDGPU_GFX_TCX_DST_FIFOD3, - AMDGPU_GFX_TCX_DST_FIFOD4, - AMDGPU_GFX_TCX_DST_FIFOD5, - AMDGPU_GFX_TCX_DST_FIFOD6, - AMDGPU_GFX_TCX_DST_FIFOD7, - AMDGPU_GFX_TCX_DST_FIFO_ACKB0, - AMDGPU_GFX_TCX_DST_FIFO_ACKB1, - AMDGPU_GFX_TCX_DST_FIFO_ACKB2, - AMDGPU_GFX_TCX_DST_FIFO_ACKB3, - AMDGPU_GFX_TCX_DST_FIFO_ACKB4, - AMDGPU_GFX_TCX_DST_FIFO_ACKB5, - AMDGPU_GFX_TCX_DST_FIFO_ACKB6, - AMDGPU_GFX_TCX_DST_FIFO_ACKB7, - AMDGPU_GFX_TCX_DST_FIFO_ACKD0, - AMDGPU_GFX_TCX_DST_FIFO_ACKD1, - AMDGPU_GFX_TCX_DST_FIFO_ACKD2, - AMDGPU_GFX_TCX_DST_FIFO_ACKD3, - AMDGPU_GFX_TCX_DST_FIFO_ACKD4, - AMDGPU_GFX_TCX_DST_FIFO_ACKD5, - AMDGPU_GFX_TCX_DST_FIFO_ACKD6, - AMDGPU_GFX_TCX_DST_FIFO_ACKD7, -}; - -enum amdgpu_gfx_atc_l2_ras_mem_id { - AMDGPU_GFX_ATC_L2_MEM0 = 0, -}; - -enum amdgpu_gfx_utcl2_ras_mem_id { - AMDGPU_GFX_UTCL2_MEM0 = 0, -}; - -enum amdgpu_gfx_vml2_ras_mem_id { - AMDGPU_GFX_VML2_MEM0 = 0, -}; - -enum amdgpu_gfx_vml2_walker_ras_mem_id { - AMDGPU_GFX_VML2_WALKER_MEM0 = 0, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_cp_mem_list[] = { - {AMDGPU_GFX_CP_MEM1, "CP_MEM1"}, - {AMDGPU_GFX_CP_MEM2, "CP_MEM2"}, - {AMDGPU_GFX_CP_MEM3, "CP_MEM3"}, - {AMDGPU_GFX_CP_MEM4, "CP_MEM4"}, - {AMDGPU_GFX_CP_MEM5, "CP_MEM5"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gcea_mem_list[] = { - {AMDGPU_GFX_GCEA_IOWR_CMDMEM, "GCEA_IOWR_CMDMEM"}, - {AMDGPU_GFX_GCEA_IORD_CMDMEM, "GCEA_IORD_CMDMEM"}, - {AMDGPU_GFX_GCEA_GMIWR_CMDMEM, "GCEA_GMIWR_CMDMEM"}, - {AMDGPU_GFX_GCEA_GMIRD_CMDMEM, "GCEA_GMIRD_CMDMEM"}, - {AMDGPU_GFX_GCEA_DRAMWR_CMDMEM, "GCEA_DRAMWR_CMDMEM"}, - {AMDGPU_GFX_GCEA_DRAMRD_CMDMEM, "GCEA_DRAMRD_CMDMEM"}, - {AMDGPU_GFX_GCEA_MAM_DMEM0, "GCEA_MAM_DMEM0"}, - {AMDGPU_GFX_GCEA_MAM_DMEM1, "GCEA_MAM_DMEM1"}, - {AMDGPU_GFX_GCEA_MAM_DMEM2, "GCEA_MAM_DMEM2"}, - {AMDGPU_GFX_GCEA_MAM_DMEM3, "GCEA_MAM_DMEM3"}, - {AMDGPU_GFX_GCEA_MAM_AMEM0, "GCEA_MAM_AMEM0"}, - {AMDGPU_GFX_GCEA_MAM_AMEM1, "GCEA_MAM_AMEM1"}, - {AMDGPU_GFX_GCEA_MAM_AMEM2, "GCEA_MAM_AMEM2"}, - {AMDGPU_GFX_GCEA_MAM_AMEM3, "GCEA_MAM_AMEM3"}, - {AMDGPU_GFX_GCEA_MAM_AFLUSH_BUFFER, "GCEA_MAM_AFLUSH_BUFFER"}, - {AMDGPU_GFX_GCEA_WRET_TAGMEM, "GCEA_WRET_TAGMEM"}, - {AMDGPU_GFX_GCEA_RRET_TAGMEM, "GCEA_RRET_TAGMEM"}, - {AMDGPU_GFX_GCEA_IOWR_DATAMEM, "GCEA_IOWR_DATAMEM"}, - {AMDGPU_GFX_GCEA_GMIWR_DATAMEM, "GCEA_GMIWR_DATAMEM"}, - {AMDGPU_GFX_GCEA_DRAM_DATAMEM, "GCEA_DRAM_DATAMEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gc_cane_mem_list[] = { - {AMDGPU_GFX_GC_CANE_MEM0, "GC_CANE_MEM0"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gcutcl2_mem_list[] = { - {AMDGPU_GFX_GCUTCL2_MEM2P512X95, "GCUTCL2_MEM2P512X95"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_gds_mem_list[] = { - {AMDGPU_GFX_GDS_MEM0, "GDS_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_lds_mem_list[] = { - {AMDGPU_GFX_LDS_BANK0, "LDS_BANK0"}, - {AMDGPU_GFX_LDS_BANK1, "LDS_BANK1"}, - {AMDGPU_GFX_LDS_BANK2, "LDS_BANK2"}, - {AMDGPU_GFX_LDS_BANK3, "LDS_BANK3"}, - {AMDGPU_GFX_LDS_BANK4, "LDS_BANK4"}, - {AMDGPU_GFX_LDS_BANK5, "LDS_BANK5"}, - {AMDGPU_GFX_LDS_BANK6, "LDS_BANK6"}, - {AMDGPU_GFX_LDS_BANK7, "LDS_BANK7"}, - {AMDGPU_GFX_LDS_BANK8, "LDS_BANK8"}, - {AMDGPU_GFX_LDS_BANK9, "LDS_BANK9"}, - {AMDGPU_GFX_LDS_BANK10, "LDS_BANK10"}, - {AMDGPU_GFX_LDS_BANK11, "LDS_BANK11"}, - {AMDGPU_GFX_LDS_BANK12, "LDS_BANK12"}, - {AMDGPU_GFX_LDS_BANK13, "LDS_BANK13"}, - {AMDGPU_GFX_LDS_BANK14, "LDS_BANK14"}, - {AMDGPU_GFX_LDS_BANK15, "LDS_BANK15"}, - {AMDGPU_GFX_LDS_BANK16, "LDS_BANK16"}, - {AMDGPU_GFX_LDS_BANK17, "LDS_BANK17"}, - {AMDGPU_GFX_LDS_BANK18, "LDS_BANK18"}, - {AMDGPU_GFX_LDS_BANK19, "LDS_BANK19"}, - {AMDGPU_GFX_LDS_BANK20, "LDS_BANK20"}, - {AMDGPU_GFX_LDS_BANK21, "LDS_BANK21"}, - {AMDGPU_GFX_LDS_BANK22, "LDS_BANK22"}, - {AMDGPU_GFX_LDS_BANK23, "LDS_BANK23"}, - {AMDGPU_GFX_LDS_BANK24, "LDS_BANK24"}, - {AMDGPU_GFX_LDS_BANK25, "LDS_BANK25"}, - {AMDGPU_GFX_LDS_BANK26, "LDS_BANK26"}, - {AMDGPU_GFX_LDS_BANK27, "LDS_BANK27"}, - {AMDGPU_GFX_LDS_BANK28, "LDS_BANK28"}, - {AMDGPU_GFX_LDS_BANK29, "LDS_BANK29"}, - {AMDGPU_GFX_LDS_BANK30, "LDS_BANK30"}, - {AMDGPU_GFX_LDS_BANK31, "LDS_BANK31"}, - {AMDGPU_GFX_LDS_SP_BUFFER_A, "LDS_SP_BUFFER_A"}, - {AMDGPU_GFX_LDS_SP_BUFFER_B, "LDS_SP_BUFFER_B"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_rlc_mem_list[] = { - {AMDGPU_GFX_RLC_GPMF32, "RLC_GPMF32"}, - {AMDGPU_GFX_RLC_RLCVF32, "RLC_RLCVF32"}, - {AMDGPU_GFX_RLC_SCRATCH, "RLC_SCRATCH"}, - {AMDGPU_GFX_RLC_SRM_ARAM, "RLC_SRM_ARAM"}, - {AMDGPU_GFX_RLC_SRM_DRAM, "RLC_SRM_DRAM"}, - {AMDGPU_GFX_RLC_TCTAG, "RLC_TCTAG"}, - {AMDGPU_GFX_RLC_SPM_SE, "RLC_SPM_SE"}, - {AMDGPU_GFX_RLC_SPM_GRBMT, "RLC_SPM_GRBMT"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sp_mem_list[] = { - {AMDGPU_GFX_SP_SIMDID0, "SP_SIMDID0"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_spi_mem_list[] = { - {AMDGPU_GFX_SPI_MEM0, "SPI_MEM0"}, - {AMDGPU_GFX_SPI_MEM1, "SPI_MEM1"}, - {AMDGPU_GFX_SPI_MEM2, "SPI_MEM2"}, - {AMDGPU_GFX_SPI_MEM3, "SPI_MEM3"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sqc_mem_list[] = { - {AMDGPU_GFX_SQC_INST_CACHE_A, "SQC_INST_CACHE_A"}, - {AMDGPU_GFX_SQC_INST_CACHE_B, "SQC_INST_CACHE_B"}, - {AMDGPU_GFX_SQC_INST_CACHE_TAG_A, "SQC_INST_CACHE_TAG_A"}, - {AMDGPU_GFX_SQC_INST_CACHE_TAG_B, "SQC_INST_CACHE_TAG_B"}, - {AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_A, "SQC_INST_CACHE_MISS_FIFO_A"}, - {AMDGPU_GFX_SQC_INST_CACHE_MISS_FIFO_B, "SQC_INST_CACHE_MISS_FIFO_B"}, - {AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_A, "SQC_INST_CACHE_GATCL1_MISS_FIFO_A"}, - {AMDGPU_GFX_SQC_INST_CACHE_GATCL1_MISS_FIFO_B, "SQC_INST_CACHE_GATCL1_MISS_FIFO_B"}, - {AMDGPU_GFX_SQC_DATA_CACHE_A, "SQC_DATA_CACHE_A"}, - {AMDGPU_GFX_SQC_DATA_CACHE_B, "SQC_DATA_CACHE_B"}, - {AMDGPU_GFX_SQC_DATA_CACHE_TAG_A, "SQC_DATA_CACHE_TAG_A"}, - {AMDGPU_GFX_SQC_DATA_CACHE_TAG_B, "SQC_DATA_CACHE_TAG_B"}, - {AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_A, "SQC_DATA_CACHE_MISS_FIFO_A"}, - {AMDGPU_GFX_SQC_DATA_CACHE_MISS_FIFO_B, "SQC_DATA_CACHE_MISS_FIFO_B"}, - {AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_A, "SQC_DATA_CACHE_HIT_FIFO_A"}, - {AMDGPU_GFX_SQC_DATA_CACHE_HIT_FIFO_B, "SQC_DATA_CACHE_HIT_FIFO_B"}, - {AMDGPU_GFX_SQC_DIRTY_BIT_A, "SQC_DIRTY_BIT_A"}, - {AMDGPU_GFX_SQC_DIRTY_BIT_B, "SQC_DIRTY_BIT_B"}, - {AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU0, "SQC_WRITE_DATA_BUFFER_CU0"}, - {AMDGPU_GFX_SQC_WRITE_DATA_BUFFER_CU1, "SQC_WRITE_DATA_BUFFER_CU1"}, - {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A, "SQC_UTCL1_MISS_LFIFO_DATA_CACHE_A"}, - {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B, "SQC_UTCL1_MISS_LFIFO_DATA_CACHE_B"}, - {AMDGPU_GFX_SQC_UTCL1_MISS_LFIFO_INST_CACHE, "SQC_UTCL1_MISS_LFIFO_INST_CACHE"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_sq_mem_list[] = { - {AMDGPU_GFX_SQ_SGPR_MEM0, "SQ_SGPR_MEM0"}, - {AMDGPU_GFX_SQ_SGPR_MEM1, "SQ_SGPR_MEM1"}, - {AMDGPU_GFX_SQ_SGPR_MEM2, "SQ_SGPR_MEM2"}, - {AMDGPU_GFX_SQ_SGPR_MEM3, "SQ_SGPR_MEM3"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_ta_mem_list[] = { - {AMDGPU_GFX_TA_FS_AFIFO_RAM_LO, "TA_FS_AFIFO_RAM_LO"}, - {AMDGPU_GFX_TA_FS_AFIFO_RAM_HI, "TA_FS_AFIFO_RAM_HI"}, - {AMDGPU_GFX_TA_FS_CFIFO_RAM, "TA_FS_CFIFO_RAM"}, - {AMDGPU_GFX_TA_FSX_LFIFO, "TA_FSX_LFIFO"}, - {AMDGPU_GFX_TA_FS_DFIFO_RAM, "TA_FS_DFIFO_RAM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcc_mem_list[] = { - {AMDGPU_GFX_TCC_MEM1, "TCC_MEM1"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tca_mem_list[] = { - {AMDGPU_GFX_TCA_MEM1, "TCA_MEM1"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tci_mem_list[] = { - {AMDGPU_GFX_TCIW_MEM, "TCIW_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcp_mem_list[] = { - {AMDGPU_GFX_TCP_LFIFO0, "TCP_LFIFO0"}, - {AMDGPU_GFX_TCP_SET0BANK0_RAM, "TCP_SET0BANK0_RAM"}, - {AMDGPU_GFX_TCP_SET0BANK1_RAM, "TCP_SET0BANK1_RAM"}, - {AMDGPU_GFX_TCP_SET0BANK2_RAM, "TCP_SET0BANK2_RAM"}, - {AMDGPU_GFX_TCP_SET0BANK3_RAM, "TCP_SET0BANK3_RAM"}, - {AMDGPU_GFX_TCP_SET1BANK0_RAM, "TCP_SET1BANK0_RAM"}, - {AMDGPU_GFX_TCP_SET1BANK1_RAM, "TCP_SET1BANK1_RAM"}, - {AMDGPU_GFX_TCP_SET1BANK2_RAM, "TCP_SET1BANK2_RAM"}, - {AMDGPU_GFX_TCP_SET1BANK3_RAM, "TCP_SET1BANK3_RAM"}, - {AMDGPU_GFX_TCP_SET2BANK0_RAM, "TCP_SET2BANK0_RAM"}, - {AMDGPU_GFX_TCP_SET2BANK1_RAM, "TCP_SET2BANK1_RAM"}, - {AMDGPU_GFX_TCP_SET2BANK2_RAM, "TCP_SET2BANK2_RAM"}, - {AMDGPU_GFX_TCP_SET2BANK3_RAM, "TCP_SET2BANK3_RAM"}, - {AMDGPU_GFX_TCP_SET3BANK0_RAM, "TCP_SET3BANK0_RAM"}, - {AMDGPU_GFX_TCP_SET3BANK1_RAM, "TCP_SET3BANK1_RAM"}, - {AMDGPU_GFX_TCP_SET3BANK2_RAM, "TCP_SET3BANK2_RAM"}, - {AMDGPU_GFX_TCP_SET3BANK3_RAM, "TCP_SET3BANK3_RAM"}, - {AMDGPU_GFX_TCP_VM_FIFO, "TCP_VM_FIFO"}, - {AMDGPU_GFX_TCP_DB_TAGRAM0, "TCP_DB_TAGRAM0"}, - {AMDGPU_GFX_TCP_DB_TAGRAM1, "TCP_DB_TAGRAM1"}, - {AMDGPU_GFX_TCP_DB_TAGRAM2, "TCP_DB_TAGRAM2"}, - {AMDGPU_GFX_TCP_DB_TAGRAM3, "TCP_DB_TAGRAM3"}, - {AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE0, "TCP_UTCL1_LFIFO_PROBE0"}, - {AMDGPU_GFX_TCP_UTCL1_LFIFO_PROBE1, "TCP_UTCL1_LFIFO_PROBE1"}, - {AMDGPU_GFX_TCP_CMD_FIFO, "TCP_CMD_FIFO"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_td_mem_list[] = { - {AMDGPU_GFX_TD_UTD_CS_FIFO_MEM, "TD_UTD_CS_FIFO_MEM"}, - {AMDGPU_GFX_TD_UTD_SS_FIFO_LO_MEM, "TD_UTD_SS_FIFO_LO_MEM"}, - {AMDGPU_GFX_TD_UTD_SS_FIFO_HI_MEM, "TD_UTD_SS_FIFO_HI_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_tcx_mem_list[] = { - {AMDGPU_GFX_TCX_FIFOD0, "TCX_FIFOD0"}, - {AMDGPU_GFX_TCX_FIFOD1, "TCX_FIFOD1"}, - {AMDGPU_GFX_TCX_FIFOD2, "TCX_FIFOD2"}, - {AMDGPU_GFX_TCX_FIFOD3, "TCX_FIFOD3"}, - {AMDGPU_GFX_TCX_FIFOD4, "TCX_FIFOD4"}, - {AMDGPU_GFX_TCX_FIFOD5, "TCX_FIFOD5"}, - {AMDGPU_GFX_TCX_FIFOD6, "TCX_FIFOD6"}, - {AMDGPU_GFX_TCX_FIFOD7, "TCX_FIFOD7"}, - {AMDGPU_GFX_TCX_FIFOB0, "TCX_FIFOB0"}, - {AMDGPU_GFX_TCX_FIFOB1, "TCX_FIFOB1"}, - {AMDGPU_GFX_TCX_FIFOB2, "TCX_FIFOB2"}, - {AMDGPU_GFX_TCX_FIFOB3, "TCX_FIFOB3"}, - {AMDGPU_GFX_TCX_FIFOB4, "TCX_FIFOB4"}, - {AMDGPU_GFX_TCX_FIFOB5, "TCX_FIFOB5"}, - {AMDGPU_GFX_TCX_FIFOB6, "TCX_FIFOB6"}, - {AMDGPU_GFX_TCX_FIFOB7, "TCX_FIFOB7"}, - {AMDGPU_GFX_TCX_FIFOA0, "TCX_FIFOA0"}, - {AMDGPU_GFX_TCX_FIFOA1, "TCX_FIFOA1"}, - {AMDGPU_GFX_TCX_FIFOA2, "TCX_FIFOA2"}, - {AMDGPU_GFX_TCX_FIFOA3, "TCX_FIFOA3"}, - {AMDGPU_GFX_TCX_FIFOA4, "TCX_FIFOA4"}, - {AMDGPU_GFX_TCX_FIFOA5, "TCX_FIFOA5"}, - {AMDGPU_GFX_TCX_FIFOA6, "TCX_FIFOA6"}, - {AMDGPU_GFX_TCX_FIFOA7, "TCX_FIFOA7"}, - {AMDGPU_GFX_TCX_CFIFO0, "TCX_CFIFO0"}, - {AMDGPU_GFX_TCX_CFIFO1, "TCX_CFIFO1"}, - {AMDGPU_GFX_TCX_CFIFO2, "TCX_CFIFO2"}, - {AMDGPU_GFX_TCX_CFIFO3, "TCX_CFIFO3"}, - {AMDGPU_GFX_TCX_CFIFO4, "TCX_CFIFO4"}, - {AMDGPU_GFX_TCX_CFIFO5, "TCX_CFIFO5"}, - {AMDGPU_GFX_TCX_CFIFO6, "TCX_CFIFO6"}, - {AMDGPU_GFX_TCX_CFIFO7, "TCX_CFIFO7"}, - {AMDGPU_GFX_TCX_FIFO_ACKB0, "TCX_FIFO_ACKB0"}, - {AMDGPU_GFX_TCX_FIFO_ACKB1, "TCX_FIFO_ACKB1"}, - {AMDGPU_GFX_TCX_FIFO_ACKB2, "TCX_FIFO_ACKB2"}, - {AMDGPU_GFX_TCX_FIFO_ACKB3, "TCX_FIFO_ACKB3"}, - {AMDGPU_GFX_TCX_FIFO_ACKB4, "TCX_FIFO_ACKB4"}, - {AMDGPU_GFX_TCX_FIFO_ACKB5, "TCX_FIFO_ACKB5"}, - {AMDGPU_GFX_TCX_FIFO_ACKB6, "TCX_FIFO_ACKB6"}, - {AMDGPU_GFX_TCX_FIFO_ACKB7, "TCX_FIFO_ACKB7"}, - {AMDGPU_GFX_TCX_FIFO_ACKD0, "TCX_FIFO_ACKD0"}, - {AMDGPU_GFX_TCX_FIFO_ACKD1, "TCX_FIFO_ACKD1"}, - {AMDGPU_GFX_TCX_FIFO_ACKD2, "TCX_FIFO_ACKD2"}, - {AMDGPU_GFX_TCX_FIFO_ACKD3, "TCX_FIFO_ACKD3"}, - {AMDGPU_GFX_TCX_FIFO_ACKD4, "TCX_FIFO_ACKD4"}, - {AMDGPU_GFX_TCX_FIFO_ACKD5, "TCX_FIFO_ACKD5"}, - {AMDGPU_GFX_TCX_FIFO_ACKD6, "TCX_FIFO_ACKD6"}, - {AMDGPU_GFX_TCX_FIFO_ACKD7, "TCX_FIFO_ACKD7"}, - {AMDGPU_GFX_TCX_DST_FIFOA0, "TCX_DST_FIFOA0"}, - {AMDGPU_GFX_TCX_DST_FIFOA1, "TCX_DST_FIFOA1"}, - {AMDGPU_GFX_TCX_DST_FIFOA2, "TCX_DST_FIFOA2"}, - {AMDGPU_GFX_TCX_DST_FIFOA3, "TCX_DST_FIFOA3"}, - {AMDGPU_GFX_TCX_DST_FIFOA4, "TCX_DST_FIFOA4"}, - {AMDGPU_GFX_TCX_DST_FIFOA5, "TCX_DST_FIFOA5"}, - {AMDGPU_GFX_TCX_DST_FIFOA6, "TCX_DST_FIFOA6"}, - {AMDGPU_GFX_TCX_DST_FIFOA7, "TCX_DST_FIFOA7"}, - {AMDGPU_GFX_TCX_DST_FIFOB0, "TCX_DST_FIFOB0"}, - {AMDGPU_GFX_TCX_DST_FIFOB1, "TCX_DST_FIFOB1"}, - {AMDGPU_GFX_TCX_DST_FIFOB2, "TCX_DST_FIFOB2"}, - {AMDGPU_GFX_TCX_DST_FIFOB3, "TCX_DST_FIFOB3"}, - {AMDGPU_GFX_TCX_DST_FIFOB4, "TCX_DST_FIFOB4"}, - {AMDGPU_GFX_TCX_DST_FIFOB5, "TCX_DST_FIFOB5"}, - {AMDGPU_GFX_TCX_DST_FIFOB6, "TCX_DST_FIFOB6"}, - {AMDGPU_GFX_TCX_DST_FIFOB7, "TCX_DST_FIFOB7"}, - {AMDGPU_GFX_TCX_DST_FIFOD0, "TCX_DST_FIFOD0"}, - {AMDGPU_GFX_TCX_DST_FIFOD1, "TCX_DST_FIFOD1"}, - {AMDGPU_GFX_TCX_DST_FIFOD2, "TCX_DST_FIFOD2"}, - {AMDGPU_GFX_TCX_DST_FIFOD3, "TCX_DST_FIFOD3"}, - {AMDGPU_GFX_TCX_DST_FIFOD4, "TCX_DST_FIFOD4"}, - {AMDGPU_GFX_TCX_DST_FIFOD5, "TCX_DST_FIFOD5"}, - {AMDGPU_GFX_TCX_DST_FIFOD6, "TCX_DST_FIFOD6"}, - {AMDGPU_GFX_TCX_DST_FIFOD7, "TCX_DST_FIFOD7"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB0, "TCX_DST_FIFO_ACKB0"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB1, "TCX_DST_FIFO_ACKB1"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB2, "TCX_DST_FIFO_ACKB2"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB3, "TCX_DST_FIFO_ACKB3"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB4, "TCX_DST_FIFO_ACKB4"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB5, "TCX_DST_FIFO_ACKB5"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB6, "TCX_DST_FIFO_ACKB6"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKB7, "TCX_DST_FIFO_ACKB7"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD0, "TCX_DST_FIFO_ACKD0"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD1, "TCX_DST_FIFO_ACKD1"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD2, "TCX_DST_FIFO_ACKD2"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD3, "TCX_DST_FIFO_ACKD3"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD4, "TCX_DST_FIFO_ACKD4"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD5, "TCX_DST_FIFO_ACKD5"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD6, "TCX_DST_FIFO_ACKD6"}, - {AMDGPU_GFX_TCX_DST_FIFO_ACKD7, "TCX_DST_FIFO_ACKD7"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_atc_l2_mem_list[] = { - {AMDGPU_GFX_ATC_L2_MEM, "ATC_L2_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_utcl2_mem_list[] = { - {AMDGPU_GFX_UTCL2_MEM, "UTCL2_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_vml2_mem_list[] = { - {AMDGPU_GFX_VML2_MEM, "VML2_MEM"}, -}; - -static const struct amdgpu_ras_memory_id_entry gfx_v9_4_3_ras_vml2_walker_mem_list[] = { - {AMDGPU_GFX_VML2_WALKER_MEM, "VML2_WALKER_MEM"}, -}; - -static const struct amdgpu_gfx_ras_mem_id_entry gfx_v9_4_3_ras_mem_list_array[AMDGPU_GFX_MEM_TYPE_NUM] = { - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_cp_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gcea_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gc_cane_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gcutcl2_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_gds_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_lds_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_rlc_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sp_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_spi_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sqc_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_sq_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_ta_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcc_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tca_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tci_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcp_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_td_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_tcx_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_atc_l2_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_utcl2_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_vml2_mem_list) - AMDGPU_GFX_MEMID_ENT(gfx_v9_4_3_ras_vml2_walker_mem_list) -}; - -static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ce_reg_list[] = { - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regRLC_CE_ERR_STATUS_LOW, regRLC_CE_ERR_STATUS_HIGH), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "RLC"}, - AMDGPU_GFX_RLC_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPC_CE_ERR_STATUS_LO, regCPC_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPC"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPF_CE_ERR_STATUS_LO, regCPF_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPF"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPG_CE_ERR_STATUS_LO, regCPG_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPG"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGDS_CE_ERR_STATUS_LO, regGDS_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GDS"}, - AMDGPU_GFX_GDS_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGC_CANE_CE_ERR_STATUS_LO, regGC_CANE_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CANE"}, - AMDGPU_GFX_GC_CANE_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_CE_ERR_STATUS_LO, regSPI_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SPI"}, - AMDGPU_GFX_SPI_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_CE_ERR_STATUS_LO, regSP0_CE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP0"}, - AMDGPU_GFX_SP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_CE_ERR_STATUS_LO, regSP1_CE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP1"}, - AMDGPU_GFX_SP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_CE_ERR_STATUS_LO, regSQ_CE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQ"}, - AMDGPU_GFX_SQ_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_CE_EDC_LO, regSQC_CE_EDC_HI), - 5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQC"}, - AMDGPU_GFX_SQC_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_CE_ERR_STATUS_LO, regTCX_CE_ERR_STATUS_HI), - 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCX"}, - AMDGPU_GFX_TCX_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCC_CE_ERR_STATUS_LO, regTCC_CE_ERR_STATUS_HI), - 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCC"}, - AMDGPU_GFX_TCC_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_CE_EDC_LO, regTA_CE_EDC_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TA"}, - AMDGPU_GFX_TA_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_CE_EDC_LO_REG, regTCI_CE_EDC_HI_REG), - 27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCI"}, - AMDGPU_GFX_TCI_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_CE_EDC_LO_REG, regTCP_CE_EDC_HI_REG), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCP"}, - AMDGPU_GFX_TCP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_CE_EDC_LO, regTD_CE_EDC_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TD"}, - AMDGPU_GFX_TD_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_CE_ERR_STATUS_LO, regGCEA_CE_ERR_STATUS_HI), - 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GCEA"}, - AMDGPU_GFX_GCEA_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_CE_ERR_STATUS_LO, regLDS_CE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "LDS"}, - AMDGPU_GFX_LDS_MEM, 4}, -}; - -static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = { - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regRLC_UE_ERR_STATUS_LOW, regRLC_UE_ERR_STATUS_HIGH), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "RLC"}, - AMDGPU_GFX_RLC_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPC_UE_ERR_STATUS_LO, regCPC_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPC"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPF_UE_ERR_STATUS_LO, regCPF_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPF"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regCPG_UE_ERR_STATUS_LO, regCPG_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CPG"}, - AMDGPU_GFX_CP_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGDS_UE_ERR_STATUS_LO, regGDS_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GDS"}, - AMDGPU_GFX_GDS_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGC_CANE_UE_ERR_STATUS_LO, regGC_CANE_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "CANE"}, - AMDGPU_GFX_GC_CANE_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_UE_ERR_STATUS_LO, regSPI_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SPI"}, - AMDGPU_GFX_SPI_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_UE_ERR_STATUS_LO, regSP0_UE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP0"}, - AMDGPU_GFX_SP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_UE_ERR_STATUS_LO, regSP1_UE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SP1"}, - AMDGPU_GFX_SP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_UE_ERR_STATUS_LO, regSQ_UE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQ"}, - AMDGPU_GFX_SQ_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_UE_EDC_LO, regSQC_UE_EDC_HI), - 5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SQC"}, - AMDGPU_GFX_SQC_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_UE_ERR_STATUS_LO, regTCX_UE_ERR_STATUS_HI), - 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCX"}, - AMDGPU_GFX_TCX_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCC_UE_ERR_STATUS_LO, regTCC_UE_ERR_STATUS_HI), - 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCC"}, - AMDGPU_GFX_TCC_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_UE_EDC_LO, regTA_UE_EDC_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TA"}, - AMDGPU_GFX_TA_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_UE_EDC_LO_REG, regTCI_UE_EDC_HI_REG), - 27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCI"}, - AMDGPU_GFX_TCI_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_UE_EDC_LO_REG, regTCP_UE_EDC_HI_REG), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCP"}, - AMDGPU_GFX_TCP_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_UE_EDC_LO, regTD_UE_EDC_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TD"}, - AMDGPU_GFX_TD_MEM, 4}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCA_UE_ERR_STATUS_LO, regTCA_UE_ERR_STATUS_HI), - 2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "TCA"}, - AMDGPU_GFX_TCA_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_UE_ERR_STATUS_LO, regGCEA_UE_ERR_STATUS_HI), - 16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "GCEA"}, - AMDGPU_GFX_GCEA_MEM, 1}, - {{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_UE_ERR_STATUS_LO, regLDS_UE_ERR_STATUS_HI), - 10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "LDS"}, - AMDGPU_GFX_LDS_MEM, 4}, -}; - -static void gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev, - void *ras_error_status, int xcc_id) -{ - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - unsigned long ce_count = 0, ue_count = 0; - uint32_t i, j, k; - - /* NOTE: convert xcc_id to physical XCD ID (XCD0 or XCD1) */ - struct amdgpu_smuio_mcm_config_info mcm_info = { - .socket_id = adev->smuio.funcs->get_socket_id(adev), - .die_id = xcc_id & 0x01 ? 1 : 0, - }; - - mutex_lock(&adev->grbm_idx_mutex); - - for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) { - for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) { - for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) { - /* no need to select if instance number is 1 */ - if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 || - gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1) - gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); - - amdgpu_ras_inst_query_ras_error_count(adev, - &(gfx_v9_4_3_ce_reg_list[i].reg_entry), - 1, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].mem_id_ent, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ce_reg_list[i].mem_id_type].size, - GET_INST(GC, xcc_id), - AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, - &ce_count); - - amdgpu_ras_inst_query_ras_error_count(adev, - &(gfx_v9_4_3_ue_reg_list[i].reg_entry), - 1, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size, - GET_INST(GC, xcc_id), - AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, - &ue_count); - } - } - } - - /* handle extra register entries of UE */ - for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) { - for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) { - for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) { - /* no need to select if instance number is 1 */ - if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 || - gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1) - gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); - - amdgpu_ras_inst_query_ras_error_count(adev, - &(gfx_v9_4_3_ue_reg_list[i].reg_entry), - 1, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].mem_id_ent, - gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_type].size, - GET_INST(GC, xcc_id), - AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, - &ue_count); - } - } - } - - gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); - - /* the caller should make sure initialize value of - * err_data->ue_count and err_data->ce_count - */ - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); -} - -static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev, - void *ras_error_status, int xcc_id) -{ - uint32_t i, j, k; - - mutex_lock(&adev->grbm_idx_mutex); - - for (i = 0; i < ARRAY_SIZE(gfx_v9_4_3_ce_reg_list); i++) { - for (j = 0; j < gfx_v9_4_3_ce_reg_list[i].se_num; j++) { - for (k = 0; k < gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst; k++) { - /* no need to select if instance number is 1 */ - if (gfx_v9_4_3_ce_reg_list[i].se_num > 1 || - gfx_v9_4_3_ce_reg_list[i].reg_entry.reg_inst > 1) - gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); - - amdgpu_ras_inst_reset_ras_error_count(adev, - &(gfx_v9_4_3_ce_reg_list[i].reg_entry), - 1, - GET_INST(GC, xcc_id)); - - amdgpu_ras_inst_reset_ras_error_count(adev, - &(gfx_v9_4_3_ue_reg_list[i].reg_entry), - 1, - GET_INST(GC, xcc_id)); - } - } - } - - /* handle extra register entries of UE */ - for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) { - for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) { - for (k = 0; k < gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) { - /* no need to select if instance number is 1 */ - if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 || - gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1) - gfx_v9_4_3_xcc_select_se_sh(adev, j, 0, k, xcc_id); - - amdgpu_ras_inst_reset_ras_error_count(adev, - &(gfx_v9_4_3_ue_reg_list[i].reg_entry), - 1, - GET_INST(GC, xcc_id)); - } - } - } - - gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, - xcc_id); - mutex_unlock(&adev->grbm_idx_mutex); -} - static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev, void *ras_error_status, int xcc_id) { @@ -4607,18 +3758,6 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct amdgpu_device *adev, mutex_unlock(&adev->grbm_idx_mutex); } -static void gfx_v9_4_3_query_ras_error_count(struct amdgpu_device *adev, - void *ras_error_status) -{ - amdgpu_gfx_ras_error_func(adev, ras_error_status, - gfx_v9_4_3_inst_query_ras_err_count); -} - -static void gfx_v9_4_3_reset_ras_error_count(struct amdgpu_device *adev) -{ - amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_reset_ras_err_count); -} - static void gfx_v9_4_3_enable_watchdog_timer(struct amdgpu_device *adev) { amdgpu_gfx_ras_error_func(adev, NULL, gfx_v9_4_3_inst_enable_watchdog_timer); @@ -5099,37 +4238,9 @@ struct amdgpu_xcp_ip_funcs gfx_v9_4_3_xcp_funcs = { .resume = &gfx_v9_4_3_xcp_resume }; -struct amdgpu_ras_block_hw_ops gfx_v9_4_3_ras_ops = { - .query_ras_error_count = &gfx_v9_4_3_query_ras_error_count, - .reset_ras_error_count = &gfx_v9_4_3_reset_ras_error_count, -}; - -static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) -{ - int r; - - r = amdgpu_ras_block_late_init(adev, ras_block); - if (r) - return r; - - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__GFX, - &gfx_v9_4_3_aca_info, - NULL); - if (r) - goto late_fini; - - return 0; - -late_fini: - amdgpu_ras_block_late_fini(adev, ras_block); - - return r; -} - struct amdgpu_gfx_ras gfx_v9_4_3_ras = { .ras_block = { - .hw_ops = &gfx_v9_4_3_ras_ops, - .ras_late_init = &gfx_v9_4_3_ras_late_init, + .hw_ops = NULL, }, .enable_watchdog_timer = &gfx_v9_4_3_enable_watchdog_timer, }; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index c2a41fa3a396..64ebedc595b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -167,44 +167,6 @@ static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev) } } -static void gmc_v8_0_mc_stop(struct amdgpu_device *adev) -{ - u32 blackout; - struct amdgpu_ip_block *ip_block; - - ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GMC); - if (!ip_block) - return; - - gmc_v8_0_wait_for_idle(ip_block); - - blackout = RREG32(mmMC_SHARED_BLACKOUT_CNTL); - if (REG_GET_FIELD(blackout, MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE) != 1) { - /* Block CPU access */ - WREG32(mmBIF_FB_EN, 0); - /* blackout the MC */ - blackout = REG_SET_FIELD(blackout, - MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE, 1); - WREG32(mmMC_SHARED_BLACKOUT_CNTL, blackout); - } - /* wait for the MC to settle */ - udelay(100); -} - -static void gmc_v8_0_mc_resume(struct amdgpu_device *adev) -{ - u32 tmp; - - /* unblackout the MC */ - tmp = RREG32(mmMC_SHARED_BLACKOUT_CNTL); - tmp = REG_SET_FIELD(tmp, MC_SHARED_BLACKOUT_CNTL, BLACKOUT_MODE, 0); - WREG32(mmMC_SHARED_BLACKOUT_CNTL, tmp); - /* allow CPU access */ - tmp = REG_SET_FIELD(0, BIF_FB_EN, FB_READ_EN, 1); - tmp = REG_SET_FIELD(tmp, BIF_FB_EN, FB_WRITE_EN, 1); - WREG32(mmBIF_FB_EN, tmp); -} - /** * gmc_v8_0_init_microcode - load ucode images from disk * @@ -1293,89 +1255,6 @@ static int gmc_v8_0_wait_for_idle(struct amdgpu_ip_block *ip_block) } -static bool gmc_v8_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - u32 srbm_soft_reset = 0; - struct amdgpu_device *adev = ip_block->adev; - u32 tmp = RREG32(mmSRBM_STATUS); - - if (tmp & SRBM_STATUS__VMC_BUSY_MASK) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, - SRBM_SOFT_RESET, SOFT_RESET_VMC, 1); - - if (tmp & (SRBM_STATUS__MCB_BUSY_MASK | SRBM_STATUS__MCB_NON_DISPLAY_BUSY_MASK | - SRBM_STATUS__MCC_BUSY_MASK | SRBM_STATUS__MCD_BUSY_MASK)) { - if (!(adev->flags & AMD_IS_APU)) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, - SRBM_SOFT_RESET, SOFT_RESET_MC, 1); - } - - if (srbm_soft_reset) { - adev->gmc.srbm_soft_reset = srbm_soft_reset; - return true; - } - - adev->gmc.srbm_soft_reset = 0; - - return false; -} - -static int gmc_v8_0_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->gmc.srbm_soft_reset) - return 0; - - gmc_v8_0_mc_stop(adev); - if (gmc_v8_0_wait_for_idle(ip_block)) - dev_warn(adev->dev, "Wait for GMC idle timed out !\n"); - - return 0; -} - -static int gmc_v8_0_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset; - - if (!adev->gmc.srbm_soft_reset) - return 0; - srbm_soft_reset = adev->gmc.srbm_soft_reset; - - if (srbm_soft_reset) { - u32 tmp; - - tmp = RREG32(mmSRBM_SOFT_RESET); - tmp |= srbm_soft_reset; - dev_info(adev->dev, "SRBM_SOFT_RESET=0x%08X\n", tmp); - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - udelay(50); - - tmp &= ~srbm_soft_reset; - WREG32(mmSRBM_SOFT_RESET, tmp); - tmp = RREG32(mmSRBM_SOFT_RESET); - - /* Wait a little for things to settle down */ - udelay(50); - } - - return 0; -} - -static int gmc_v8_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->gmc.srbm_soft_reset) - return 0; - - gmc_v8_0_mc_resume(adev); - return 0; -} - static int gmc_v8_0_vm_fault_interrupt_state(struct amdgpu_device *adev, struct amdgpu_irq_src *src, unsigned int type, @@ -1715,10 +1594,6 @@ static const struct amd_ip_funcs gmc_v8_0_ip_funcs = { .resume = gmc_v8_0_resume, .is_idle = gmc_v8_0_is_idle, .wait_for_idle = gmc_v8_0_wait_for_idle, - .check_soft_reset = gmc_v8_0_check_soft_reset, - .pre_soft_reset = gmc_v8_0_pre_soft_reset, - .soft_reset = gmc_v8_0_soft_reset, - .post_soft_reset = gmc_v8_0_post_soft_reset, .set_clockgating_state = gmc_v8_0_set_clockgating_state, .set_powergating_state = gmc_v8_0_set_powergating_state, .get_clockgating_state = gmc_v8_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 8a5c44810ba1..1fcc0594fd0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -57,6 +57,7 @@ #include "umc_v6_0.h" #include "umc_v6_7.h" #include "umc_v12_0.h" +#include "ras_umc_v12_0.h" #include "hdp_v4_0.h" #include "mca_v3_0.h" @@ -1382,7 +1383,7 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) case IP_VERSION(12, 0, 0): case IP_VERSION(12, 5, 0): adev->umc.max_ras_err_cnt_per_query = - UMC_V12_0_TOTAL_CHANNEL_NUM(adev) * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; + UMC_V12_0_TOTAL_CHANNEL_NUM * UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM; adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM; @@ -2025,11 +2026,19 @@ static int gmc_v9_0_sw_init(struct amdgpu_ip_block *ip_block) * The first KFD VMID is 8 for GPUs with graphics, 3 for * compute-only GPUs. On compute-only GPUs that leaves 2 VMIDs * for video processing. + * + * If kernel queues are disabled, allow KFD to use all vmids. */ - adev->vm_manager.first_kfd_vmid = - (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || - amdgpu_is_multi_aid(adev)) ? + if (adev->gfx.disable_kq && + adev->jpeg.disable_kq && + adev->vcn.disable_kq && + adev->sdma.no_user_submission) + adev->vm_manager.first_kfd_vmid = 1; + else + adev->vm_manager.first_kfd_vmid = + (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 1) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || + amdgpu_is_multi_aid(adev)) ? 3 : 8; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index d8204fbc198d..0fdc32b3ae91 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -119,6 +119,19 @@ static int jpeg_v4_0_3_early_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->jpeg.disable_kq = false; + adev->jpeg.disable_uq = true; + break; + case 2: + adev->jpeg.disable_kq = true; + adev->jpeg.disable_uq = true; + break; + } + adev->jpeg.num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS_4_0_3; jpeg_v4_0_3_set_dec_ring_funcs(adev); @@ -175,6 +188,10 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) { ring = &adev->jpeg.inst[i].ring_dec[j]; ring->use_doorbell = true; + if (adev->jpeg.disable_kq) { + ring->no_scheduler = true; + ring->no_user_submission = true; + } ring->vm_hub = AMDGPU_MMHUB0(adev->jpeg.inst[i].aid_id); if (!amdgpu_sriov_vf(adev)) { ring->doorbell_index = @@ -1425,72 +1442,6 @@ static const struct amdgpu_ras_block_hw_ops jpeg_v4_0_3_ras_hw_ops = { .query_poison_status = jpeg_v4_0_3_query_ras_poison_status, }; -static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* reference to smu driver if header file */ -static int jpeg_v4_0_3_err_codes[] = { - 16, 17, 18, 19, 20, 21, 22, 23, /* JPEG[0-7][S|D] */ - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -static bool jpeg_v4_0_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - jpeg_v4_0_3_err_codes, - ARRAY_SIZE(jpeg_v4_0_3_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops jpeg_v4_0_3_aca_bank_ops = { - .aca_bank_parser = jpeg_v4_0_3_aca_bank_parser, - .aca_bank_is_valid = jpeg_v4_0_3_aca_bank_is_valid, -}; - -static const struct aca_info jpeg_v4_0_3_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &jpeg_v4_0_3_aca_bank_ops, -}; - static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; @@ -1506,11 +1457,6 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm goto late_fini; } - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG, - &jpeg_v4_0_3_aca_info, NULL); - if (r) - goto late_fini; - return 0; late_fini: diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c index ae3afc7ab326..8846cb3ed12b 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c @@ -118,6 +118,19 @@ static int jpeg_v5_0_1_early_init(struct amdgpu_ip_block *ip_block) if (!adev->jpeg.num_jpeg_inst || adev->jpeg.num_jpeg_inst > AMDGPU_MAX_JPEG_INSTANCES) return -ENOENT; + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->jpeg.disable_kq = false; + adev->jpeg.disable_uq = true; + break; + case 2: + adev->jpeg.disable_kq = true; + adev->jpeg.disable_uq = true; + break; + } + adev->jpeg.num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS; jpeg_v5_0_1_set_dec_ring_funcs(adev); jpeg_v5_0_1_set_irq_funcs(adev); @@ -172,6 +185,10 @@ static int jpeg_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block) for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) { ring = &adev->jpeg.inst[i].ring_dec[j]; ring->use_doorbell = true; + if (adev->jpeg.disable_kq) { + ring->no_scheduler = true; + ring->no_user_submission = true; + } ring->vm_hub = AMDGPU_MMHUB0(adev->jpeg.inst[i].aid_id); if (!amdgpu_sriov_vf(adev)) { ring->doorbell_index = @@ -871,10 +888,7 @@ static const struct amd_ip_funcs jpeg_v5_0_1_ip_funcs = { .resume = jpeg_v5_0_1_resume, .is_idle = jpeg_v5_0_1_is_idle, .wait_for_idle = jpeg_v5_0_1_wait_for_idle, - .check_soft_reset = NULL, - .pre_soft_reset = NULL, .soft_reset = NULL, - .post_soft_reset = NULL, .set_clockgating_state = jpeg_v5_0_1_set_clockgating_state, .set_powergating_state = jpeg_v5_0_1_set_powergating_state, .dump_ip_state = amdgpu_jpeg_dump_ip_state, @@ -1003,73 +1017,6 @@ static const struct amdgpu_ras_block_hw_ops jpeg_v5_0_1_ras_hw_ops = { .query_poison_status = jpeg_v5_0_1_query_ras_poison_status, }; -static int jpeg_v5_0_1_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* reference to smu driver if header file */ -static int jpeg_v5_0_1_err_codes[] = { - 16, 17, 18, 19, 20, 21, 22, 23, /* JPEG[0-9][S|D] */ - 24, 25, 26, 27, 28, 29, 30, 31, - 48, 49, 50, 51, -}; - -static bool jpeg_v5_0_1_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - jpeg_v5_0_1_err_codes, - ARRAY_SIZE(jpeg_v5_0_1_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops jpeg_v5_0_1_aca_bank_ops = { - .aca_bank_parser = jpeg_v5_0_1_aca_bank_parser, - .aca_bank_is_valid = jpeg_v5_0_1_aca_bank_is_valid, -}; - -static const struct aca_info jpeg_v5_0_1_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &jpeg_v5_0_1_aca_bank_ops, -}; - static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; @@ -1078,11 +1025,6 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm if (r) return r; - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG, - &jpeg_v5_0_1_aca_info, NULL); - if (r) - goto late_fini; - if (amdgpu_ras_is_supported(adev, ras_block->block) && adev->jpeg.inst->ras_poison_irq.funcs) { r = amdgpu_irq_get(adev, &adev->jpeg.inst->ras_poison_irq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c index 7a4ecea6b39a..ff02f72352a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_2.c @@ -690,10 +690,7 @@ static const struct amd_ip_funcs jpeg_v5_0_2_ip_funcs = { .resume = jpeg_v5_0_2_resume, .is_idle = jpeg_v5_0_2_is_idle, .wait_for_idle = jpeg_v5_0_2_wait_for_idle, - .check_soft_reset = NULL, - .pre_soft_reset = NULL, .soft_reset = NULL, - .post_soft_reset = NULL, .set_clockgating_state = jpeg_v5_0_2_set_clockgating_state, .set_powergating_state = jpeg_v5_0_2_set_powergating_state, .dump_ip_state = amdgpu_jpeg_dump_ip_state, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 16625c31bfd3..e947c16e694d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -133,8 +133,8 @@ static int mes_userq_map(struct amdgpu_usermode_queue *queue) queue_input.gang_quantum = 10000; queue_input.paging = false; - queue_input.process_context_addr = ctx->gpu_addr; - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.process_context_addr = uq_mgr->proc_ctx_obj.gpu_addr; + queue_input.gang_context_addr = ctx->gpu_addr; queue_input.inprocess_gang_priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; queue_input.gang_global_priority_level = convert_to_mes_priority(queue->priority); @@ -169,7 +169,8 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); queue_input.doorbell_offset = queue->doorbell_index; - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; + queue_input.queue_type = queue->queue_type; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); @@ -179,6 +180,63 @@ static int mes_userq_unmap(struct amdgpu_usermode_queue *queue) return r; } +int mes_userq_reset(struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + struct mes_reset_queue_input queue_input; + int r; + + /* XXX: add a FW version check for SDMA per queue reset */ + memset(&queue_input, 0x0, sizeof(struct mes_reset_queue_input)); + queue_input.doorbell_offset = queue->doorbell_index; + queue_input.queue_type = queue->queue_type; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + if (r) + return r; + return mes_userq_unmap(queue); +} + +int mes_userq_reset_queue(struct amdgpu_device *adev, + struct amdgpu_usermode_queue *guilty_uq, + int queue_type, + unsigned int pipe, + unsigned int queue, + unsigned int db) +{ + struct amdgpu_usermode_queue *uq; + bool use_mmio = adev->gfx.mec.use_mmio_for_reset; + unsigned long uq_id; + int r; + + xa_for_each(&adev->userq_doorbell_xa, uq_id, uq) { + if (uq->queue_type == queue_type) { + if (uq == guilty_uq) + continue; + if (uq->doorbell_index == db) { + uq->state = AMDGPU_USERQ_STATE_HUNG; + if (use_mmio) + r = amdgpu_mes_reset_queue_mmio(adev, queue_type, 0, 1, pipe, queue, 0); + else + r = amdgpu_mes_reset_user_queue(adev, queue_type, db, 0); + if (r) + return r; + r = mes_userq_unmap(uq); + if (r) + return r; + atomic_inc(&adev->gpu_reset_counter); + amdgpu_userq_fence_driver_force_completion(uq); + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); + break; + } + } + } + return 0; +} + static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue, struct drm_amdgpu_userq_in *mqd_user) @@ -186,12 +244,8 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_userq_obj *ctx = &queue->fw_obj; int r, size; - /* - * The FW expects at least one page space allocated for - * process ctx and gang ctx each. Create an object - * for the same. - */ - size = AMDGPU_USERQ_PROC_CTX_SZ + AMDGPU_USERQ_GANG_CTX_SZ; + /* The FW expects at least one page space allocated for gang ctx. */ + size = AMDGPU_USERQ_GANG_CTX_SZ; r = amdgpu_bo_create_kernel(uq_mgr->adev, size, 0, AMDGPU_GEM_DOMAIN_GTT, &ctx->obj, &ctx->gpu_addr, @@ -205,54 +259,26 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, return 0; } -static int mes_userq_detect_and_reset(struct amdgpu_device *adev, - int queue_type) +static int mes_userq_create_proc_ctx_space(struct amdgpu_userq_mgr *uq_mgr) { - int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev); - struct mes_detect_and_reset_queue_input input; - struct amdgpu_usermode_queue *queue; - unsigned int hung_db_num = 0; - unsigned long queue_id; - u32 db_array[8]; - bool found_hung_queue = false; - int r, i; - - if (db_array_size > 8) { - dev_err(adev->dev, "DB array size (%d vs 8) too small\n", - db_array_size); - return -EINVAL; - } - - memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input)); + int r = 0; - input.queue_type = queue_type; - - amdgpu_mes_lock(&adev->mes); - r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false, - &hung_db_num, db_array, 0); - amdgpu_mes_unlock(&adev->mes); - if (r) { - dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r); - } else if (hung_db_num) { - xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { - if (queue->queue_type == queue_type) { - for (i = 0; i < hung_db_num; i++) { - if (queue->doorbell_index == db_array[i]) { - queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; - atomic_inc(&adev->gpu_reset_counter); - amdgpu_userq_fence_driver_force_completion(queue); - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); - } - } - } - } + mutex_lock(&uq_mgr->proc_ctx_lock); + /* This check is a necessary because amdgpu_bo_create_kernel() + * calls helpers like amdgpu_bo_pin() and memset() unconditionally + */ + if (!uq_mgr->proc_ctx_obj.obj) { + r = amdgpu_bo_create_kernel(uq_mgr->adev, AMDGPU_USERQ_PROC_CTX_SZ, + 0, AMDGPU_GEM_DOMAIN_GTT, + &uq_mgr->proc_ctx_obj.obj, + &uq_mgr->proc_ctx_obj.gpu_addr, + &uq_mgr->proc_ctx_obj.cpu_ptr); + + if (!r) + memset(uq_mgr->proc_ctx_obj.cpu_ptr, 0, AMDGPU_USERQ_PROC_CTX_SZ); } - if (found_hung_queue) { - /* Resume scheduling after hang recovery */ - r = amdgpu_mes_resume(adev, input.xcc_id); - } + mutex_unlock(&uq_mgr->proc_ctx_lock); return r; } @@ -429,7 +455,14 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, goto free_mqd; } - /* Create BO for FW operations */ + /* Create per-process MES process context BO */ + r = mes_userq_create_proc_ctx_space(uq_mgr); + if (r) { + DRM_ERROR("Failed to allocate MES process context space bo, error: %d\n", r); + goto free_mqd; + } + + /* Create BO of a gang for FW operations */ r = mes_userq_create_ctx_space(uq_mgr, queue, mqd_user); if (r) { DRM_ERROR("Failed to allocate BO for userqueue (%d)", r); @@ -497,7 +530,7 @@ static int mes_userq_preempt(struct amdgpu_usermode_queue *queue) *fence_ptr = 0; memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input)); - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; queue_input.suspend_fence_addr = fence_gpu_addr; queue_input.suspend_fence_value = 1; amdgpu_mes_lock(&adev->mes); @@ -534,7 +567,7 @@ static int mes_userq_restore(struct amdgpu_usermode_queue *queue) return 0; memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input)); - queue_input.gang_context_addr = ctx->gpu_addr + AMDGPU_USERQ_PROC_CTX_SZ; + queue_input.gang_context_addr = ctx->gpu_addr; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input); @@ -549,7 +582,7 @@ const struct amdgpu_userq_funcs userq_mes_funcs = { .mqd_destroy = mes_userq_mqd_destroy, .unmap = mes_userq_unmap, .map = mes_userq_map, - .detect_and_reset = mes_userq_detect_and_reset, .preempt = mes_userq_preempt, .restore = mes_userq_restore, + .reset = mes_userq_reset, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h index 090ae8897770..a473360d6a8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.h @@ -27,4 +27,13 @@ #include "amdgpu_userq.h" extern const struct amdgpu_userq_funcs userq_mes_funcs; + +int mes_userq_reset(struct amdgpu_usermode_queue *queue); +int mes_userq_reset_queue(struct amdgpu_device *adev, + struct amdgpu_usermode_queue *guilty_uq, + int queue_type, + unsigned int pipe, + unsigned int queue, + unsigned int db); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 1b071a3de173..8f136ff7d96f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -387,6 +387,8 @@ static int mes_v11_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); if (mes_rev >= 0x60) mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset; @@ -396,6 +398,230 @@ static int mes_v11_0_remove_hw_queue(struct amdgpu_mes *mes, offsetof(union MESAPI__REMOVE_QUEUE, api_status)); } +static bool mes_v11_0_pipe_reset_support(struct amdgpu_device *adev) +{ + /* Disable the pipe reset until the CPFW fully support it.*/ + dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n"); + return false; +} +static int mes_v11_0_reset_gfx_pipe_mmio(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 queue) +{ + uint32_t reset_pipe = 0, clean_pipe = 0; + int r; + + if (!mes_v11_0_pipe_reset_support(adev)) + return -EOPNOTSUPP; + + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); + mutex_lock(&adev->srbm_mutex); + soc21_grbm_select(adev, me, pipe, queue, 0); + + switch (pipe) { + case 0: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + PFP_PIPE0_RESET, 1); + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + ME_PIPE0_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + PFP_PIPE0_RESET, 0); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + ME_PIPE0_RESET, 0); + break; + case 1: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + PFP_PIPE1_RESET, 1); + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + ME_PIPE1_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + PFP_PIPE1_RESET, 0); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + ME_PIPE1_RESET, 0); + break; + default: + break; + } + + WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe); + WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe); + + r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) - + RS64_FW_UC_START_ADDR_LO; + soc21_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); + + dev_info(adev->dev, "The gfx pipe reset to the ME firmware start PC: %s\n", + r == 0 ? "successfully" : "failed"); + /* FIXME: Sometimes driver can't cache the ME firmware start PC correctly, + * so the pipe reset status relies on the later gfx ring test result. + */ + return 0; +} + +/* + * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for + * every queue on (me, pipe). HQDs must be torn down while pipe reset stays + * asserted; only then clear the pipe reset bit. + * Caller must hold adev->srbm_mutex. + */ +static void mes_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me, + u32 pipe) +{ + unsigned int q; + + for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) { + soc21_grbm_select(adev, me, pipe, q, 0); + /* Start from a clean HQD dequeue state before forcing HQD inactive. */ + WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0); + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0); + } +} + +static int mes_v11_0_reset_compute_pipe_mmio(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 queue) +{ + uint32_t reset_val, clean_val; + int r; + + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); + mutex_lock(&adev->srbm_mutex); + soc21_grbm_select(adev, me, pipe, queue, 0); + + if (adev->gfx.rs64_enable) { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); + clean_val = reset_val; + + switch (pipe) { + case 0: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 0); + break; + case 1: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 0); + break; + case 2: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 0); + break; + case 3: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 0); + break; + default: + break; + } + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val); + mes_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe); + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val); + r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) - + RS64_FW_UC_START_ADDR_LO; + } else { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL); + clean_val = reset_val; + + if (me == 1) { + switch (pipe) { + case 0: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 0); + break; + case 1: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 0); + break; + case 2: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 0); + break; + case 3: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 0); + break; + default: + break; + } + /* mec1 fw pc: CP_MEC1_INSTR_PNTR */ + } else { + switch (pipe) { + case 0: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 0); + break; + case 1: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 0); + break; + case 2: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE2_RESET, 0); + break; + case 3: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE3_RESET, 0); + break; + default: + break; + } + /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */ + } + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val); + mes_v11_0_clear_hqds_on_mec_pipe(adev, me, pipe); + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val); + r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR)); + } + + soc21_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); + + dev_dbg(adev->dev, "MEC pipe me%u pipe%u queue%u resets to MEC FW start PC: %s\n", + me, pipe, queue, r == 0 ? "successfully" : "failed"); + /*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe + * reset status relies on the compute ring test result. + */ + return 0; +} + +static int mes_v11_0_reset_pipe_mmio(struct amdgpu_mes *mes, uint32_t queue_type, + uint32_t me_id, uint32_t pipe_id, + uint32_t queue_id, uint32_t vmid) +{ + struct amdgpu_device *adev = mes->adev; + + if (queue_type == AMDGPU_RING_TYPE_GFX) + return mes_v11_0_reset_gfx_pipe_mmio(adev, me_id, pipe_id, queue_id); + else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) + return mes_v11_0_reset_compute_pipe_mmio(adev, me_id, pipe_id, queue_id); + else + return -EOPNOTSUPP; +} + static int mes_v11_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_type, uint32_t me_id, uint32_t pipe_id, uint32_t queue_id, uint32_t vmid) @@ -770,10 +996,16 @@ static int mes_v11_0_reset_hw_queue(struct amdgpu_mes *mes, { union MESAPI__RESET mes_reset_queue_pkt; - if (input->use_mmio) - return mes_v11_0_reset_queue_mmio(mes, input->queue_type, - input->me_id, input->pipe_id, - input->queue_id, input->vmid); + if (input->use_mmio) { + int r = mes_v11_0_reset_queue_mmio(mes, input->queue_type, + input->me_id, input->pipe_id, + input->queue_id, input->vmid); + if (r) + return mes_v11_0_reset_pipe_mmio(mes, input->queue_type, + input->me_id, input->pipe_id, + input->queue_id, input->vmid); + return 0; + } memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index b6cbc25e1ab4..ce5064200743 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -26,7 +26,7 @@ #include "amdgpu.h" #include "gfx_v12_0.h" #include "soc15_common.h" -#include "soc21.h" +#include "soc24.h" #include "gc/gc_12_0_0_offset.h" #include "gc/gc_12_0_0_sh_mask.h" #include "gc/gc_11_0_0_default.h" @@ -371,6 +371,8 @@ static int mes_v12_0_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); if (mes_rev >= 0x5a) mes_remove_queue_pkt.remove_queue_after_reset = input->remove_queue_after_reset; @@ -413,6 +415,171 @@ int gfx_v12_0_request_gfx_index_mutex(struct amdgpu_device *adev, return 0; } +static bool mes_v12_0_pipe_reset_support(struct amdgpu_device *adev) +{ + /* Disable the pipe reset until the CPFW fully support it.*/ + dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n"); + return false; +} + +static int mes_v12_0_reset_gfx_pipe_mmio(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 queue) +{ + uint32_t reset_pipe = 0, clean_pipe = 0; + int r; + + if (!mes_v12_0_pipe_reset_support(adev)) + return -EOPNOTSUPP; + + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); + mutex_lock(&adev->srbm_mutex); + soc24_grbm_select(adev, me, pipe, queue, 0); + + switch (pipe) { + case 0: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + PFP_PIPE0_RESET, 1); + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + ME_PIPE0_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + PFP_PIPE0_RESET, 0); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + ME_PIPE0_RESET, 0); + break; + case 1: + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + PFP_PIPE1_RESET, 1); + reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL, + ME_PIPE1_RESET, 1); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + PFP_PIPE1_RESET, 0); + clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL, + ME_PIPE1_RESET, 0); + break; + default: + break; + } + + WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe); + WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe); + + r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 2) - + RS64_FW_UC_START_ADDR_LO; + soc24_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); + + dev_info(adev->dev, "The gfx pipe reset: %s\n", + r == 0 ? "successfully" : "failed"); + /* Sometimes the ME start pc counter can't cache correctly, so the + * PC check only as a reference and pipe reset result rely on the + * later ring test. + */ + return 0; +} + +/* + * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for + * every queue on (me, pipe). HQDs must be torn down while pipe reset stays + * asserted; only then clear the pipe reset bit. + * Caller must hold adev->srbm_mutex. + */ +static void mes_v12_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me, + u32 pipe) +{ + unsigned int q; + + for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) { + soc24_grbm_select(adev, me, pipe, q, 0); + /* Start from a clean HQD dequeue state before forcing HQD inactive. */ + WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0); + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0); + } +} + +static int mes_v12_0_reset_compute_pipe_mmio(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 queue) +{ + uint32_t reset_val, clean_val; + int r = 0; + + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); + mutex_lock(&adev->srbm_mutex); + soc24_grbm_select(adev, me, pipe, queue, 0); + if (adev->gfx.rs64_enable) { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); + clean_val = reset_val; + + switch (pipe) { + case 0: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 0); + break; + case 1: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 0); + break; + case 2: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 0); + break; + case 3: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 0); + break; + default: + break; + } + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val); + mes_v12_0_clear_hqds_on_mec_pipe(adev, me, pipe); + soc24_grbm_select(adev, me, pipe, queue, 0); + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val); + r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) - + RS64_FW_UC_START_ADDR_LO; + } else { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL); + clean_val = reset_val; + + switch (pipe) { + case 0: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 0); + break; + case 1: + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 0); + break; + default: + break; + } + + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val); + mes_v12_0_clear_hqds_on_mec_pipe(adev, me, pipe); + soc24_grbm_select(adev, me, pipe, queue, 0); + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val); + } + + soc24_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); + + dev_dbg(adev->dev, "MEC pipe me%u pipe%u queue%u resets to MEC FW start PC: %s\n", + me, pipe, queue, r == 0 ? "successfully" : "failed"); + return 0; +} + static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_type, uint32_t me_id, uint32_t pipe_id, uint32_t queue_id, uint32_t vmid) @@ -442,7 +609,7 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ mutex_unlock(&adev->gfx.reset_sem_mutex); mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0); + soc24_grbm_select(adev, me_id, pipe_id, queue_id, 0); /* wait till dequeue take effects */ for (i = 0; i < adev->usec_timeout; i++) { if (!(RREG32_SOC15(GC, 0, regCP_GFX_HQD_ACTIVE) & 1)) @@ -454,13 +621,13 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ r = -ETIMEDOUT; } - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) { dev_info(adev->dev, "reset compute queue (%d:%d:%d)\n", me_id, pipe_id, queue_id); mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, me_id, pipe_id, queue_id, 0); + soc24_grbm_select(adev, me_id, pipe_id, queue_id, 0); WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2); WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1); @@ -474,7 +641,7 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ dev_err(adev->dev, "failed to wait on hqd deactivate\n"); r = -ETIMEDOUT; } - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } else if (queue_type == AMDGPU_RING_TYPE_SDMA) { dev_info(adev->dev, "reset sdma queue (%d:%d:%d)\n", @@ -507,6 +674,20 @@ static int mes_v12_0_reset_queue_mmio(struct amdgpu_mes *mes, uint32_t queue_typ return r; } +static int mes_v12_0_reset_pipe_mmio(struct amdgpu_mes *mes, uint32_t queue_type, + uint32_t me_id, uint32_t pipe_id, + uint32_t queue_id, uint32_t vmid) +{ + struct amdgpu_device *adev = mes->adev; + + if (queue_type == AMDGPU_RING_TYPE_GFX) + return mes_v12_0_reset_gfx_pipe_mmio(adev, me_id, pipe_id, queue_id); + else if (queue_type == AMDGPU_RING_TYPE_COMPUTE) + return mes_v12_0_reset_compute_pipe_mmio(adev, me_id, pipe_id, queue_id); + else + return -EOPNOTSUPP; +} + static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, struct mes_map_legacy_queue_input *input) { @@ -528,10 +709,15 @@ static int mes_v12_0_map_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); mes_add_queue_pkt.map_legacy_kq = 1; - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else + if (mes->adev->enable_uni_mes) { + /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */ + if (input->queue_type == AMDGPU_RING_TYPE_MES) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + } else { pipe = AMDGPU_MES_SCHED_PIPE; + } return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_add_queue_pkt, sizeof(mes_add_queue_pkt), @@ -565,12 +751,28 @@ static int mes_v12_0_unmap_legacy_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.unmap_legacy_queue = 1; mes_remove_queue_pkt.queue_type = convert_to_mes_queue_type(input->queue_type); + /* + * A reset-time unmap: the queue was already reset via MMIO while + * gangs are suspended and it is on the MES hung/fail list. Tell + * MES to just drop its internal state for it. Without this flag + * MES asks CP to unmap the already-reset (still wedged) queue + * again, which times out and forces a GPU reset. + */ + if (input->action == RESET_QUEUES && + (mes->sched_version & AMDGPU_MES_VERSION_MASK) >= 0x5a) + mes_remove_queue_pkt.remove_queue_after_reset = 1; + } - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else + if (mes->adev->enable_uni_mes) { + /* Keep scheduler queue on KIQ pipe; unmap all other kernel queues on sched pipe. */ + if (input->queue_type == AMDGPU_RING_TYPE_MES) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + } else { pipe = AMDGPU_MES_SCHED_PIPE; + } return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_remove_queue_pkt, sizeof(mes_remove_queue_pkt), @@ -888,10 +1090,16 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, union MESAPI__RESET mes_reset_queue_pkt; int pipe; - if (input->use_mmio) - return mes_v12_0_reset_queue_mmio(mes, input->queue_type, - input->me_id, input->pipe_id, - input->queue_id, input->vmid); + if (input->use_mmio) { + int r = mes_v12_0_reset_queue_mmio(mes, input->queue_type, + input->me_id, input->pipe_id, + input->queue_id, input->vmid); + if (r) + return mes_v12_0_reset_pipe_mmio(mes, input->queue_type, + input->me_id, input->pipe_id, + input->queue_id, input->vmid); + return 0; + } memset(&mes_reset_queue_pkt, 0, sizeof(mes_reset_queue_pkt)); @@ -915,10 +1123,7 @@ static int mes_v12_0_reset_hw_queue(struct amdgpu_mes *mes, mes_reset_queue_pkt.doorbell_offset = input->doorbell_offset; } - if (input->is_kq) - pipe = AMDGPU_MES_KIQ_PIPE; - else - pipe = AMDGPU_MES_SCHED_PIPE; + pipe = AMDGPU_MES_SCHED_PIPE; return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_reset_queue_pkt, sizeof(mes_reset_queue_pkt), @@ -1094,7 +1299,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) if (enable) { mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { - soc21_grbm_select(adev, 3, pipe, 0, 0); + soc24_grbm_select(adev, 3, pipe, 0, 0); if (amdgpu_mes_log_enable) { u32 log_size = AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE; /* In case uni mes is not enabled, only program for pipe 0 */ @@ -1133,7 +1338,7 @@ static void mes_v12_0_enable(struct amdgpu_device *adev, bool enable) WREG32_SOC15(GC, 0, regCP_MES_CNTL, data); } - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); if (amdgpu_emu_mode) @@ -1165,7 +1370,7 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev) mutex_lock(&adev->srbm_mutex); for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { /* me=3, queue=0 */ - soc21_grbm_select(adev, 3, pipe, 0, 0); + soc24_grbm_select(adev, 3, pipe, 0, 0); /* set ucode start address */ ucode_addr = adev->mes.uc_start_addr[pipe] >> 2; @@ -1174,7 +1379,7 @@ static void mes_v12_0_set_ucode_start_addr(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regCP_MES_PRGRM_CNTR_START_HI, upper_32_bits(ucode_addr)); - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); } mutex_unlock(&adev->srbm_mutex); } @@ -1203,7 +1408,7 @@ static int mes_v12_0_load_microcode(struct amdgpu_device *adev, mutex_lock(&adev->srbm_mutex); /* me=3, pipe=0, queue=0 */ - soc21_grbm_select(adev, 3, pipe, 0, 0); + soc24_grbm_select(adev, 3, pipe, 0, 0); WREG32_SOC15(GC, 0, regCP_MES_IC_BASE_CNTL, 0); @@ -1238,7 +1443,7 @@ static int mes_v12_0_load_microcode(struct amdgpu_device *adev, WREG32_SOC15(GC, 0, regCP_MES_IC_OP_CNTL, data); } - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); return 0; @@ -1385,7 +1590,7 @@ static void mes_v12_0_queue_init_register(struct amdgpu_ring *ring) uint32_t data = 0; mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, 3, ring->pipe, 0, 0); + soc24_grbm_select(adev, 3, ring->pipe, 0, 0); /* set CP_HQD_VMID.VMID = 0. */ data = RREG32_SOC15(GC, 0, regCP_HQD_VMID); @@ -1436,7 +1641,7 @@ static void mes_v12_0_queue_init_register(struct amdgpu_ring *ring) /* set CP_HQD_ACTIVE.ACTIVE=1 */ WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, mqd->cp_hqd_active); - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } @@ -1502,14 +1707,14 @@ static int mes_v12_0_queue_init(struct amdgpu_device *adev, ((pipe == AMDGPU_MES_KIQ_PIPE) && !adev->mes.kiq_version)) { /* get MES scheduler/KIQ versions */ mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, 3, pipe, 0, 0); + soc24_grbm_select(adev, 3, pipe, 0, 0); if (pipe == AMDGPU_MES_SCHED_PIPE) adev->mes.sched_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO); else if (pipe == AMDGPU_MES_KIQ_PIPE && adev->enable_mes_kiq) adev->mes.kiq_version = RREG32_SOC15(GC, 0, regCP_MES_GP3_LO); - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } @@ -1697,7 +1902,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev) int i; mutex_lock(&adev->srbm_mutex); - soc21_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0); + soc24_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0); /* disable the queue if it's active */ if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) { @@ -1721,7 +1926,7 @@ static void mes_v12_0_kiq_dequeue_sched(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_HI, 0); WREG32_SOC15(GC, 0, regCP_HQD_PQ_RPTR, 0); - soc21_grbm_select(adev, 0, 0, 0, 0); + soc24_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); adev->mes.ring[0].sched.ready = false; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c index e13535d94c51..f7d5879c6e44 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c @@ -362,6 +362,8 @@ static int mes_v12_1_remove_hw_queue(struct amdgpu_mes *mes, mes_remove_queue_pkt.doorbell_offset = input->doorbell_offset; mes_remove_queue_pkt.gang_context_addr = input->gang_context_addr; + mes_remove_queue_pkt.queue_type = + convert_to_mes_queue_type(input->queue_type); return mes_v12_1_submit_pkt_and_poll_completion(mes, xcc_id, AMDGPU_MES_SCHED_PIPE, @@ -417,10 +419,15 @@ static int mes_v12_1_map_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); mes_add_queue_pkt.map_legacy_kq = 1; - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else + if (mes->adev->enable_uni_mes) { + /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */ + if (input->queue_type == AMDGPU_RING_TYPE_MES) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + } else { pipe = AMDGPU_MES_SCHED_PIPE; + } return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, pipe, @@ -457,10 +464,15 @@ static int mes_v12_1_unmap_legacy_queue(struct amdgpu_mes *mes, convert_to_mes_queue_type(input->queue_type); } - if (mes->adev->enable_uni_mes) - pipe = AMDGPU_MES_KIQ_PIPE; - else + if (mes->adev->enable_uni_mes) { + /* Keep scheduler queue on KIQ pipe; map all other kernel queues on sched pipe. */ + if (input->queue_type == AMDGPU_RING_TYPE_MES) + pipe = AMDGPU_MES_KIQ_PIPE; + else + pipe = AMDGPU_MES_SCHED_PIPE; + } else { pipe = AMDGPU_MES_SCHED_PIPE; + } return mes_v12_1_submit_pkt_and_poll_completion(mes, input->xcc_id, pipe, @@ -2262,6 +2274,7 @@ static int mes_v12_1_test_queue(struct amdgpu_device *adev, int xcc_id, remove_queue.xcc_id = xcc_id; remove_queue.doorbell_offset = doorbell_idx; remove_queue.gang_context_addr = add_queue.gang_context_addr; + remove_queue.queue_type = queue_type; r = mes_v12_1_remove_hw_queue(&adev->mes, &remove_queue); error: diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index cc688ae79e84..47d07cd25fc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -29,7 +29,6 @@ #include "soc15_common.h" #include "soc15.h" -#include "amdgpu_ras.h" #include "amdgpu_psp.h" #define regVM_L2_CNTL3_DEFAULT 0x80100007 @@ -636,236 +635,8 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = { .get_clockgating = mmhub_v1_8_get_clockgating, }; -static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = { - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_CE_ERR_STATUS_LO, regMMEA0_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_CE_ERR_STATUS_LO, regMMEA1_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_CE_ERR_STATUS_LO, regMMEA2_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_CE_ERR_STATUS_LO, regMMEA3_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_CE_ERR_STATUS_LO, regMMEA4_CE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_CE_ERR_STATUS_LO, regMM_CANE_CE_ERR_STATUS_HI), - 1, 0, "MM_CANE"}, -}; - -static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ue_reg_list[] = { - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA0_UE_ERR_STATUS_LO, regMMEA0_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA0"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA1_UE_ERR_STATUS_LO, regMMEA1_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA1"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA2_UE_ERR_STATUS_LO, regMMEA2_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA2"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA3_UE_ERR_STATUS_LO, regMMEA3_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA3"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMMEA4_UE_ERR_STATUS_LO, regMMEA4_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "MMEA4"}, - {AMDGPU_RAS_REG_ENTRY(MMHUB, 0, regMM_CANE_UE_ERR_STATUS_LO, regMM_CANE_UE_ERR_STATUS_HI), - 1, 0, "MM_CANE"}, -}; - -static const struct amdgpu_ras_memory_id_entry mmhub_v1_8_ras_memory_list[] = { - {AMDGPU_MMHUB_WGMI_PAGEMEM, "MMEA_WGMI_PAGEMEM"}, - {AMDGPU_MMHUB_RGMI_PAGEMEM, "MMEA_RGMI_PAGEMEM"}, - {AMDGPU_MMHUB_WDRAM_PAGEMEM, "MMEA_WDRAM_PAGEMEM"}, - {AMDGPU_MMHUB_RDRAM_PAGEMEM, "MMEA_RDRAM_PAGEMEM"}, - {AMDGPU_MMHUB_WIO_CMDMEM, "MMEA_WIO_CMDMEM"}, - {AMDGPU_MMHUB_RIO_CMDMEM, "MMEA_RIO_CMDMEM"}, - {AMDGPU_MMHUB_WGMI_CMDMEM, "MMEA_WGMI_CMDMEM"}, - {AMDGPU_MMHUB_RGMI_CMDMEM, "MMEA_RGMI_CMDMEM"}, - {AMDGPU_MMHUB_WDRAM_CMDMEM, "MMEA_WDRAM_CMDMEM"}, - {AMDGPU_MMHUB_RDRAM_CMDMEM, "MMEA_RDRAM_CMDMEM"}, - {AMDGPU_MMHUB_MAM_DMEM0, "MMEA_MAM_DMEM0"}, - {AMDGPU_MMHUB_MAM_DMEM1, "MMEA_MAM_DMEM1"}, - {AMDGPU_MMHUB_MAM_DMEM2, "MMEA_MAM_DMEM2"}, - {AMDGPU_MMHUB_MAM_DMEM3, "MMEA_MAM_DMEM3"}, - {AMDGPU_MMHUB_WRET_TAGMEM, "MMEA_WRET_TAGMEM"}, - {AMDGPU_MMHUB_RRET_TAGMEM, "MMEA_RRET_TAGMEM"}, - {AMDGPU_MMHUB_WIO_DATAMEM, "MMEA_WIO_DATAMEM"}, - {AMDGPU_MMHUB_WGMI_DATAMEM, "MMEA_WGMI_DATAMEM"}, - {AMDGPU_MMHUB_WDRAM_DATAMEM, "MMEA_WDRAM_DATAMEM"}, -}; - -static void mmhub_v1_8_inst_query_ras_error_count(struct amdgpu_device *adev, - uint32_t mmhub_inst, - void *ras_err_status) -{ - struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status; - unsigned long ue_count = 0, ce_count = 0; - - /* NOTE: mmhub is converted by aid_mask and the range is 0-3, - * which can be used as die ID directly */ - struct amdgpu_smuio_mcm_config_info mcm_info = { - .socket_id = adev->smuio.funcs->get_socket_id(adev), - .die_id = mmhub_inst, - }; - - amdgpu_ras_inst_query_ras_error_count(adev, - mmhub_v1_8_ce_reg_list, - ARRAY_SIZE(mmhub_v1_8_ce_reg_list), - mmhub_v1_8_ras_memory_list, - ARRAY_SIZE(mmhub_v1_8_ras_memory_list), - mmhub_inst, - AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, - &ce_count); - amdgpu_ras_inst_query_ras_error_count(adev, - mmhub_v1_8_ue_reg_list, - ARRAY_SIZE(mmhub_v1_8_ue_reg_list), - mmhub_v1_8_ras_memory_list, - ARRAY_SIZE(mmhub_v1_8_ras_memory_list), - mmhub_inst, - AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, - &ue_count); - - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); -} - -static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev, - void *ras_err_status) -{ - uint32_t inst_mask; - uint32_t i; - - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) { - dev_warn(adev->dev, "MMHUB RAS is not supported\n"); - return; - } - - inst_mask = adev->aid_mask; - for_each_inst(i, inst_mask) - mmhub_v1_8_inst_query_ras_error_count(adev, i, ras_err_status); -} - -static void mmhub_v1_8_inst_reset_ras_error_count(struct amdgpu_device *adev, - uint32_t mmhub_inst) -{ - amdgpu_ras_inst_reset_ras_error_count(adev, - mmhub_v1_8_ce_reg_list, - ARRAY_SIZE(mmhub_v1_8_ce_reg_list), - mmhub_inst); - amdgpu_ras_inst_reset_ras_error_count(adev, - mmhub_v1_8_ue_reg_list, - ARRAY_SIZE(mmhub_v1_8_ue_reg_list), - mmhub_inst); -} - -static void mmhub_v1_8_reset_ras_error_count(struct amdgpu_device *adev) -{ - uint32_t inst_mask; - uint32_t i; - - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) { - dev_warn(adev->dev, "MMHUB RAS is not supported\n"); - return; - } - - inst_mask = adev->aid_mask; - for_each_inst(i, inst_mask) - mmhub_v1_8_inst_reset_ras_error_count(adev, i); -} - -static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = { - .query_ras_error_count = mmhub_v1_8_query_ras_error_count, - .reset_ras_error_count = mmhub_v1_8_reset_ras_error_count, -}; - -static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* reference to smu driver if header file */ -static int mmhub_v1_8_err_codes[] = { - 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */ - 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */ - 10, /* CODE_UTCL2_ROUTER */ - 11, /* CODE_VML2 */ - 12, /* CODE_VML2_WALKER */ - 13, /* CODE_MMCANE */ -}; - -static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - mmhub_v1_8_err_codes, - ARRAY_SIZE(mmhub_v1_8_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = { - .aca_bank_parser = mmhub_v1_8_aca_bank_parser, - .aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid, -}; - -static const struct aca_info mmhub_v1_8_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &mmhub_v1_8_aca_bank_ops, -}; - -static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) -{ - int r; - - r = amdgpu_ras_block_late_init(adev, ras_block); - if (r) - return r; - - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__MMHUB, - &mmhub_v1_8_aca_info, NULL); - if (r) - goto late_fini; - - return 0; - -late_fini: - amdgpu_ras_block_late_fini(adev, ras_block); - - return r; -} - struct amdgpu_mmhub_ras mmhub_v1_8_ras = { .ras_block = { - .hw_ops = &mmhub_v1_8_ras_hw_ops, - .ras_late_init = mmhub_v1_8_ras_late_init, + .hw_ops = NULL, }, }; diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 72edf5326b05..77557ee3ca16 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -507,11 +507,6 @@ void nv_set_virt_ops(struct amdgpu_device *adev) adev->virt.ops = &xgpu_nv_virt_ops; } -static bool nv_need_full_reset(struct amdgpu_device *adev) -{ - return true; -} - static bool nv_need_reset_on_init(struct amdgpu_device *adev) { u32 sol_reg; @@ -595,7 +590,6 @@ static const struct amdgpu_asic_funcs nv_asic_funcs = { .set_vce_clocks = &nv_set_vce_clocks, .get_config_memsize = &nv_get_config_memsize, .init_doorbell_index = &nv_init_doorbell_index, - .need_full_reset = &nv_need_full_reset, .need_reset_on_init = &nv_need_reset_on_init, .get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count, .supports_baco = &amdgpu_dpm_is_baco_supported, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 3fde9be74690..c2d098cd72ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -1237,65 +1237,6 @@ static int sdma_v3_0_wait_for_idle(struct amdgpu_ip_block *ip_block) return -ETIMEDOUT; } -static bool sdma_v3_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - u32 tmp = RREG32(mmSRBM_STATUS2); - - if ((tmp & SRBM_STATUS2__SDMA_BUSY_MASK) || - (tmp & SRBM_STATUS2__SDMA1_BUSY_MASK)) { - srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_SDMA_MASK; - srbm_soft_reset |= SRBM_SOFT_RESET__SOFT_RESET_SDMA1_MASK; - } - - if (srbm_soft_reset) { - adev->sdma.srbm_soft_reset = srbm_soft_reset; - return true; - } else { - adev->sdma.srbm_soft_reset = 0; - return false; - } -} - -static int sdma_v3_0_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - - if (!adev->sdma.srbm_soft_reset) - return 0; - - srbm_soft_reset = adev->sdma.srbm_soft_reset; - - if (REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA) || - REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA1)) { - sdma_v3_0_ctx_switch_enable(adev, false); - sdma_v3_0_enable(adev, false); - } - - return 0; -} - -static int sdma_v3_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - - if (!adev->sdma.srbm_soft_reset) - return 0; - - srbm_soft_reset = adev->sdma.srbm_soft_reset; - - if (REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA) || - REG_GET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_SDMA1)) { - sdma_v3_0_gfx_resume(adev); - sdma_v3_0_rlc_resume(adev); - } - - return 0; -} - static int sdma_v3_0_soft_reset(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -1552,9 +1493,6 @@ static const struct amd_ip_funcs sdma_v3_0_ip_funcs = { .resume = sdma_v3_0_resume, .is_idle = sdma_v3_0_is_idle, .wait_for_idle = sdma_v3_0_wait_for_idle, - .check_soft_reset = sdma_v3_0_check_soft_reset, - .pre_soft_reset = sdma_v3_0_pre_soft_reset, - .post_soft_reset = sdma_v3_0_post_soft_reset, .soft_reset = sdma_v3_0_soft_reset, .set_clockgating_state = sdma_v3_0_set_clockgating_state, .set_powergating_state = sdma_v3_0_set_powergating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 8652928861ad..484f1a6b5fbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -95,8 +95,6 @@ static const struct amdgpu_hwip_reg_entry sdma_reg_list_4_4_2[] = { SOC15_REG_ENTRY_STR(GC, 0, regSDMA_VM_CNTL) }; -#define mmSMNAID_AID0_MCA_SMU 0x03b30400 - #define WREG32_SDMA(instance, offset, value) \ WREG32(sdma_v4_4_2_get_reg_offset(adev, (instance), (offset)), value) #define RREG32_SDMA(instance, offset) \ @@ -1359,6 +1357,19 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int r; + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->sdma.no_user_submission = false; + adev->sdma.disable_uq = true; + break; + case 2: + adev->sdma.no_user_submission = true; + adev->sdma.disable_uq = true; + break; + } + r = sdma_v4_4_2_init_microcode(adev); if (r) return r; @@ -1478,6 +1489,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) /* doorbell size is 2 dwords, get DWORD offset */ ring->doorbell_index = adev->doorbell_index.sdma_engine[i] << 1; ring->vm_hub = AMDGPU_MMHUB0(aid_id); + ring->no_user_submission = adev->sdma.no_user_submission; sprintf(ring->name, "sdma%d.%d", aid_id, i % adev->sdma.num_inst_per_aid); @@ -2404,187 +2416,9 @@ struct amdgpu_xcp_ip_funcs sdma_v4_4_2_xcp_funcs = { .resume = &sdma_v4_4_2_xcp_resume }; -static const struct amdgpu_ras_err_status_reg_entry sdma_v4_2_2_ue_reg_list[] = { - {AMDGPU_RAS_REG_ENTRY(SDMA0, 0, regSDMA_UE_ERR_STATUS_LO, regSDMA_UE_ERR_STATUS_HI), - 1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), "SDMA"}, -}; - -static const struct amdgpu_ras_memory_id_entry sdma_v4_4_2_ras_memory_list[] = { - {AMDGPU_SDMA_MBANK_DATA_BUF0, "SDMA_MBANK_DATA_BUF0"}, - {AMDGPU_SDMA_MBANK_DATA_BUF1, "SDMA_MBANK_DATA_BUF1"}, - {AMDGPU_SDMA_MBANK_DATA_BUF2, "SDMA_MBANK_DATA_BUF2"}, - {AMDGPU_SDMA_MBANK_DATA_BUF3, "SDMA_MBANK_DATA_BUF3"}, - {AMDGPU_SDMA_MBANK_DATA_BUF4, "SDMA_MBANK_DATA_BUF4"}, - {AMDGPU_SDMA_MBANK_DATA_BUF5, "SDMA_MBANK_DATA_BUF5"}, - {AMDGPU_SDMA_MBANK_DATA_BUF6, "SDMA_MBANK_DATA_BUF6"}, - {AMDGPU_SDMA_MBANK_DATA_BUF7, "SDMA_MBANK_DATA_BUF7"}, - {AMDGPU_SDMA_MBANK_DATA_BUF8, "SDMA_MBANK_DATA_BUF8"}, - {AMDGPU_SDMA_MBANK_DATA_BUF9, "SDMA_MBANK_DATA_BUF9"}, - {AMDGPU_SDMA_MBANK_DATA_BUF10, "SDMA_MBANK_DATA_BUF10"}, - {AMDGPU_SDMA_MBANK_DATA_BUF11, "SDMA_MBANK_DATA_BUF11"}, - {AMDGPU_SDMA_MBANK_DATA_BUF12, "SDMA_MBANK_DATA_BUF12"}, - {AMDGPU_SDMA_MBANK_DATA_BUF13, "SDMA_MBANK_DATA_BUF13"}, - {AMDGPU_SDMA_MBANK_DATA_BUF14, "SDMA_MBANK_DATA_BUF14"}, - {AMDGPU_SDMA_MBANK_DATA_BUF15, "SDMA_MBANK_DATA_BUF15"}, - {AMDGPU_SDMA_UCODE_BUF, "SDMA_UCODE_BUF"}, - {AMDGPU_SDMA_RB_CMD_BUF, "SDMA_RB_CMD_BUF"}, - {AMDGPU_SDMA_IB_CMD_BUF, "SDMA_IB_CMD_BUF"}, - {AMDGPU_SDMA_UTCL1_RD_FIFO, "SDMA_UTCL1_RD_FIFO"}, - {AMDGPU_SDMA_UTCL1_RDBST_FIFO, "SDMA_UTCL1_RDBST_FIFO"}, - {AMDGPU_SDMA_UTCL1_WR_FIFO, "SDMA_UTCL1_WR_FIFO"}, - {AMDGPU_SDMA_DATA_LUT_FIFO, "SDMA_DATA_LUT_FIFO"}, - {AMDGPU_SDMA_SPLIT_DAT_BUF, "SDMA_SPLIT_DAT_BUF"}, -}; - -static void sdma_v4_4_2_inst_query_ras_error_count(struct amdgpu_device *adev, - uint32_t sdma_inst, - void *ras_err_status) -{ - struct ras_err_data *err_data = (struct ras_err_data *)ras_err_status; - uint32_t sdma_dev_inst = GET_INST(SDMA0, sdma_inst); - unsigned long ue_count = 0; - struct amdgpu_smuio_mcm_config_info mcm_info = { - .socket_id = adev->smuio.funcs->get_socket_id(adev), - .die_id = adev->sdma.instance[sdma_inst].aid_id, - }; - - /* sdma v4_4_2 doesn't support query ce counts */ - amdgpu_ras_inst_query_ras_error_count(adev, - sdma_v4_2_2_ue_reg_list, - ARRAY_SIZE(sdma_v4_2_2_ue_reg_list), - sdma_v4_4_2_ras_memory_list, - ARRAY_SIZE(sdma_v4_4_2_ras_memory_list), - sdma_dev_inst, - AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, - &ue_count); - - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); -} - -static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev, - void *ras_err_status) -{ - uint32_t inst_mask; - int i = 0; - - inst_mask = GENMASK(adev->sdma.num_instances - 1, 0); - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { - for_each_inst(i, inst_mask) - sdma_v4_4_2_inst_query_ras_error_count(adev, i, ras_err_status); - } else { - dev_warn(adev->dev, "SDMA RAS is not supported\n"); - } -} - -static void sdma_v4_4_2_inst_reset_ras_error_count(struct amdgpu_device *adev, - uint32_t sdma_inst) -{ - uint32_t sdma_dev_inst = GET_INST(SDMA0, sdma_inst); - - amdgpu_ras_inst_reset_ras_error_count(adev, - sdma_v4_2_2_ue_reg_list, - ARRAY_SIZE(sdma_v4_2_2_ue_reg_list), - sdma_dev_inst); -} - -static void sdma_v4_4_2_reset_ras_error_count(struct amdgpu_device *adev) -{ - uint32_t inst_mask; - int i = 0; - - inst_mask = GENMASK(adev->sdma.num_instances - 1, 0); - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) { - for_each_inst(i, inst_mask) - sdma_v4_4_2_inst_reset_ras_error_count(adev, i); - } else { - dev_warn(adev->dev, "SDMA RAS is not supported\n"); - } -} - -static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = { - .query_ras_error_count = sdma_v4_4_2_query_ras_error_count, - .reset_ras_error_count = sdma_v4_4_2_reset_ras_error_count, -}; - -static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */ -static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 }; - -static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - sdma_v4_4_2_err_codes, - ARRAY_SIZE(sdma_v4_4_2_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops sdma_v4_4_2_aca_bank_ops = { - .aca_bank_parser = sdma_v4_4_2_aca_bank_parser, - .aca_bank_is_valid = sdma_v4_4_2_aca_bank_is_valid, -}; - -static const struct aca_info sdma_v4_4_2_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &sdma_v4_4_2_aca_bank_ops, -}; - -static int sdma_v4_4_2_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) -{ - int r; - - r = amdgpu_sdma_ras_late_init(adev, ras_block); - if (r) - return r; - - return amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__SDMA, - &sdma_v4_4_2_aca_info, NULL); -} - static struct amdgpu_sdma_ras sdma_v4_4_2_ras = { .ras_block = { - .hw_ops = &sdma_v4_4_2_ras_hw_ops, - .ras_late_init = sdma_v4_4_2_ras_late_init, + .hw_ops = NULL, }, }; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index d7537888e60c..7a3f1a60b014 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -793,23 +793,6 @@ static int sdma_v6_0_soft_reset(struct amdgpu_ip_block *ip_block) return sdma_v6_0_start(adev); } -static bool sdma_v6_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - struct amdgpu_ring *ring; - int i, r; - long tmo = msecs_to_jiffies(1000); - - for (i = 0; i < adev->sdma.num_instances; i++) { - ring = &adev->sdma.instance[i].ring; - r = amdgpu_ring_test_ib(ring, tmo); - if (r) - return true; - } - - return false; -} - /** * sdma_v6_0_start - setup and start the async dma engines * @@ -1747,7 +1730,6 @@ const struct amd_ip_funcs sdma_v6_0_ip_funcs = { .is_idle = sdma_v6_0_is_idle, .wait_for_idle = sdma_v6_0_wait_for_idle, .soft_reset = sdma_v6_0_soft_reset, - .check_soft_reset = sdma_v6_0_check_soft_reset, .set_clockgating_state = sdma_v6_0_set_clockgating_state, .set_powergating_state = sdma_v6_0_set_powergating_state, .get_clockgating_state = sdma_v6_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 49c57a38151b..84305b6800fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -784,23 +784,6 @@ static int sdma_v7_0_soft_reset(struct amdgpu_ip_block *ip_block) return sdma_v7_0_start(adev); } -static bool sdma_v7_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - struct amdgpu_ring *ring; - int i, r; - long tmo = msecs_to_jiffies(1000); - - for (i = 0; i < adev->sdma.num_instances; i++) { - ring = &adev->sdma.instance[i].ring; - r = amdgpu_ring_test_ib(ring, tmo); - if (r) - return true; - } - - return false; -} - static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring, unsigned int vmid, struct amdgpu_fence *timedout_fence) @@ -1679,7 +1662,6 @@ const struct amd_ip_funcs sdma_v7_0_ip_funcs = { .is_idle = sdma_v7_0_is_idle, .wait_for_idle = sdma_v7_0_wait_for_idle, .soft_reset = sdma_v7_0_soft_reset, - .check_soft_reset = sdma_v7_0_check_soft_reset, .set_clockgating_state = sdma_v7_0_set_clockgating_state, .set_powergating_state = sdma_v7_0_set_powergating_state, .get_clockgating_state = sdma_v7_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c index b06001f6b536..322e6f4dd121 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_1.c @@ -775,23 +775,6 @@ static int sdma_v7_1_soft_reset(struct amdgpu_ip_block *ip_block) return sdma_v7_1_inst_start(adev, inst_mask); } -static bool sdma_v7_1_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - struct amdgpu_ring *ring; - int i, r; - long tmo = msecs_to_jiffies(1000); - - for (i = 0; i < adev->sdma.num_instances; i++) { - ring = &adev->sdma.instance[i].ring; - r = amdgpu_ring_test_ib(ring, tmo); - if (r) - return true; - } - - return false; -} - static int sdma_v7_1_reset_queue(struct amdgpu_ring *ring, unsigned int vmid, struct amdgpu_fence *timedout_fence) @@ -1644,7 +1627,6 @@ const struct amd_ip_funcs sdma_v7_1_ip_funcs = { .is_idle = sdma_v7_1_is_idle, .wait_for_idle = sdma_v7_1_wait_for_idle, .soft_reset = sdma_v7_1_soft_reset, - .check_soft_reset = sdma_v7_1_check_soft_reset, .set_clockgating_state = sdma_v7_1_set_clockgating_state, .set_powergating_state = sdma_v7_1_set_powergating_state, .get_clockgating_state = sdma_v7_1_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c index c26cb3e8bff6..b104469c38ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/si.c +++ b/drivers/gpu/drm/amd/amdgpu/si.c @@ -1509,12 +1509,6 @@ static void si_invalidate_hdp(struct amdgpu_device *adev, } } -static bool si_need_full_reset(struct amdgpu_device *adev) -{ - /* change this when we support soft reset */ - return true; -} - static bool si_need_reset_on_init(struct amdgpu_device *adev) { return false; @@ -2019,7 +2013,6 @@ static const struct amdgpu_asic_funcs si_asic_funcs = .get_config_memsize = &si_get_config_memsize, .flush_hdp = &si_flush_hdp, .invalidate_hdp = &si_invalidate_hdp, - .need_full_reset = &si_need_full_reset, .get_pcie_usage = &si_get_pcie_usage, .need_reset_on_init = &si_need_reset_on_init, .get_pcie_replay_count = &si_get_pcie_replay_count, diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 87b398dd0769..ed3fd58b78d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -721,12 +721,6 @@ void soc15_set_virt_ops(struct amdgpu_device *adev) soc15_reg_base_init(adev); } -static bool soc15_need_full_reset(struct amdgpu_device *adev) -{ - /* change this when we implement soft reset */ - return true; -} - static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -878,7 +872,6 @@ static const struct amdgpu_asic_funcs soc15_asic_funcs = .set_uvd_clocks = &soc15_set_uvd_clocks, .set_vce_clocks = &soc15_set_vce_clocks, .get_config_memsize = &soc15_get_config_memsize, - .need_full_reset = &soc15_need_full_reset, .init_doorbell_index = &vega10_doorbell_index_init, .get_pcie_usage = &soc15_get_pcie_usage, .need_reset_on_init = &soc15_need_reset_on_init, @@ -899,7 +892,6 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs = .set_uvd_clocks = &soc15_set_uvd_clocks, .set_vce_clocks = &soc15_set_vce_clocks, .get_config_memsize = &soc15_get_config_memsize, - .need_full_reset = &soc15_need_full_reset, .init_doorbell_index = &vega20_doorbell_index_init, .get_pcie_usage = &vega20_get_pcie_usage, .need_reset_on_init = &soc15_need_reset_on_init, @@ -920,7 +912,6 @@ static const struct amdgpu_asic_funcs aqua_vanjaram_asic_funcs = .set_uvd_clocks = &soc15_set_uvd_clocks, .set_vce_clocks = &soc15_set_vce_clocks, .get_config_memsize = &soc15_get_config_memsize, - .need_full_reset = &soc15_need_full_reset, .init_doorbell_index = &aqua_vanjaram_doorbell_index_init, .need_reset_on_init = &soc15_need_reset_on_init, .get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count, diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h index a7b5a95ebebb..47e0329b6f3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h @@ -38,30 +38,30 @@ (adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + (reg)+(offset)) #define __WREG32_SOC15_RLC__(reg, value, flag, hwip, inst) \ - ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.rlcg_reg_access_supported) ? \ - amdgpu_sriov_wreg(adev, reg, value, flag, hwip, inst) : \ - WREG32(reg, value)) + adev->gfx.rlc.reg_funcs->wreg32(adev, reg, value, flag, hwip, inst) #define __RREG32_SOC15_RLC__(reg, flag, hwip, inst) \ - ((amdgpu_sriov_vf(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.rlcg_reg_access_supported) ? \ - amdgpu_sriov_rreg(adev, reg, flag, hwip, inst) : \ - RREG32(reg)) - -#define WREG32_FIELD15(ip, idx, reg, field, val) \ - __WREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \ - (__RREG32_SOC15_RLC__( \ - adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \ - 0, ip##_HWIP, idx) & \ - ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field), \ - 0, ip##_HWIP, idx) - -#define WREG32_FIELD15_PREREG(ip, idx, reg_name, field, val) \ - __WREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name, \ - (__RREG32_SOC15_RLC__( \ - adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name, \ - 0, ip##_HWIP, idx) & \ - ~REG_FIELD_MASK(reg_name, field)) | (val) << REG_FIELD_SHIFT(reg_name, field), \ - 0, ip##_HWIP, idx) + adev->gfx.rlc.reg_funcs->rreg32(adev, reg, flag, hwip, inst) + +#define WREG32_FIELD15(ip, idx, reg_name, field, val) \ +do { \ + u32 reg__ = adev->reg_offset[ip##_HWIP][idx][mm##reg_name##_BASE_IDX] + mm##reg_name; \ + u32 val__ = __RREG32_SOC15_RLC__(reg__, 0, ip##_HWIP, idx); \ +\ + val__ &= ~REG_FIELD_MASK(reg_name, field); \ + val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \ + __WREG32_SOC15_RLC__(reg__, val__, 0, ip##_HWIP, idx); \ +} while (0) + +#define WREG32_FIELD15_PREREG(ip, idx, reg_name, field, val) \ +do { \ + u32 reg__ = adev->reg_offset[ip##_HWIP][idx][reg##reg_name##_BASE_IDX] + reg##reg_name; \ + u32 val__ = __RREG32_SOC15_RLC__(reg__, 0, ip##_HWIP, idx); \ +\ + val__ &= ~REG_FIELD_MASK(reg_name, field); \ + val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \ + __WREG32_SOC15_RLC__(reg__, val__, 0, ip##_HWIP, idx); \ +} while (0) #define RREG32_SOC15(ip, inst, reg) \ __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, \ @@ -181,12 +181,15 @@ WREG32_RLC_EX(prefix, target_reg, value, inst); \ } while (0) -#define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \ - __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \ - (__RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg, \ - AMDGPU_REGS_RLC, ip##_HWIP, idx) & \ - ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field), \ - AMDGPU_REGS_RLC, ip##_HWIP, idx) +#define WREG32_FIELD15_RLC(ip, idx, reg_name, field, val) \ +do { \ + u32 reg__ = adev->reg_offset[ip##_HWIP][idx][mm##reg_name##_BASE_IDX] + mm##reg_name; \ + u32 val__ = __RREG32_SOC15_RLC__(reg__, AMDGPU_REGS_RLC, ip##_HWIP, idx); \ +\ + val__ &= ~REG_FIELD_MASK(reg_name, field); \ + val__ |= (val) << REG_FIELD_SHIFT(reg_name, field); \ + __WREG32_SOC15_RLC__(reg__, val__, AMDGPU_REGS_RLC, ip##_HWIP, idx); \ +} while (0) #define WREG32_SOC15_OFFSET_RLC(ip, inst, reg, offset, value) \ __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + offset, value, AMDGPU_REGS_RLC, ip##_HWIP, inst) @@ -207,10 +210,4 @@ amdgpu_reg_get_smn_base64(adev, ip##_HWIP, inst), \ value) -#define RREG64_MCA(smn_base, mca_base, idx) \ - RREG64_PCIE_EXT(smn_base + mca_base + (idx * 8)) - -#define WREG64_MCA(smn_base, mca_base, idx, val) \ - WREG64_PCIE_EXT(smn_base + mca_base + (idx * 8), val) - #endif diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 1677e88a4e36..09f28dbd60ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -461,17 +461,6 @@ const struct amdgpu_ip_block_version soc21_common_ip_block = { .funcs = &soc21_common_ip_funcs, }; -static bool soc21_need_full_reset(struct amdgpu_device *adev) -{ - switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { - case IP_VERSION(11, 0, 0): - case IP_VERSION(11, 0, 2): - case IP_VERSION(11, 0, 3): - default: - return true; - } -} - static bool soc21_need_reset_on_init(struct amdgpu_device *adev) { u32 sol_reg; @@ -550,7 +539,6 @@ static const struct amdgpu_asic_funcs soc21_asic_funcs = { .set_vce_clocks = &soc21_set_vce_clocks, .get_config_memsize = &soc21_get_config_memsize, .init_doorbell_index = &soc21_init_doorbell_index, - .need_full_reset = &soc21_need_full_reset, .need_reset_on_init = &soc21_need_reset_on_init, .get_pcie_replay_count = &amdgpu_nbio_get_pcie_replay_count, .supports_baco = &amdgpu_dpm_is_baco_supported, diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index 9dce30d2bb8d..e5e3a460e486 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -238,16 +238,6 @@ const struct amdgpu_ip_block_version soc24_common_ip_block = { .funcs = &soc24_common_ip_funcs, }; -static bool soc24_need_full_reset(struct amdgpu_device *adev) -{ - switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { - case IP_VERSION(12, 0, 0): - case IP_VERSION(12, 0, 1): - default: - return true; - } -} - static bool soc24_need_reset_on_init(struct amdgpu_device *adev) { u32 sol_reg; @@ -330,7 +320,6 @@ static const struct amdgpu_asic_funcs soc24_asic_funcs = { .get_xclk = &soc24_get_xclk, .get_config_memsize = &soc24_get_config_memsize, .init_doorbell_index = &soc24_init_doorbell_index, - .need_full_reset = &soc24_need_full_reset, .need_reset_on_init = &soc24_need_reset_on_init, .get_pcie_replay_count = &soc24_get_pcie_replay_count, .supports_baco = &amdgpu_dpm_is_baco_supported, diff --git a/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c b/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c index 5f05c8e68297..a9039fb1a77b 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/soc_v1_0.c @@ -223,15 +223,6 @@ static int soc_v1_0_read_register(struct amdgpu_device *adev, return -EINVAL; } -static bool soc_v1_0_need_full_reset(struct amdgpu_device *adev) -{ - switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { - case IP_VERSION(12, 1, 0): - default: - return true; - } -} - static bool soc_v1_0_need_reset_on_init(struct amdgpu_device *adev) { @@ -271,7 +262,6 @@ static const struct amdgpu_asic_funcs soc_v1_0_asic_funcs = { .read_register = &soc_v1_0_read_register, .get_config_memsize = &soc_v1_0_get_config_memsize, .get_xclk = &soc_v1_0_get_xclk, - .need_full_reset = &soc_v1_0_need_full_reset, .init_doorbell_index = &soc_v1_0_doorbell_index_init, .need_reset_on_init = &soc_v1_0_need_reset_on_init, .encode_ext_smn_addressing = &soc_v1_0_encode_ext_smn_addressing, @@ -600,8 +590,10 @@ static int soc_v1_0_get_xcp_res_info(struct amdgpu_xcp_mgr *xcp_mgr, xcp_cfg->num_res = ARRAY_SIZE(max_res); for (i = 0; i < xcp_cfg->num_res; i++) { - res_lt_xcp = max_res[i] < num_xcp; xcp_cfg->xcp_res[i].id = i; + if (!max_res[i]) + continue; + res_lt_xcp = max_res[i] < num_xcp; xcp_cfg->xcp_res[i].num_inst = res_lt_xcp ? 1 : max_res[i] / num_xcp; xcp_cfg->xcp_res[i].num_inst = diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index ee8038df17e3..a3e883f6f099 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -390,43 +390,6 @@ static int tonga_ih_wait_for_idle(struct amdgpu_ip_block *ip_block) return -ETIMEDOUT; } -static bool tonga_ih_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - u32 tmp = RREG32(mmSRBM_STATUS); - - if (tmp & SRBM_STATUS__IH_BUSY_MASK) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, - SOFT_RESET_IH, 1); - - if (srbm_soft_reset) { - adev->irq.srbm_soft_reset = srbm_soft_reset; - return true; - } else { - adev->irq.srbm_soft_reset = 0; - return false; - } -} - -static int tonga_ih_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - if (!ip_block->adev->irq.srbm_soft_reset) - return 0; - - return tonga_ih_hw_fini(ip_block); -} - -static int tonga_ih_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->irq.srbm_soft_reset) - return 0; - - return tonga_ih_hw_init(ip_block); -} - static int tonga_ih_soft_reset(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -481,10 +444,7 @@ static const struct amd_ip_funcs tonga_ih_ip_funcs = { .resume = tonga_ih_resume, .is_idle = tonga_ih_is_idle, .wait_for_idle = tonga_ih_wait_for_idle, - .check_soft_reset = tonga_ih_check_soft_reset, - .pre_soft_reset = tonga_ih_pre_soft_reset, .soft_reset = tonga_ih_soft_reset, - .post_soft_reset = tonga_ih_post_soft_reset, .set_clockgating_state = tonga_ih_set_clockgating_state, .set_powergating_state = tonga_ih_set_powergating_state, }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 14092150336a..67bdf7303e6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -28,48 +28,6 @@ #include "umc/umc_12_0_0_sh_mask.h" #include "mp/mp_13_0_6_sh_mask.h" -#define MAX_ECC_NUM_PER_RETIREMENT 32 -#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms - -static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, - uint32_t node_inst, - uint32_t umc_inst, - uint32_t ch_inst) -{ - uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst; - uint64_t cross_node_offset = (node_inst == 0) ? 0 : UMC_V12_0_CROSS_NODE_OFFSET; - - umc_inst = index / 4; - ch_inst = index % 4; - - return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * umc_inst + - UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; -} - -static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device *adev, - uint32_t node_inst, uint32_t umc_inst, - uint32_t ch_inst, void *data) -{ - uint64_t odecc_err_cnt_addr; - uint64_t umc_reg_offset = - get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - - odecc_err_cnt_addr = - SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); - - /* clear error count */ - WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, - UMC_V12_0_CE_CNT_INIT); - - return 0; -} - -static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) -{ - amdgpu_umc_loop_channels(adev, - umc_v12_0_reset_error_count_per_channel, NULL); -} - bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { dev_dbg(adev->dev, @@ -115,65 +73,6 @@ bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_ !(umc_v12_0_is_uncorrectable_error(adev, mc_umc_status))))); } -static void umc_v12_0_query_error_count_per_type(struct amdgpu_device *adev, - uint64_t umc_reg_offset, - unsigned long *error_count, - check_error_type_func error_type_func) -{ - uint64_t mc_umc_status; - uint64_t mc_umc_status_addr; - - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); - - /* Check MCUMC_STATUS */ - mc_umc_status = - RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); - - if (error_type_func(adev, mc_umc_status)) - *error_count += 1; -} - -static int umc_v12_0_query_error_count(struct amdgpu_device *adev, - uint32_t node_inst, uint32_t umc_inst, - uint32_t ch_inst, void *data) -{ - struct ras_err_data *err_data = (struct ras_err_data *)data; - unsigned long ue_count = 0, ce_count = 0, de_count = 0; - - /* NOTE: node_inst is converted by adev->umc.active_mask and the range is [0-3], - * which can be used as die ID directly */ - struct amdgpu_smuio_mcm_config_info mcm_info = { - .socket_id = adev->smuio.funcs->get_socket_id(adev), - .die_id = node_inst, - }; - - uint64_t umc_reg_offset = - get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - - umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, - &ce_count, umc_v12_0_is_correctable_error); - umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, - &ue_count, umc_v12_0_is_uncorrectable_error); - umc_v12_0_query_error_count_per_type(adev, umc_reg_offset, - &de_count, umc_v12_0_is_deferred_error); - - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count); - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count); - amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, de_count); - - return 0; -} - -static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, - void *ras_error_status) -{ - amdgpu_umc_loop_channels(adev, - umc_v12_0_query_error_count, ras_error_status); - - umc_v12_0_reset_error_count(adev); -} - static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev) { enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; @@ -279,190 +178,6 @@ static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev) adev->umc.retire_unit = 0x1 << flip_bits->bit_num; } -static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, - struct ta_ras_query_address_input *addr_in, - struct ta_ras_query_address_output *addr_out, - bool dump_addr) -{ - uint32_t row = 0, row_lower = 0, row_high = 0; - uint32_t col = 0, col_lower = 0, bank = 0; - uint32_t channel_index = 0, umc_inst = 0; - uint32_t i, bit_num, retire_unit, *flip_bits; - uint64_t soc_pa, column, err_addr; - struct ta_ras_query_address_output addr_out_tmp; - struct ta_ras_query_address_output *paddr_out; - int ret = 0; - - if (!addr_out) - paddr_out = &addr_out_tmp; - else - paddr_out = addr_out; - - err_addr = bank = 0; - if (addr_in) { - err_addr = addr_in->ma.err_addr; - addr_in->addr_type = TA_RAS_MCA_TO_PA; - ret = psp_ras_query_address(&adev->psp, addr_in, paddr_out); - if (ret) { - dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", - err_addr); - - goto out; - } - - bank = paddr_out->pa.bank; - /* no need to care about umc inst if addr_in is NULL */ - umc_inst = addr_in->ma.umc_inst; - } - - flip_bits = adev->umc.flip_bits.flip_bits_in_pa; - bit_num = adev->umc.flip_bits.bit_num; - retire_unit = adev->umc.retire_unit; - - soc_pa = paddr_out->pa.pa; - channel_index = paddr_out->pa.channel_idx; - /* clear loop bits in soc physical address */ - for (i = 0; i < bit_num; i++) - soc_pa &= ~BIT_ULL(flip_bits[i]); - - paddr_out->pa.pa = soc_pa; - /* get column bit 0 and 1 in mca address */ - col_lower = (err_addr >> 1) & 0x3ULL; - /* extra row bit will be handled later */ - row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & 0x1fffULL; - row_lower &= ~BIT_ULL(adev->umc.flip_bits.flip_row_bit); - - if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 5, 0)) { - row_high = (soc_pa >> adev->umc.flip_bits.r13_in_pa) & 0x3ULL; - /* it's 2.25GB in each channel, from MCA address to PA - * [R14 R13] is converted if the two bits value are 0x3, - * get them from PA instead of MCA address. - */ - row_lower |= (row_high << 13); - } - - if (!err_data && !dump_addr) - goto out; - - /* loop for all possibilities of retired bits */ - for (column = 0; column < retire_unit; column++) { - soc_pa = paddr_out->pa.pa; - for (i = 0; i < bit_num; i++) - soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]); - - col = ((column & 0x7) << 2) | col_lower; - /* handle extra row bit */ - if (bit_num == RETIRE_FLIP_BITS_NUM) - row = ((column >> 3) << adev->umc.flip_bits.flip_row_bit) | - row_lower; - - if (dump_addr) - dev_info(adev->dev, - "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", - soc_pa, row, col, bank, channel_index); - - if (err_data) - amdgpu_umc_fill_error_record(err_data, err_addr, - soc_pa, channel_index, umc_inst); - } - -out: - return ret; -} - -static int umc_v12_0_query_error_address(struct amdgpu_device *adev, - uint32_t node_inst, uint32_t umc_inst, - uint32_t ch_inst, void *data) -{ - struct ras_err_data *err_data = (struct ras_err_data *)data; - struct ta_ras_query_address_input addr_in; - uint64_t mc_umc_status_addr; - uint64_t mc_umc_status, err_addr; - uint64_t mc_umc_addrt0; - uint64_t umc_reg_offset = - get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); - - mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4); - - if (mc_umc_status == 0) - return 0; - - if (!err_data->err_addr) { - /* clear umc status */ - WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); - - return 0; - } - - /* calculate error address if ue error is detected */ - if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || - umc_v12_0_is_deferred_error(adev, mc_umc_status)) { - mc_umc_addrt0 = - SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); - - err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + umc_reg_offset) * 4); - - err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - - if (!adev->aid_mask && - adev->smuio.funcs && - adev->smuio.funcs->get_socket_id) - addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev); - else - addr_in.ma.socket_id = 0; - - addr_in.ma.err_addr = err_addr; - addr_in.ma.ch_inst = ch_inst; - addr_in.ma.umc_inst = umc_inst; - addr_in.ma.node_inst = node_inst; - - umc_v12_0_convert_error_address(adev, err_data, &addr_in, NULL, true); - } - - /* clear umc status */ - WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); - - return 0; -} - -static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev, - void *ras_error_status) -{ - amdgpu_umc_loop_channels(adev, - umc_v12_0_query_error_address, ras_error_status); -} - -static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, - uint32_t node_inst, uint32_t umc_inst, - uint32_t ch_inst, void *data) -{ - uint32_t odecc_cnt_sel; - uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr; - uint64_t umc_reg_offset = - get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); - - odecc_cnt_sel_addr = - SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel); - odecc_err_cnt_addr = - SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); - - odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4); - - /* set ce error interrupt type to APIC based interrupt */ - odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel, - OdEccErrInt, 0x1); - WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, odecc_cnt_sel); - - /* set error count to initial value */ - WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V12_0_CE_CNT_INIT); - - return 0; -} - static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, void *ras_error_status) { @@ -482,309 +197,11 @@ static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev, return false; } -static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) -{ - amdgpu_umc_loop_channels(adev, - umc_v12_0_err_cnt_init_per_channel, NULL); -} - -static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev) -{ - /* - * Force return true, because regUMCCH0_EccCtrl - * is not accessible from host side - */ - return true; -} - -const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { - .query_ras_error_count = umc_v12_0_query_ras_error_count, - .query_ras_error_address = umc_v12_0_query_ras_error_address, -}; - -static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct amdgpu_device *adev = handle->adev; - struct aca_bank_info info; - enum aca_error_type err_type; - u64 status, count; - u32 ext_error_code; - int ret; - - status = bank->regs[ACA_REG_IDX_STATUS]; - if (umc_v12_0_is_deferred_error(adev, status)) - err_type = ACA_ERROR_TYPE_DEFERRED; - else if (umc_v12_0_is_uncorrectable_error(adev, status)) - err_type = ACA_ERROR_TYPE_UE; - else if (umc_v12_0_is_correctable_error(adev, status)) - err_type = ACA_ERROR_TYPE_CE; - else - return 0; - bank->aca_err_type = err_type; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - amdgpu_umc_update_ecc_status(adev, - bank->regs[ACA_REG_IDX_STATUS], - bank->regs[ACA_REG_IDX_IPID], - bank->regs[ACA_REG_IDX_ADDR]); - - ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); - if (umc_v12_0_is_deferred_error(adev, status)) - count = ext_error_code == 0 ? - adev->umc.err_addr_cnt / adev->umc.retire_unit : 1ULL; - else - count = ext_error_code == 0 ? - ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL; - - return aca_error_cache_log_bank_error(handle, &info, err_type, count); -} - -static const struct aca_bank_ops umc_v12_0_aca_bank_ops = { - .aca_bank_parser = umc_v12_0_aca_bank_parser, -}; - -const struct aca_info umc_v12_0_aca_info = { - .hwip = ACA_HWIP_TYPE_UMC, - .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK | ACA_ERROR_DEFERRED_MASK, - .bank_ops = &umc_v12_0_aca_bank_ops, -}; - -static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) -{ - int ret; - - ret = amdgpu_umc_ras_late_init(adev, ras_block); - if (ret) - return ret; - - ret = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__UMC, - &umc_v12_0_aca_info, NULL); - if (ret) - return ret; - - return 0; -} - -static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, - uint64_t status, uint64_t ipid, uint64_t addr) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint16_t hwid, mcatype; - uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; - uint64_t err_addr, pa_addr = 0; - struct ras_ecc_err *ecc_err; - struct ta_ras_query_address_output addr_out; - uint32_t shift_bit = adev->umc.flip_bits.flip_bits_in_pa[2]; - int count, ret, i; - - hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); - mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - - /* The IP block decode of consumption is SMU */ - if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) { - con->umc_ecc_log.consumption_q_count++; - return 0; - } - - if (!status) - return 0; - - if (!umc_v12_0_is_deferred_error(adev, status)) - return 0; - - err_addr = REG_GET_FIELD(addr, - MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - - dev_dbg(adev->dev, - "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n", - ipid, - MCA_IPID_2_SOCKET_ID(ipid), - MCA_IPID_2_DIE_ID(ipid), - MCA_IPID_2_UMC_INST(ipid), - MCA_IPID_2_UMC_CH(ipid), - err_addr); - - ret = amdgpu_umc_mca_to_addr(adev, - err_addr, MCA_IPID_2_UMC_CH(ipid), - MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid), - MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true); - if (ret) - return ret; - - ecc_err = kzalloc_obj(*ecc_err); - if (!ecc_err) - return -ENOMEM; - - pa_addr = addr_out.pa.pa; - ecc_err->status = status; - ecc_err->ipid = ipid; - ecc_err->addr = addr; - ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT; - ecc_err->channel_idx = addr_out.pa.channel_idx; - - /* If converted pa_pfn is 0, use pa C4 pfn. */ - if (!ecc_err->pa_pfn) - ecc_err->pa_pfn = BIT_ULL(shift_bit) >> AMDGPU_GPU_PAGE_SHIFT; - - ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); - if (ret) { - if (ret == -EEXIST) - con->umc_ecc_log.de_queried_count++; - else - dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); - - kfree(ecc_err); - return ret; - } - - con->umc_ecc_log.de_queried_count++; - - memset(page_pfn, 0, sizeof(page_pfn)); - count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, - pa_addr, - page_pfn, ARRAY_SIZE(page_pfn)); - if (count <= 0) { - dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count); - return 0; - } - - /* Reserve memory */ - for (i = 0; i < count; i++) - amdgpu_ras_reserve_page(adev, page_pfn[i]); - - /* The problem case is as follows: - * 1. GPU A triggers a gpu ras reset, and GPU A drives - * GPU B to also perform a gpu ras reset. - * 2. After gpu B ras reset started, gpu B queried a DE - * data. Since the DE data was queried in the ras reset - * thread instead of the page retirement thread, bad - * page retirement work would not be triggered. Then - * even if all gpu resets are completed, the bad pages - * will be cached in RAM until GPU B's bad page retirement - * work is triggered again and then saved to eeprom. - * Trigger delayed work to save the bad pages to eeprom in time - * after gpu ras reset is completed. - */ - if (amdgpu_ras_in_recovery(adev)) - schedule_delayed_work(&con->page_retirement_dwork, - msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET)); - - return 0; -} - -static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, - struct ras_ecc_err *ecc_err, void *ras_error_status) -{ - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; - int ret, i, count; - - if (!err_data || !ecc_err) - return -EINVAL; - - memset(page_pfn, 0, sizeof(page_pfn)); - count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, - ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT, - page_pfn, ARRAY_SIZE(page_pfn)); - - for (i = 0; i < count; i++) { - ret = amdgpu_umc_fill_error_record(err_data, - ecc_err->addr, - page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, - ecc_err->channel_idx, - MCA_IPID_2_UMC_INST(ecc_err->ipid)); - if (ret) - break; - } - - err_data->de_count++; - - return ret; -} - -static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, - void *ras_error_status) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT]; - struct radix_tree_root *ecc_tree; - int new_detected, ret, i; - - ecc_tree = &con->umc_ecc_log.de_page_tree; - - mutex_lock(&con->umc_ecc_log.lock); - new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries, - 0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG); - for (i = 0; i < new_detected; i++) { - if (!entries[i]) - continue; - - ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status); - if (ret) { - dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret); - break; - } - radix_tree_tag_clear(ecc_tree, - entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); - } - mutex_unlock(&con->umc_ecc_log.lock); -} - -static uint32_t umc_v12_0_get_die_id(struct amdgpu_device *adev, - uint64_t mca_addr, uint64_t retired_page) -{ - uint32_t die = 0; - - /* we only calculate die id for nps1 mode right now */ - die += ((((retired_page >> 12) & 0x1ULL)^ - ((retired_page >> 20) & 0x1ULL) ^ - ((retired_page >> 27) & 0x1ULL) ^ - ((retired_page >> 34) & 0x1ULL) ^ - ((retired_page >> 41) & 0x1ULL)) << 0); - - /* the original PA_C4 and PA_R13 may be cleared in retired_page, so - * get them from mca_addr. - */ - die += ((((retired_page >> 13) & 0x1ULL) ^ - ((mca_addr >> 5) & 0x1ULL) ^ - ((retired_page >> 28) & 0x1ULL) ^ - ((mca_addr >> 23) & 0x1ULL) ^ - ((retired_page >> 42) & 0x1ULL)) << 1); - die &= 3; - - return die; -} - -static void umc_v12_0_mca_ipid_parse(struct amdgpu_device *adev, uint64_t ipid, - uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid) -{ - if (did) - *did = MCA_IPID_2_DIE_ID(ipid); - if (ch) - *ch = MCA_IPID_2_UMC_CH(ipid); - if (umc_inst) - *umc_inst = MCA_IPID_2_UMC_INST(ipid); - if (sid) - *sid = MCA_IPID_2_SOCKET_ID(ipid); -} - struct amdgpu_umc_ras umc_v12_0_ras = { .ras_block = { - .hw_ops = &umc_v12_0_ras_hw_ops, - .ras_late_init = umc_v12_0_ras_late_init, + .hw_ops = NULL, }, - .err_cnt_init = umc_v12_0_err_cnt_init, - .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, - .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr, .check_ecc_err_status = umc_v12_0_check_ecc_err_status, - .update_ecc_status = umc_v12_0_update_ecc_status, - .convert_ras_err_addr = umc_v12_0_convert_error_address, - .get_die_id_from_pa = umc_v12_0_get_die_id, .get_retire_flip_bits = umc_v12_0_get_retire_flip_bits, - .mca_ipid_parse = umc_v12_0_mca_ipid_parse, }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index 63b7e7254526..9d9e84d8d3bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -26,31 +26,6 @@ #include "soc15_common.h" #include "amdgpu.h" -#define UMC_V12_0_NODE_DIST 0x40000000 -#define UMC_V12_0_INST_DIST 0x40000 - -/* UMC register per channel offset */ -#define UMC_V12_0_PER_CHANNEL_OFFSET 0x400 - -/* UMC cross node offset */ -#define UMC_V12_0_CROSS_NODE_OFFSET 0x100000000 - -/* OdEccErrCnt max value */ -#define UMC_V12_0_CE_CNT_MAX 0xffff -/* umc ce interrupt threshold */ -#define UMC_V12_0_CE_INT_THRESHOLD 0xffff -/* umc ce count initial value */ -#define UMC_V12_0_CE_CNT_INIT (UMC_V12_0_CE_CNT_MAX - UMC_V12_0_CE_INT_THRESHOLD) - -/* number of umc channel instance with memory map register access */ -#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8 -/* number of umc instance with memory map register access */ -#define UMC_V12_0_UMC_INSTANCE_NUM 4 - -/* Total channel instances for all available umc nodes */ -#define UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \ - (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc) - /* one piece of normalized address is mapped to 8 pieces of physical address */ #define UMC_V12_0_NA_MAP_PA_NUM 8 /* R13 bit shift should be considered, double the number */ @@ -75,9 +50,6 @@ /* row bits in MCA address */ #define UMC_V12_0_MA_R0_BIT 10 -#define MCA_UMC_HWID_V12_0 0x96 -#define MCA_UMC_MCATYPE_V12_0 0x0 - #define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \ (((_ipid_lo) >> 12) & 0xF)) #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c index ecd7ead7a60b..8bb9592b0981 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c @@ -1165,36 +1165,6 @@ static int uvd_v6_0_wait_for_idle(struct amdgpu_ip_block *ip_block) } #define AMDGPU_UVD_STATUS_BUSY_MASK 0xfd -static bool uvd_v6_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - u32 tmp = RREG32(mmSRBM_STATUS); - - if (REG_GET_FIELD(tmp, SRBM_STATUS, UVD_RQ_PENDING) || - REG_GET_FIELD(tmp, SRBM_STATUS, UVD_BUSY) || - (RREG32(mmUVD_STATUS) & AMDGPU_UVD_STATUS_BUSY_MASK)) - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_UVD, 1); - - if (srbm_soft_reset) { - adev->uvd.inst->srbm_soft_reset = srbm_soft_reset; - return true; - } else { - adev->uvd.inst->srbm_soft_reset = 0; - return false; - } -} - -static int uvd_v6_0_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->uvd.inst->srbm_soft_reset) - return 0; - - uvd_v6_0_stop(adev); - return 0; -} static int uvd_v6_0_soft_reset(struct amdgpu_ip_block *ip_block) { @@ -1227,18 +1197,6 @@ static int uvd_v6_0_soft_reset(struct amdgpu_ip_block *ip_block) return 0; } -static int uvd_v6_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->uvd.inst->srbm_soft_reset) - return 0; - - mdelay(5); - - return uvd_v6_0_start(adev); -} - static int uvd_v6_0_set_interrupt_state(struct amdgpu_device *adev, struct amdgpu_irq_src *source, unsigned type, @@ -1538,10 +1496,7 @@ static const struct amd_ip_funcs uvd_v6_0_ip_funcs = { .resume = uvd_v6_0_resume, .is_idle = uvd_v6_0_is_idle, .wait_for_idle = uvd_v6_0_wait_for_idle, - .check_soft_reset = uvd_v6_0_check_soft_reset, - .pre_soft_reset = uvd_v6_0_pre_soft_reset, .soft_reset = uvd_v6_0_soft_reset, - .post_soft_reset = uvd_v6_0_post_soft_reset, .set_clockgating_state = uvd_v6_0_set_clockgating_state, .set_powergating_state = uvd_v6_0_set_powergating_state, .get_clockgating_state = uvd_v6_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c index c69f7d82060f..9f4e88440c0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c @@ -631,47 +631,6 @@ static int vce_v3_0_wait_for_idle(struct amdgpu_ip_block *ip_block) #define AMDGPU_VCE_STATUS_BUSY_MASK (VCE_STATUS_VCPU_REPORT_AUTO_BUSY_MASK | \ VCE_STATUS_VCPU_REPORT_RB0_BUSY_MASK) -static bool vce_v3_0_check_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - u32 srbm_soft_reset = 0; - - /* According to VCE team , we should use VCE_STATUS instead - * SRBM_STATUS.VCE_BUSY bit for busy status checking. - * GRBM_GFX_INDEX.INSTANCE_INDEX is used to specify which VCE - * instance's registers are accessed - * (0 for 1st instance, 10 for 2nd instance). - * - *VCE_STATUS - *|UENC|ACPI|AUTO ACTIVE|RB1 |RB0 |RB2 | |FW_LOADED|JOB | - *|----+----+-----------+----+----+----+----------+---------+----| - *|bit8|bit7| bit6 |bit5|bit4|bit3| bit2 | bit1 |bit0| - * - * VCE team suggest use bit 3--bit 6 for busy status check - */ - mutex_lock(&adev->grbm_idx_mutex); - WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(0)); - if (RREG32(mmVCE_STATUS) & AMDGPU_VCE_STATUS_BUSY_MASK) { - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE0, 1); - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE1, 1); - } - WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(1)); - if (RREG32(mmVCE_STATUS) & AMDGPU_VCE_STATUS_BUSY_MASK) { - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE0, 1); - srbm_soft_reset = REG_SET_FIELD(srbm_soft_reset, SRBM_SOFT_RESET, SOFT_RESET_VCE1, 1); - } - WREG32(mmGRBM_GFX_INDEX, GET_VCE_INSTANCE(0)); - mutex_unlock(&adev->grbm_idx_mutex); - - if (srbm_soft_reset) { - adev->vce.srbm_soft_reset = srbm_soft_reset; - return true; - } else { - adev->vce.srbm_soft_reset = 0; - return false; - } -} - static int vce_v3_0_soft_reset(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -703,31 +662,6 @@ static int vce_v3_0_soft_reset(struct amdgpu_ip_block *ip_block) return 0; } -static int vce_v3_0_pre_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->vce.srbm_soft_reset) - return 0; - - mdelay(5); - - return vce_v3_0_suspend(ip_block); -} - - -static int vce_v3_0_post_soft_reset(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (!adev->vce.srbm_soft_reset) - return 0; - - mdelay(5); - - return vce_v3_0_resume(ip_block); -} - static int vce_v3_0_set_interrupt_state(struct amdgpu_device *adev, struct amdgpu_irq_src *source, unsigned type, @@ -909,10 +843,7 @@ static const struct amd_ip_funcs vce_v3_0_ip_funcs = { .resume = vce_v3_0_resume, .is_idle = vce_v3_0_is_idle, .wait_for_idle = vce_v3_0_wait_for_idle, - .check_soft_reset = vce_v3_0_check_soft_reset, - .pre_soft_reset = vce_v3_0_pre_soft_reset, .soft_reset = vce_v3_0_soft_reset, - .post_soft_reset = vce_v3_0_post_soft_reset, .set_clockgating_state = vce_v3_0_set_clockgating_state, .set_powergating_state = vce_v3_0_set_powergating_state, .get_clockgating_state = vce_v3_0_get_clockgating_state, diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index 8b8184fe6764..0d8a3cea63ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -159,9 +159,8 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring) struct amdgpu_device *adev = ring->adev; struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me]; - atomic_inc(&adev->vcn.inst[0].total_submission_cnt); - - cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work); + if (!atomic_fetch_inc(&adev->vcn.inst[0].total_submission_cnt)) + cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work); /* We can safely return early here because we've cancelled the * the delayed work so there is no one else to set it to false @@ -207,10 +206,9 @@ static void vcn_v2_5_ring_end_use(struct amdgpu_ring *ring) !adev->vcn.inst[ring->me].using_unified_queue) atomic_dec(&adev->vcn.inst[ring->me].dpg_enc_submission_cnt); - atomic_dec(&adev->vcn.inst[0].total_submission_cnt); - - schedule_delayed_work(&adev->vcn.inst[0].idle_work, - VCN_IDLE_TIMEOUT); + if (atomic_dec_and_test(&adev->vcn.inst[0].total_submission_cnt)) + schedule_delayed_work(&adev->vcn.inst[0].idle_work, + VCN_IDLE_TIMEOUT); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 894780669f9c..0cce78b205a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1995,7 +1995,7 @@ static int vcn_v4_0_ring_reset(struct amdgpu_ring *ring, return amdgpu_ring_reset_helper_end(ring, timedout_fence); } -static struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = { +static const struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = { .type = AMDGPU_RING_TYPE_VCN_ENC, .align_mask = 0x3f, .nop = VCN_ENC_CMD_NO_OP, @@ -2028,6 +2028,40 @@ static struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs = { .reset = vcn_v4_0_ring_reset, }; +static const struct amdgpu_ring_funcs vcn_v4_0_unified_ring_vm_funcs_secure = { + .type = AMDGPU_RING_TYPE_VCN_ENC, + .align_mask = 0x3f, + .nop = VCN_ENC_CMD_NO_OP, + .secure_submission_supported = true, + .no_user_fence = true, + .extra_bytes = sizeof(struct amdgpu_vcn_rb_metadata), + .get_rptr = vcn_v4_0_unified_ring_get_rptr, + .get_wptr = vcn_v4_0_unified_ring_get_wptr, + .set_wptr = vcn_v4_0_unified_ring_set_wptr, + .patch_cs_in_place = vcn_v4_0_ring_patch_cs_in_place, + .emit_frame_size = + SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 4 + + 4 + /* vcn_v2_0_enc_ring_emit_vm_flush */ + 5 + 5 + /* vcn_v2_0_enc_ring_emit_fence x2 vm fence */ + 1, /* vcn_v2_0_enc_ring_insert_end */ + .emit_ib_size = 5, /* vcn_v2_0_enc_ring_emit_ib */ + .emit_ib = vcn_v2_0_enc_ring_emit_ib, + .emit_fence = vcn_v2_0_enc_ring_emit_fence, + .emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush, + .test_ring = amdgpu_vcn_enc_ring_test_ring, + .test_ib = amdgpu_vcn_unified_ring_test_ib, + .insert_nop = amdgpu_ring_insert_nop, + .insert_end = vcn_v2_0_enc_ring_insert_end, + .pad_ib = amdgpu_ring_generic_pad_ib, + .begin_use = amdgpu_vcn_ring_begin_use, + .end_use = amdgpu_vcn_ring_end_use, + .emit_wreg = vcn_v2_0_enc_ring_emit_wreg, + .emit_reg_wait = vcn_v2_0_enc_ring_emit_reg_wait, + .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = vcn_v4_0_ring_reset, +}; + /** * vcn_v4_0_set_unified_ring_funcs - set unified ring functions * @@ -2044,10 +2078,11 @@ static void vcn_v4_0_set_unified_ring_funcs(struct amdgpu_device *adev) continue; if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 2)) - vcn_v4_0_unified_ring_vm_funcs.secure_submission_supported = true; - - adev->vcn.inst[i].ring_enc[0].funcs = - (const struct amdgpu_ring_funcs *)&vcn_v4_0_unified_ring_vm_funcs; + adev->vcn.inst[i].ring_enc[0].funcs = + &vcn_v4_0_unified_ring_vm_funcs_secure; + else + adev->vcn.inst[i].ring_enc[0].funcs = + &vcn_v4_0_unified_ring_vm_funcs; adev->vcn.inst[i].ring_enc[0].me = i; } } diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 7f001c32e911..179b892fb410 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -115,6 +115,19 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int i, r; + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->vcn.disable_kq = false; + adev->vcn.disable_uq = true; + break; + case 2: + adev->vcn.disable_kq = true; + adev->vcn.disable_uq = true; + break; + } + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) /* re-use enc ring as unified ring */ adev->vcn.inst[i].num_enc_rings = 1; @@ -217,6 +230,10 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) ring = &adev->vcn.inst[i].ring_enc[0]; ring->use_doorbell = true; + if (adev->vcn.disable_kq) { + ring->no_scheduler = true; + ring->no_user_submission = true; + } if (!amdgpu_sriov_vf(adev)) ring->doorbell_index = @@ -2146,71 +2163,6 @@ static const struct amdgpu_ras_block_hw_ops vcn_v4_0_3_ras_hw_ops = { .query_poison_status = vcn_v4_0_3_query_poison_status, }; -static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* reference to smu driver if header file */ -static int vcn_v4_0_3_err_codes[] = { - 14, 15, /* VCN */ -}; - -static bool vcn_v4_0_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - vcn_v4_0_3_err_codes, - ARRAY_SIZE(vcn_v4_0_3_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops vcn_v4_0_3_aca_bank_ops = { - .aca_bank_parser = vcn_v4_0_3_aca_bank_parser, - .aca_bank_is_valid = vcn_v4_0_3_aca_bank_is_valid, -}; - -static const struct aca_info vcn_v4_0_3_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &vcn_v4_0_3_aca_bank_ops, -}; - static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; @@ -2226,11 +2178,6 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo goto late_fini; } - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__VCN, - &vcn_v4_0_3_aca_info, NULL); - if (r) - goto late_fini; - return 0; late_fini: diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index 1571cc5a148c..c8879a6e5297 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -1479,10 +1479,11 @@ static int vcn_v4_0_5_ring_reset(struct amdgpu_ring *ring, return amdgpu_ring_reset_helper_end(ring, timedout_fence); } -static struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = { +static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = { .type = AMDGPU_RING_TYPE_VCN_ENC, .align_mask = 0x3f, .nop = VCN_ENC_CMD_NO_OP, + .secure_submission_supported = true, .no_user_fence = true, .get_rptr = vcn_v4_0_5_unified_ring_get_rptr, .get_wptr = vcn_v4_0_5_unified_ring_get_wptr, @@ -1525,9 +1526,6 @@ static void vcn_v4_0_5_set_unified_ring_funcs(struct amdgpu_device *adev) if (adev->vcn.harvest_config & (1 << i)) continue; - if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 5)) - vcn_v4_0_5_unified_ring_vm_funcs.secure_submission_supported = true; - adev->vcn.inst[i].ring_enc[0].funcs = &vcn_v4_0_5_unified_ring_vm_funcs; adev->vcn.inst[i].ring_enc[0].me = i; } diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index d3db0494341e..1a07c3bf4425 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -94,6 +94,19 @@ static int vcn_v5_0_1_early_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int i, r; + switch (amdgpu_user_queue) { + case -1: + case 0: + default: + adev->vcn.disable_kq = false; + adev->vcn.disable_uq = true; + break; + case 2: + adev->vcn.disable_kq = true; + adev->vcn.disable_uq = true; + break; + } + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) /* re-use enc ring as unified ring */ adev->vcn.inst[i].num_enc_rings = 1; @@ -188,6 +201,10 @@ static int vcn_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block) ring = &adev->vcn.inst[i].ring_enc[0]; ring->use_doorbell = true; + if (adev->vcn.disable_kq) { + ring->no_scheduler = true; + ring->no_user_submission = true; + } if (!amdgpu_sriov_vf(adev)) ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 1) + @@ -1657,10 +1674,7 @@ static const struct amd_ip_funcs vcn_v5_0_1_ip_funcs = { .resume = vcn_v5_0_1_resume, .is_idle = vcn_v5_0_1_is_idle, .wait_for_idle = vcn_v5_0_1_wait_for_idle, - .check_soft_reset = NULL, - .pre_soft_reset = NULL, .soft_reset = NULL, - .post_soft_reset = NULL, .set_clockgating_state = vcn_v5_0_1_set_clockgating_state, .set_powergating_state = vcn_set_powergating_state, .dump_ip_state = amdgpu_vcn_dump_ip_state, @@ -1713,71 +1727,6 @@ static const struct amdgpu_ras_block_hw_ops vcn_v5_0_1_ras_hw_ops = { .query_poison_status = vcn_v5_0_1_query_poison_status, }; -static int vcn_v5_0_1_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - struct aca_bank_info info; - u64 misc0; - int ret; - - ret = aca_bank_info_decode(bank, &info); - if (ret) - return ret; - - misc0 = bank->regs[ACA_REG_IDX_MISC0]; - switch (type) { - case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, - 1ULL); - break; - case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, - ACA_REG__MISC0__ERRCNT(misc0)); - break; - default: - return -EINVAL; - } - - return ret; -} - -/* reference to smu driver if header file */ -static int vcn_v5_0_1_err_codes[] = { - 14, 15, 47, /* VCN [D|V|S] */ -}; - -static bool vcn_v5_0_1_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, - enum aca_smu_type type, void *data) -{ - u32 instlo; - - instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); - instlo &= GENMASK(31, 1); - - if (instlo != mmSMNAID_AID0_MCA_SMU) - return false; - - if (aca_bank_check_error_codes(handle->adev, bank, - vcn_v5_0_1_err_codes, - ARRAY_SIZE(vcn_v5_0_1_err_codes))) - return false; - - return true; -} - -static const struct aca_bank_ops vcn_v5_0_1_aca_bank_ops = { - .aca_bank_parser = vcn_v5_0_1_aca_bank_parser, - .aca_bank_is_valid = vcn_v5_0_1_aca_bank_is_valid, -}; - -static const struct aca_info vcn_v5_0_1_aca_info = { - .hwip = ACA_HWIP_TYPE_SMU, - .mask = ACA_ERROR_UE_MASK, - .bank_ops = &vcn_v5_0_1_aca_bank_ops, -}; - static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; @@ -1786,11 +1735,6 @@ static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_commo if (r) return r; - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__VCN, - &vcn_v5_0_1_aca_info, NULL); - if (r) - goto late_fini; - if (amdgpu_ras_is_supported(adev, ras_block->block) && adev->vcn.inst->ras_poison_irq.funcs) { r = amdgpu_irq_get(adev, &adev->vcn.inst->ras_poison_irq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c index bbc172db91a1..b9f6ae75ea72 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_2.c @@ -1203,10 +1203,7 @@ static const struct amd_ip_funcs vcn_v5_0_2_ip_funcs = { .resume = vcn_v5_0_2_resume, .is_idle = vcn_v5_0_2_is_idle, .wait_for_idle = vcn_v5_0_2_wait_for_idle, - .check_soft_reset = NULL, - .pre_soft_reset = NULL, .soft_reset = NULL, - .post_soft_reset = NULL, .set_clockgating_state = vcn_v5_0_2_set_clockgating_state, .set_powergating_state = vcn_set_powergating_state, }; diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index a256320b92f3..5715b6b596af 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -1328,27 +1328,6 @@ static void vi_invalidate_hdp(struct amdgpu_device *adev, } } -static bool vi_need_full_reset(struct amdgpu_device *adev) -{ - switch (adev->asic_type) { - case CHIP_CARRIZO: - case CHIP_STONEY: - /* CZ has hang issues with full reset at the moment */ - return false; - case CHIP_FIJI: - case CHIP_TONGA: - /* XXX: soft reset should work on fiji and tonga */ - return true; - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_TOPAZ: - default: - /* change this when we support soft reset */ - return true; - } -} - static void vi_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -1437,7 +1416,6 @@ static const struct amdgpu_asic_funcs vi_asic_funcs = .get_config_memsize = &vi_get_config_memsize, .flush_hdp = &vi_flush_hdp, .invalidate_hdp = &vi_invalidate_hdp, - .need_full_reset = &vi_need_full_reset, .init_doorbell_index = &legacy_doorbell_index_init, .get_pcie_usage = &vi_get_pcie_usage, .need_reset_on_init = &vi_need_reset_on_init, |
