diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 293 |
1 files changed, 220 insertions, 73 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c17505fba988..a97946878024 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -137,14 +137,14 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); /** - * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control + * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control * * @dev: drm_device pointer * * Returns true if the device is a dGPU with HG/PX power control, * otherwise return false. */ -bool amdgpu_device_is_px(struct drm_device *dev) +bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; @@ -154,6 +154,21 @@ bool amdgpu_device_is_px(struct drm_device *dev) } /** + * amdgpu_device_supports_baco - Does the device support BACO + * + * @dev: drm_device pointer + * + * Returns true if the device supporte BACO, + * otherwise return false. + */ +bool amdgpu_device_supports_baco(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + + return amdgpu_asic_supports_baco(adev); +} + +/** * VRAM access helper functions. * * amdgpu_device_vram_access - read/write a buffer in vram @@ -1072,8 +1087,9 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) { struct drm_device *dev = pci_get_drvdata(pdev); + int r; - if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -1081,7 +1097,12 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_resume(dev, true, true); + pci_set_power_state(dev->pdev, PCI_D0); + pci_restore_state(dev->pdev); + r = pci_enable_device(dev->pdev); + if (r) + DRM_WARN("pci_enable_device failed (%d)\n", r); + amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; drm_kms_helper_poll_enable(dev); @@ -1089,7 +1110,11 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero pr_info("amdgpu: switched off\n"); drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_suspend(dev, true, true); + amdgpu_device_suspend(dev, true); + pci_save_state(dev->pdev); + /* Shut down the device */ + pci_disable_device(dev->pdev); + pci_set_power_state(dev->pdev, PCI_D3cold); dev->switch_power_state = DRM_SWITCH_POWER_OFF; } } @@ -1527,7 +1552,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) } parse_soc_bounding_box: -#ifdef CONFIG_DRM_AMD_DC_DCN2_0 /* * soc bounding box info is not integrated in disocovery table, * we always need to parse it from gpu info firmware. @@ -1538,7 +1562,6 @@ parse_soc_bounding_box: le32_to_cpu(hdr->header.ucode_array_offset_bytes)); adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; } -#endif break; } default: @@ -1854,6 +1877,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } } + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_init_data_exchange(adev); + r = amdgpu_ib_pool_init(adev); if (r) { dev_err(adev->dev, "IB initialization failed (%d).\n", r); @@ -1895,11 +1921,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_amdkfd_device_init(adev); init_failed: - if (amdgpu_sriov_vf(adev)) { - if (!r) - amdgpu_virt_init_data_exchange(adev); + if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); - } return r; } @@ -1938,6 +1961,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * amdgpu_device_set_cg_state - set clockgating for amdgpu device * * @adev: amdgpu_device pointer + * @state: clockgating state (gate or ungate) * * The list of all the hardware IPs that make up the asic is walked and the * set_clockgating_state callbacks are run. @@ -1962,6 +1986,7 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_clockgating_state) { /* enable clockgating to save power */ r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, @@ -1992,6 +2017,7 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, @@ -2600,20 +2626,19 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: -#if defined(CONFIG_DRM_AMD_DC_DCN1_0) +#if defined(CONFIG_DRM_AMD_DC_DCN) case CHIP_RAVEN: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN2_0) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN2_1) case CHIP_RENOIR: #endif return amdgpu_dc != 0; #endif default: + if (amdgpu_dc > 0) + DRM_INFO("Display Core has been requested via kernel parameter " + "but isn't supported by ASIC, ignoring\n"); return false; } } @@ -2639,7 +2664,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - adev->asic_reset_res = amdgpu_asic_reset(adev); + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) + adev->asic_reset_res = (adev->in_baco == false) ? + amdgpu_device_baco_enter(adev->ddev) : + amdgpu_device_baco_exit(adev->ddev); + else + adev->asic_reset_res = amdgpu_asic_reset(adev); + if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); @@ -2731,7 +2762,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, uint32_t flags) { int r, i; - bool runtime = false; + bool boco = false; u32 max_MBps; adev->shutdown = false; @@ -2794,8 +2825,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); - mutex_init(&adev->notifier_lock); - mutex_init(&adev->virt.dpm_mutex); mutex_init(&adev->psp.mutex); r = amdgpu_device_check_arguments(adev); @@ -2902,12 +2931,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, * ignore it */ vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - if (amdgpu_device_is_px(ddev)) - runtime = true; - if (!pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_device_supports_boco(ddev)) + boco = true; + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, runtime); - if (runtime) + &amdgpu_switcheroo_ops, boco); + if (boco) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); if (amdgpu_emu_mode == 1) { @@ -2994,8 +3026,6 @@ fence_driver_init: } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); - if (amdgpu_virt_request_full_gpu(adev, false)) - amdgpu_virt_release_full_gpu(adev, false); goto failed; } @@ -3013,16 +3043,19 @@ fence_driver_init: amdgpu_fbdev_init(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_init(adev); - r = amdgpu_pm_sysfs_init(adev); - if (r) + if (r) { + adev->pm_sysfs_en = false; DRM_ERROR("registering pm debugfs failed (%d).\n", r); + } else + adev->pm_sysfs_en = true; r = amdgpu_ucode_sysfs_init(adev); - if (r) + if (r) { + adev->ucode_sysfs_en = false; DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); + } else + adev->ucode_sysfs_en = true; r = amdgpu_debugfs_gem_init(adev); if (r) @@ -3091,7 +3124,7 @@ fence_driver_init: failed: amdgpu_vf_error_trans_all(adev); - if (runtime) + if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); return r; @@ -3122,7 +3155,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) drm_atomic_helper_shutdown(adev->ddev); } amdgpu_fence_driver_fini(adev); - amdgpu_pm_sysfs_fini(adev); + if (adev->pm_sysfs_en) + amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_device_ip_fini(adev); if (adev->firmware.gpu_info_fw) { @@ -3139,9 +3173,12 @@ void amdgpu_device_fini(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (!pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_unregister_client(adev->pdev); - if (adev->flags & AMD_IS_PX) + if (amdgpu_device_supports_boco(adev->ddev)) vga_switcheroo_fini_domain_pm_ops(adev->dev); vga_client_register(adev->pdev, NULL, NULL, NULL); if (adev->rio_mem) @@ -3150,12 +3187,11 @@ void amdgpu_device_fini(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_fini(adev); amdgpu_debugfs_regs_cleanup(adev); device_remove_file(adev->dev, &dev_attr_pcie_replay_count); - amdgpu_ucode_sysfs_fini(adev); + if (adev->ucode_sysfs_en) + amdgpu_ucode_sysfs_fini(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); amdgpu_debugfs_preempt_cleanup(adev); @@ -3178,7 +3214,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) * Returns 0 for success or an error on failure. * Called at driver suspend. */ -int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) +int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { struct amdgpu_device *adev; struct drm_crtc *crtc; @@ -3261,13 +3297,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) */ amdgpu_bo_evict_vram(adev); - if (suspend) { - pci_save_state(dev->pdev); - /* Shut down the device */ - pci_disable_device(dev->pdev); - pci_set_power_state(dev->pdev, PCI_D3hot); - } - return 0; } @@ -3282,7 +3311,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) * Returns 0 for success or an error on failure. * Called at driver resume. */ -int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) +int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { struct drm_connector *connector; struct drm_connector_list_iter iter; @@ -3293,14 +3322,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - if (resume) { - pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); - r = pci_enable_device(dev->pdev); - if (r) - return r; - } - /* post card */ if (amdgpu_device_need_post(adev)) { r = amdgpu_atom_asic_init(adev->mode_info.atom_context); @@ -3646,6 +3667,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) goto error; + amdgpu_virt_init_data_exchange(adev); /* we need recover gart prior to run SMC/CP/SDMA resume */ amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); @@ -3663,7 +3685,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_amdkfd_post_reset(adev); error: - amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { amdgpu_inc_vram_lost(adev); @@ -3769,13 +3790,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; + int cpu = smp_processor_id(); + bool use_baco = + (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? + true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3783,21 +3809,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* For XGMI run all resets in parallel to speed up the process */ + /* + * For XGMI run all resets in parallel to speed up the + * process by scheduling the highpri wq on different + * cpus. For XGMI with baco reset, all nodes must enter + * baco within close proximity before anyone exit. + */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + if (!queue_work_on(cpu, system_highpri_wq, + &tmp_adev->xgmi_reset_work)) r = -EALREADY; + cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - - if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); + if (r) break; - } } - /* For XGMI wait for all PSP resets to complete before proceed */ + /* For XGMI wait for all work to complete before proceed */ if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { @@ -3806,11 +3835,57 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, r = tmp_adev->asic_reset_res; if (r) break; + if (use_baco) + tmp_adev->in_baco = true; + } + } + } + + /* + * For XGMI with baco reset, need exit baco phase by scheduling + * xgmi_reset_work one more time. PSP reset and sGPU skips this + * phase. Not assume the situation that PSP reset and baco reset + * coexist within an XGMI hive. + */ + + if (!r && use_baco) { + cpu = smp_processor_id(); + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + if (!queue_work_on(cpu, + system_highpri_wq, + &tmp_adev->xgmi_reset_work)) + r = -EALREADY; + if (r) + break; + cpu = cpumask_next(cpu, cpu_online_mask); + } + } + } + + if (!r && use_baco) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + flush_work(&tmp_adev->xgmi_reset_work); + r = tmp_adev->asic_reset_res; + if (r) + break; + tmp_adev->in_baco = false; } } } + + if (r) { + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + goto end; + } } + if (!r && amdgpu_ras_intr_triggered()) + amdgpu_ras_intr_cleared(); list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { @@ -3943,12 +4018,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_device *tmp_adev = NULL; int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + bool use_baco = + (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? + true : false; /* * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); @@ -3959,7 +4037,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); + dev_info(adev->dev, "GPU %s begin!\n", + (in_ras_intr && !use_baco) ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -4026,7 +4105,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_unregister_gpu_instance(tmp_adev); /* disable ras on ALL IPs */ - if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!(in_ras_intr && !use_baco) && + amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -4037,13 +4117,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, drm_sched_stop(&ring->sched, job ? &job->base : NULL); - if (in_ras_intr) + if (in_ras_intr && !use_baco) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } - if (in_ras_intr) + if (in_ras_intr && !use_baco) goto skip_sched_resume; /* @@ -4095,7 +4175,8 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(adev, hive, device_list_handle, + &need_full_reset); if (r && r == -EAGAIN) goto retry; } @@ -4136,7 +4217,7 @@ skip_hw_reset: skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /*unlock kfd: SRIOV would do it separately */ - if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) + if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } @@ -4285,3 +4366,69 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) } } +int amdgpu_device_baco_enter(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, false); + + if (is_support_sw_smu(adev)) { + struct smu_context *smu = &adev->smu; + int ret; + + ret = smu_baco_enter(smu); + if (ret) + return ret; + } else { + void *pp_handle = adev->powerplay.pp_handle; + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + + if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) + return -ENOENT; + + /* enter BACO state */ + if (pp_funcs->set_asic_baco_state(pp_handle, 1)) + return -EIO; + } + + return 0; +} + +int amdgpu_device_baco_exit(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + if (is_support_sw_smu(adev)) { + struct smu_context *smu = &adev->smu; + int ret; + + ret = smu_baco_exit(smu); + if (ret) + return ret; + + } else { + void *pp_handle = adev->powerplay.pp_handle; + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + + if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state) + return -ENOENT; + + /* exit BACO state */ + if (pp_funcs->set_asic_baco_state(pp_handle, 0)) + return -EIO; + } + + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, true); + + return 0; +} |