diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
48 files changed, 1078 insertions, 107 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7645e498faa4..d8bc6da50016 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -299,6 +299,12 @@ extern int amdgpu_wbrf; #define AMDGPU_RESET_VCE (1 << 13) #define AMDGPU_RESET_VCE1 (1 << 14) +/* reset mask */ +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */ +#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */ +#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */ +#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */ + /* max cursor sizes (in pixels) */ #define CIK_CURSOR_WIDTH 128 #define CIK_CURSOR_HEIGHT 128 @@ -1464,6 +1470,8 @@ struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev); struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, struct dma_fence *gang); bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev); +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring); +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset); /* atpx handler */ #if defined(CONFIG_VGA_SWITCHEROO) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 2ca127173135..9d6345146495 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -158,7 +158,7 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ return -EINVAL; } - if (start + count >= max_count) + if (start + count > max_count) return -EINVAL; count = min_t(int, count, max_count); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 24343c312480..3afcd1e8aa54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -834,6 +834,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; + if (!kiq_ring->sched.ready || adev->job_hang) + return 0; + ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); if (!ring_funcs) return -ENOMEM; @@ -858,8 +861,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); - if (kiq_ring->sched.ready && !adev->job_hang) - r = amdgpu_ring_test_helper(kiq_ring); + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. + */ + r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0450eab6ade7..0171d240fcb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4236,7 +4236,10 @@ int amdgpu_device_init(struct amdgpu_device *adev, * for throttling interrupt) = 60 seconds. */ ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); + ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); + ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); + ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); /* Registers mapping */ /* TODO: block userspace mapping of io register */ @@ -5186,6 +5189,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) amdgpu_ras_resume(adev); + + amdgpu_virt_ras_telemetry_post_reset(adev); + return 0; } @@ -6200,6 +6206,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); + if (!p2p_access) + dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", + pci_name(peer_adev->pdev)); bool is_large_bar = adev->gmc.visible_vram_size && adev->gmc.real_vram_size == adev->gmc.visible_vram_size; @@ -6715,3 +6724,47 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, } return ret; } + +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) +{ + ssize_t size = 0; + + if (!ring || !ring->adev) + return size; + + if (amdgpu_device_should_recover_gpu(ring->adev)) + size |= AMDGPU_RESET_TYPE_FULL; + + if (unlikely(!ring->adev->debug_disable_soft_recovery) && + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) + size |= AMDGPU_RESET_TYPE_SOFT_RESET; + + return size; +} + +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) +{ + ssize_t size = 0; + + if (supported_reset == 0) { + size += sysfs_emit_at(buf, size, "unsupported"); + size += sysfs_emit_at(buf, size, "\n"); + return size; + + } + + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) + size += sysfs_emit_at(buf, size, "soft "); + + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) + size += sysfs_emit_at(buf, size, "queue "); + + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) + size += sysfs_emit_at(buf, size, "pipe "); + + if (supported_reset & AMDGPU_RESET_TYPE_FULL) + size += sysfs_emit_at(buf, size, "full "); + + size += sysfs_emit_at(buf, size, "\n"); + return size; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 2f3f09dfb1fd..f57cc72c43cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -515,6 +515,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; + if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev)) + return 0; + spin_lock(&kiq->ring_lock); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * adev->gfx.num_compute_rings)) { @@ -528,20 +531,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.compute_ring[j], RESET_QUEUES, 0, 0); } - - /** - * This is workaround: only skip kiq_ring test - * during ras recovery in suspend stage for gfx9.4.3 + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. */ - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - amdgpu_ras_in_recovery(adev)) { - spin_unlock(&kiq->ring_lock); - return 0; - } + r = amdgpu_ring_test_helper(kiq_ring); - if (kiq_ring->sched.ready && !adev->job_hang) - r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); return r; @@ -569,8 +567,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - spin_lock(&kiq->ring_lock); + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) + return 0; + if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { + spin_lock(&kiq->ring_lock); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * adev->gfx.num_gfx_rings)) { spin_unlock(&kiq->ring_lock); @@ -583,11 +584,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.gfx_ring[j], PREEMPT_QUEUES, 0, 0); } - } + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang) + /* + * Ring test will do a basic scratch register change check. + * Just run this to ensure that unmap queues that is submitted + * before got processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); - spin_unlock(&kiq->ring_lock); + spin_unlock(&kiq->ring_lock); + } return r; } @@ -692,7 +699,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) kiq->pmf->kiq_map_queues(kiq_ring, &adev->gfx.compute_ring[j]); } - + /* Submit map queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that map queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); if (r) @@ -743,7 +756,13 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.gfx_ring[j]); } } - + /* Submit map queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that map queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); if (r) @@ -885,6 +904,9 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r if (r) return r; + if (amdgpu_sriov_vf(adev)) + return r; + if (adev->gfx.cp_ecc_error_irq.funcs) { r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); if (r) @@ -1576,9 +1598,11 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, if (adev->enforce_isolation[i] && !partition_values[i]) { /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); + amdgpu_mes_set_enforce_isolation(adev, i, false); } else if (!adev->enforce_isolation[i] && partition_values[i]) { /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); + amdgpu_mes_set_enforce_isolation(adev, i, true); } adev->enforce_isolation[i] = partition_values[i]; } @@ -1588,6 +1612,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, return count; } +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); +} + +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); +} + static DEVICE_ATTR(run_cleaner_shader, 0200, NULL, amdgpu_gfx_set_run_cleaner_shader); @@ -1601,6 +1651,11 @@ static DEVICE_ATTR(current_compute_partition, 0644, static DEVICE_ATTR(available_compute_partition, 0444, amdgpu_gfx_get_available_compute_partition, NULL); +static DEVICE_ATTR(gfx_reset_mask, 0444, + amdgpu_gfx_get_gfx_reset_mask, NULL); + +static DEVICE_ATTR(compute_reset_mask, 0444, + amdgpu_gfx_get_compute_reset_mask, NULL); static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev) { @@ -1666,6 +1721,40 @@ static void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev) device_remove_file(adev->dev, &dev_attr_run_cleaner_shader); } +static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (!amdgpu_gpu_recovery) + return r; + + if (adev->gfx.num_gfx_rings) { + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask); + if (r) + return r; + } + + if (adev->gfx.num_compute_rings) { + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask); + if (r) + return r; + } + + return r; +} + +static void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (!amdgpu_gpu_recovery) + return; + + if (adev->gfx.num_gfx_rings) + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask); + + if (adev->gfx.num_compute_rings) + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); +} + int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) { int r; @@ -1680,6 +1769,10 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) if (r) dev_err(adev->dev, "failed to create isolation sysfs files"); + r = amdgpu_gfx_sysfs_reset_mask_init(adev); + if (r) + dev_err(adev->dev, "failed to create reset mask sysfs files"); + return r; } @@ -1687,6 +1780,7 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) { amdgpu_gfx_sysfs_xcp_fini(adev); amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_reset_mask_fini(adev); } int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index fd73e527f446..8b5bd63b5773 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -424,6 +424,8 @@ struct amdgpu_gfx { /* reset mask */ uint32_t grbm_soft_reset; uint32_t srbm_soft_reset; + uint32_t gfx_supported_reset; + uint32_t compute_supported_reset; /* gfx off */ bool gfx_off_state; /* true: enabled, false: disabled */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 95e2796919fc..04eb51674596 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -47,7 +47,7 @@ int amdgpu_jpeg_sw_init(struct amdgpu_device *adev) adev->jpeg.indirect_sram = true; for (i = 0; i < adev->jpeg.num_jpeg_inst; i++) { - if (adev->jpeg.harvest_config & (1 << i)) + if (adev->jpeg.harvest_config & (1U << i)) continue; if (adev->jpeg.indirect_sram) { @@ -73,7 +73,7 @@ int amdgpu_jpeg_sw_fini(struct amdgpu_device *adev) int i, j; for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { - if (adev->jpeg.harvest_config & (1 << i)) + if (adev->jpeg.harvest_config & (1U << i)) continue; amdgpu_bo_free_kernel( @@ -110,7 +110,7 @@ static void amdgpu_jpeg_idle_work_handler(struct work_struct *work) unsigned int i, j; for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { - if (adev->jpeg.harvest_config & (1 << i)) + if (adev->jpeg.harvest_config & (1U << i)) continue; for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) @@ -357,7 +357,7 @@ static int amdgpu_debugfs_jpeg_sched_mask_set(void *data, u64 val) if (!adev) return -ENODEV; - mask = (1 << (adev->jpeg.num_jpeg_inst * adev->jpeg.num_jpeg_rings)) - 1; + mask = (1ULL << (adev->jpeg.num_jpeg_inst * adev->jpeg.num_jpeg_rings)) - 1; if ((val & mask) == 0) return -EINVAL; @@ -388,7 +388,7 @@ static int amdgpu_debugfs_jpeg_sched_mask_get(void *data, u64 *val) for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) { ring = &adev->jpeg.inst[i].ring_dec[j]; if (ring->sched.ready) - mask |= 1 << ((i * adev->jpeg.num_jpeg_rings) + j); + mask |= 1ULL << ((i * adev->jpeg.num_jpeg_rings) + j); } } *val = mask; @@ -415,3 +415,38 @@ void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev) &amdgpu_debugfs_jpeg_sched_mask_fops); #endif } + +static ssize_t amdgpu_get_jpeg_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->jpeg.supported_reset); +} + +static DEVICE_ATTR(jpeg_reset_mask, 0444, + amdgpu_get_jpeg_reset_mask, NULL); + +int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (adev->jpeg.num_jpeg_inst) { + r = device_create_file(adev->dev, &dev_attr_jpeg_reset_mask); + if (r) + return r; + } + + return r; +} + +void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (adev->jpeg.num_jpeg_inst) + device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h index 819dc7a0af99..3eb4a4653fce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h @@ -128,6 +128,7 @@ struct amdgpu_jpeg { uint16_t inst_mask; uint8_t num_inst_per_aid; bool indirect_sram; + uint32_t supported_reset; }; int amdgpu_jpeg_sw_init(struct amdgpu_device *adev); @@ -150,5 +151,7 @@ int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev); int amdgpu_jpeg_psp_update_sram(struct amdgpu_device *adev, int inst_idx, enum AMDGPU_UCODE_ID ucode_id); void amdgpu_debugfs_jpeg_sched_mask_init(struct amdgpu_device *adev); +int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev); +void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev); #endif /*__AMDGPU_JPEG_H__*/ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index b10383f83d73..59ec20b07a6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -104,7 +104,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) return 0; r = amdgpu_bo_create_kernel(adev, adev->mes.event_log_size, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_GTT, + AMDGPU_GEM_DOMAIN_VRAM, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); @@ -192,17 +192,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev) (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]]; } - r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs); - if (r) { - dev_err(adev->dev, - "(%d) read_val_offs alloc failed\n", r); - goto error; - } - adev->mes.read_val_gpu_addr = - adev->wb.gpu_addr + (adev->mes.read_val_offs * 4); - adev->mes.read_val_ptr = - (uint32_t *)&adev->wb.wb[adev->mes.read_val_offs]; - r = amdgpu_mes_doorbell_init(adev); if (r) goto error; @@ -223,8 +212,6 @@ error: amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs[i]); } - if (adev->mes.read_val_ptr) - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); idr_destroy(&adev->mes.pasid_idr); idr_destroy(&adev->mes.gang_id_idr); @@ -249,8 +236,6 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs[i]); } - if (adev->mes.read_val_ptr) - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); amdgpu_mes_doorbell_free(adev); @@ -921,10 +906,19 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) { struct mes_misc_op_input op_input; int r, val = 0; + uint32_t addr_offset = 0; + uint64_t read_val_gpu_addr; + uint32_t *read_val_ptr; + if (amdgpu_device_wb_get(adev, &addr_offset)) { + DRM_ERROR("critical bug! too many mes readers\n"); + goto error; + } + read_val_gpu_addr = adev->wb.gpu_addr + (addr_offset * 4); + read_val_ptr = (uint32_t *)&adev->wb.wb[addr_offset]; op_input.op = MES_MISC_OP_READ_REG; op_input.read_reg.reg_offset = reg; - op_input.read_reg.buffer_addr = adev->mes.read_val_gpu_addr; + op_input.read_reg.buffer_addr = read_val_gpu_addr; if (!adev->mes.funcs->misc_op) { DRM_ERROR("mes rreg is not supported!\n"); @@ -935,9 +929,11 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) if (r) DRM_ERROR("failed to read reg (0x%x)\n", reg); else - val = *(adev->mes.read_val_ptr); + val = *(read_val_ptr); error: + if (addr_offset) + amdgpu_device_wb_free(adev, addr_offset); return val; } @@ -1682,6 +1678,29 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) return is_supported; } +/* Fix me -- node_id is used to identify the correct MES instances in the future */ +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) +{ + struct mes_misc_op_input op_input = {0}; + int r; + + op_input.op = MES_MISC_OP_CHANGE_CONFIG; + op_input.change_config.option.limit_single_process = enable ? 1 : 0; + + if (!adev->mes.funcs->misc_op) { + dev_err(adev->dev, "mes change config is not supported!\n"); + r = -EINVAL; + goto error; + } + + r = adev->mes.funcs->misc_op(&adev->mes, &op_input); + if (r) + dev_err(adev->dev, "failed to change_config.\n"); + +error: + return r; +} + #if defined(CONFIG_DEBUG_FS) static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 0684e482a204..c6f93cbd6739 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -40,6 +40,7 @@ #define AMDGPU_MES_VERSION_MASK 0x00000fff #define AMDGPU_MES_API_VERSION_MASK 0x00fff000 #define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000 +#define AMDGPU_MES_MSCRATCH_SIZE 0x8000 enum amdgpu_mes_priority_level { AMDGPU_MES_PRIORITY_LEVEL_LOW = 0, @@ -120,9 +121,6 @@ struct amdgpu_mes { uint32_t query_status_fence_offs[AMDGPU_MAX_MES_PIPES]; uint64_t query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES]; uint64_t *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES]; - uint32_t read_val_offs; - uint64_t read_val_gpu_addr; - uint32_t *read_val_ptr; uint32_t saved_flags; @@ -311,6 +309,7 @@ enum mes_misc_opcode { MES_MISC_OP_WRM_REG_WAIT, MES_MISC_OP_WRM_REG_WR_WAIT, MES_MISC_OP_SET_SHADER_DEBUGGER, + MES_MISC_OP_CHANGE_CONFIG, }; struct mes_misc_op_input { @@ -349,6 +348,21 @@ struct mes_misc_op_input { uint32_t tcp_watch_cntl[4]; uint32_t trap_en; } set_shader_debugger; + + struct { + union { + struct { + uint32_t limit_single_process : 1; + uint32_t enable_hws_logging_buffer : 1; + uint32_t reserved : 30; + }; + uint32_t all; + } option; + struct { + uint32_t tdr_level; + uint32_t tdr_delay; + } tdr_config; + } change_config; }; }; @@ -519,4 +533,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) } bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); + +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable); + #endif /* __AMDGPU_MES_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index b5f65ef1efcd..6852d50caa89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -162,7 +162,8 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) * When GTT is just an alternative to VRAM make sure that we * only use it as fallback and still try to fill up VRAM first. */ - if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) + if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM && + !(adev->flags & AMD_IS_APU)) places[c].flags |= TTM_PL_FLAG_FALLBACK; c++; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b772299e1067..1bc95b0cdbb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1214,6 +1214,42 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, } } +static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev, + struct ras_query_if *query_if, + struct ras_err_data *err_data, + struct ras_query_context *qctx) +{ + unsigned long new_ue, new_ce, new_de; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); + const char *blk_name = get_ras_block_str(&query_if->head); + u64 event_id = qctx->evid.event_id; + + new_ce = err_data->ce_count - obj->err_data.ce_count; + new_ue = err_data->ue_count - obj->err_data.ue_count; + new_de = err_data->de_count - obj->err_data.de_count; + + if (new_ce) { + RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors " + "detected in %s block\n", + new_ce, + blk_name); + } + + if (new_ue) { + RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors " + "detected in %s block\n", + new_ue, + blk_name); + } + + if (new_de) { + RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors " + "detected in %s block\n", + new_de, + blk_name); + } +} + static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) { struct ras_err_node *err_node; @@ -1237,6 +1273,15 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s } } +static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj, + struct ras_err_data *err_data) +{ + /* Host reports absolute counts */ + obj->err_data.ue_count = err_data->ue_count; + obj->err_data.ce_count = err_data->ce_count; + obj->err_data.de_count = err_data->de_count; +} + static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) { struct ras_common_if head; @@ -1323,7 +1368,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) return -EINVAL; - if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { + if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { + return amdgpu_virt_req_ras_err_count(adev, blk, err_data); + } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, err_data); } else { @@ -1405,14 +1452,22 @@ static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, if (ret) goto out_fini_err_data; - amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); + if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { + amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); + amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); + } else { + /* Host provides absolute error counts. First generate the report + * using the previous VF internal count against new host count. + * Then Update VF internal count. + */ + amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx); + amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data); + } info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; info->de_count = obj->err_data.de_count; - amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); - out_fini_err_data: amdgpu_ras_error_data_fini(&err_data); @@ -3453,6 +3508,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!amdgpu_ras_asic_supported(adev)) return; + if (amdgpu_sriov_vf(adev)) { + if (amdgpu_virt_get_ras_capability(adev)) + goto init_ras_enabled_flag; + } + /* query ras capability from psp */ if (amdgpu_psp_get_ras_capability(&adev->psp)) goto init_ras_enabled_flag; @@ -3925,7 +3985,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) } /* Guest side doesn't need init ras feature */ - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) return 0; list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { @@ -4392,11 +4452,14 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, return false; } - if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) + if (amdgpu_sriov_vf(adev)) { + *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; + } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { *error_query_mode = (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; - else + } else { *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; + } return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 871b2d6278e0..6db772ecfee4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -365,6 +365,7 @@ enum amdgpu_ras_error_query_mode { AMDGPU_RAS_INVALID_ERROR_QUERY = 0, AMDGPU_RAS_DIRECT_ERROR_QUERY = 1, AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2, + AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY = 3, }; /* ras error status reisger fields */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 5868b4a32ea6..8c89b69edc20 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -413,3 +413,44 @@ void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev) &amdgpu_debugfs_sdma_sched_mask_fops); #endif } + +static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->sdma.supported_reset); +} + +static DEVICE_ATTR(sdma_reset_mask, 0444, + amdgpu_get_sdma_reset_mask, NULL); + +int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (!amdgpu_gpu_recovery) + return r; + + if (adev->sdma.num_instances) { + r = device_create_file(adev->dev, &dev_attr_sdma_reset_mask); + if (r) + return r; + } + + return r; +} + +void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (!amdgpu_gpu_recovery) + return; + + if (adev->sdma.num_instances) + device_remove_file(adev->dev, &dev_attr_sdma_reset_mask); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index a37fcd9bb981..2db58b5812a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -116,6 +116,7 @@ struct amdgpu_sdma { struct ras_common_if *ras_if; struct amdgpu_sdma_ras *ras; uint32_t *ip_dump; + uint32_t supported_reset; }; /* @@ -176,4 +177,6 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev, bool duplicate); int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev); void amdgpu_debugfs_sdma_sched_mask_init(struct amdgpu_device *adev); +int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev); +void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index bb7b9b2eaac1..896f3609b0ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -318,6 +318,9 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r if (r) return r; + if (amdgpu_sriov_vf(adev)) + return r; + if (amdgpu_ras_is_supported(adev, ras_block->block)) { r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index b6397d3229e1..c704e9803e11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -523,6 +523,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) adev->unique_id = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; + adev->virt.ras_en_caps.all = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_en_caps.all; + adev->virt.ras_telemetry_en_caps.all = + ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_telemetry_en_caps.all; break; default: dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); @@ -703,6 +706,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_vf2pf = (struct amd_sriov_msg_vf2pf_info_header *) (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10)); + adev->virt.fw_reserve.ras_telemetry = + (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10)); } else if (adev->mman.drv_vram_usage_va) { adev->virt.fw_reserve.p_pf2vf = (struct amd_sriov_msg_pf2vf_info_header *) @@ -710,6 +715,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_vf2pf = (struct amd_sriov_msg_vf2pf_info_header *) (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10)); + adev->virt.fw_reserve.ras_telemetry = + (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10)); } amdgpu_virt_read_pf2vf_data(adev); @@ -1144,3 +1151,185 @@ bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev) return xnack_mode; } + +bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!amdgpu_sriov_ras_caps_en(adev)) + return false; + + if (adev->virt.ras_en_caps.bits.block_umc) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__UMC); + if (adev->virt.ras_en_caps.bits.block_sdma) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SDMA); + if (adev->virt.ras_en_caps.bits.block_gfx) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__GFX); + if (adev->virt.ras_en_caps.bits.block_mmhub) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MMHUB); + if (adev->virt.ras_en_caps.bits.block_athub) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__ATHUB); + if (adev->virt.ras_en_caps.bits.block_pcie_bif) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__PCIE_BIF); + if (adev->virt.ras_en_caps.bits.block_hdp) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__HDP); + if (adev->virt.ras_en_caps.bits.block_xgmi_wafl) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__XGMI_WAFL); + if (adev->virt.ras_en_caps.bits.block_df) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__DF); + if (adev->virt.ras_en_caps.bits.block_smn) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SMN); + if (adev->virt.ras_en_caps.bits.block_sem) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SEM); + if (adev->virt.ras_en_caps.bits.block_mp0) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP0); + if (adev->virt.ras_en_caps.bits.block_mp1) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP1); + if (adev->virt.ras_en_caps.bits.block_fuse) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__FUSE); + if (adev->virt.ras_en_caps.bits.block_mca) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MCA); + if (adev->virt.ras_en_caps.bits.block_vcn) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__VCN); + if (adev->virt.ras_en_caps.bits.block_jpeg) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__JPEG); + if (adev->virt.ras_en_caps.bits.block_ih) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__IH); + if (adev->virt.ras_en_caps.bits.block_mpio) + adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MPIO); + + if (adev->virt.ras_en_caps.bits.poison_propogation_mode) + con->poison_supported = true; /* Poison is handled by host */ + + return true; +} + +static inline enum amd_sriov_ras_telemetry_gpu_block +amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) { + switch (block) { + case AMDGPU_RAS_BLOCK__UMC: + return RAS_TELEMETRY_GPU_BLOCK_UMC; + case AMDGPU_RAS_BLOCK__SDMA: + return RAS_TELEMETRY_GPU_BLOCK_SDMA; + case AMDGPU_RAS_BLOCK__GFX: + return RAS_TELEMETRY_GPU_BLOCK_GFX; + case AMDGPU_RAS_BLOCK__MMHUB: + return RAS_TELEMETRY_GPU_BLOCK_MMHUB; + case AMDGPU_RAS_BLOCK__ATHUB: + return RAS_TELEMETRY_GPU_BLOCK_ATHUB; + case AMDGPU_RAS_BLOCK__PCIE_BIF: + return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF; + case AMDGPU_RAS_BLOCK__HDP: + return RAS_TELEMETRY_GPU_BLOCK_HDP; + case AMDGPU_RAS_BLOCK__XGMI_WAFL: + return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL; + case AMDGPU_RAS_BLOCK__DF: + return RAS_TELEMETRY_GPU_BLOCK_DF; + case AMDGPU_RAS_BLOCK__SMN: + return RAS_TELEMETRY_GPU_BLOCK_SMN; + case AMDGPU_RAS_BLOCK__SEM: + return RAS_TELEMETRY_GPU_BLOCK_SEM; + case AMDGPU_RAS_BLOCK__MP0: + return RAS_TELEMETRY_GPU_BLOCK_MP0; + case AMDGPU_RAS_BLOCK__MP1: + return RAS_TELEMETRY_GPU_BLOCK_MP1; + case AMDGPU_RAS_BLOCK__FUSE: + return RAS_TELEMETRY_GPU_BLOCK_FUSE; + case AMDGPU_RAS_BLOCK__MCA: + return RAS_TELEMETRY_GPU_BLOCK_MCA; + case AMDGPU_RAS_BLOCK__VCN: + return RAS_TELEMETRY_GPU_BLOCK_VCN; + case AMDGPU_RAS_BLOCK__JPEG: + return RAS_TELEMETRY_GPU_BLOCK_JPEG; + case AMDGPU_RAS_BLOCK__IH: + return RAS_TELEMETRY_GPU_BLOCK_IH; + case AMDGPU_RAS_BLOCK__MPIO: + return RAS_TELEMETRY_GPU_BLOCK_MPIO; + default: + dev_err(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", block); + return RAS_TELEMETRY_GPU_BLOCK_COUNT; + } +} + +static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev, + struct amdsriov_ras_telemetry *host_telemetry) +{ + struct amd_sriov_ras_telemetry_error_count *tmp = NULL; + uint32_t checksum, used_size; + + checksum = host_telemetry->header.checksum; + used_size = host_telemetry->header.used_size; + + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) + return 0; + + tmp = kmalloc(used_size, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + memcpy(tmp, &host_telemetry->body.error_count, used_size); + + if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) + goto out; + + memcpy(&adev->virt.count_cache, tmp, + min(used_size, sizeof(adev->virt.count_cache))); +out: + kfree(tmp); + + return 0; +} + +static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update) +{ + struct amdgpu_virt *virt = &adev->virt; + + /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host + * will ignore incoming guest messages. Ratelimit the guest messages to + * prevent guest self DOS. + */ + if (__ratelimit(&adev->virt.ras_telemetry_rs) || force_update) { + if (!virt->ops->req_ras_err_count(adev)) + amdgpu_virt_cache_host_error_counts(adev, + adev->virt.fw_reserve.ras_telemetry); + } + + return 0; +} + +/* Bypass ACA interface and query ECC counts directly from host */ +int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, + struct ras_err_data *err_data) +{ + enum amd_sriov_ras_telemetry_gpu_block sriov_block; + + sriov_block = amdgpu_ras_block_to_sriov(adev, block); + + if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || + !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) + return -EOPNOTSUPP; + + /* Host Access may be lost during reset, just return last cached data. */ + if (down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_virt_req_ras_err_count_internal(adev, false); + up_read(&adev->reset_domain->sem); + } + + err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count; + err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count; + err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; + + return 0; +} + +int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) +{ + unsigned long ue_count, ce_count; + + if (amdgpu_sriov_ras_telemetry_en(adev)) { + amdgpu_virt_req_ras_err_count_internal(adev, true); + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL); + } + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index b650a2032c42..5381b8d596e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -95,6 +95,7 @@ struct amdgpu_virt_ops { void (*ras_poison_handler)(struct amdgpu_device *adev, enum amdgpu_ras_block block); bool (*rcvd_ras_intr)(struct amdgpu_device *adev); + int (*req_ras_err_count)(struct amdgpu_device *adev); }; /* @@ -103,6 +104,7 @@ struct amdgpu_virt_ops { struct amdgpu_virt_fw_reserve { struct amd_sriov_msg_pf2vf_info_header *p_pf2vf; struct amd_sriov_msg_vf2pf_info_header *p_vf2pf; + void *ras_telemetry; unsigned int checksum_key; }; @@ -136,6 +138,8 @@ enum AMDGIM_FEATURE_FLAG { AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7), /* MES info */ AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8), + AMDGIM_FEATURE_RAS_CAPS = (1 << 9), + AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10), }; enum AMDGIM_REG_ACCESS_FLAG { @@ -276,6 +280,12 @@ struct amdgpu_virt { uint32_t autoload_ucode_id; struct mutex rlcg_reg_lock; + + union amd_sriov_ras_caps ras_en_caps; + union amd_sriov_ras_caps ras_telemetry_en_caps; + + struct ratelimit_state ras_telemetry_rs; + struct amd_sriov_ras_telemetry_error_count count_cache; }; struct amdgpu_video_codec_info; @@ -320,6 +330,15 @@ struct amdgpu_video_codec_info; #define amdgpu_sriov_vf_mmio_access_protection(adev) \ ((adev)->virt.caps & AMDGPU_VF_MMIO_ACCESS_PROTECT) +#define amdgpu_sriov_ras_caps_en(adev) \ +((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CAPS) + +#define amdgpu_sriov_ras_telemetry_en(adev) \ +(((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_TELEMETRY) && (adev)->virt.fw_reserve.ras_telemetry) + +#define amdgpu_sriov_ras_telemetry_block_en(adev, sriov_blk) \ +(amdgpu_sriov_ras_telemetry_en((adev)) && (adev)->virt.ras_telemetry_en_caps.all & BIT(sriov_blk)) + static inline bool is_virtual_machine(void) { #if defined(CONFIG_X86) @@ -383,4 +402,8 @@ bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev, u32 acc_flags, u32 hwip, bool write, u32 *rlcg_flag); u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id); +bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev); +int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, + struct ras_err_data *err_data); +int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 46713a158d90..3e6f9dfb61bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -377,6 +377,13 @@ static int vpe_sw_init(struct amdgpu_ip_block *ip_block) ret = vpe_init_microcode(vpe); if (ret) goto out; + + /* TODO: Add queue reset mask when FW fully supports it */ + adev->vpe.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->vpe.ring); + ret = amdgpu_vpe_sysfs_reset_mask_init(adev); + if (ret) + goto out; out: return ret; } @@ -389,6 +396,7 @@ static int vpe_sw_fini(struct amdgpu_ip_block *ip_block) release_firmware(vpe->fw); vpe->fw = NULL; + amdgpu_vpe_sysfs_reset_mask_fini(adev); vpe_ring_fini(vpe); amdgpu_bo_free_kernel(&adev->vpe.cmdbuf_obj, @@ -865,6 +873,41 @@ static void vpe_ring_end_use(struct amdgpu_ring *ring) schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT); } +static ssize_t amdgpu_get_vpe_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->vpe.supported_reset); +} + +static DEVICE_ATTR(vpe_reset_mask, 0444, + amdgpu_get_vpe_reset_mask, NULL); + +int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (adev->vpe.num_instances) { + r = device_create_file(adev->dev, &dev_attr_vpe_reset_mask); + if (r) + return r; + } + + return r; +} + +void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (adev->vpe.num_instances) + device_remove_file(adev->dev, &dev_attr_vpe_reset_mask); +} + static const struct amdgpu_ring_funcs vpe_ring_funcs = { .type = AMDGPU_RING_TYPE_VPE, .align_mask = 0xf, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h index 231d86d0953e..695da740a97e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h @@ -79,6 +79,7 @@ struct amdgpu_vpe { uint32_t num_instances; bool collaborate_mode; + uint32_t supported_reset; }; int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev); @@ -86,6 +87,8 @@ int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe); int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe); int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe); int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe); +void amdgpu_vpe_sysfs_reset_mask_fini(struct amdgpu_device *adev); +int amdgpu_vpe_sysfs_reset_mask_init(struct amdgpu_device *adev); #define vpe_ring_init(vpe) ((vpe)->funcs->ring_init ? (vpe)->funcs->ring_init((vpe)) : 0) #define vpe_ring_start(vpe) ((vpe)->funcs->ring_start ? (vpe)->funcs->ring_start((vpe)) : 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index 83a16918ea76..e209b5e101df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -471,6 +471,16 @@ static const char *xcp_desc[] = { [AMDGPU_CPX_PARTITION_MODE] = "CPX", }; +static const char *nps_desc[] = { + [UNKNOWN_MEMORY_PARTITION_MODE] = "UNKNOWN", + [AMDGPU_NPS1_PARTITION_MODE] = "NPS1", + [AMDGPU_NPS2_PARTITION_MODE] = "NPS2", + [AMDGPU_NPS3_PARTITION_MODE] = "NPS3", + [AMDGPU_NPS4_PARTITION_MODE] = "NPS4", + [AMDGPU_NPS6_PARTITION_MODE] = "NPS6", + [AMDGPU_NPS8_PARTITION_MODE] = "NPS8", +}; + ATTRIBUTE_GROUPS(xcp_cfg_res_sysfs); #define to_xcp_attr(x) \ @@ -540,6 +550,26 @@ static ssize_t supported_xcp_configs_show(struct kobject *kobj, return size; } +static ssize_t supported_nps_configs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct amdgpu_xcp_cfg *xcp_cfg = to_xcp_cfg(kobj); + int size = 0, mode; + char *sep = ""; + + if (!xcp_cfg || !xcp_cfg->compatible_nps_modes) + return sysfs_emit(buf, "Not supported\n"); + + for_each_inst(mode, xcp_cfg->compatible_nps_modes) { + size += sysfs_emit_at(buf, size, "%s%s", sep, nps_desc[mode]); + sep = ", "; + } + + size += sysfs_emit_at(buf, size, "\n"); + + return size; +} + static ssize_t xcp_config_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -596,6 +626,9 @@ static const struct kobj_type xcp_cfg_sysfs_ktype = { static struct kobj_attribute supp_part_sysfs_mode = __ATTR_RO(supported_xcp_configs); +static struct kobj_attribute supp_nps_sysfs_mode = + __ATTR_RO(supported_nps_configs); + static const struct attribute *xcp_attrs[] = { &supp_part_sysfs_mode.attr, &xcp_cfg_sysfs_mode.attr, @@ -625,13 +658,24 @@ void amdgpu_xcp_cfg_sysfs_init(struct amdgpu_device *adev) if (r) goto err1; + if (adev->gmc.supported_nps_modes != 0) { + r = sysfs_create_file(&xcp_cfg->kobj, &supp_nps_sysfs_mode.attr); + if (r) { + sysfs_remove_files(&xcp_cfg->kobj, xcp_attrs); + goto err1; + } + } + mode = (xcp_cfg->xcp_mgr->mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE) ? AMDGPU_SPX_PARTITION_MODE : xcp_cfg->xcp_mgr->mode; r = amdgpu_xcp_get_res_info(xcp_cfg->xcp_mgr, mode, xcp_cfg); - if (r) + if (r) { + sysfs_remove_file(&xcp_cfg->kobj, &supp_nps_sysfs_mode.attr); + sysfs_remove_files(&xcp_cfg->kobj, xcp_attrs); goto err1; + } xcp_cfg->mode = mode; for (i = 0; i < xcp_cfg->num_res; i++) { @@ -653,6 +697,7 @@ err: kobject_put(&xcp_res->kobj); } + sysfs_remove_file(&xcp_cfg->kobj, &supp_nps_sysfs_mode.attr); sysfs_remove_files(&xcp_cfg->kobj, xcp_attrs); err1: kobject_put(&xcp_cfg->kobj); @@ -673,6 +718,7 @@ void amdgpu_xcp_cfg_sysfs_fini(struct amdgpu_device *adev) kobject_put(&xcp_res->kobj); } + sysfs_remove_file(&xcp_cfg->kobj, &supp_nps_sysfs_mode.attr); sysfs_remove_files(&xcp_cfg->kobj, xcp_attrs); kobject_put(&xcp_cfg->kobj); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index 6e9eeaeb3de1..b4f9c2f4e92c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -28,17 +28,21 @@ #define AMD_SRIOV_MSG_VBIOS_SIZE_KB 64 #define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB AMD_SRIOV_MSG_VBIOS_SIZE_KB #define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB 4 - +#define AMD_SRIOV_MSG_TMR_OFFSET_KB 2048 +#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB 2 +#define AMD_SRIOV_RAS_TELEMETRY_SIZE_KB 64 /* * layout - * 0 64KB 65KB 66KB - * | VBIOS | PF2VF | VF2PF | Bad Page | ... - * | 64KB | 1KB | 1KB | + * 0 64KB 65KB 66KB 68KB 132KB + * | VBIOS | PF2VF | VF2PF | Bad Page | RAS Telemetry Region | ... + * | 64KB | 1KB | 1KB | 2KB | 64KB | ... */ + #define AMD_SRIOV_MSG_SIZE_KB 1 #define AMD_SRIOV_MSG_PF2VF_OFFSET_KB AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB #define AMD_SRIOV_MSG_VF2PF_OFFSET_KB (AMD_SRIOV_MSG_PF2VF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB) #define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB (AMD_SRIOV_MSG_VF2PF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB) +#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB (AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB) /* * PF2VF history log: @@ -86,30 +90,59 @@ enum amd_sriov_ucode_engine_id { union amd_sriov_msg_feature_flags { struct { - uint32_t error_log_collect : 1; - uint32_t host_load_ucodes : 1; - uint32_t host_flr_vramlost : 1; - uint32_t mm_bw_management : 1; - uint32_t pp_one_vf_mode : 1; - uint32_t reg_indirect_acc : 1; - uint32_t av1_support : 1; - uint32_t vcn_rb_decouple : 1; - uint32_t mes_info_enable : 1; - uint32_t reserved : 23; + uint32_t error_log_collect : 1; + uint32_t host_load_ucodes : 1; + uint32_t host_flr_vramlost : 1; + uint32_t mm_bw_management : 1; + uint32_t pp_one_vf_mode : 1; + uint32_t reg_indirect_acc : 1; + uint32_t av1_support : 1; + uint32_t vcn_rb_decouple : 1; + uint32_t mes_info_dump_enable : 1; + uint32_t ras_caps : 1; + uint32_t ras_telemetry : 1; + uint32_t reserved : 21; } flags; uint32_t all; }; union amd_sriov_reg_access_flags { struct { - uint32_t vf_reg_access_ih : 1; - uint32_t vf_reg_access_mmhub : 1; - uint32_t vf_reg_access_gc : 1; - uint32_t reserved : 29; + uint32_t vf_reg_access_ih : 1; + uint32_t vf_reg_access_mmhub : 1; + uint32_t vf_reg_access_gc : 1; + uint32_t reserved : 29; } flags; uint32_t all; }; +union amd_sriov_ras_caps { + struct { + uint64_t block_umc : 1; + uint64_t block_sdma : 1; + uint64_t block_gfx : 1; + uint64_t block_mmhub : 1; + uint64_t block_athub : 1; + uint64_t block_pcie_bif : 1; + uint64_t block_hdp : 1; + uint64_t block_xgmi_wafl : 1; + uint64_t block_df : 1; + uint64_t block_smn : 1; + uint64_t block_sem : 1; + uint64_t block_mp0 : 1; + uint64_t block_mp1 : 1; + uint64_t block_fuse : 1; + uint64_t block_mca : 1; + uint64_t block_vcn : 1; + uint64_t block_jpeg : 1; + uint64_t block_ih : 1; + uint64_t block_mpio : 1; + uint64_t poison_propogation_mode : 1; + uint64_t reserved : 44; + } bits; + uint64_t all; +}; + union amd_sriov_msg_os_info { struct { uint32_t windows : 1; @@ -158,7 +191,7 @@ struct amd_sriov_msg_pf2vf_info_header { uint32_t reserved[2]; }; -#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49) +#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (55) struct amd_sriov_msg_pf2vf_info { /* header contains size and version */ struct amd_sriov_msg_pf2vf_info_header header; @@ -211,6 +244,12 @@ struct amd_sriov_msg_pf2vf_info { uint32_t pcie_atomic_ops_support_flags; /* Portion of GPU memory occupied by VF. MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */ uint32_t gpu_capacity; + /* vf bdf on host pci tree for debug only */ + uint32_t bdf_on_host; + uint32_t more_bp; //Reserved for future use. + union amd_sriov_ras_caps ras_en_caps; + union amd_sriov_ras_caps ras_telemetry_en_caps; + /* reserved */ uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE]; } __packed; @@ -283,8 +322,12 @@ enum amd_sriov_mailbox_request_message { MB_REQ_MSG_REL_GPU_FINI_ACCESS, MB_REQ_MSG_REQ_GPU_RESET_ACCESS, MB_REQ_MSG_REQ_GPU_INIT_DATA, + MB_REQ_MSG_PSP_VF_CMD_RELAY, MB_REQ_MSG_LOG_VF_ERROR = 200, + MB_REQ_MSG_READY_TO_RESET = 201, + MB_REQ_MSG_RAS_POISON = 202, + MB_REQ_RAS_ERROR_COUNT = 203, }; /* mailbox message send from host to guest */ @@ -297,10 +340,60 @@ enum amd_sriov_mailbox_response_message { MB_RES_MSG_FAIL, MB_RES_MSG_QUERY_ALIVE, MB_RES_MSG_GPU_INIT_DATA_READY, + MB_RES_MSG_RAS_ERROR_COUNT_READY = 11, MB_RES_MSG_TEXT_MESSAGE = 255 }; +enum amd_sriov_ras_telemetry_gpu_block { + RAS_TELEMETRY_GPU_BLOCK_UMC = 0, + RAS_TELEMETRY_GPU_BLOCK_SDMA = 1, + RAS_TELEMETRY_GPU_BLOCK_GFX = 2, + RAS_TELEMETRY_GPU_BLOCK_MMHUB = 3, + RAS_TELEMETRY_GPU_BLOCK_ATHUB = 4, + RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF = 5, + RAS_TELEMETRY_GPU_BLOCK_HDP = 6, + RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL = 7, + RAS_TELEMETRY_GPU_BLOCK_DF = 8, + RAS_TELEMETRY_GPU_BLOCK_SMN = 9, + RAS_TELEMETRY_GPU_BLOCK_SEM = 10, + RAS_TELEMETRY_GPU_BLOCK_MP0 = 11, + RAS_TELEMETRY_GPU_BLOCK_MP1 = 12, + RAS_TELEMETRY_GPU_BLOCK_FUSE = 13, + RAS_TELEMETRY_GPU_BLOCK_MCA = 14, + RAS_TELEMETRY_GPU_BLOCK_VCN = 15, + RAS_TELEMETRY_GPU_BLOCK_JPEG = 16, + RAS_TELEMETRY_GPU_BLOCK_IH = 17, + RAS_TELEMETRY_GPU_BLOCK_MPIO = 18, + RAS_TELEMETRY_GPU_BLOCK_COUNT = 19, +}; + +struct amd_sriov_ras_telemetry_header { + uint32_t checksum; + uint32_t used_size; + uint32_t reserved[2]; +}; + +struct amd_sriov_ras_telemetry_error_count { + struct { + uint32_t ce_count; + uint32_t ue_count; + uint32_t de_count; + uint32_t ce_overflow_count; + uint32_t ue_overflow_count; + uint32_t de_overflow_count; + uint32_t reserved[6]; + } block[RAS_TELEMETRY_GPU_BLOCK_COUNT]; +}; + +struct amdsriov_ras_telemetry { + struct amd_sriov_ras_telemetry_header header; + + union { + struct amd_sriov_ras_telemetry_error_count error_count; + } body; +}; + /* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion */ enum amd_sriov_gpu_init_data_version { GPU_INIT_DATA_READY_V1 = 1, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index d1a18ca584dd..24dce803a829 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) } } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 62e4c446793d..2ae058a224f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1580,6 +1580,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(11, 0, 0): + case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; adev->gfx.cleaner_shader_size = sizeof(gfx_11_0_3_cleaner_shader_hex); @@ -1691,6 +1693,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } } + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(11, 0, 0): + case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): + if ((adev->gfx.me_fw_version >= 2280) && + (adev->gfx.mec_fw_version >= 2410)) { + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + } + break; + default: + break; + } + if (!adev->enable_mes_kiq) { r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 1b99f90cd193..fe7c48f2fb2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + if (!adev->enable_mes_kiq) { r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 480c41ee947e..b7006c41e270 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device *adev) amdgpu_ring_write(kiq_ring, 0); amdgpu_ring_write(kiq_ring, 0); } + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); if (r) DRM_ERROR("KCQ disable failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index a880dce16ae2..0b6f09f2cc9b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2374,6 +2374,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0); if (r) { DRM_ERROR("Failed to init KIQ BOs!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 983088805c3a..e2b3dda57030 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + if (adev->gfx.mec_fw_version >= 155) { + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE; + } + break; + default: + break; + } r = gfx_v9_4_3_gpu_early_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index f43ded8a0aab..50c5da3020cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1130,8 +1130,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, uint64_t *flags) { struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); - bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; - bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENT); + bool is_vram = bo->tbo.resource && + bo->tbo.resource->mem_type == TTM_PL_VRAM; + bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | + AMDGPU_GEM_CREATE_EXT_COHERENT); bool ext_coherent = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENT; bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; struct amdgpu_vm *vm = mapping->bo_va->base.vm; @@ -1139,6 +1141,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, bool snoop = false; bool is_local; + dma_resv_assert_held(bo->tbo.base.resv); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(9, 4, 1): case IP_VERSION(9, 4, 2): @@ -1257,9 +1261,8 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, *flags &= ~AMDGPU_PTE_VALID; } - if (bo && bo->tbo.resource) - gmc_v9_0_get_coherence_flags(adev, mapping->bo_va->base.bo, - mapping, flags); + if ((*flags & AMDGPU_PTE_VALID) && bo) + gmc_v9_0_get_coherence_flags(adev, bo, mapping, flags); } static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c index 89953c0f5f1f..193dfac5dc76 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c @@ -123,6 +123,12 @@ static int jpeg_v4_0_sw_init(struct amdgpu_ip_block *ip_block) r = amdgpu_jpeg_ras_sw_init(adev); if (r) return r; + /* TODO: Add queue reset mask when FW fully supports it */ + adev->jpeg.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]); + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + if (r) + return r; return 0; } @@ -143,6 +149,7 @@ static int jpeg_v4_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); r = amdgpu_jpeg_sw_fini(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index 6917e4a8e96a..67b51bcbacd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -159,6 +159,13 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->jpeg.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]); + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + if (r) + return r; + return 0; } @@ -178,6 +185,7 @@ static int jpeg_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); r = amdgpu_jpeg_sw_fini(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c index f3cce523f3cb..b48e2412e6cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c @@ -153,6 +153,13 @@ static int jpeg_v4_0_5_sw_init(struct amdgpu_ip_block *ip_block) adev->jpeg.inst[i].external.jpeg_pitch[0] = SOC15_REG_OFFSET(JPEG, i, regUVD_JPEG_PITCH); } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->jpeg.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]); + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + if (r) + return r; + return 0; } @@ -172,6 +179,7 @@ static int jpeg_v4_0_5_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); r = amdgpu_jpeg_sw_fini(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c index 06840d1dae79..686f9605239d 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c @@ -100,6 +100,12 @@ static int jpeg_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block) adev->jpeg.internal.jpeg_pitch[0] = regUVD_JPEG_PITCH_INTERNAL_OFFSET; adev->jpeg.inst->external.jpeg_pitch[0] = SOC15_REG_OFFSET(JPEG, 0, regUVD_JPEG_PITCH); + /* TODO: Add queue reset mask when FW fully supports it */ + adev->jpeg.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]); + r = amdgpu_jpeg_sysfs_reset_mask_init(adev); + if (r) + return r; return 0; } @@ -119,6 +125,7 @@ static int jpeg_v5_0_0_sw_fini(struct amdgpu_ip_block *ip_block) if (r) return r; + amdgpu_jpeg_sysfs_reset_mask_fini(adev); r = amdgpu_jpeg_sw_fini(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 0e758ebf2372..9c905b9e9376 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -644,6 +644,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes, sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl)); misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en; break; + case MES_MISC_OP_CHANGE_CONFIG: + if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) < 0x63) { + dev_err(mes->adev->dev, "MES FW versoin must be larger than 0x63 to support limit single process feature.\n"); + return -EINVAL; + } + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG; + misc_pkt.change_config.opcode = + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS; + misc_pkt.change_config.option.bits.limit_single_process = + input->change_config.option.limit_single_process; + break; + default: DRM_ERROR("unsupported misc op (%d) \n", input->op); return -EINVAL; @@ -708,6 +720,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes->event_log_gpu_addr; } + if (enforce_isolation) + mes_set_hw_res_pkt.limit_single_process = 1; + return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); @@ -908,6 +923,16 @@ static void mes_v11_0_enable(struct amdgpu_device *adev, bool enable) uint32_t pipe, data = 0; if (enable) { + if (amdgpu_mes_log_enable) { + WREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_LO, + lower_32_bits(adev->mes.event_log_gpu_addr + AMDGPU_MES_LOG_BUFFER_SIZE)); + WREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_HI, + upper_32_bits(adev->mes.event_log_gpu_addr + AMDGPU_MES_LOG_BUFFER_SIZE)); + dev_info(adev->dev, "Setup CP MES MSCRATCH address : 0x%x. 0x%x\n", + RREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_HI), + RREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_LO)); + } + data = RREG32_SOC15(GC, 0, regCP_MES_CNTL); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, @@ -1370,7 +1395,7 @@ static int mes_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->mes.kiq_hw_init = &mes_v11_0_kiq_hw_init; adev->mes.kiq_hw_fini = &mes_v11_0_kiq_hw_fini; - adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE; + adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE; r = amdgpu_mes_init(adev); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 3daa8862e622..9ecc5d61e49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -531,6 +531,14 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl)); misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en; break; + case MES_MISC_OP_CHANGE_CONFIG: + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG; + misc_pkt.change_config.opcode = + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS; + misc_pkt.change_config.option.bits.limit_single_process = + input->change_config.option.limit_single_process; + break; + default: DRM_ERROR("unsupported misc op (%d) \n", input->op); return -EINVAL; @@ -550,7 +558,7 @@ static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_1_pkt.header.type = MES_API_TYPE_SCHEDULER; mes_set_hw_res_1_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1; mes_set_hw_res_1_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; - mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 100; + mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 0xa; return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_1_pkt, sizeof(mes_set_hw_res_1_pkt), @@ -624,6 +632,9 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr + pipe * AMDGPU_MES_LOG_BUFFER_SIZE; } + if (enforce_isolation) + mes_set_hw_res_pkt.limit_single_process = 1; + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index f47bd7ada4d7..4dcb72d1bdda 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -61,15 +61,18 @@ static enum idh_event xgpu_nv_mailbox_peek_msg(struct amdgpu_device *adev) static int xgpu_nv_mailbox_rcv_msg(struct amdgpu_device *adev, enum idh_event event) { + int r = 0; u32 reg; reg = RREG32_NO_KIQ(mmMAILBOX_MSGBUF_RCV_DW0); - if (reg != event) + if (reg == IDH_FAIL) + r = -EINVAL; + else if (reg != event) return -ENOENT; xgpu_nv_mailbox_send_ack(adev); - return 0; + return r; } static uint8_t xgpu_nv_peek_ack(struct amdgpu_device *adev) @@ -178,6 +181,9 @@ send_request: if (data1 != 0) event = IDH_RAS_POISON_READY; break; + case IDH_REQ_RAS_ERROR_COUNT: + event = IDH_RAS_ERROR_COUNT_READY; + break; default: break; } @@ -456,6 +462,11 @@ static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev) return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF); } +static int xgpu_nv_req_ras_err_count(struct amdgpu_device *adev) +{ + return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_ERROR_COUNT); +} + const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .req_full_gpu = xgpu_nv_request_full_gpu_access, .rel_full_gpu = xgpu_nv_release_full_gpu_access, @@ -466,4 +477,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .trans_msg = xgpu_nv_mailbox_trans_msg, .ras_poison_handler = xgpu_nv_ras_poison_handler, .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, + .req_ras_err_count = xgpu_nv_req_ras_err_count, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h index 1d099ffb3a5a..9d61d76e1bf9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h @@ -40,6 +40,7 @@ enum idh_request { IDH_LOG_VF_ERROR = 200, IDH_READY_TO_RESET = 201, IDH_RAS_POISON = 202, + IDH_REQ_RAS_ERROR_COUNT = 203, }; enum idh_event { @@ -54,6 +55,8 @@ enum idh_event { IDH_RAS_POISON_READY, IDH_PF_SOFT_FLR_NOTIFICATION, IDH_RAS_ERROR_DETECTED, + IDH_RAS_ERROR_COUNT_READY = 11, + IDH_TEXT_MESSAGE = 255, }; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index fb37e354a9d5..1ac730328516 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,6 +247,12 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(7, 7, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); + break; + } } static void nbio_v7_7_update_medium_grain_clock_gating(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 6b72169be8f8..3bad565ded73 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -67,8 +67,8 @@ static const struct amd_ip_funcs nv_common_ip_funcs; /* Navi */ static const struct amdgpu_video_codec_info nv_video_codecs_encode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 0)}, }; static const struct amdgpu_video_codecs nv_video_codecs_encode = { @@ -94,8 +94,8 @@ static const struct amdgpu_video_codecs nv_video_codecs_decode = { /* Sienna Cichlid */ static const struct amdgpu_video_codec_info sc_video_codecs_encode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2160, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 7680, 4352, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, }; static const struct amdgpu_video_codecs sc_video_codecs_encode = { @@ -136,8 +136,8 @@ static const struct amdgpu_video_codecs sc_video_codecs_decode_vcn1 = { /* SRIOV Sienna Cichlid, not const since data is controlled by host */ static struct amdgpu_video_codec_info sriov_sc_video_codecs_encode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2160, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 7680, 4352, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, }; static struct amdgpu_video_codec_info sriov_sc_video_codecs_decode_array_vcn0[] = { diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 9c7cea0890c9..a38553f38fdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1430,6 +1430,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + if (amdgpu_sdma_ras_sw_init(adev)) { dev_err(adev->dev, "fail to initialize sdma ras block\n"); return -EINVAL; @@ -1442,6 +1446,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); + r = amdgpu_sdma_sysfs_reset_mask_init(adev); + if (r) + return r; + return r; } @@ -1456,6 +1464,7 @@ static int sdma_v4_4_2_sw_fini(struct amdgpu_ip_block *ip_block) amdgpu_ring_fini(&adev->sdma.instance[i].page); } + amdgpu_sdma_sysfs_reset_mask_fini(adev); if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2) || amdgpu_ip_version(adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) amdgpu_sdma_destroy_inst_ctx(adev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index d31c4860933f..fa9b40934957 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -1452,6 +1452,19 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(5, 0, 0): + case IP_VERSION(5, 0, 2): + case IP_VERSION(5, 0, 5): + if (adev->sdma.instance[0].fw_version >= 35) + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + break; + default: + break; + } + /* Allocate memory for SDMA IP Dump buffer */ ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL); if (ptr) @@ -1459,6 +1472,10 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); + r = amdgpu_sdma_sysfs_reset_mask_init(adev); + if (r) + return r; + return r; } @@ -1470,6 +1487,7 @@ static int sdma_v5_0_sw_fini(struct amdgpu_ip_block *ip_block) for (i = 0; i < adev->sdma.num_instances; i++) amdgpu_ring_fini(&adev->sdma.instance[i].ring); + amdgpu_sdma_sysfs_reset_mask_fini(adev); amdgpu_sdma_destroy_inst_ctx(adev, false); kfree(adev->sdma.ip_dump); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index ffa8c62ac101..ba5160399ab2 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1357,6 +1357,24 @@ static int sdma_v5_2_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(5, 2, 0): + case IP_VERSION(5, 2, 2): + case IP_VERSION(5, 2, 3): + case IP_VERSION(5, 2, 4): + if (adev->sdma.instance[0].fw_version >= 76) + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + break; + case IP_VERSION(5, 2, 5): + if (adev->sdma.instance[0].fw_version >= 34) + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + break; + default: + break; + } + /* Allocate memory for SDMA IP Dump buffer */ ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL); if (ptr) @@ -1364,6 +1382,10 @@ static int sdma_v5_2_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); + r = amdgpu_sdma_sysfs_reset_mask_init(adev); + if (r) + return r; + return r; } @@ -1375,6 +1397,7 @@ static int sdma_v5_2_sw_fini(struct amdgpu_ip_block *ip_block) for (i = 0; i < adev->sdma.num_instances; i++) amdgpu_ring_fini(&adev->sdma.instance[i].ring); + amdgpu_sdma_sysfs_reset_mask_fini(adev); amdgpu_sdma_destroy_inst_ctx(adev, true); kfree(adev->sdma.ip_dump); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 5635f2d84090..d46128b0ec92 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1350,6 +1350,19 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(6, 0, 0): + case IP_VERSION(6, 0, 2): + case IP_VERSION(6, 0, 3): + if (adev->sdma.instance[0].fw_version >= 21) + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + break; + default: + break; + } + if (amdgpu_sdma_ras_sw_init(adev)) { dev_err(adev->dev, "Failed to initialize sdma ras block!\n"); return -EINVAL; @@ -1362,6 +1375,10 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); + r = amdgpu_sdma_sysfs_reset_mask_init(adev); + if (r) + return r; + return r; } @@ -1373,6 +1390,7 @@ static int sdma_v6_0_sw_fini(struct amdgpu_ip_block *ip_block) for (i = 0; i < adev->sdma.num_instances; i++) amdgpu_ring_fini(&adev->sdma.instance[i].ring); + amdgpu_sdma_sysfs_reset_mask_fini(adev); amdgpu_sdma_destroy_inst_ctx(adev, true); kfree(adev->sdma.ip_dump); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 533b4b2b432d..ede072758dab 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -90,8 +90,8 @@ static const struct amd_ip_funcs soc15_common_ip_funcs; /* Vega, Raven, Arcturus */ static const struct amdgpu_video_codec_info vega_video_codecs_encode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 0)}, }; static const struct amdgpu_video_codecs vega_video_codecs_encode = diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 93fbb3354720..d6999835918f 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -49,13 +49,13 @@ static const struct amd_ip_funcs soc21_common_ip_funcs; /* SOC21 */ static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_encode_array_vcn0[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)}, }; static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_encode_array_vcn1[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, }; @@ -96,14 +96,14 @@ static const struct amdgpu_video_codecs vcn_4_0_0_video_codecs_decode_vcn1 = { /* SRIOV SOC21, not const since data is controlled by host */ static struct amdgpu_video_codec_info sriov_vcn_4_0_0_video_codecs_encode_array_vcn0[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)}, }; static struct amdgpu_video_codec_info sriov_vcn_4_0_0_video_codecs_encode_array_vcn1[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, }; static struct amdgpu_video_codecs sriov_vcn_4_0_0_video_codecs_encode_vcn0 = { diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c index 3af10ef4b793..be96de92b2f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc24.c +++ b/drivers/gpu/drm/amd/amdgpu/soc24.c @@ -48,7 +48,7 @@ static const struct amd_ip_funcs soc24_common_ip_funcs; static const struct amdgpu_video_codec_info vcn_5_0_0_video_codecs_encode_array_vcn0[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)}, }; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 0d5c94bfc0ef..cf808a153fce 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -95,6 +95,13 @@ static void vcn_v4_0_3_unified_ring_set_wptr(struct amdgpu_ring *ring); static void vcn_v4_0_3_set_ras_funcs(struct amdgpu_device *adev); static void vcn_v4_0_3_enable_ras(struct amdgpu_device *adev, int inst_idx, bool indirect); + +static inline bool vcn_v4_0_3_normalizn_reqd(struct amdgpu_device *adev) +{ + return (amdgpu_sriov_vf(adev) || + (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))); +} + /** * vcn_v4_0_3_early_init - set function pointers * @@ -1428,8 +1435,8 @@ static uint64_t vcn_v4_0_3_unified_ring_get_wptr(struct amdgpu_ring *ring) static void vcn_v4_0_3_enc_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, uint32_t val, uint32_t mask) { - /* For VF, only local offsets should be used */ - if (amdgpu_sriov_vf(ring->adev)) + /* Use normalized offsets when required */ + if (vcn_v4_0_3_normalizn_reqd(ring->adev)) reg = NORMALIZE_VCN_REG_OFFSET(reg); amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WAIT); @@ -1440,8 +1447,8 @@ static void vcn_v4_0_3_enc_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t static void vcn_v4_0_3_enc_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val) { - /* For VF, only local offsets should be used */ - if (amdgpu_sriov_vf(ring->adev)) + /* Use normalized offsets when required */ + if (vcn_v4_0_3_normalizn_reqd(ring->adev)) reg = NORMALIZE_VCN_REG_OFFSET(reg); amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WRITE); diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index b3fa54c0514e..a83505815d39 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -136,15 +136,15 @@ static const struct amdgpu_video_codec_info polaris_video_codecs_encode_array[] { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, .max_width = 4096, - .max_height = 2304, - .max_pixels_per_frame = 4096 * 2304, + .max_height = 4096, + .max_pixels_per_frame = 4096 * 4096, .max_level = 0, }, { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, .max_width = 4096, - .max_height = 2304, - .max_pixels_per_frame = 4096 * 2304, + .max_height = 4096, + .max_pixels_per_frame = 4096 * 4096, .max_level = 0, }, }; |
