summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-19 16:24:24 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-19 16:24:24 -0700
commit574cc4539762561d96b456dbc0544d8898bd4c6e (patch)
tree07d84db8cf9fd30cbde6f539ce3a3f6116593e41 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent3c2edc36a77420d8be05d656019dbc8c31535992 (diff)
parent945b584c94f8c665b2df3834a8a6a8faf256cd5f (diff)
downloadlwn-574cc4539762561d96b456dbc0544d8898bd4c6e.tar.gz
lwn-574cc4539762561d96b456dbc0544d8898bd4c6e.zip
Merge tag 'drm-next-2019-09-18' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie: "This is the main pull request for 5.4-rc1 merge window. I don't think there is anything outstanding so next week should just be fixes, but we'll see if I missed anything. I landed some fixes earlier in the week but got delayed writing summary and sending it out, due to a mix of sick kid and jetlag! There are some fixes pending, but I'd rather get the main merge out of the way instead of delaying it longer. It's also pretty large in commit count and new amd header file size. The largest thing is four new amdgpu products (navi12/14, arcturus and renoir APU support). Otherwise it's pretty much lots of work across the board, i915 has started landing tigerlake support, lots of icelake fixes and lots of locking reworking for future gpu support, lots of header file rework (drmP.h is nearly gone), some old legacy hacks (DRM_WAIT_ON) have been put into the places they are needed. uapi: - content protection type property for HDCP core: - rework include dependencies - lots of drmP.h removals - link rate calculation robustness fix - make fb helper map only when required - add connector->DDC adapter link - DRM_WAIT_ON removed - drop DRM_AUTH usage from drivers dma-buf: - reservation object fence helper dma-fence: - shrink dma_fence struct - merge signal functions - store timestamps in dma_fence - selftests ttm: - embed drm_get_object struct into ttm_buffer_object - release_notify callback bridges: - sii902x - audio graph card support - tc358767 - aux data handling rework - ti-snd64dsi86 - debugfs support, DSI mode flags support panels: - Support for GiantPlus GPM940B0, Sharp LQ070Y3DG3B, Ortustech COM37H3M, Novatek NT39016, Sharp LS020B1DD01D, Raydium RM67191, Boe Himax8279d, Sharp LD-D5116Z01B - TI nspire, NEC NL8048HL11, LG Philips LB035Q02, Sharp LS037V7DW01, Sony ACX565AKM, Toppoly TD028TTEC1 Toppoly TD043MTEA1 i915: - Initial tigerlake platform support - Locking simplification work, general all over refactoring. - Selftests - HDCP debug info improvements - DSI properties - Icelake display PLL fixes, colorspace fixes, bandwidth fixes, DSI suspend/resume - GuC fixes - Perf fixes - ElkhartLake enablement - DP MST fixes - GVT - command parser enhancements amdgpu: - add wipe memory on release flag for buffer creation - Navi12/14 support (may be marked experimental) - Arcturus support - Renoir APU support - mclk DPM for Navi - DC display fixes - Raven scatter/gather support - RAS support for GFX - Navi12 + Arcturus power features - GPU reset for Picasso - smu11 i2c controller support amdkfd: - navi12/14 support - Arcturus support radeon: - kexec fix nouveau: - improved display color management - detect lack of GPU power cables vmwgfx: - evicition priority support - remove unused security feature msm: - msm8998 display support - better async commit support for cursor updates etnaviv: - per-process address space support - performance counter fixes - softpin support mcde: - DCS transfers fix exynos: - drmP.h cleanup lima: - reduce logging kirin: - misc clenaups komeda: - dual-link support - DT memory regions hisilicon: - misc fixes imx: - IPUv3 image converter fixes - 32-bit RGB V4L2 pixel format support ingenic: - more support for panel related cases mgag200: - cursor support fix panfrost: - export GPU features register to userspace - gpu heap allocations - per-fd address space support pl111: - CLD pads wiring support removed from DT rockchip: - rework to use DRM PSR helpers - fix bug in VOP_WIN_GET macro - DSI DT binding rework sun4i: - improve support for color encoding and range - DDC enabled GPIO tinydrm: - rework SPI support - improve MIPI-DBI support - moved to drm/tiny vkms: - rework CRC tracking dw-hdmi: - get_eld and i2s improvements gm12u320: - misc fixes meson: - global code cleanup - vpu feature detect omap: - alpha/pixel blend mode properties rcar-du: - misc fixes" * tag 'drm-next-2019-09-18' of git://anongit.freedesktop.org/drm/drm: (2112 commits) drm/nouveau/bar/gm20b: Avoid BAR1 teardown during init drm/nouveau: Fix ordering between TTM and GEM release drm/nouveau/prime: Extend DMA reservation object lock drm/nouveau: Fix fallout from reservation object rework drm/nouveau/kms/nv50-: Don't create MSTMs for eDP connectors drm/i915: Use NOEVICT for first pass on attemping to pin a GGTT mmap drm/i915: to make vgpu ppgtt notificaiton as atomic operation drm/i915: Flush the existing fence before GGTT read/write drm/i915: Hold irq-off for the entire fake lock period drm/i915/gvt: update RING_START reg of vGPU when the context is submitted to i915 drm/i915/gvt: update vgpu workload head pointer correctly drm/mcde: Fix DSI transfers drm/msm: Use the correct dma_sync calls harder drm/msm: remove unlikely() from WARN_ON() conditions drm/msm/dsi: Fix return value check for clk_get_parent drm/msm: add atomic traces drm/msm/dpu: async commit support drm/msm: async commit support drm/msm: split power control from prepare/complete_commit drm/msm: add kms->flush_commit() ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c218
1 files changed, 90 insertions, 128 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fac7aa2c244f..016ea274b955 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,74 +30,6 @@
#include "amdgpu_ras.h"
#include "amdgpu_atomfirmware.h"
-struct ras_ih_data {
- /* interrupt bottom half */
- struct work_struct ih_work;
- int inuse;
- /* IP callback */
- ras_ih_cb cb;
- /* full of entries */
- unsigned char *ring;
- unsigned int ring_size;
- unsigned int element_size;
- unsigned int aligned_element_size;
- unsigned int rptr;
- unsigned int wptr;
-};
-
-struct ras_fs_data {
- char sysfs_name[32];
- char debugfs_name[32];
-};
-
-struct ras_err_data {
- unsigned long ue_count;
- unsigned long ce_count;
-};
-
-struct ras_err_handler_data {
- /* point to bad pages array */
- struct {
- unsigned long bp;
- struct amdgpu_bo *bo;
- } *bps;
- /* the count of entries */
- int count;
- /* the space can place new entries */
- int space_left;
- /* last reserved entry's index + 1 */
- int last_reserved;
-};
-
-struct ras_manager {
- struct ras_common_if head;
- /* reference count */
- int use;
- /* ras block link */
- struct list_head node;
- /* the device */
- struct amdgpu_device *adev;
- /* debugfs */
- struct dentry *ent;
- /* sysfs */
- struct device_attribute sysfs_attr;
- int attr_inuse;
-
- /* fs node name */
- struct ras_fs_data fs_data;
-
- /* IH data */
- struct ras_ih_data ih_data;
-
- struct ras_err_data err_data;
-};
-
-struct ras_badpage {
- unsigned int bp;
- unsigned int size;
- unsigned int flags;
-};
-
const char *ras_error_string[] = {
"none",
"parity",
@@ -130,6 +62,9 @@ const char *ras_block_string[] = {
#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
+/* inject address is 52 bits */
+#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
uint64_t offset, uint64_t size,
struct amdgpu_bo **bo_ptr);
@@ -196,6 +131,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
char err[9] = "ue";
int op = -1;
int block_id;
+ uint32_t sub_block;
u64 address, value;
if (*pos)
@@ -223,17 +159,23 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
return -EINVAL;
data->head.block = block_id;
- data->head.type = memcmp("ue", err, 2) == 0 ?
- AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
- AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+ /* only ue and ce errors are supported */
+ if (!memcmp("ue", err, 2))
+ data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+ else if (!memcmp("ce", err, 2))
+ data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+ else
+ return -EINVAL;
+
data->op = op;
if (op == 2) {
- if (sscanf(str, "%*s %*s %*s %llu %llu",
- &address, &value) != 2)
- if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
- &address, &value) != 2)
+ if (sscanf(str, "%*s %*s %*s %u %llu %llu",
+ &sub_block, &address, &value) != 3)
+ if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
+ &sub_block, &address, &value) != 3)
return -EINVAL;
+ data->head.sub_block_index = sub_block;
data->inject.address = address;
data->inject.value = value;
}
@@ -278,7 +220,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* write the struct to the control node.
*
* bash:
- * echo op block [error [address value]] > .../ras/ras_ctrl
+ * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
* op: disable, enable, inject
* disable: only block is needed
* enable: block and error are needed
@@ -288,10 +230,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
* error: ue, ce
* ue: multi_uncorrectable
* ce: single_correctable
+ * sub_block: sub block index, pass 0 if there is no sub block
*
* here are some examples for bash commands,
- * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
- * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
* echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
*
* How to check the result?
@@ -310,7 +253,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
{
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
struct ras_debug_if data;
- struct amdgpu_bo *bo;
int ret = 0;
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
@@ -328,17 +270,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
break;
case 2:
- ret = amdgpu_ras_reserve_vram(adev,
- data.inject.address, PAGE_SIZE, &bo);
- if (ret) {
- /* address was offset, now it is absolute.*/
- data.inject.address += adev->gmc.vram_start;
- if (data.inject.address > adev->gmc.vram_end)
- break;
- } else
- data.inject.address = amdgpu_bo_gpu_offset(bo);
+ if ((data.inject.address >= adev->gmc.mc_vram_size) ||
+ (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
- amdgpu_ras_release_vram(adev, &bo);
break;
default:
ret = -EINVAL;
@@ -656,14 +595,46 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
struct ras_query_if *info)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+ struct ras_err_data err_data = {0, 0, 0, NULL};
if (!obj)
return -EINVAL;
- /* TODO might read the register to read the count */
+
+ switch (info->head.block) {
+ case AMDGPU_RAS_BLOCK__UMC:
+ if (adev->umc.funcs->query_ras_error_count)
+ adev->umc.funcs->query_ras_error_count(adev, &err_data);
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ if (adev->umc.funcs->query_ras_error_address)
+ adev->umc.funcs->query_ras_error_address(adev, &err_data);
+ break;
+ case AMDGPU_RAS_BLOCK__GFX:
+ if (adev->gfx.funcs->query_ras_error_count)
+ adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+ break;
+ case AMDGPU_RAS_BLOCK__MMHUB:
+ if (adev->mmhub_funcs->query_ras_error_count)
+ adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
+ break;
+ default:
+ break;
+ }
+
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
+ if (err_data.ce_count)
+ dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
+ obj->err_data.ce_count, ras_block_str(info->head.block));
+ if (err_data.ue_count)
+ dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
+ obj->err_data.ue_count, ras_block_str(info->head.block));
+
return 0;
}
@@ -684,13 +655,23 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (!obj)
return -EINVAL;
- if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+ switch (info->head.block) {
+ case AMDGPU_RAS_BLOCK__GFX:
+ if (adev->gfx.funcs->ras_error_inject)
+ ret = adev->gfx.funcs->ras_error_inject(adev, info);
+ else
+ ret = -EINVAL;
+ break;
+ case AMDGPU_RAS_BLOCK__UMC:
+ case AMDGPU_RAS_BLOCK__MMHUB:
+ ret = psp_ras_trigger_error(&adev->psp, &block_info);
+ break;
+ default:
DRM_INFO("%s error injection is not supported yet\n",
ras_block_str(info->head.block));
- return -EINVAL;
+ ret = -EINVAL;
}
- ret = psp_ras_trigger_error(&adev->psp, &block_info);
if (ret)
DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
ras_block_str(info->head.block),
@@ -707,7 +688,7 @@ int amdgpu_ras_error_cure(struct amdgpu_device *adev,
}
/* get the total error counts on all IPs */
-int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -715,7 +696,7 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
struct ras_err_data data = {0, 0};
if (!con)
- return -EINVAL;
+ return 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
@@ -723,7 +704,7 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
};
if (amdgpu_ras_error_query(adev, &info))
- return -EINVAL;
+ return 0;
data.ce_count += info.ce_count;
data.ue_count += info.ue_count;
@@ -812,32 +793,8 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
{
struct amdgpu_ras *con =
container_of(attr, struct amdgpu_ras, features_attr);
- struct drm_device *ddev = dev_get_drvdata(dev);
- struct amdgpu_device *adev = ddev->dev_private;
- struct ras_common_if head;
- int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
- int i;
- ssize_t s;
- struct ras_manager *obj;
-
- s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
- for (i = 0; i < ras_block_count; i++) {
- head.block = i;
-
- if (amdgpu_ras_is_feature_enabled(adev, &head)) {
- obj = amdgpu_ras_find_obj(adev, &head);
- s += scnprintf(&buf[s], PAGE_SIZE - s,
- "%s: %s\n",
- ras_block_str(i),
- ras_err_str(obj->head.type));
- } else
- s += scnprintf(&buf[s], PAGE_SIZE - s,
- "%s: disabled\n",
- ras_block_str(i));
- }
-
- return s;
+ return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
}
static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
@@ -1054,6 +1011,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
struct ras_ih_data *data = &obj->ih_data;
struct amdgpu_iv_entry entry;
int ret;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
while (data->rptr != data->wptr) {
rmb();
@@ -1068,19 +1026,19 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
* from the callback to udpate the error type/count, etc
*/
if (data->cb) {
- ret = data->cb(obj->adev, &entry);
+ ret = data->cb(obj->adev, &err_data, &entry);
/* ue will trigger an interrupt, and in that case
* we need do a reset to recovery the whole system.
* But leave IP do that recovery, here we just dispatch
* the error.
*/
- if (ret == AMDGPU_RAS_UE) {
- obj->err_data.ue_count++;
+ if (ret == AMDGPU_RAS_SUCCESS) {
+ /* these counts could be left as 0 if
+ * some blocks do not count error number
+ */
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
}
- /* Might need get ce count by register, but not all IP
- * saves ce count, some IP just use one bit or two bits
- * to indicate ce happened.
- */
}
}
}
@@ -1577,6 +1535,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (amdgpu_ras_fs_init(adev))
goto fs_out;
+ /* ras init for each ras block */
+ if (adev->umc.funcs->ras_init)
+ adev->umc.funcs->ras_init(adev);
+
DRM_INFO("RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
con->hw_supported, con->supported);