diff options
author | Mukul Joshi <mukul.joshi@amd.com> | 2023-04-11 16:32:38 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-04-13 00:14:24 -0400 |
commit | dd299441654fd8209056c7985ddf2373ebaba6ed (patch) | |
tree | de0f5e5597e9deda68821a0c623c7bdc499d8ab9 /drivers/gpu | |
parent | 318e431b306e966d2ee99e900a11bdc9a701ee83 (diff) | |
download | lwn-dd299441654fd8209056c7985ddf2373ebaba6ed.tar.gz lwn-dd299441654fd8209056c7985ddf2373ebaba6ed.zip |
drm/amdgpu: Rework retry fault removal
Rework retry fault removal from the software filter by
storing an expired timestamp for a fault that is being removed.
When a new fault comes, and it matches an entry in the sw filter,
it will be added as a new fault only when its timestamp is greater
than the timestamp expiry of the fault in the sw filter.
This helps in avoiding stale faults being added back into the
filter and preventing legitimate faults from being handled.
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 36 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 1 |
2 files changed, 34 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 88bc7f5f46e6..9b0ccb1b84c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -395,8 +395,21 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, while (fault->timestamp >= stamp) { uint64_t tmp; - if (atomic64_read(&fault->key) == key) - return true; + if (atomic64_read(&fault->key) == key) { + /* + * if we get a fault which is already present in + * the fault_ring and the timestamp of + * the fault is after the expired timestamp, + * then this is a new fault that needs to be added + * into the fault ring. + */ + if (fault->timestamp_expiry != 0 && + amdgpu_ih_ts_after(fault->timestamp_expiry, + timestamp)) + break; + else + return true; + } tmp = fault->timestamp; fault = &gmc->fault_ring[fault->next]; @@ -432,15 +445,32 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, { struct amdgpu_gmc *gmc = &adev->gmc; uint64_t key = amdgpu_gmc_fault_key(addr, pasid); + struct amdgpu_ih_ring *ih; struct amdgpu_gmc_fault *fault; + uint32_t last_wptr; + uint64_t last_ts; uint32_t hash; uint64_t tmp; + ih = adev->irq.retry_cam_enabled ? &adev->irq.ih_soft : &adev->irq.ih1; + /* Get the WPTR of the last entry in IH ring */ + last_wptr = amdgpu_ih_get_wptr(adev, ih); + /* Order wptr with ring data. */ + rmb(); + /* Get the timetamp of the last entry in IH ring */ + last_ts = amdgpu_ih_decode_iv_ts(adev, ih, last_wptr, -1); + hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER); fault = &gmc->fault_ring[gmc->fault_hash[hash].idx]; do { - if (atomic64_cmpxchg(&fault->key, key, 0) == key) + if (atomic64_read(&fault->key) == key) { + /* + * Update the timestamp when this fault + * expired. + */ + fault->timestamp_expiry = last_ts; break; + } tmp = fault->timestamp; fault = &gmc->fault_ring[fault->next]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 232523e3e270..6d105d7fb98b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -70,6 +70,7 @@ struct amdgpu_gmc_fault { uint64_t timestamp:48; uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER; atomic64_t key; + uint64_t timestamp_expiry:48; }; /* |