diff options
author | Jerome Glisse <jglisse@redhat.com> | 2010-03-09 14:45:10 +0000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2010-04-06 10:42:45 +1000 |
commit | 225758d8ba4fdcc1e8c9cf617fd89529bd4a9596 (patch) | |
tree | a9ac2f23435d4a6db5aa33774ba94d9f0aeb5c4c /drivers/gpu/drm/radeon/radeon_fence.c | |
parent | 95beb690170e6ce918fe53c73a0fcc7cf64d704a (diff) | |
download | lwn-225758d8ba4fdcc1e8c9cf617fd89529bd4a9596.tar.gz lwn-225758d8ba4fdcc1e8c9cf617fd89529bd4a9596.zip |
drm/radeon/kms: fence cleanup + more reliable GPU lockup detection V4
This patch cleanup the fence code, it drops the timeout field of
fence as the time to complete each IB is unpredictable and shouldn't
be bound.
The fence cleanup lead to GPU lockup detection improvement, this
patch introduce a callback, allowing to do asic specific test for
lockup detection. In this patch the CP is use as a first indicator
of GPU lockup. If CP doesn't make progress during 1second we assume
we are facing a GPU lockup.
To avoid overhead of testing GPU lockup frequently due to fence
taking time to be signaled we query the lockup callback every
500msec. There is plenty code comment explaining the design & choise
inside the code.
This have been tested mostly on R3XX/R5XX hw, in normal running
destkop (compiz firefox, quake3 running) the lockup callback wasn't
call once (1 hour session). Also tested with forcing GPU lockup and
lockup was reported after the 1s CP activity timeout.
V2 switch to 500ms timeout so GPU lockup get call at least 2 times
in less than 2sec.
V3 store last jiffies in fence struct so on ERESTART, EBUSY we keep
track of how long we already wait for a given fence
V4 make sure we got up to date cp read pointer so we don't have
false positive
Signed-off-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/radeon/radeon_fence.c')
-rw-r--r-- | drivers/gpu/drm/radeon/radeon_fence.c | 102 |
1 files changed, 55 insertions, 47 deletions
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 8495d4e32e18..393154268dea 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -57,7 +57,6 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) radeon_fence_ring_emit(rdev, fence); fence->emited = true; - fence->timeout = jiffies + ((2000 * HZ) / 1000); list_del(&fence->list); list_add_tail(&fence->list, &rdev->fence_drv.emited); write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); @@ -70,15 +69,34 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev) struct list_head *i, *n; uint32_t seq; bool wake = false; + unsigned long cjiffies; - if (rdev == NULL) { - return true; - } - if (rdev->shutdown) { - return true; - } seq = RREG32(rdev->fence_drv.scratch_reg); - rdev->fence_drv.last_seq = seq; + if (seq != rdev->fence_drv.last_seq) { + rdev->fence_drv.last_seq = seq; + rdev->fence_drv.last_jiffies = jiffies; + rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + } else { + cjiffies = jiffies; + if (time_after(cjiffies, rdev->fence_drv.last_jiffies)) { + cjiffies -= rdev->fence_drv.last_jiffies; + if (time_after(rdev->fence_drv.last_timeout, cjiffies)) { + /* update the timeout */ + rdev->fence_drv.last_timeout -= cjiffies; + } else { + /* the 500ms timeout is elapsed we should test + * for GPU lockup + */ + rdev->fence_drv.last_timeout = 1; + } + } else { + /* wrap around update last jiffies, we will just wait + * a little longer + */ + rdev->fence_drv.last_jiffies = cjiffies; + } + return false; + } n = NULL; list_for_each(i, &rdev->fence_drv.emited) { fence = list_entry(i, struct radeon_fence, list); @@ -170,9 +188,8 @@ bool radeon_fence_signaled(struct radeon_fence *fence) int radeon_fence_wait(struct radeon_fence *fence, bool intr) { struct radeon_device *rdev; - unsigned long cur_jiffies; - unsigned long timeout; - bool expired = false; + unsigned long irq_flags, timeout; + u32 seq; int r; if (fence == NULL) { @@ -183,14 +200,10 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) if (radeon_fence_signaled(fence)) { return 0; } - + timeout = rdev->fence_drv.last_timeout; retry: - cur_jiffies = jiffies; - timeout = HZ / 100; - if (time_after(fence->timeout, cur_jiffies)) { - timeout = fence->timeout - cur_jiffies; - } - + /* save current sequence used to check for GPU lockup */ + seq = rdev->fence_drv.last_seq; if (intr) { radeon_irq_kms_sw_irq_get(rdev); r = wait_event_interruptible_timeout(rdev->fence_drv.queue, @@ -205,38 +218,34 @@ retry: radeon_irq_kms_sw_irq_put(rdev); } if (unlikely(!radeon_fence_signaled(fence))) { - if (unlikely(r == 0)) { - expired = true; + /* we were interrupted for some reason and fence isn't + * isn't signaled yet, resume wait + */ + if (r) { + timeout = r; + goto retry; } - if (unlikely(expired)) { - timeout = 1; - if (time_after(cur_jiffies, fence->timeout)) { - timeout = cur_jiffies - fence->timeout; - } - timeout = jiffies_to_msecs(timeout); - if (timeout > 500) { - DRM_ERROR("fence(%p:0x%08X) %lums timeout " - "going to reset GPU\n", - fence, fence->seq, timeout); - radeon_gpu_reset(rdev); - WREG32(rdev->fence_drv.scratch_reg, fence->seq); - } + /* don't protect read access to rdev->fence_drv.last_seq + * if we experiencing a lockup the value doesn't change + */ + if (seq == rdev->fence_drv.last_seq && radeon_gpu_is_lockup(rdev)) { + /* good news we believe it's a lockup */ + dev_warn(rdev->dev, "GPU lockup (last fence id 0x%08X)\n", seq); + r = radeon_gpu_reset(rdev); + if (r) + return r; + /* FIXME: what should we do ? marking everyone + * as signaled for now + */ + WREG32(rdev->fence_drv.scratch_reg, fence->seq); } + timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + write_lock_irqsave(&rdev->fence_drv.lock, irq_flags); + rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + rdev->fence_drv.last_jiffies = jiffies; + write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); goto retry; } - if (unlikely(expired)) { - rdev->fence_drv.count_timeout++; - cur_jiffies = jiffies; - timeout = 1; - if (time_after(cur_jiffies, fence->timeout)) { - timeout = cur_jiffies - fence->timeout; - } - timeout = jiffies_to_msecs(timeout); - DRM_ERROR("fence(%p:0x%08X) %lums timeout\n", - fence, fence->seq, timeout); - DRM_ERROR("last signaled fence(0x%08X)\n", - rdev->fence_drv.last_seq); - } return 0; } @@ -332,7 +341,6 @@ int radeon_fence_driver_init(struct radeon_device *rdev) INIT_LIST_HEAD(&rdev->fence_drv.created); INIT_LIST_HEAD(&rdev->fence_drv.emited); INIT_LIST_HEAD(&rdev->fence_drv.signaled); - rdev->fence_drv.count_timeout = 0; init_waitqueue_head(&rdev->fence_drv.queue); rdev->fence_drv.initialized = true; write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); |