diff options
author | Ben Widawsky <ben@bwidawsk.net> | 2012-05-25 16:56:22 -0700 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2012-05-31 11:53:51 +0200 |
commit | e36891900855b3bed5d9cc9209655e6dfa435a5f (patch) | |
tree | a1eeec4c5dba5bbe26e0942c65b312783ce24a0f /drivers/gpu/drm/i915/i915_irq.c | |
parent | 6c982376de3f81e046e380d27079b823acadf6ad (diff) | |
download | lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.tar.gz lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.zip |
drm/i915: Dynamic Parity Detection handling
On IVB hardware we are given an interrupt whenever a L3 parity error
occurs in the L3 cache. The L3 cache is used by internal GPU clients
only. This is a very rare occurrence (in fact to test this I need to
use specially instrumented silicon).
When a row in the L3 cache detects a parity error the HW generates an
interrupt. The interrupt is masked in GTIMR until we get a chance to
read some registers and alert userspace via a uevent. With this
information userspace can use a sysfs interface (follow-up patch) to
remap those rows.
Way above my level of understanding, but if a given row fails, it is
statistically more likely to fail again than a row which has not failed.
Therefore it is desirable for an operating system to maintain a lifelong
list of failing rows and always remap any bad rows on driver load.
Hardware limits the number of rows that are remappable per bank/subbank,
and should more than that many rows detect parity errors, software
should maintain a list of the most frequent errors, and remap those
rows.
V2: Drop WARN_ON(IS_GEN6) (Jesse)
DRM_DEBUG row/bank/subbank on errror (Jesse)
Comment updates (Jesse)
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'drivers/gpu/drm/i915/i915_irq.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_irq.c | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 5134a62ff82f..c5266355973b 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -398,6 +398,86 @@ static void gen6_pm_rps_work(struct work_struct *work) mutex_unlock(&dev_priv->dev->struct_mutex); } + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * @work: workqueue struct + * + * Doesn't actually do anything except notify userspace. As a consequence of + * this event, userspace should try to remap the bad rows since statistically + * it is likely the same row is more likely to go bad again. + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. + * In order to prevent a get/put style interface, acquire struct mutex + * any time we access those registers. + */ + mutex_lock(&dev_priv->dev->struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(&dev_priv->irq_lock, flags); + dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); + + mutex_unlock(&dev_priv->dev->struct_mutex); + + parity_event[0] = "L3_PARITY_ERROR=1"; + parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row); + parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank); + parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank); + parity_event[4] = NULL; + + kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj, + KOBJ_CHANGE, parity_event); + + DRM_DEBUG("Parity error: Row = %d, Bank = %d, Sub bank = %d.\n", + row, bank, subbank); + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; + unsigned long flags; + + if (!IS_IVYBRIDGE(dev)) + return; + + spin_lock_irqsave(&dev_priv->irq_lock, flags); + dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv->gt_irq_mask); + spin_unlock_irqrestore(&dev_priv->irq_lock, flags); + + queue_work(dev_priv->wq, &dev_priv->parity_error_work); +} + static void snb_gt_irq_handler(struct drm_device *dev, struct drm_i915_private *dev_priv, u32 gt_iir) @@ -417,6 +497,9 @@ static void snb_gt_irq_handler(struct drm_device *dev, DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); i915_handle_error(dev, false); } + + if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT) + ivybridge_handle_parity_error(dev); } static void gen6_queue_rps_work(struct drm_i915_private *dev_priv, @@ -1641,6 +1724,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev) atomic_set(&dev_priv->irq_received, 0); + if (IS_IVYBRIDGE(dev)) + INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work); + I915_WRITE(HWSTAM, 0xeffe); /* XXX hotplug from PCH */ |