drm/i915: Dynamic Parity Detection handling

On IVB hardware we are given an interrupt whenever a L3 parity error occurs in the L3 cache. The L3 cache is used by internal GPU clients only. This is a very rare occurrence (in fact to test this I need to use specially instrumented silicon). When a row in the L3 cache detects a parity error the HW generates an interrupt. The interrupt is masked in GTIMR until we get a chance to read some registers and alert userspace via a uevent. With this information userspace can use a sysfs interface (follow-up patch) to remap those rows. Way above my level of understanding, but if a given row fails, it is statistically more likely to fail again than a row which has not failed. Therefore it is desirable for an operating system to maintain a lifelong list of failing rows and always remap any bad rows on driver load. Hardware limits the number of rows that are remappable per bank/subbank, and should more than that many rows detect parity errors, software should maintain a list of the most frequent errors, and remap those rows. V2: Drop WARN_ON(IS_GEN6) (Jesse) DRM_DEBUG row/bank/subbank on errror (Jesse) Comment updates (Jesse) Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
author: Ben Widawsky <ben@bwidawsk.net> 2012-05-25 16:56:22 -0700
committer: Daniel Vetter <daniel.vetter@ffwll.ch> 2012-05-31 11:53:51 +0200
commit: e36891900855b3bed5d9cc9209655e6dfa435a5f (patch)
tree: a1eeec4c5dba5bbe26e0942c65b312783ce24a0f /drivers/gpu/drm/i915/i915_irq.c
parent: 6c982376de3f81e046e380d27079b823acadf6ad (diff)
download: lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.tar.gz
lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.zip
1 files changed, 86 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5134a62ff82f..c5266355973b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -398,6 +398,86 @@ static void gen6_pm_rps_work(struct work_struct *work)
 	mutex_unlock(&dev_priv->dev->struct_mutex);
 }
 
+
+/**
+ * ivybridge_parity_work - Workqueue called when a parity error interrupt
+ * occurred.
+ * @work: workqueue struct
+ *
+ * Doesn't actually do anything except notify userspace. As a consequence of
+ * this event, userspace should try to remap the bad rows since statistically
+ * it is likely the same row is more likely to go bad again.
+ */
+static void ivybridge_parity_work(struct work_struct *work)
+{
+	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+						    parity_error_work);
+	u32 error_status, row, bank, subbank;
+	char *parity_event[5];
+	uint32_t misccpctl;
+	unsigned long flags;
+
+	/* We must turn off DOP level clock gating to access the L3 registers.
+	 * In order to prevent a get/put style interface, acquire struct mutex
+	 * any time we access those registers.
+	 */
+	mutex_lock(&dev_priv->dev->struct_mutex);
+
+	misccpctl = I915_READ(GEN7_MISCCPCTL);
+	I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
+	POSTING_READ(GEN7_MISCCPCTL);
+
+	error_status = I915_READ(GEN7_L3CDERRST1);
+	row = GEN7_PARITY_ERROR_ROW(error_status);
+	bank = GEN7_PARITY_ERROR_BANK(error_status);
+	subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
+
+	I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
+				    GEN7_L3CDERRST1_ENABLE);
+	POSTING_READ(GEN7_L3CDERRST1);
+
+	I915_WRITE(GEN7_MISCCPCTL, misccpctl);
+
+	spin_lock_irqsave(&dev_priv->irq_lock, flags);
+	dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
+	I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
+	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
+
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+
+	parity_event[0] = "L3_PARITY_ERROR=1";
+	parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row);
+	parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank);
+	parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank);
+	parity_event[4] = NULL;
+
+	kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj,
+			   KOBJ_CHANGE, parity_event);
+
+	DRM_DEBUG("Parity error: Row = %d, Bank = %d, Sub bank = %d.\n",
+		  row, bank, subbank);
+
+	kfree(parity_event[3]);
+	kfree(parity_event[2]);
+	kfree(parity_event[1]);
+}
+
+void ivybridge_handle_parity_error(struct drm_device *dev)
+{
+	drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
+	unsigned long flags;
+
+	if (!IS_IVYBRIDGE(dev))
+		return;
+
+	spin_lock_irqsave(&dev_priv->irq_lock, flags);
+	dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
+	I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
+	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
+
+	queue_work(dev_priv->wq, &dev_priv->parity_error_work);
+}
+
 static void snb_gt_irq_handler(struct drm_device *dev,
 			       struct drm_i915_private *dev_priv,
 			       u32 gt_iir)
@@ -417,6 +497,9 @@ static void snb_gt_irq_handler(struct drm_device *dev,
 		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
 		i915_handle_error(dev, false);
 	}
+
+	if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT)
+		ivybridge_handle_parity_error(dev);
 }
 
 static void gen6_queue_rps_work(struct drm_i915_private *dev_priv,
@@ -1641,6 +1724,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
 	atomic_set(&dev_priv->irq_received, 0);
 
 
+	if (IS_IVYBRIDGE(dev))
+		INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work);
+
 	I915_WRITE(HWSTAM, 0xeffe);
 
 	/* XXX hotplug from PCH */
author	Ben Widawsky <ben@bwidawsk.net>	2012-05-25 16:56:22 -0700
committer	Daniel Vetter <daniel.vetter@ffwll.ch>	2012-05-31 11:53:51 +0200
commit	e36891900855b3bed5d9cc9209655e6dfa435a5f (patch)
tree	a1eeec4c5dba5bbe26e0942c65b312783ce24a0f /drivers/gpu/drm/i915/i915_irq.c
parent	6c982376de3f81e046e380d27079b823acadf6ad (diff)
download	lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.tar.gz lwn-e36891900855b3bed5d9cc9209655e6dfa435a5f.zip