summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/command_submission.c
diff options
context:
space:
mode:
authorOded Gabbay <oded.gabbay@gmail.com>2019-02-28 10:46:21 +0200
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-02-28 13:07:52 +0100
commitaf5f7eea45e1b177db961c4706625f4cf545c063 (patch)
treea59b3f71bb2aa0dbd1f7e39129f5e30b75a11a4e /drivers/misc/habanalabs/command_submission.c
parentefaa281219fd37cb1ee5cdef483aa67a16b0a087 (diff)
downloadlwn-af5f7eea45e1b177db961c4706625f4cf545c063.tar.gz
lwn-af5f7eea45e1b177db961c4706625f4cf545c063.zip
habanalabs: soft-reset device if context-switch fails
This patch fix a bug in the driver, where if the TPC or MME remains in non-IDLE even after all the command submissions are done (due to user bug or malicious user), then future command submissions will fail in the context-switch stage and the driver will remain in "stuck" mode. The fix is to do a soft-reset of the device in case the context-switch fails, because the device should be IDLE during context-switch. If it is not IDLE, then something is wrong and we should reset the compute engines. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/misc/habanalabs/command_submission.c')
-rw-r--r--drivers/misc/habanalabs/command_submission.c16
1 files changed, 9 insertions, 7 deletions
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 25ad9d805cfa..3525236ed8d9 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -622,13 +622,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
"Failed to switch to context %d, rejecting CS! %d\n",
ctx->asid, rc);
/*
- * If we timedout, we need to soft-reset because
- * QMAN is probably stuck. However, we can't
- * call to reset here directly because of
- * deadlock, so need to do it at the very end
- * of this function
+ * If we timedout, or if the device is not IDLE
+ * while we want to do context-switch (-EBUSY),
+ * we need to soft-reset because QMAN is
+ * probably stuck. However, we can't call to
+ * reset here directly because of deadlock, so
+ * need to do it at the very end of this
+ * function
*/
- if (rc == -ETIMEDOUT)
+ if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
need_soft_reset = true;
mutex_unlock(&hpriv->restore_phase_mutex);
goto out;
@@ -706,7 +708,7 @@ out:
args->out.seq = cs_seq;
}
- if ((rc == -ETIMEDOUT) && (need_soft_reset))
+ if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset))
hl_device_reset(hdev, false, false);
return rc;