summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/gaudi2
diff options
context:
space:
mode:
authorTomer Tayar <ttayar@habana.ai>2023-11-06 18:41:35 +0200
committerOded Gabbay <ogabbay@kernel.org>2023-12-19 11:09:43 +0200
commitae303d885d4a0fcea65330de9327d28edfebd206 (patch)
tree564cfc67a3e9b256e2dcd7c405b39d15212b617c /drivers/accel/habanalabs/gaudi2
parent4b0b1fbc7757169b6d304545a321c7a88f13f8f0 (diff)
downloadlwn-ae303d885d4a0fcea65330de9327d28edfebd206.tar.gz
lwn-ae303d885d4a0fcea65330de9327d28edfebd206.zip
accel/habanalabs/gaudi2: get the correct QM CQ info upon an error
Upon a QM error, the address/size from both the CQ and the ARC_CQ are printed, although the instruction that led to the error was received from only one of them. Moreover, in case of a QM undefined opcode, only one of these address/size sets will be captured based on the value of ARC_CQ_PTR. However, this value can be non-zero even if currently the CQ is used, in case the CQ/ARC_CQ are alternately used. Under the assumption of having a stop-on-error configuration, modify to use CP_STS.CUR_CQ field to get the relevant CQ for the QM error. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel/habanalabs/gaudi2')
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2.c44
1 files changed, 22 insertions, 22 deletions
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 5075f92d15cc..77c480725a84 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
{
- u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
- u64 cq_ptr, arc_cq_ptr, cp_current_inst;
-
- lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
- hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
- cq_ptr = ((u64) hi) << 32 | lo;
- cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
-
- lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
- hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
- arc_cq_ptr = ((u64) hi) << 32 | lo;
- arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+ u32 lo, hi, cq_ptr_size, cp_sts;
+ u64 cq_ptr, cp_current_inst;
+ bool is_arc_cq;
+
+ cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
+ is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */
+
+ if (is_arc_cq) {
+ lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
+ hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
+ cq_ptr = ((u64) hi) << 32 | lo;
+ cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
+ } else {
+ lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
+ hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
+ cq_ptr = ((u64) hi) << 32 | lo;
+ cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
+ }
lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
cp_current_inst = ((u64) hi) << 32 | lo;
dev_info(hdev->dev,
- "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
- cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
+ "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
+ is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
- if (arc_cq_ptr) {
- hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr;
- hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
- } else {
- hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
- hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
- }
-
+ hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
+ hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
}
}