diff options
author | Oded Gabbay <oded.gabbay@gmail.com> | 2019-02-16 00:39:20 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2019-02-18 09:46:45 +0100 |
commit | f8c8c7d5f1b0ea85fe6b4fe2dc63d72774a29184 (patch) | |
tree | c04933e9efcdcf80b3f57ddd22bb31d17c76eacb /drivers/misc/habanalabs/goya/goya.c | |
parent | d91389bc839d724cd8df7ca308dde97beca9b0c5 (diff) | |
download | lwn-f8c8c7d5f1b0ea85fe6b4fe2dc63d72774a29184.tar.gz lwn-f8c8c7d5f1b0ea85fe6b4fe2dc63d72774a29184.zip |
habanalabs: add device reset support
This patch adds support for doing various on-the-fly reset of Goya.
The driver supports two types of resets:
1. soft-reset
2. hard-reset
Soft-reset is done when the device detects a timeout of a command
submission that was given to the device. The soft-reset process only resets
the engines that are relevant for the submission of compute jobs, i.e. the
DMA channels, the TPCs and the MME. The purpose is to bring the device as
fast as possible to a working state.
Hard-reset is done in several cases:
1. After soft-reset is done but the device is not responding
2. When fatal errors occur inside the device, e.g. ECC error
3. When the driver is removed
Hard-reset performs a reset of the entire chip except for the PCI
controller and the PLLs. It is a much longer process then soft-reset but it
helps to recover the device without the need to reboot the Host.
After hard-reset, the driver will restore the max power attribute and in
case of manual power management, the frequencies that were set.
This patch also adds two entries to the sysfs, which allows the root user
to initiate a soft or hard reset.
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/misc/habanalabs/goya/goya.c')
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 226 |
1 files changed, 225 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index d46925d921a3..1fe1d6a1ff9e 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -120,6 +120,130 @@ static const char *goya_axi_name[GOYA_MAX_INITIATORS] = { #define GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE 121 +static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = { + GOYA_ASYNC_EVENT_ID_PCIE_IF, + GOYA_ASYNC_EVENT_ID_TPC0_ECC, + GOYA_ASYNC_EVENT_ID_TPC1_ECC, + GOYA_ASYNC_EVENT_ID_TPC2_ECC, + GOYA_ASYNC_EVENT_ID_TPC3_ECC, + GOYA_ASYNC_EVENT_ID_TPC4_ECC, + GOYA_ASYNC_EVENT_ID_TPC5_ECC, + GOYA_ASYNC_EVENT_ID_TPC6_ECC, + GOYA_ASYNC_EVENT_ID_TPC7_ECC, + GOYA_ASYNC_EVENT_ID_MME_ECC, + GOYA_ASYNC_EVENT_ID_MME_ECC_EXT, + GOYA_ASYNC_EVENT_ID_MMU_ECC, + GOYA_ASYNC_EVENT_ID_DMA_MACRO, + GOYA_ASYNC_EVENT_ID_DMA_ECC, + GOYA_ASYNC_EVENT_ID_CPU_IF_ECC, + GOYA_ASYNC_EVENT_ID_PSOC_MEM, + GOYA_ASYNC_EVENT_ID_PSOC_CORESIGHT, + GOYA_ASYNC_EVENT_ID_SRAM0, + GOYA_ASYNC_EVENT_ID_SRAM1, + GOYA_ASYNC_EVENT_ID_SRAM2, + GOYA_ASYNC_EVENT_ID_SRAM3, + GOYA_ASYNC_EVENT_ID_SRAM4, + GOYA_ASYNC_EVENT_ID_SRAM5, + GOYA_ASYNC_EVENT_ID_SRAM6, + GOYA_ASYNC_EVENT_ID_SRAM7, + GOYA_ASYNC_EVENT_ID_SRAM8, + GOYA_ASYNC_EVENT_ID_SRAM9, + GOYA_ASYNC_EVENT_ID_SRAM10, + GOYA_ASYNC_EVENT_ID_SRAM11, + GOYA_ASYNC_EVENT_ID_SRAM12, + GOYA_ASYNC_EVENT_ID_SRAM13, + GOYA_ASYNC_EVENT_ID_SRAM14, + GOYA_ASYNC_EVENT_ID_SRAM15, + GOYA_ASYNC_EVENT_ID_SRAM16, + GOYA_ASYNC_EVENT_ID_SRAM17, + GOYA_ASYNC_EVENT_ID_SRAM18, + GOYA_ASYNC_EVENT_ID_SRAM19, + GOYA_ASYNC_EVENT_ID_SRAM20, + GOYA_ASYNC_EVENT_ID_SRAM21, + GOYA_ASYNC_EVENT_ID_SRAM22, + GOYA_ASYNC_EVENT_ID_SRAM23, + GOYA_ASYNC_EVENT_ID_SRAM24, + GOYA_ASYNC_EVENT_ID_SRAM25, + GOYA_ASYNC_EVENT_ID_SRAM26, + GOYA_ASYNC_EVENT_ID_SRAM27, + GOYA_ASYNC_EVENT_ID_SRAM28, + GOYA_ASYNC_EVENT_ID_SRAM29, + GOYA_ASYNC_EVENT_ID_GIC500, + GOYA_ASYNC_EVENT_ID_PLL0, + GOYA_ASYNC_EVENT_ID_PLL1, + GOYA_ASYNC_EVENT_ID_PLL3, + GOYA_ASYNC_EVENT_ID_PLL4, + GOYA_ASYNC_EVENT_ID_PLL5, + GOYA_ASYNC_EVENT_ID_PLL6, + GOYA_ASYNC_EVENT_ID_AXI_ECC, + GOYA_ASYNC_EVENT_ID_L2_RAM_ECC, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET, + GOYA_ASYNC_EVENT_ID_PSOC_GPIO_10_VRHOT_ICRIT, + GOYA_ASYNC_EVENT_ID_PCIE_DEC, + GOYA_ASYNC_EVENT_ID_TPC0_DEC, + GOYA_ASYNC_EVENT_ID_TPC1_DEC, + GOYA_ASYNC_EVENT_ID_TPC2_DEC, + GOYA_ASYNC_EVENT_ID_TPC3_DEC, + GOYA_ASYNC_EVENT_ID_TPC4_DEC, + GOYA_ASYNC_EVENT_ID_TPC5_DEC, + GOYA_ASYNC_EVENT_ID_TPC6_DEC, + GOYA_ASYNC_EVENT_ID_TPC7_DEC, + GOYA_ASYNC_EVENT_ID_MME_WACS, + GOYA_ASYNC_EVENT_ID_MME_WACSD, + GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER, + GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC, + GOYA_ASYNC_EVENT_ID_PSOC, + GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR, + GOYA_ASYNC_EVENT_ID_TPC0_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC1_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC2_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC3_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC4_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC5_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC6_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC7_CMDQ, + GOYA_ASYNC_EVENT_ID_TPC0_QM, + GOYA_ASYNC_EVENT_ID_TPC1_QM, + GOYA_ASYNC_EVENT_ID_TPC2_QM, + GOYA_ASYNC_EVENT_ID_TPC3_QM, + GOYA_ASYNC_EVENT_ID_TPC4_QM, + GOYA_ASYNC_EVENT_ID_TPC5_QM, + GOYA_ASYNC_EVENT_ID_TPC6_QM, + GOYA_ASYNC_EVENT_ID_TPC7_QM, + GOYA_ASYNC_EVENT_ID_MME_QM, + GOYA_ASYNC_EVENT_ID_MME_CMDQ, + GOYA_ASYNC_EVENT_ID_DMA0_QM, + GOYA_ASYNC_EVENT_ID_DMA1_QM, + GOYA_ASYNC_EVENT_ID_DMA2_QM, + GOYA_ASYNC_EVENT_ID_DMA3_QM, + GOYA_ASYNC_EVENT_ID_DMA4_QM, + GOYA_ASYNC_EVENT_ID_DMA0_CH, + GOYA_ASYNC_EVENT_ID_DMA1_CH, + GOYA_ASYNC_EVENT_ID_DMA2_CH, + GOYA_ASYNC_EVENT_ID_DMA3_CH, + GOYA_ASYNC_EVENT_ID_DMA4_CH, + GOYA_ASYNC_EVENT_ID_TPC0_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC1_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC2_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC3_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC4_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC5_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC6_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_TPC7_BMON_SPMU, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH0, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH1, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH2, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH3, + GOYA_ASYNC_EVENT_ID_DMA_BM_CH4 +}; + static int goya_armcp_info_get(struct hl_device *hdev); static void goya_get_fixed_properties(struct hl_device *hdev) @@ -2447,6 +2571,14 @@ static int goya_hw_init(struct hl_device *hdev) /* Perform read from the device to make sure device is up */ val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); + /* + * Let's mark in the H/W that we have reached this point. We check + * this value in the reset_before_init function to understand whether + * we need to reset the chip before doing H/W init. This register is + * cleared by the H/W upon H/W reset + */ + WREG32(mmPSOC_GLOBAL_CONF_APP_STATUS, HL_DEVICE_HW_STATE_DIRTY); + rc = goya_init_cpu(hdev, GOYA_CPU_TIMEOUT_USEC); if (rc) { dev_err(hdev->dev, "failed to initialize CPU\n"); @@ -2575,6 +2707,14 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) "Timeout while waiting for device to reset 0x%x\n", status); + if (!hard_reset) { + goya->hw_cap_initialized &= ~(HW_CAP_DMA | HW_CAP_MME | + HW_CAP_GOLDEN | HW_CAP_TPC); + WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, + GOYA_ASYNC_EVENT_ID_SOFT_RESET); + return; + } + /* Chicken bit to re-initiate boot sequencer flow */ WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1 << PSOC_GLOBAL_CONF_BOOT_SEQ_RE_START_IND_SHIFT); @@ -3184,6 +3324,57 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type) } } +static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, + size_t irq_arr_size) +{ + struct armcp_unmask_irq_arr_packet *pkt; + size_t total_pkt_size; + long result; + int rc; + + total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) + + irq_arr_size; + + /* data should be aligned to 8 bytes in order to ArmCP to copy it */ + total_pkt_size = (total_pkt_size + 0x7) & ~0x7; + + /* total_pkt_size is casted to u16 later on */ + if (total_pkt_size > USHRT_MAX) { + dev_err(hdev->dev, "too many elements in IRQ array\n"); + return -EINVAL; + } + + pkt = kzalloc(total_pkt_size, GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + pkt->length = irq_arr_size / sizeof(irq_arr[0]); + memcpy(&pkt->irqs, irq_arr, irq_arr_size); + + pkt->armcp_pkt.ctl = ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY << + ARMCP_PKT_CTL_OPCODE_SHIFT; + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt, + total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result); + + if (rc) + dev_err(hdev->dev, "failed to unmask IRQ array\n"); + + kfree(pkt); + + return rc; +} + +static int goya_soft_reset_late_init(struct hl_device *hdev) +{ + /* + * Unmask all IRQs since some could have been received + * during the soft reset + */ + return goya_unmask_irq_arr(hdev, goya_non_fatal_events, + sizeof(goya_non_fatal_events)); +} + static int goya_unmask_irq(struct hl_device *hdev, u16 event_type) { struct armcp_packet pkt; @@ -3245,6 +3436,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) dev_err(hdev->dev, "Received H/W interrupt %d, reset the chip\n", event_type); + hl_device_reset(hdev, true, false); break; case GOYA_ASYNC_EVENT_ID_PCIE_DEC: @@ -3310,6 +3502,30 @@ void *goya_get_events_stat(struct hl_device *hdev, u32 *size) return goya->events_stat; } +int goya_send_heartbeat(struct hl_device *hdev) +{ + struct goya_device *goya = hdev->asic_specific; + struct armcp_packet hb_pkt; + long result; + int rc; + + if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + + memset(&hb_pkt, 0, sizeof(hb_pkt)); + + hb_pkt.ctl = ARMCP_PACKET_TEST << ARMCP_PKT_CTL_OPCODE_SHIFT; + hb_pkt.value = ARMCP_PACKET_FENCE_VAL; + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, + sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result); + + if ((rc) || (result != ARMCP_PACKET_FENCE_VAL)) + rc = -EIO; + + return rc; +} + static int goya_armcp_info_get(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; @@ -3455,6 +3671,11 @@ out: return rc; } +static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev) +{ + return RREG32(mmPSOC_GLOBAL_CONF_APP_STATUS); +} + static const struct hl_asic_funcs goya_funcs = { .early_init = goya_early_init, .early_fini = goya_early_fini, @@ -3484,12 +3705,15 @@ static const struct hl_asic_funcs goya_funcs = { .handle_eqe = goya_handle_eqe, .set_pll_profile = goya_set_pll_profile, .get_events_stat = goya_get_events_stat, + .send_heartbeat = goya_send_heartbeat, .enable_clock_gating = goya_init_clock_gating, .disable_clock_gating = goya_disable_clock_gating, + .soft_reset_late_init = goya_soft_reset_late_init, .hw_queues_lock = goya_hw_queues_lock, .hw_queues_unlock = goya_hw_queues_unlock, .get_eeprom_data = goya_get_eeprom_data, - .send_cpu_message = goya_send_cpu_message + .send_cpu_message = goya_send_cpu_message, + .get_hw_state = goya_get_hw_state }; /* |