diff options
author | Ofir Bitton <obitton@habana.ai> | 2020-07-21 10:49:51 +0300 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2020-09-22 18:49:49 +0300 |
commit | 0a068adde505a90ece23caaf19b77567e1d18298 (patch) | |
tree | bb4a8b951eef94143747d05ac8ef68c19d1549ba /drivers/misc | |
parent | a98d73c7fae486f7fea83bdec8599d4fccb6807d (diff) | |
download | lwn-0a068adde505a90ece23caaf19b77567e1d18298.tar.gz lwn-0a068adde505a90ece23caaf19b77567e1d18298.zip |
habanalabs: add information about PCIe controller
Update firmware header with new API for getting pcie info
such as tx/rx throughput and replay counter.
These counters are needed by customers for monitor and maintenance
of multiple devices.
Add new opcodes to the INFO ioctl to retrieve these counters.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Diffstat (limited to 'drivers/misc')
-rw-r--r-- | drivers/misc/habanalabs/common/firmware_if.c | 48 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 4 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_ioctl.c | 41 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 4 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 4 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/common/armcp_if.h | 10 |
6 files changed, 111 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index f52bc690dfc5..61f5edc96e16 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -363,6 +363,54 @@ out: return rc; } +int hl_fw_armcp_pci_counters_get(struct hl_device *hdev, + struct hl_info_pci_counters *counters) +{ + struct armcp_packet pkt = {}; + long result; + int rc; + + pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_THROUGHPUT_GET << + ARMCP_PKT_CTL_OPCODE_SHIFT); + + /* Fetch PCI rx counter */ + pkt.index = cpu_to_le32(armcp_pcie_throughput_rx); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_ARMCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle ArmCP PCI info pkt, error %d\n", rc); + return rc; + } + counters->rx_throughput = result; + + /* Fetch PCI tx counter */ + pkt.index = cpu_to_le32(armcp_pcie_throughput_tx); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_ARMCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle ArmCP PCI info pkt, error %d\n", rc); + return rc; + } + counters->tx_throughput = result; + + /* Fetch PCI replay counter */ + pkt.ctl = cpu_to_le32(ARMCP_PACKET_PCIE_REPLAY_CNT_GET << + ARMCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_ARMCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle ArmCP PCI info pkt, error %d\n", rc); + return rc; + } + counters->replay_cnt = (u32) result; + + return rc; +} + static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg) { u32 err_val; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index f97eebc64979..2c9fcb513215 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1483,6 +1483,7 @@ struct hl_device_idle_busy_ts { * @soft_reset_cnt: number of soft reset since the driver was loaded. * @hard_reset_cnt: number of hard reset since the driver was loaded. * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr + * @clk_throttling_reason: bitmask represents the current clk throttling reasons * @id: device minor. * @id_control: minor of the control device * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit @@ -1587,6 +1588,7 @@ struct hl_device { u32 soft_reset_cnt; u32 hard_reset_cnt; u32 idle_busy_ts_idx; + u32 clk_throttling_reason; u16 id; u16 id_control; u16 cpu_pci_msb_addr; @@ -1841,6 +1843,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, int hl_fw_send_heartbeat(struct hl_device *hdev); int hl_fw_armcp_info_get(struct hl_device *hdev); int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); +int hl_fw_armcp_pci_counters_get(struct hl_device *hdev, + struct hl_info_pci_counters *counters); int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, u32 msg_to_cpu_reg, u32 cpu_msg_status_reg, u32 boot_err0_reg, bool skip_bmc, diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 5af1c03da473..4d838b1a3bbe 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -276,6 +276,41 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; } +static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_pci_counters pci_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_armcp_pci_counters_get(hdev, &pci_counters); + if (rc) + return rc; + + return copy_to_user(out, &pci_counters, + min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0; +} + +static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_clk_throttle clk_throttle = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason; + + return copy_to_user(out, &clk_throttle, + min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0; +} + static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; @@ -360,6 +395,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_CS_COUNTERS: return cs_counters_info(hpriv, args); + case HL_INFO_PCI_COUNTERS: + return pci_counters_info(hpriv, args); + + case HL_INFO_CLK_THROTTLE_REASON: + return clk_throttle_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 4009b7df4caf..adb5c5594ac1 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -5653,21 +5653,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, { switch (event_type) { case GAUDI_EVENT_FIX_POWER_ENV_S: + hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER; dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; case GAUDI_EVENT_FIX_POWER_ENV_E: + hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER; dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_S: + hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL; dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_E: + hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL; dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); break; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 33cd2ae653d2..954f2c022d33 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4580,18 +4580,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type) { switch (event_type) { case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S: + hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER; dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E: + hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER; dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S: + hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL; dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); break; case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E: + hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL; dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); break; diff --git a/drivers/misc/habanalabs/include/common/armcp_if.h b/drivers/misc/habanalabs/include/common/armcp_if.h index 07f9972db28d..1403c937253c 100644 --- a/drivers/misc/habanalabs/include/common/armcp_if.h +++ b/drivers/misc/habanalabs/include/common/armcp_if.h @@ -243,6 +243,8 @@ enum armcp_packet_id { ARMCP_PACKET_TEMPERATURE_SET, /* sysfs */ ARMCP_PACKET_VOLTAGE_SET, /* sysfs */ ARMCP_PACKET_CURRENT_SET, /* sysfs */ + ARMCP_PACKET_PCIE_THROUGHPUT_GET, /* internal */ + ARMCP_PACKET_PCIE_REPLAY_CNT_GET, /* internal */ }; #define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5 @@ -277,6 +279,9 @@ struct armcp_packet { __u8 pad; /* unused */ }; + /* For any general request */ + __le32 index; + /* For frequency get/set */ __le32 pll_index; @@ -344,6 +349,11 @@ enum armcp_pwm_attributes { armcp_pwm_enable }; +enum armcp_pcie_throughput_attributes { + armcp_pcie_throughput_tx, + armcp_pcie_throughput_rx +}; + /* Event Queue Packets */ struct eq_generic_event { |