diff options
Diffstat (limited to 'drivers/infiniband/hw')
64 files changed, 2921 insertions, 321 deletions
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3721446c6ba4..6df5a2738c95 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -53,12 +53,6 @@ #define BNXT_RE_MAX_MR_SIZE_HIGH BIT_ULL(39) #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH -#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) -#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) -#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) /* Number of MRs to reserve for PF, leaving remainder for VFs */ #define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) @@ -231,6 +225,8 @@ struct bnxt_re_dev { unsigned long event_bitmap; struct bnxt_qplib_cc_param cc_param; struct workqueue_struct *dcb_wq; + struct dentry *cc_config; + struct bnxt_re_dbg_cc_config_params *cc_config_params; }; #define to_bnxt_re_dev(ptr, member) \ @@ -243,6 +239,10 @@ struct bnxt_re_dev { #define BNXT_RE_CHECK_RC(x) ((x) && ((x) != -ETIMEDOUT)) void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev); +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad); +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, + struct ib_mad *out_mad); + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index 7c47039044ef..af91d16c3c77 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -22,6 +22,23 @@ static struct dentry *bnxt_re_debugfs_root; +static const char * const bnxt_re_cc_gen0_name[] = { + "enable_cc", + "run_avg_weight_g", + "num_phase_per_state", + "init_cr", + "init_tr", + "tos_ecn", + "tos_dscp", + "alt_vlan_pcp", + "alt_vlan_dscp", + "rtt", + "cc_mode", + "tcp_cp", + "tx_queue", + "inactivity_cp", +}; + static inline const char *bnxt_re_qp_state_str(u8 state) { switch (state) { @@ -110,19 +127,215 @@ void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) debugfs_remove(qp->dentry); } +static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param *ccparam, u32 *val) +{ + u64 map_offset; + + map_offset = BIT(offset); + + switch (map_offset) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + *val = ccparam->enable; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + *val = ccparam->g; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + *val = ccparam->nph_per_state; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + *val = ccparam->init_cr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + *val = ccparam->init_tr; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + *val = ccparam->tos_ecn; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + *val = ccparam->tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + *val = ccparam->alt_vlan_pcp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + *val = ccparam->alt_tos_dscp; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + *val = ccparam->rtt; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + *val = ccparam->cc_mode; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + *val = ccparam->tcp_cp; + break; + default: + return -EINVAL; + } + + return 0; +} + +static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + struct bnxt_qplib_cc_param ccparam = {}; + u32 offset = dbg_cc_param->offset; + char buf[16]; + u32 val; + int rc; + + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &ccparam); + if (rc) + return rc; + + rc = map_cc_config_offset_gen0_ext0(offset, &ccparam, &val); + if (rc) + return rc; + + rc = snprintf(buf, sizeof(buf), "%d\n", val); + if (rc < 0) + return rc; + + return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc); +} + +static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val) +{ + u32 modify_mask; + + modify_mask = BIT(offset); + + switch (modify_mask) { + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC: + ccparam->enable = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_G: + ccparam->g = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_NUMPHASEPERSTATE: + ccparam->nph_per_state = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_CR: + ccparam->init_cr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INIT_TR: + ccparam->init_tr = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN: + ccparam->tos_ecn = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_DSCP: + ccparam->tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_VLAN_PCP: + ccparam->alt_vlan_pcp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ALT_TOS_DSCP: + ccparam->alt_tos_dscp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_RTT: + ccparam->rtt = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE: + ccparam->cc_mode = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP: + ccparam->tcp_cp = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE: + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP: + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE: + ccparam->time_pph = val; + break; + case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_PKTS_PER_PHASE: + ccparam->pkts_pph = val; + break; + } + + ccparam->mask = modify_mask; +} + +static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val) +{ + struct bnxt_qplib_cc_param ccparam = { }; + + /* Supporting only Gen 0 now */ + if (gen_ext == CC_CONFIG_GEN0_EXT0) + bnxt_re_fill_gen0_ext0(&ccparam, offset, val); + else + return -EINVAL; + + bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam); + return 0; +} + +static ssize_t bnxt_re_cc_config_set(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_cc_param *dbg_cc_param = filp->private_data; + struct bnxt_re_dev *rdev = dbg_cc_param->rdev; + u32 offset = dbg_cc_param->offset; + u8 cc_gen = dbg_cc_param->cc_gen; + char buf[16]; + u32 val; + int rc; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + + buf[count] = '\0'; + if (kstrtou32(buf, 0, &val)) + return -EINVAL; + + rc = bnxt_re_configure_cc(rdev, cc_gen, offset, val); + return rc ? rc : count; +} + +static const struct file_operations bnxt_re_cc_config_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = bnxt_re_cc_config_get, + .write = bnxt_re_cc_config_set, +}; + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; + struct bnxt_re_dbg_cc_config_params *cc_params; + int i; rdev->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), bnxt_re_debugfs_root); rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); + rdev->cc_config = debugfs_create_dir("cc_config", rdev->dbg_root); + + rdev->cc_config_params = kzalloc(sizeof(*cc_params), GFP_KERNEL); + + for (i = 0; i < BNXT_RE_CC_PARAM_GEN0; i++) { + struct bnxt_re_cc_param *tmp_params = &rdev->cc_config_params->gen0_parms[i]; + + tmp_params->rdev = rdev; + tmp_params->offset = i; + tmp_params->cc_gen = CC_CONFIG_GEN0_EXT0; + tmp_params->dentry = debugfs_create_file(bnxt_re_cc_gen0_name[i], 0400, + rdev->cc_config, tmp_params, + &bnxt_re_cc_config_ops); + } } void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) { debugfs_remove_recursive(rdev->qp_debugfs); - + debugfs_remove_recursive(rdev->cc_config); + kfree(rdev->cc_config_params); debugfs_remove_recursive(rdev->dbg_root); rdev->dbg_root = NULL; } diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h index cd3be0a9ec7e..8f101df4e838 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.h +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -18,4 +18,19 @@ void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev); void bnxt_re_register_debugfs(void); void bnxt_re_unregister_debugfs(void); +#define CC_CONFIG_GEN_EXT(x, y) (((x) << 16) | (y)) +#define CC_CONFIG_GEN0_EXT0 CC_CONFIG_GEN_EXT(0, 0) + +#define BNXT_RE_CC_PARAM_GEN0 14 + +struct bnxt_re_cc_param { + struct bnxt_re_dev *rdev; + struct dentry *dentry; + u32 offset; + u8 cc_gen; +}; + +struct bnxt_re_dbg_cc_config_params { + struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; +}; #endif diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index f039aefcaf67..44bb082e0a60 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -39,6 +39,8 @@ #include <linux/types.h> #include <linux/pci.h> +#include <rdma/ib_mad.h> +#include <rdma/ib_pma.h> #include "roce_hsi.h" #include "qplib_res.h" @@ -285,6 +287,96 @@ static void bnxt_re_copy_db_pacing_stats(struct bnxt_re_dev *rdev, readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); } +int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct ib_pma_portcounters_ext *pma_cnt_ext; + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + + pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt_ext->port_xmit_data = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_bytes) / 4); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_bytes) / 4); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(le64_to_cpu(hw_stats->rx_ucast_pkts)); + pma_cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(le64_to_cpu(hw_stats->tx_ucast_pkts)); + + } else { + pma_cnt_ext->port_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_rcv_data = cpu_to_be64(estat->rx_roce_good_bytes / 4); + pma_cnt_ext->port_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + pma_cnt_ext->port_xmit_data = cpu_to_be64(estat->tx_roce_bytes / 4); + pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(estat->rx_roce_good_pkts); + pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(estat->tx_roce_pkts); + } + return 0; +} + +int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) +{ + struct bnxt_qplib_ext_stat *estat = &rdev->stats.rstat.ext_stat; + struct ib_pma_portcounters *pma_cnt; + struct ctx_hw_stats *hw_stats = NULL; + int rc; + + hw_stats = rdev->qplib_ctx.stats.dma; + + pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags)) { + u32 fid = PCI_FUNC(rdev->en_dev->pdev->devfn); + + rc = bnxt_qplib_qext_stat(&rdev->rcfw, fid, estat); + if (rc) + return rc; + } + if ((bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) && rdev->is_virtfn) || + !bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { + pma_cnt->port_rcv_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->rx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_rcv_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->rx_ucast_bytes) & + 0xFFFFFFFF) / 4)); + pma_cnt->port_xmit_packets = + cpu_to_be32((u32)(le64_to_cpu(hw_stats->tx_ucast_pkts)) & 0xFFFFFFFF); + pma_cnt->port_xmit_data = + cpu_to_be32((u32)((le64_to_cpu(hw_stats->tx_ucast_bytes) + & 0xFFFFFFFF) / 4)); + } else { + pma_cnt->port_rcv_packets = cpu_to_be32(estat->rx_roce_good_pkts); + pma_cnt->port_rcv_data = cpu_to_be32((estat->rx_roce_good_bytes / 4)); + pma_cnt->port_xmit_packets = cpu_to_be32(estat->tx_roce_pkts); + pma_cnt->port_xmit_data = cpu_to_be32((estat->tx_roce_bytes / 4)); + } + pma_cnt->port_rcv_constraint_errors = (u8)(le64_to_cpu(hw_stats->rx_discard_pkts) & 0xFF); + pma_cnt->port_rcv_errors = cpu_to_be16((u16)(le64_to_cpu(hw_stats->rx_error_pkts) + & 0xFFFF)); + pma_cnt->port_xmit_constraint_errors = (u8)(le64_to_cpu(hw_stats->tx_error_pkts) & 0xFF); + pma_cnt->port_xmit_discards = cpu_to_be16((u16)(le64_to_cpu(hw_stats->tx_discard_pkts) + & 0xFFFF)); + + return 0; +} + int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 6f5db32082dd..9082b3fd2b47 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -49,6 +49,7 @@ #include <rdma/ib_addr.h> #include <rdma/ib_mad.h> #include <rdma/ib_cache.h> +#include <rdma/ib_pma.h> #include <rdma/uverbs_ioctl.h> #include <linux/hashtable.h> @@ -4491,6 +4492,41 @@ void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(bnxt_entry); } +int bnxt_re_process_mad(struct ib_device *ibdev, int mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct ib_class_port_info cpi = {}; + int ret = IB_MAD_RESULT_SUCCESS; + int rc = 0; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return ret; + + switch (in_mad->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + break; + case IB_PMA_PORT_COUNTERS_EXT: + rc = bnxt_re_assign_pma_port_ext_counters(rdev, out_mad); + break; + case IB_PMA_PORT_COUNTERS: + rc = bnxt_re_assign_pma_port_counters(rdev, out_mad); + break; + default: + rc = -EINVAL; + break; + } + if (rc) + return IB_MAD_RESULT_FAILURE; + ret |= IB_MAD_RESULT_REPLY; + return ret; +} + static int UVERBS_HANDLER(BNXT_RE_METHOD_NOTIFY_DRV)(struct uverbs_attr_bundle *attrs) { struct bnxt_re_ucontext *uctx; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index fbb16a411d6a..22c9eb8e9cfc 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -268,6 +268,12 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int bnxt_re_process_mad(struct ib_device *device, int process_mad_flags, + u32 port_num, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad, + size_t *out_mad_size, u16 *out_mad_pkey_index); + static inline u32 __to_ib_port_num(u16 port_id) { return (u32)port_id + 1; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a94c8c5387d9..293b0a96c8e3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1285,6 +1285,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .post_recv = bnxt_re_post_recv, .post_send = bnxt_re_post_send, .post_srq_recv = bnxt_re_post_srq_recv, + .process_mad = bnxt_re_process_mad, .query_ah = bnxt_re_query_ah, .query_device = bnxt_re_query_device, .modify_device = bnxt_re_modify_device, @@ -2130,8 +2131,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) * memory for the function and all child VFs */ rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx, - BNXT_RE_MAX_QPC_COUNT); + &rdev->qplib_ctx); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate RCFW Channel: %#x\n", rc); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 5336f74297f8..457eecb99f96 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1217,8 +1217,6 @@ static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp) qp->path_mtu = CMDQ_MODIFY_QP_PATH_MTU_MTU_2048; } - qp->modify_flags &= - ~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID; /* Bono FW require the max_dest_rd_atomic to be >= 1 */ if (qp->max_dest_rd_atomic < 1) qp->max_dest_rd_atomic = 1; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 17e62f22683b..d23074383428 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -915,7 +915,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq); @@ -924,8 +923,7 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz) + struct bnxt_qplib_ctx *ctx) { struct bnxt_qplib_hwq_attr hwq_attr = {}; struct bnxt_qplib_sg_info sginfo = {}; @@ -969,12 +967,6 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - /* Allocate one extra to hold the QP1 entries */ - rcfw->qp_tbl_size = qp_tbl_sz + 1; - rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), - GFP_KERNEL); - if (!rcfw->qp_tbl) - goto fail; spin_lock_init(&rcfw->tbl_lock); rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 88814cb3aa74..ff873c5f1b25 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -262,8 +262,7 @@ static inline void bnxt_qplib_fill_cmdqmsg(struct bnxt_qplib_cmdqmsg *msg, void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz); + struct bnxt_qplib_ctx *ctx); void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill); void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, @@ -285,9 +284,10 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn); void bnxt_qplib_mark_qp_error(void *qp_handle); + static inline u32 map_qp_id_to_tbl_indx(u32 qid, struct bnxt_qplib_rcfw *rcfw) { /* Last index of the qp_tbl is for QP1 ie. qp_tbl_size - 1*/ - return (qid == 1) ? rcfw->qp_tbl_size - 1 : qid % rcfw->qp_tbl_size - 2; + return (qid == 1) ? rcfw->qp_tbl_size - 1 : (qid % (rcfw->qp_tbl_size - 2)); } #endif /* __BNXT_QPLIB_RCFW_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 02922a0987ad..6cd05207ffed 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -871,6 +871,7 @@ int bnxt_qplib_init_res(struct bnxt_qplib_res *res) void bnxt_qplib_free_res(struct bnxt_qplib_res *res) { + kfree(res->rcfw->qp_tbl); bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl); bnxt_qplib_free_pd_tbl(&res->pd_tbl); bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl); @@ -878,12 +879,20 @@ void bnxt_qplib_free_res(struct bnxt_qplib_res *res) int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev) { + struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_dev_attr *dev_attr; int rc; res->netdev = netdev; dev_attr = res->dattr; + /* Allocate one extra to hold the QP1 entries */ + rcfw->qp_tbl_size = max_t(u32, BNXT_RE_MAX_QPC_COUNT + 1, dev_attr->max_qp); + rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), + GFP_KERNEL); + if (!rcfw->qp_tbl) + return -ENOMEM; + rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid); if (rc) goto fail; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 711990232de1..6a13927674b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -49,6 +49,13 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_58818 0xd818 #define CHIP_NUM_57608 0x1760 +#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) +#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) +#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + #define BNXT_QPLIB_DBR_VALID (0x1UL << 26) #define BNXT_QPLIB_DBR_EPOCH_SHIFT 24 #define BNXT_QPLIB_DBR_TOGGLE_SHIFT 25 @@ -600,4 +607,9 @@ static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; } +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4ccd4405355a..f231e886ad9d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -176,6 +176,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); + bnxt_qplib_query_version(rcfw, attr->fw_ver); for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) { diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 0ee60fdc18b3..7eceb3e9f4ce 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2215,11 +2215,12 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; - __le16 reserved16; + __le16 max_srq_ext; __le64 reserved64; }; diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 1b23c698ec25..e0acc185e719 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -709,7 +709,6 @@ error: erdma_cancel_mpatimer(new_cep); erdma_cep_put(new_cep); - new_cep->sock = NULL; } if (new_s) { diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index a442eca498b8..368b6be3226f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -12882,22 +12882,6 @@ u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) } } -/* return the OPA port logical state name */ -const char *opa_lstate_name(u32 lstate) -{ - static const char * const port_logical_names[] = { - "PORT_NOP", - "PORT_DOWN", - "PORT_INIT", - "PORT_ARMED", - "PORT_ACTIVE", - "PORT_ACTIVE_DEFER", - }; - if (lstate < ARRAY_SIZE(port_logical_names)) - return port_logical_names[lstate]; - return "unknown"; -} - /* return the OPA port physical state name */ const char *opa_pstate_name(u32 pstate) { @@ -12956,8 +12940,6 @@ static void update_statusp(struct hfi1_pportdata *ppd, u32 state) break; } } - dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", - opa_lstate_name(state), state); } /** diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 8841db16bde7..6992f6d40255 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -771,7 +771,6 @@ int is_bx(struct hfi1_devdata *dd); bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); -const char *opa_lstate_name(u32 lstate); const char *opa_pstate_name(u32 pstate); u32 driver_pstate(struct hfi1_pportdata *ppd); u32 driver_lstate(struct hfi1_pportdata *ppd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 37a6794885d3..50826e7cdb7e 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -968,7 +968,7 @@ static bool __set_armed_to_active(struct hfi1_packet *packet) if (hwstate != IB_PORT_ACTIVE) { dd_dev_info(packet->rcd->dd, "Unexpected link state %s\n", - opa_lstate_name(hwstate)); + ib_port_state_to_str(hwstate)); return false; } diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index cbac4a442d9e..d6fbd9c2b8b4 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -635,12 +635,11 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cca_timer_lock); for (i = 0; i < OPA_MAX_SLS; i++) { - hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); ppd->cca_timer[i].ppd = ppd; ppd->cca_timer[i].sl = i; ppd->cca_timer[i].ccti = 0; - ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index a9883295f4af..b39f63ce6dfc 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1160,8 +1160,8 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, if (ret == HFI_TRANSITION_DISALLOWED || ret == HFI_TRANSITION_UNDEFINED) { pr_warn("invalid logical state transition %s -> %s\n", - opa_lstate_name(logical_old), - opa_lstate_name(logical_new)); + ib_port_state_to_str(logical_old), + ib_port_state_to_str(logical_new)); return ret; } diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c index 52cce1c8b76a..3b7842a7f634 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.c +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -405,26 +405,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, } /* - * Perform a stand-alone single QSFP write. Acquire the resource, do the - * write, then release the resource. - */ -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len) -{ - struct hfi1_devdata *dd = ppd->dd; - u32 resource = qsfp_resource(dd); - int ret; - - ret = acquire_chip_resource(dd, resource, QSFP_WAIT); - if (ret) - return ret; - ret = qsfp_write(ppd, target, addr, bp, len); - release_chip_resource(dd, resource); - - return ret; -} - -/* * Access page n, offset m of QSFP memory as defined by SFF 8636 * by reading @addr = ((256 * n) + m) * diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h index df1389bad86b..5c59d53fcb63 100644 --- a/drivers/infiniband/hw/hfi1/qsfp.h +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -195,8 +195,6 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); -int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, - int len); int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); struct hfi1_asic_data; diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 950c133d4220..6ee911f6885b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -175,8 +175,10 @@ void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC) ida_destroy(&hr_dev->xrcd_ida.ida); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { ida_destroy(&hr_dev->srq_table.srq_ida.ida); + xa_destroy(&hr_dev->srq_table.xa); + } hns_roce_cleanup_qp_table(hr_dev); hns_roce_cleanup_cq_table(hr_dev); ida_destroy(&hr_dev->mr_table.mtpt_ida.ida); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 4106423a1b39..3a5c93c9fb3e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -537,5 +537,6 @@ void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) ida_destroy(&hr_dev->cq_table.bank[i].ida); + xa_destroy(&hr_dev->cq_table.array); mutex_destroy(&hr_dev->cq_table.bank_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 605562122ecc..ca0798224e56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1361,6 +1361,11 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, return ret; } +/* This is the bottom bt pages number of a 100G MR on 4K OS, assuming + * the bt page size is not expanded by cal_best_bt_pg_sz() + */ +#define RESCHED_LOOP_CNT_THRESHOLD_ON_4K 12800 + /* construct the base address table and link them by address hop config */ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, @@ -1369,6 +1374,7 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, { const struct hns_roce_buf_region *r; int ofs, end; + int loop; int unit; int ret; int i; @@ -1386,7 +1392,10 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, continue; end = r->offset + r->count; - for (ofs = r->offset; ofs < end; ofs += unit) { + for (ofs = r->offset, loop = 1; ofs < end; ofs += unit, loop++) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs, hem_list->mid_bt[i], &hem_list->btm_bt); @@ -1443,9 +1452,14 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, struct list_head *head = &hem_list->btm_bt; struct hns_roce_hem_item *hem, *temp_hem; void *cpu_base = NULL; + int loop = 1; int nr = 0; list_for_each_entry_safe(hem, temp_hem, head, sibling) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + loop++; + if (hem_list_page_is_in_range(hem, offset)) { nr = offset - hem->start; cpu_base = hem->addr + nr * BA_BYTE_LEN; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index ae24c81c9812..cf89a8db4f64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -183,7 +183,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_DEVICE_RC_RNR_NAK_GEN; props->max_send_sge = hr_dev->caps.max_sq_sg; props->max_recv_sge = hr_dev->caps.max_rq_sg; - props->max_sge_rd = 1; + props->max_sge_rd = hr_dev->caps.max_sq_sg; props->max_cq = hr_dev->caps.num_cqs; props->max_cqe = hr_dev->caps.max_cqes; props->max_mr = hr_dev->caps.num_mtpts; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 55b9283bfc6f..09da3496843b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -998,7 +998,7 @@ static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, if (attr->region_count > ARRAY_SIZE(attr->region) || attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) { ibdev_err(ibdev, - "invalid buf attr, region count %d, page shift %u.\n", + "invalid buf attr, region count %u, page shift %u.\n", attr->region_count, attr->page_shift); return false; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 9e2e76c59406..9f376a2232b0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -868,12 +868,14 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp *ucmd, struct hns_roce_ib_create_qp_resp *resp) { + bool has_sdb = user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd); struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); + bool has_rdb = user_qp_has_rdb(hr_dev, init_attr, udata, resp); struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) { + if (has_sdb) { ret = hns_roce_db_map_user(uctx, ucmd->sdb_addr, &hr_qp->sdb); if (ret) { ibdev_err(ibdev, @@ -884,7 +886,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; } - if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) { + if (has_rdb) { ret = hns_roce_db_map_user(uctx, ucmd->db_addr, &hr_qp->rdb); if (ret) { ibdev_err(ibdev, @@ -898,7 +900,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, return 0; err_sdb: - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) + if (has_sdb) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_out: return ret; @@ -1119,24 +1121,23 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ibucontext); hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); + return ret; + } ret = set_congest_param(hr_dev, hr_qp, ucmd); - if (ret) - return ret; } else { if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) hr_qp->config = HNS_ROCE_EXSGE_FLAGS; + default_congest_type(hr_dev, hr_qp); ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, "failed to set kernel SQ size, ret = %d.\n", ret); - - default_congest_type(hr_dev, hr_qp); } return ret; @@ -1219,7 +1220,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, min(udata->outlen, sizeof(resp))); if (ret) { ibdev_err(ibdev, "copy qp resp failed!\n"); - goto err_store; + goto err_flow_ctrl; } } @@ -1319,7 +1320,7 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, ret = hns_roce_create_qp_common(hr_dev, init_attr, udata, hr_qp); if (ret) - ibdev_err(ibdev, "create QP type 0x%x failed(%d)\n", + ibdev_err(ibdev, "create QP type %d failed(%d)\n", init_attr->qp_type, ret); err_out: @@ -1602,6 +1603,7 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); xa_destroy(&hr_dev->qp_table.dip_xa); + xa_destroy(&hr_dev->qp_table_xa); mutex_destroy(&hr_dev->qp_table.bank_mutex); mutex_destroy(&hr_dev->qp_table.scc_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 70c06ef65603..1090051f493b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -51,7 +51,7 @@ static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, break; default: dev_err(hr_dev->dev, - "hns_roce:Unexpected event type 0x%x on SRQ %06lx\n", + "hns_roce:Unexpected event type %d on SRQ %06lx\n", event_type, srq->srqn); return; } diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig index b6f9c41bca51..5f49a58590ed 100644 --- a/drivers/infiniband/hw/irdma/Kconfig +++ b/drivers/infiniband/hw/irdma/Kconfig @@ -7,6 +7,7 @@ config INFINIBAND_IRDMA depends on ICE && I40E select GENERIC_ALLOCATOR select AUXILIARY_BUS + select CRC32 help This is an Intel(R) Ethernet Protocol Driver for RDMA driver that support E810 (iWARP/RoCE) and X722 (iWARP) network devices. diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index ad50b77282f8..69ce1862eabe 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -498,8 +498,6 @@ static int irdma_save_msix_info(struct irdma_pci_f *rf) iw_qvlist->num_vectors = rf->msix_count; if (rf->msix_count <= num_online_cpus()) rf->msix_shared = true; - else if (rf->msix_count > num_online_cpus() + 1) - rf->msix_count = num_online_cpus() + 1; pmsix = rf->msix_entries; for (i = 0, ceq_idx = 0; i < rf->msix_count; i++, iw_qvinfo++) { diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 3f13200ff71b..1ee8969595d3 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -206,6 +206,43 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); } +static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +{ + int i; + + rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX; + rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries), + GFP_KERNEL); + if (!rf->msix_entries) + return -ENOMEM; + + for (i = 0; i < rf->msix_count; i++) + if (ice_alloc_rdma_qvector(pf, &rf->msix_entries[i])) + break; + + if (i < IRDMA_MIN_MSIX) { + for (; i > 0; i--) + ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + + kfree(rf->msix_entries); + return -ENOMEM; + } + + rf->msix_count = i; + + return 0; +} + +static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +{ + int i; + + for (i = 0; i < rf->msix_count; i++) + ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + + kfree(rf->msix_entries); +} + static void irdma_remove(struct auxiliary_device *aux_dev) { struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, @@ -216,6 +253,7 @@ static void irdma_remove(struct auxiliary_device *aux_dev) irdma_ib_unregister_device(iwdev); ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); + irdma_deinit_interrupts(iwdev->rf, pf); pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); } @@ -230,9 +268,7 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; rf->hw.hw_addr = pf->hw.hw_addr; rf->pcidev = pf->pdev; - rf->msix_count = pf->num_rdma_msix; rf->pf_id = pf->hw.pf_id; - rf->msix_entries = &pf->msix_entries[pf->rdma_base_vector]; rf->default_vsi.vsi_idx = vsi->vsi_num; rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; @@ -281,6 +317,10 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ irdma_fill_device_info(iwdev, pf, vsi); rf = iwdev->rf; + err = irdma_init_interrupts(rf, pf); + if (err) + goto err_init_interrupts; + err = irdma_ctrl_init_hw(rf); if (err) goto err_ctrl_init; @@ -311,6 +351,8 @@ err_ibreg: err_rt_init: irdma_ctrl_deinit_hw(rf); err_ctrl_init: + irdma_deinit_interrupts(rf, pf); +err_init_interrupts: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 9f0ed6e84471..bb0b6494ccb2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -30,7 +30,6 @@ #endif #include <linux/auxiliary_bus.h> #include <linux/net/intel/iidc.h> -#include <crypto/hash.h> #include <rdma/ib_smi.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> @@ -117,6 +116,9 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_IRQ_NAME_STR_LEN (64) +#define IRDMA_NUM_AEQ_MSIX 1 +#define IRDMA_MIN_MSIX 2 + enum init_completion_state { INVALID_STATE = 0, INITIAL_STATE, diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index ddf02a462efa..4b4f78288d12 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -6,7 +6,6 @@ #include <linux/pci.h> #include <linux/bitfield.h> #include <linux/net/intel/iidc.h> -#include <crypto/hash.h> #include <rdma/ib_verbs.h> #define STATS_TIMER_DELAY 60000 @@ -43,15 +42,12 @@ enum irdma_status_code irdma_vf_wait_vchnl_resp(struct irdma_sc_dev *dev); bool irdma_vf_clear_to_send(struct irdma_sc_dev *dev); void irdma_add_dev_ref(struct irdma_sc_dev *dev); void irdma_put_dev_ref(struct irdma_sc_dev *dev); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); void irdma_send_ieq_ack(struct irdma_sc_qp *qp); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); -void irdma_free_hash_desc(struct shash_desc *hash_desc); -int irdma_init_hash_desc(struct shash_desc **hash_desc); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 7e3f9bca2c23..694e5a9ed15d 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -923,8 +923,6 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, switch (rsrc->cmpl) { case PUDA_HASH_CRC_COMPLETE: - irdma_free_hash_desc(rsrc->hash_desc); - fallthrough; case PUDA_QP_CREATED: irdma_qp_rem_qos(&rsrc->qp); @@ -1095,15 +1093,12 @@ int irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, goto error; if (info->type == IRDMA_PUDA_RSRC_TYPE_IEQ) { - if (!irdma_init_hash_desc(&rsrc->hash_desc)) { - rsrc->check_crc = true; - rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; - ret = 0; - } + rsrc->check_crc = true; + rsrc->cmpl = PUDA_HASH_CRC_COMPLETE; } irdma_sc_ccq_arm(&rsrc->cq); - return ret; + return 0; error: irdma_puda_dele_rsrc(vsi, info->type, false); @@ -1396,8 +1391,8 @@ static int irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, crcptr = txbuf->data + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) { - status = irdma_ieq_check_mpacrc(ieq->hash_desc, txbuf->data, - (fpdu_len - 4), mpacrc); + status = irdma_ieq_check_mpacrc(txbuf->data, fpdu_len - 4, + mpacrc); if (status) { ibdev_dbg(to_ibdev(ieq->dev), "IEQ: error bad crc\n"); goto error; @@ -1465,8 +1460,8 @@ static int irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, crcptr = datap + fpdu_len - 4; mpacrc = *(u32 *)crcptr; if (ieq->check_crc) - ret = irdma_ieq_check_mpacrc(ieq->hash_desc, datap, - fpdu_len - 4, mpacrc); + ret = irdma_ieq_check_mpacrc(datap, fpdu_len - 4, + mpacrc); if (ret) { list_add(&buf->list, rxlist); ibdev_dbg(to_ibdev(ieq->dev), diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h index bc6d9514c9c1..2fc638f2b143 100644 --- a/drivers/infiniband/hw/irdma/puda.h +++ b/drivers/infiniband/hw/irdma/puda.h @@ -119,7 +119,6 @@ struct irdma_puda_rsrc { u32 rx_wqe_idx; u32 rxq_invalid_cnt; u32 tx_wqe_avail_cnt; - struct shash_desc *hash_desc; struct list_head txpend; struct list_head bufpool; /* free buffers pool list for recv and xmit */ u32 alloc_buf_count; @@ -163,10 +162,8 @@ struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, struct irdma_puda_buf *buf); -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, u32 val); -int irdma_init_hash_desc(struct shash_desc **desc); +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val); void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); -void irdma_free_hash_desc(struct shash_desc *desc); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); int irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq); diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0e594122baa7..e73b14fd95ef 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -1274,57 +1274,14 @@ void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) } /** - * irdma_init_hash_desc - initialize hash for crc calculation - * @desc: cryption type - */ -int irdma_init_hash_desc(struct shash_desc **desc) -{ - struct crypto_shash *tfm; - struct shash_desc *tdesc; - - tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return -EINVAL; - - tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!tdesc) { - crypto_free_shash(tfm); - return -EINVAL; - } - - tdesc->tfm = tfm; - *desc = tdesc; - - return 0; -} - -/** - * irdma_free_hash_desc - free hash desc - * @desc: to be freed - */ -void irdma_free_hash_desc(struct shash_desc *desc) -{ - if (desc) { - crypto_free_shash(desc->tfm); - kfree(desc); - } -} - -/** * irdma_ieq_check_mpacrc - check if mpa crc is OK - * @desc: desc for hash * @addr: address of buffer for crc * @len: length of buffer * @val: value to be compared */ -int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, - u32 val) +int irdma_ieq_check_mpacrc(const void *addr, u32 len, u32 val) { - u32 crc = 0; - - crypto_shash_digest(desc, addr, len, (u8 *)&crc); - if (crc != val) + if ((__force u32)cpu_to_le32(~crc32c(~0, addr, len)) != val) return -EINVAL; return 0; diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile index 88655fe5e398..921c05e08b11 100644 --- a/drivers/infiniband/hw/mana/Makefile +++ b/drivers/infiniband/hw/mana/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o -mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o ah.o wr.o counters.o diff --git a/drivers/infiniband/hw/mana/ah.c b/drivers/infiniband/hw/mana/ah.c new file mode 100644 index 000000000000..f56952eebbaa --- /dev/null +++ b/drivers/infiniband/hw/mana/ah.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + struct rdma_ah_attr *ah_attr = attr->ah_attr; + const struct ib_global_route *grh; + enum rdma_network_type ntype; + + if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE || + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + + if (udata) + return -EINVAL; + + ah->av = dma_pool_zalloc(mdev->av_pool, GFP_ATOMIC, &ah->dma_handle); + if (!ah->av) + return -ENOMEM; + + grh = rdma_ah_read_grh(ah_attr); + ntype = rdma_gid_attr_network_type(grh->sgid_attr); + + copy_in_reverse(ah->av->dest_mac, ah_attr->roce.dmac, ETH_ALEN); + ah->av->udp_src_port = rdma_flow_label_to_udp_sport(grh->flow_label); + ah->av->hop_limit = grh->hop_limit; + ah->av->dscp = (grh->traffic_class >> 2) & 0x3f; + ah->av->is_ipv6 = (ntype == RDMA_NETWORK_IPV6); + + if (ah->av->is_ipv6) { + copy_in_reverse(ah->av->dest_ip, grh->dgid.raw, 16); + copy_in_reverse(ah->av->src_ip, grh->sgid_attr->gid.raw, 16); + } else { + ah->av->dest_ip[10] = 0xFF; + ah->av->dest_ip[11] = 0xFF; + copy_in_reverse(&ah->av->dest_ip[12], &grh->dgid.raw[12], 4); + copy_in_reverse(&ah->av->src_ip[12], &grh->sgid_attr->gid.raw[12], 4); + } + + return 0; +} + +int mana_ib_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct mana_ib_dev *mdev = container_of(ibah->device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(ibah, struct mana_ib_ah, ibah); + + dma_pool_free(mdev->av_pool, ah->av, ah->dma_handle); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/counters.c b/drivers/infiniband/hw/mana/counters.c new file mode 100644 index 000000000000..e533ce21013d --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "counters.h" + +static const struct rdma_stat_desc mana_ib_port_stats_desc[] = { + [MANA_IB_REQUESTER_TIMEOUT].name = "requester_timeout", + [MANA_IB_REQUESTER_OOS_NAK].name = "requester_oos_nak", + [MANA_IB_REQUESTER_RNR_NAK].name = "requester_rnr_nak", + [MANA_IB_RESPONDER_RNR_NAK].name = "responder_rnr_nak", + [MANA_IB_RESPONDER_OOS].name = "responder_oos", + [MANA_IB_RESPONDER_DUP_REQUEST].name = "responder_dup_request", + [MANA_IB_REQUESTER_IMPLICIT_NAK].name = "requester_implicit_nak", + [MANA_IB_REQUESTER_READRESP_PSN_MISMATCH].name = "requester_readresp_psn_mismatch", + [MANA_IB_NAK_INV_REQ].name = "nak_inv_req", + [MANA_IB_NAK_ACCESS_ERR].name = "nak_access_error", + [MANA_IB_NAK_OPP_ERR].name = "nak_opp_error", + [MANA_IB_NAK_INV_READ].name = "nak_inv_read", + [MANA_IB_RESPONDER_LOCAL_LEN_ERR].name = "responder_local_len_error", + [MANA_IB_REQUESTOR_LOCAL_PROT_ERR].name = "requestor_local_prot_error", + [MANA_IB_RESPONDER_REM_ACCESS_ERR].name = "responder_rem_access_error", + [MANA_IB_RESPONDER_LOCAL_QP_ERR].name = "responder_local_qp_error", + [MANA_IB_RESPONDER_MALFORMED_WQE].name = "responder_malformed_wqe", + [MANA_IB_GENERAL_HW_ERR].name = "general_hw_error", + [MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED].name = "requester_rnr_nak_retries_exceeded", + [MANA_IB_REQUESTER_RETRIES_EXCEEDED].name = "requester_retries_exceeded", + [MANA_IB_TOTAL_FATAL_ERR].name = "total_fatal_error", + [MANA_IB_RECEIVED_CNPS].name = "received_cnps", + [MANA_IB_NUM_QPS_CONGESTED].name = "num_qps_congested", + [MANA_IB_RATE_INC_EVENTS].name = "rate_inc_events", + [MANA_IB_NUM_QPS_RECOVERED].name = "num_qps_recovered", + [MANA_IB_CURRENT_RATE].name = "current_rate", +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num) +{ + return rdma_alloc_hw_stats_struct(mana_ib_port_stats_desc, + ARRAY_SIZE(mana_ib_port_stats_desc), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, + ib_dev); + struct mana_rnic_query_vf_cntrs_resp resp = {}; + struct mana_rnic_query_vf_cntrs_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_VF_COUNTERS, + sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(mdev_to_gc(mdev), sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to query vf counters err %d", + err); + return err; + } + + stats->value[MANA_IB_REQUESTER_TIMEOUT] = resp.requester_timeout; + stats->value[MANA_IB_REQUESTER_OOS_NAK] = resp.requester_oos_nak; + stats->value[MANA_IB_REQUESTER_RNR_NAK] = resp.requester_rnr_nak; + stats->value[MANA_IB_RESPONDER_RNR_NAK] = resp.responder_rnr_nak; + stats->value[MANA_IB_RESPONDER_OOS] = resp.responder_oos; + stats->value[MANA_IB_RESPONDER_DUP_REQUEST] = resp.responder_dup_request; + stats->value[MANA_IB_REQUESTER_IMPLICIT_NAK] = + resp.requester_implicit_nak; + stats->value[MANA_IB_REQUESTER_READRESP_PSN_MISMATCH] = + resp.requester_readresp_psn_mismatch; + stats->value[MANA_IB_NAK_INV_REQ] = resp.nak_inv_req; + stats->value[MANA_IB_NAK_ACCESS_ERR] = resp.nak_access_err; + stats->value[MANA_IB_NAK_OPP_ERR] = resp.nak_opp_err; + stats->value[MANA_IB_NAK_INV_READ] = resp.nak_inv_read; + stats->value[MANA_IB_RESPONDER_LOCAL_LEN_ERR] = + resp.responder_local_len_err; + stats->value[MANA_IB_REQUESTOR_LOCAL_PROT_ERR] = + resp.requestor_local_prot_err; + stats->value[MANA_IB_RESPONDER_REM_ACCESS_ERR] = + resp.responder_rem_access_err; + stats->value[MANA_IB_RESPONDER_LOCAL_QP_ERR] = + resp.responder_local_qp_err; + stats->value[MANA_IB_RESPONDER_MALFORMED_WQE] = + resp.responder_malformed_wqe; + stats->value[MANA_IB_GENERAL_HW_ERR] = resp.general_hw_err; + stats->value[MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED] = + resp.requester_rnr_nak_retries_exceeded; + stats->value[MANA_IB_REQUESTER_RETRIES_EXCEEDED] = + resp.requester_retries_exceeded; + stats->value[MANA_IB_TOTAL_FATAL_ERR] = resp.total_fatal_err; + + stats->value[MANA_IB_RECEIVED_CNPS] = resp.received_cnps; + stats->value[MANA_IB_NUM_QPS_CONGESTED] = resp.num_qps_congested; + stats->value[MANA_IB_RATE_INC_EVENTS] = resp.rate_inc_events; + stats->value[MANA_IB_NUM_QPS_RECOVERED] = resp.num_qps_recovered; + stats->value[MANA_IB_CURRENT_RATE] = resp.current_rate; + + return ARRAY_SIZE(mana_ib_port_stats_desc); +} diff --git a/drivers/infiniband/hw/mana/counters.h b/drivers/infiniband/hw/mana/counters.h new file mode 100644 index 000000000000..7ff92d27f6c3 --- /dev/null +++ b/drivers/infiniband/hw/mana/counters.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + */ + +#ifndef _COUNTERS_H_ +#define _COUNTERS_H_ + +#include "mana_ib.h" + +enum mana_ib_port_counters { + MANA_IB_REQUESTER_TIMEOUT, + MANA_IB_REQUESTER_OOS_NAK, + MANA_IB_REQUESTER_RNR_NAK, + MANA_IB_RESPONDER_RNR_NAK, + MANA_IB_RESPONDER_OOS, + MANA_IB_RESPONDER_DUP_REQUEST, + MANA_IB_REQUESTER_IMPLICIT_NAK, + MANA_IB_REQUESTER_READRESP_PSN_MISMATCH, + MANA_IB_NAK_INV_REQ, + MANA_IB_NAK_ACCESS_ERR, + MANA_IB_NAK_OPP_ERR, + MANA_IB_NAK_INV_READ, + MANA_IB_RESPONDER_LOCAL_LEN_ERR, + MANA_IB_REQUESTOR_LOCAL_PROT_ERR, + MANA_IB_RESPONDER_REM_ACCESS_ERR, + MANA_IB_RESPONDER_LOCAL_QP_ERR, + MANA_IB_RESPONDER_MALFORMED_WQE, + MANA_IB_GENERAL_HW_ERR, + MANA_IB_REQUESTER_RNR_NAK_RETRIES_EXCEEDED, + MANA_IB_REQUESTER_RETRIES_EXCEEDED, + MANA_IB_TOTAL_FATAL_ERR, + MANA_IB_RECEIVED_CNPS, + MANA_IB_NUM_QPS_CONGESTED, + MANA_IB_RATE_INC_EVENTS, + MANA_IB_NUM_QPS_RECOVERED, + MANA_IB_CURRENT_RATE, +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num); +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index); +#endif /* _COUNTERS_H_ */ diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index f04a679d2871..0fc4e2679218 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -15,42 +15,58 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; + struct gdma_context *gc; bool is_rnic_cq; u32 doorbell; + u32 buf_size; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev_to_gc(mdev); cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; cq->cq_handle = INVALID_MANA_HANDLE; - if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) - return -EINVAL; + if (udata) { + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) + return -EINVAL; - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); - if (err) { - ibdev_dbg(ibdev, - "Failed to copy from udata for create cq, %d\n", err); - return err; - } + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, "Failed to copy from udata for create cq, %d\n", err); + return err; + } - is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); + is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); - if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { - ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); - return -EINVAL; - } + if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || + attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } - cq->cqe = attr->cqe; - err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, &cq->queue); - if (err) { - ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); - return err; - } + cq->cqe = attr->cqe; + err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); + return err; + } - mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, - ibucontext); - doorbell = mana_ucontext->doorbell; + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + doorbell = mana_ucontext->doorbell; + } else { + is_rnic_cq = true; + buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE)); + cq->cqe = buf_size / COMP_ENTRY_SIZE; + err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err); + return err; + } + doorbell = gc->mana_ib.doorbell; + } if (is_rnic_cq) { err = mana_ib_gd_create_cq(mdev, cq, doorbell); @@ -66,13 +82,19 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } } - resp.cqid = cq->queue.id; - err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); - if (err) { - ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); - goto err_remove_cq_cb; + if (udata) { + resp.cqid = cq->queue.id; + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto err_remove_cq_cb; + } } + spin_lock_init(&cq->cq_lock); + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + return 0; err_remove_cq_cb: @@ -122,7 +144,10 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) return -EINVAL; /* Create CQ table entry */ WARN_ON(gc->cq_table[cq->queue.id]); - gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); + if (cq->queue.kmem) + gdma_cq = cq->queue.kmem; + else + gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); if (!gdma_cq) return -ENOMEM; @@ -141,6 +166,153 @@ void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) if (cq->queue.id >= gc->max_num_cqs || cq->queue.id == INVALID_QUEUE_ID) return; + if (cq->queue.kmem) + /* Then it will be cleaned and removed by the mana */ + return; + kfree(gc->cq_table[cq->queue.id]); gc->cq_table[cq->queue.id] = NULL; } + +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct gdma_queue *gdma_cq = cq->queue.kmem; + + if (!gdma_cq) + return -EINVAL; + + mana_gd_ring_cq(gdma_cq, SET_ARM_BIT); + return 0; +} + +static inline void handle_ud_sq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct ud_sq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq); + if (!shadow_wqe) + return; + + shadow_wqe->header.error_code = rdma_cqe->ud_send.vendor_error; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_sq); +} + +static inline void handle_ud_rq_cqe(struct mana_ib_qp *qp, struct gdma_comp *cqe) +{ + struct mana_rdma_cqe *rdma_cqe = (struct mana_rdma_cqe *)cqe->cqe_data; + struct gdma_queue *wq = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct ud_rq_shadow_wqe *shadow_wqe; + + shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_rq); + if (!shadow_wqe) + return; + + shadow_wqe->byte_len = rdma_cqe->ud_recv.msg_len; + shadow_wqe->src_qpn = rdma_cqe->ud_recv.src_qpn; + shadow_wqe->header.error_code = IB_WC_SUCCESS; + + wq->tail += shadow_wqe->header.posted_wqe_size; + shadow_queue_advance_next_to_complete(&qp->shadow_rq); +} + +static void mana_handle_cqe(struct mana_ib_dev *mdev, struct gdma_comp *cqe) +{ + struct mana_ib_qp *qp = mana_get_qp_ref(mdev, cqe->wq_num, cqe->is_sq); + + if (!qp) + return; + + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) { + if (cqe->is_sq) + handle_ud_sq_cqe(qp, cqe); + else + handle_ud_rq_cqe(qp, cqe); + } + + mana_put_qp_ref(qp); +} + +static void fill_verbs_from_shadow_wqe(struct mana_ib_qp *qp, struct ib_wc *wc, + const struct shadow_wqe_header *shadow_wqe) +{ + const struct ud_rq_shadow_wqe *ud_wqe = (const struct ud_rq_shadow_wqe *)shadow_wqe; + + wc->wr_id = shadow_wqe->wr_id; + wc->status = shadow_wqe->error_code; + wc->opcode = shadow_wqe->opcode; + wc->vendor_err = shadow_wqe->error_code; + wc->wc_flags = 0; + wc->qp = &qp->ibqp; + wc->pkey_index = 0; + + if (shadow_wqe->opcode == IB_WC_RECV) { + wc->byte_len = ud_wqe->byte_len; + wc->src_qp = ud_wqe->src_qpn; + wc->wc_flags |= IB_WC_GRH; + } +} + +static int mana_process_completions(struct mana_ib_cq *cq, int nwc, struct ib_wc *wc) +{ + struct shadow_wqe_header *shadow_wqe; + struct mana_ib_qp *qp; + int wc_index = 0; + + /* process send shadow queue completions */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_sq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_sq); + wc_index++; + } + } + + /* process recv shadow queue completions */ + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_rq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + shadow_queue_advance_consumer(&qp->shadow_rq); + wc_index++; + } + } + +out: + return wc_index; +} + +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_dev *mdev = container_of(ibcq->device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = cq->queue.kmem; + struct gdma_comp gdma_cqe; + unsigned long flags; + int num_polled = 0; + int comp_read, i; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + comp_read = mana_gd_poll_cq(queue, &gdma_cqe, 1); + if (comp_read < 1) + break; + mana_handle_cqe(mdev, &gdma_cqe); + } + + num_polled = mana_process_completions(cq, num_entries, wc); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return num_polled; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 3416a85f8738..b31089320aa5 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -19,6 +19,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .add_gid = mana_ib_gd_add_gid, .alloc_pd = mana_ib_alloc_pd, .alloc_ucontext = mana_ib_alloc_ucontext, + .create_ah = mana_ib_create_ah, .create_cq = mana_ib_create_cq, .create_qp = mana_ib_create_qp, .create_rwq_ind_table = mana_ib_create_rwq_ind_table, @@ -27,22 +28,30 @@ static const struct ib_device_ops mana_ib_dev_ops = { .dealloc_ucontext = mana_ib_dealloc_ucontext, .del_gid = mana_ib_gd_del_gid, .dereg_mr = mana_ib_dereg_mr, + .destroy_ah = mana_ib_destroy_ah, .destroy_cq = mana_ib_destroy_cq, .destroy_qp = mana_ib_destroy_qp, .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, .destroy_wq = mana_ib_destroy_wq, .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_dma_mr = mana_ib_get_dma_mr, .get_link_layer = mana_ib_get_link_layer, .get_port_immutable = mana_ib_get_port_immutable, .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, + .poll_cq = mana_ib_poll_cq, + .post_recv = mana_ib_post_recv, + .post_send = mana_ib_post_send, .query_device = mana_ib_query_device, .query_gid = mana_ib_query_gid, .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, + .reg_user_mr_dmabuf = mana_ib_reg_user_mr_dmabuf, + .req_notify_cq = mana_ib_arm_cq, + INIT_RDMA_OBJ_SIZE(ib_ah, mana_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), @@ -51,6 +60,43 @@ static const struct ib_device_ops mana_ib_dev_ops = { ib_ind_table), }; +static const struct ib_device_ops mana_ib_stats_ops = { + .alloc_hw_port_stats = mana_ib_alloc_hw_port_stats, + .get_hw_stats = mana_ib_get_hw_stats, +}; + +static int mana_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mana_ib_dev *dev = container_of(this, struct mana_ib_dev, nb); + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct gdma_context *gc = dev->gdma_dev->gdma_context; + struct mana_context *mc = gc->mana.driver_data; + struct net_device *ndev; + + /* Only process events from our parent device */ + if (event_dev != mc->ports[0]) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGEUPPER: + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); + /* + * RDMA core will setup GID based on updated netdev. + * It's not possible to race with the core as rtnl lock is being + * held. + */ + ib_device_set_netdev(&dev->ib_dev, ndev, 1); + + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int mana_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -84,10 +130,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; - rcu_read_lock(); /* required to get primary netdev */ - ndev = mana_get_primary_netdev_rcu(mc, 0); + ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker); if (!ndev) { - rcu_read_unlock(); ret = -ENODEV; ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); goto free_ib_device; @@ -95,7 +139,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, ether_addr_copy(mac_addr, ndev->dev_addr); addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); - rcu_read_unlock(); + /* mana_get_primary_netdev() returns ndev with refcount held */ + netdev_put(ndev, &dev->dev_tracker); if (ret) { ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); goto free_ib_device; @@ -109,17 +154,27 @@ static int mana_ib_probe(struct auxiliary_device *adev, } dev->gdma_dev = &mdev->gdma_context->mana_ib; + dev->nb.notifier_call = mana_ib_netdev_event; + ret = register_netdevice_notifier(&dev->nb); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d", + ret); + goto deregister_device; + } + ret = mana_ib_gd_query_adapter_caps(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } + ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); + ret = mana_ib_create_eqs(dev); if (ret) { ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); - goto deregister_device; + goto deregister_net_notifier; } ret = mana_ib_gd_create_rnic_adapter(dev); @@ -134,20 +189,31 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto destroy_rnic; } + dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev, + MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0); + if (!dev->av_pool) { + ret = -ENOMEM; + goto destroy_rnic; + } + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) - goto destroy_rnic; + goto deallocate_pool; dev_set_drvdata(&adev->dev, dev); return 0; +deallocate_pool: + dma_pool_destroy(dev->av_pool); destroy_rnic: xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: mana_ib_destroy_eqs(dev); +deregister_net_notifier: + unregister_netdevice_notifier(&dev->nb); deregister_device: mana_gd_deregister_device(dev->gdma_dev); free_ib_device: @@ -160,9 +226,11 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); + dma_pool_destroy(dev->av_pool); xa_destroy(&dev->qp_table_wq); mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); + unregister_netdevice_notifier(&dev->nb); mana_gd_deregister_device(dev->gdma_dev); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 457cea6d9909..eda9c5b971de 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -82,6 +82,9 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), sizeof(resp)); + if (!udata) + flags |= GDMA_PD_FLAG_ALLOW_GPA_MR; + req.flags = flags; err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); @@ -237,6 +240,27 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); } +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue_spec spec = {}; + int err; + + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + spec.type = type; + spec.monitor_avl_buf = false; + spec.queue_size = size; + err = mana_gd_create_mana_wq_cq(&gc->mana_ib, &spec, &queue->kmem); + if (err) + return err; + /* take ownership into mana_ib from mana */ + queue->gdma_region = queue->kmem->mem_info.dma_region_handle; + queue->kmem->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION; + return 0; +} + int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, struct mana_ib_queue *queue) { @@ -276,6 +300,8 @@ void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue */ mana_ib_gd_destroy_dma_region(mdev, queue->gdma_region); ib_umem_release(queue->umem); + if (queue->kmem) + mana_gd_destroy_queue(mdev_to_gc(mdev), queue->kmem); } static int @@ -358,7 +384,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem unsigned int tail = 0; u64 *page_addr_list; void *request_buf; - int err; + int err = 0; gc = mdev_to_gc(dev); hwc = gc->hwc.driver_data; @@ -535,8 +561,10 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; - if (port_num == 1) + if (port_num == 1) { immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } return 0; } @@ -595,8 +623,11 @@ int mana_ib_query_port(struct ib_device *ibdev, u32 port, props->active_width = IB_WIDTH_4X; props->active_speed = IB_SPEED_EDR; props->pkey_tbl_len = 1; - if (port == 1) + if (port == 1) { props->gid_tbl_len = 16; + props->port_cap_flags = IB_PORT_CM_SUP; + props->ip_gids = true; + } return 0; } @@ -634,7 +665,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) mana_gd_init_req_hdr(&req.hdr, MANA_IB_GET_ADAPTER_CAP, sizeof(req), sizeof(resp)); - req.hdr.resp.msg_version = GDMA_MESSAGE_V3; + req.hdr.resp.msg_version = GDMA_MESSAGE_V4; req.hdr.dev_id = dev->gdma_dev->dev_id; err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), @@ -663,6 +694,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) caps->max_inline_data_size = resp.max_inline_data_size; caps->max_send_sge_count = resp.max_send_sge_count; caps->max_recv_sge_count = resp.max_recv_sge_count; + caps->feature_flags = resp.feature_flags; return 0; } @@ -678,7 +710,7 @@ mana_ib_event_handler(void *ctx, struct gdma_queue *q, struct gdma_event *event) switch (event->type) { case GDMA_EQE_RNIC_QP_FATAL: qpn = event->details[0]; - qp = mana_get_qp_ref(mdev, qpn); + qp = mana_get_qp_ref(mdev, qpn, false); if (!qp) break; if (qp->ibqp.event_handler) { @@ -762,6 +794,9 @@ int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) req.hdr.dev_id = gc->mana_ib.dev_id; req.notify_eq_id = mdev->fatal_err_eq->id; + if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT) + req.feature_flags |= MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err) { ibdev_err(&mdev->ib_dev, "Failed to create RNIC adapter err %d", err); @@ -987,3 +1022,61 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) } return 0; } + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_udqp_resp resp = {}; + struct mana_rnic_create_udqp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->ud_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.qp_type = type; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp err %d", err); + return err; + } + qp->qp_handle = resp.qp_handle; + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; i++) { + qp->ud_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->ud_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_udqp_resp resp = {0}; + struct mana_rnic_destroy_udqp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy ud qp err %d", err); + return err; + } + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index b53a5b4de908..6903946677e5 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -11,8 +11,11 @@ #include <rdma/ib_umem.h> #include <rdma/mana-abi.h> #include <rdma/uverbs_ioctl.h> +#include <linux/dmapool.h> #include <net/mana/mana.h> +#include "shadow_queue.h" +#include "counters.h" #define PAGE_SZ_BM \ (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ @@ -21,6 +24,9 @@ /* MANA doesn't have any limit for MR size */ #define MANA_IB_MAX_MR_SIZE U64_MAX +/* Send queue ID mask */ +#define MANA_SENDQ_MASK BIT(31) + /* * The hardware limit of number of MRs is greater than maximum number of MRs * that can possibly represent in 24 bits @@ -32,6 +38,11 @@ */ #define MANA_CA_ACK_DELAY 16 +/* + * The buffer used for writing AV + */ +#define MANA_AV_BUFFER_SIZE 64 + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -48,10 +59,12 @@ struct mana_ib_adapter_caps { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; }; struct mana_ib_queue { struct ib_umem *umem; + struct gdma_queue *kmem; u64 gdma_region; u64 id; }; @@ -64,6 +77,9 @@ struct mana_ib_dev { struct gdma_queue **eqs; struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; + struct dma_pool *av_pool; + netdevice_tracker dev_tracker; + struct notifier_block nb; }; struct mana_ib_wq { @@ -87,6 +103,25 @@ struct mana_ib_pd { u32 tx_vp_offset; }; +struct mana_ib_av { + u8 dest_ip[16]; + u8 dest_mac[ETH_ALEN]; + u16 udp_src_port; + u8 src_ip[16]; + u32 hop_limit : 8; + u32 reserved1 : 12; + u32 dscp : 6; + u32 reserved2 : 5; + u32 is_ipv6 : 1; + u32 reserved3 : 32; +}; + +struct mana_ib_ah { + struct ib_ah ibah; + struct mana_ib_av *av; + dma_addr_t dma_handle; +}; + struct mana_ib_mr { struct ib_mr ibmr; struct ib_umem *umem; @@ -96,6 +131,10 @@ struct mana_ib_mr { struct mana_ib_cq { struct ib_cq ibcq; struct mana_ib_queue queue; + /* protects CQ polling */ + spinlock_t cq_lock; + struct list_head list_send_qp; + struct list_head list_recv_qp; int cqe; u32 comp_vector; mana_handle_t cq_handle; @@ -114,6 +153,17 @@ struct mana_ib_rc_qp { struct mana_ib_queue queues[MANA_RC_QUEUE_TYPE_MAX]; }; +enum mana_ud_queue_type { + MANA_UD_SEND_QUEUE = 0, + MANA_UD_RECV_QUEUE, + MANA_UD_QUEUE_TYPE_MAX, +}; + +struct mana_ib_ud_qp { + struct mana_ib_queue queues[MANA_UD_QUEUE_TYPE_MAX]; + u32 sq_psn; +}; + struct mana_ib_qp { struct ib_qp ibqp; @@ -121,11 +171,17 @@ struct mana_ib_qp { union { struct mana_ib_queue raw_sq; struct mana_ib_rc_qp rc_qp; + struct mana_ib_ud_qp ud_qp; }; /* The port on the IB device, starting with 1 */ u32 port; + struct list_head cq_send_list; + struct list_head cq_recv_list; + struct shadow_queue shadow_rq; + struct shadow_queue shadow_sq; + refcount_t refcount; struct completion free; }; @@ -145,17 +201,24 @@ enum mana_ib_command_code { MANA_IB_DESTROY_ADAPTER = 0x30003, MANA_IB_CONFIG_IP_ADDR = 0x30004, MANA_IB_CONFIG_MAC_ADDR = 0x30005, + MANA_IB_CREATE_UD_QP = 0x30006, + MANA_IB_DESTROY_UD_QP = 0x30007, MANA_IB_CREATE_CQ = 0x30008, MANA_IB_DESTROY_CQ = 0x30009, MANA_IB_CREATE_RC_QP = 0x3000a, MANA_IB_DESTROY_RC_QP = 0x3000b, MANA_IB_SET_QP_STATE = 0x3000d, + MANA_IB_QUERY_VF_COUNTERS = 0x30022, }; struct mana_ib_query_adapter_caps_req { struct gdma_req_hdr hdr; }; /*HW Data */ +enum mana_ib_adapter_features { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT = BIT(4), +}; + struct mana_ib_query_adapter_caps_resp { struct gdma_resp_hdr hdr; u32 max_sq_id; @@ -176,8 +239,13 @@ struct mana_ib_query_adapter_caps_resp { u32 max_send_sge_count; u32 max_recv_sge_count; u32 max_inline_data_size; + u64 feature_flags; }; /* HW Data */ +enum mana_ib_adapter_features_request { + MANA_IB_FEATURE_CLIENT_ERROR_CQE_REQUEST = BIT(1), +}; /*HW Data */ + struct mana_rnic_create_adapter_req { struct gdma_req_hdr hdr; u32 notify_eq_id; @@ -296,6 +364,37 @@ struct mana_rnic_destroy_rc_qp_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_rnic_create_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_UD_QUEUE_TYPE_MAX]; + u32 qp_type; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; +}; /* HW Data */ + +struct mana_rnic_create_udqp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t qp_handle; + u32 queue_ids[MANA_UD_QUEUE_TYPE_MAX]; +}; /* HW Data*/ + +struct mana_rnic_destroy_udqp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_udqp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + struct mana_ib_ah_attr { u8 src_addr[16]; u8 dest_addr[16]; @@ -332,17 +431,104 @@ struct mana_rnic_set_qp_state_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +enum WQE_OPCODE_TYPES { + WQE_TYPE_UD_SEND = 0, + WQE_TYPE_UD_RECV = 8, +}; /* HW DATA */ + +struct rdma_send_oob { + u32 wqe_type : 5; + u32 fence : 1; + u32 signaled : 1; + u32 solicited : 1; + u32 psn : 24; + + u32 ssn_or_rqpn : 24; + u32 reserved1 : 8; + union { + struct { + u32 remote_qkey; + u32 immediate; + u32 reserved1; + u32 reserved2; + } ud_send; + }; +}; /* HW DATA */ + +struct mana_rdma_cqe { + union { + struct { + u8 cqe_type; + u8 data[GDMA_COMP_DATA_SIZE - 1]; + }; + struct { + u32 cqe_type : 8; + u32 vendor_error : 9; + u32 reserved1 : 15; + u32 sge_offset : 5; + u32 tx_wqe_offset : 27; + } ud_send; + struct { + u32 cqe_type : 8; + u32 reserved1 : 24; + u32 msg_len; + u32 src_qpn : 24; + u32 reserved2 : 8; + u32 imm_data; + u32 rx_wqe_offset; + } ud_recv; + }; +}; /* HW DATA */ + +struct mana_rnic_query_vf_cntrs_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_query_vf_cntrs_resp { + struct gdma_resp_hdr hdr; + u64 requester_timeout; + u64 requester_oos_nak; + u64 requester_rnr_nak; + u64 responder_rnr_nak; + u64 responder_oos; + u64 responder_dup_request; + u64 requester_implicit_nak; + u64 requester_readresp_psn_mismatch; + u64 nak_inv_req; + u64 nak_access_err; + u64 nak_opp_err; + u64 nak_inv_read; + u64 responder_local_len_err; + u64 requestor_local_prot_err; + u64 responder_rem_access_err; + u64 responder_local_qp_err; + u64 responder_malformed_wqe; + u64 general_hw_err; + u64 requester_rnr_nak_retries_exceeded; + u64 requester_retries_exceeded; + u64 total_fatal_err; + u64 received_cnps; + u64 num_qps_congested; + u64 rate_inc_events; + u64 num_qps_recovered; + u64 current_rate; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; } static inline struct mana_ib_qp *mana_get_qp_ref(struct mana_ib_dev *mdev, - uint32_t qid) + u32 qid, bool is_sq) { struct mana_ib_qp *qp; unsigned long flag; + if (is_sq) + qid |= MANA_SENDQ_MASK; + xa_lock_irqsave(&mdev->qp_table_wq, flag); qp = xa_load(&mdev->qp_table_wq, qid); if (qp) @@ -388,6 +574,8 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); +int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type, + struct mana_ib_queue *queue); int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, struct mana_ib_queue *queue); void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue); @@ -480,4 +668,24 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, struct ib_qp_init_attr *attr, u32 doorbell, u64 flags); int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u32 type); +int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); + +int mana_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int mana_ib_destroy_ah(struct ib_ah *ah, u32 flags); + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); + +int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int mr_access_flags, + struct uverbs_attr_bundle *attrs); #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 887b09dd86e7..f99557ec7767 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -8,6 +8,8 @@ #define VALID_MR_FLAGS \ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) +#define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE) + static enum gdma_mr_access_flags mana_ib_verbs_to_gdma_access_flags(int access_flags) { @@ -39,6 +41,8 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, req.mr_type = mr_params->mr_type; switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GPA: + break; case GDMA_MR_TYPE_GVA: req.gva.dma_region_handle = mr_params->gva.dma_region_handle; req.gva.virtual_address = mr_params->gva.virtual_address; @@ -169,6 +173,107 @@ err_free: return ERR_PTR(err); } +struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct ib_umem_dmabuf *umem_dmabuf; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + u64 dma_region_handle; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + access_flags &= ~IB_ACCESS_OPTIONAL; + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ibdev, start, length, fd, access_flags); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(ibdev, "Failed to get dmabuf umem, %d\n", err); + goto err_free; + } + + mr->umem = &umem_dmabuf->umem; + + err = mana_ib_create_dma_region(dev, mr->umem, &dma_region_handle, iova); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(mdev_to_gc(dev), dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (access_flags & ~VALID_DMA_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GPA; + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_free; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 73d67c853b6f..c928af58f38b 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -398,18 +398,128 @@ err_free_vport: return err; } +static u32 mana_ib_wqe_size(u32 sge, u32 oob_size) +{ + u32 wqe_size = sge * sizeof(struct gdma_sge) + sizeof(struct gdma_wqe) + oob_size; + + return ALIGN(wqe_size, GDMA_WQE_BU_SIZE); +} + +static u32 mana_ib_queue_size(struct ib_qp_init_attr *attr, u32 queue_type) +{ + u32 queue_size; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + queue_size = attr->cap.max_send_wr * + mana_ib_wqe_size(attr->cap.max_send_sge, INLINE_OOB_LARGE_SIZE); + else + queue_size = attr->cap.max_recv_wr * + mana_ib_wqe_size(attr->cap.max_recv_sge, INLINE_OOB_SMALL_SIZE); + break; + default: + return 0; + } + + return MANA_PAGE_ALIGN(roundup_pow_of_two(queue_size)); +} + +static enum gdma_queue_type mana_ib_queue_type(struct ib_qp_init_attr *attr, u32 queue_type) +{ + enum gdma_queue_type type; + + switch (attr->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + if (queue_type == MANA_UD_SEND_QUEUE) + type = GDMA_SQ; + else + type = GDMA_RQ; + break; + default: + type = GDMA_INVALID_QUEUE; + } + return type; +} + +static int mana_table_store_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, + GFP_KERNEL); +} + +static void mana_table_remove_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); +} + +static int mana_table_store_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + int err; + + err = xa_insert_irq(&mdev->qp_table_wq, qids, qp, GFP_KERNEL); + if (err) + return err; + + err = xa_insert_irq(&mdev->qp_table_wq, qidr, qp, GFP_KERNEL); + if (err) + goto remove_sq; + + return 0; + +remove_sq: + xa_erase_irq(&mdev->qp_table_wq, qids); + return err; +} + +static void mana_table_remove_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + u32 qids = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].id | MANA_SENDQ_MASK; + u32 qidr = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + + xa_erase_irq(&mdev->qp_table_wq, qids); + xa_erase_irq(&mdev->qp_table_wq, qidr); +} + static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) { refcount_set(&qp->refcount, 1); init_completion(&qp->free); - return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, - GFP_KERNEL); + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + return mana_table_store_rc_qp(mdev, qp); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_table_store_ud_qp(mdev, qp); + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for storing in mana table, %d\n", + qp->ibqp.qp_type); + } + + return -EINVAL; } static void mana_table_remove_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) { - xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + mana_table_remove_rc_qp(mdev, qp); + break; + case IB_QPT_UD: + case IB_QPT_GSI: + mana_table_remove_ud_qp(mdev, qp); + break; + default: + ibdev_dbg(&mdev->ib_dev, "Unknown QP type for removing from mana table, %d\n", + qp->ibqp.qp_type); + return; + } mana_put_qp_ref(qp); wait_for_completion(&qp->free); } @@ -490,6 +600,105 @@ destroy_queues: return err; } +static void mana_add_qp_to_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + +static void mana_remove_qp_from_cqs(struct mana_ib_qp *qp) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + unsigned long flags; + + spin_lock_irqsave(&send_cq->cq_lock, flags); + list_del(&qp->cq_send_list); + spin_unlock_irqrestore(&send_cq->cq_lock, flags); + + spin_lock_irqsave(&recv_cq->cq_lock, flags); + list_del(&qp->cq_recv_list); + spin_unlock_irqrestore(&recv_cq->cq_lock, flags); +} + +static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct gdma_context *gc = mdev_to_gc(mdev); + u32 doorbell, queue_size; + int i, err; + + if (udata) { + ibdev_dbg(&mdev->ib_dev, "User-level UD QPs are not supported\n"); + return -EOPNOTSUPP; + } + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) { + queue_size = mana_ib_queue_size(attr, i); + err = mana_ib_create_kernel_queue(mdev, queue_size, mana_ib_queue_type(attr, i), + &qp->ud_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", + i, err); + goto destroy_queues; + } + } + doorbell = gc->mana_ib.doorbell; + + err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr, + sizeof(struct ud_rq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow rq err %d\n", err); + goto destroy_queues; + } + err = create_shadow_queue(&qp->shadow_sq, attr->cap.max_send_wr, + sizeof(struct ud_sq_shadow_wqe)); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create shadow sq err %d\n", err); + goto destroy_shadow_queues; + } + + err = mana_ib_gd_create_ud_qp(mdev, qp, attr, doorbell, attr->qp_type); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create ud qp %d\n", err); + goto destroy_shadow_queues; + } + qp->ibqp.qp_num = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].id; + qp->port = attr->port_num; + + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + qp->ud_qp.queues[i].kmem->id = qp->ud_qp.queues[i].id; + + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + + mana_add_qp_to_cqs(qp); + + return 0; + +destroy_qp: + mana_ib_gd_destroy_ud_qp(mdev, qp); +destroy_shadow_queues: + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + return err; +} + int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -503,6 +712,9 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); case IB_QPT_RC: return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_create_ud_qp(ibqp, ibqp->pd, attr, udata); default: ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", attr->qp_type); @@ -579,6 +791,8 @@ int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { switch (ibqp->qp_type) { case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_GSI: return mana_ib_gd_modify_qp(ibqp, attr, attr_mask, udata); default: ibdev_dbg(ibqp->device, "Modify QP type %u not supported", ibqp->qp_type); @@ -652,6 +866,28 @@ static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) return 0; } +static int mana_ib_destroy_ud_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + mana_remove_qp_from_cqs(qp); + mana_table_remove_qp(mdev, qp); + + destroy_shadow_queue(&qp->shadow_rq); + destroy_shadow_queue(&qp->shadow_sq); + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_ud_qp(mdev, qp); + for (i = 0; i < MANA_UD_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->ud_qp.queues[i]); + + return 0; +} + int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); @@ -665,6 +901,9 @@ int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return mana_ib_destroy_qp_raw(qp, udata); case IB_QPT_RC: return mana_ib_destroy_rc_qp(qp, udata); + case IB_QPT_UD: + case IB_QPT_GSI: + return mana_ib_destroy_ud_qp(qp, udata); default: ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", ibqp->qp_type); diff --git a/drivers/infiniband/hw/mana/shadow_queue.h b/drivers/infiniband/hw/mana/shadow_queue.h new file mode 100644 index 000000000000..a4b3818f9c39 --- /dev/null +++ b/drivers/infiniband/hw/mana/shadow_queue.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_SHADOW_QUEUE_H_ +#define _MANA_SHADOW_QUEUE_H_ + +struct shadow_wqe_header { + u16 opcode; + u16 error_code; + u32 posted_wqe_size; + u64 wr_id; +}; + +struct ud_rq_shadow_wqe { + struct shadow_wqe_header header; + u32 byte_len; + u32 src_qpn; +}; + +struct ud_sq_shadow_wqe { + struct shadow_wqe_header header; +}; + +struct shadow_queue { + /* Unmasked producer index, Incremented on wqe posting */ + u64 prod_idx; + /* Unmasked consumer index, Incremented on cq polling */ + u64 cons_idx; + /* Unmasked index of next-to-complete (from HW) shadow WQE */ + u64 next_to_complete_idx; + /* queue size in wqes */ + u32 length; + /* distance between elements in bytes */ + u32 stride; + /* ring buffer holding wqes */ + void *buffer; +}; + +static inline int create_shadow_queue(struct shadow_queue *queue, uint32_t length, uint32_t stride) +{ + queue->buffer = kvmalloc_array(length, stride, GFP_KERNEL); + if (!queue->buffer) + return -ENOMEM; + + queue->length = length; + queue->stride = stride; + + return 0; +} + +static inline void destroy_shadow_queue(struct shadow_queue *queue) +{ + kvfree(queue->buffer); +} + +static inline bool shadow_queue_full(struct shadow_queue *queue) +{ + return (queue->prod_idx - queue->cons_idx) >= queue->length; +} + +static inline bool shadow_queue_empty(struct shadow_queue *queue) +{ + return queue->prod_idx == queue->cons_idx; +} + +static inline void * +shadow_queue_get_element(const struct shadow_queue *queue, u64 unmasked_index) +{ + u32 index = unmasked_index % queue->length; + + return ((u8 *)queue->buffer + index * queue->stride); +} + +static inline void * +shadow_queue_producer_entry(struct shadow_queue *queue) +{ + return shadow_queue_get_element(queue, queue->prod_idx); +} + +static inline void * +shadow_queue_get_next_to_consume(const struct shadow_queue *queue) +{ + if (queue->cons_idx == queue->next_to_complete_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->cons_idx); +} + +static inline void * +shadow_queue_get_next_to_complete(struct shadow_queue *queue) +{ + if (queue->next_to_complete_idx == queue->prod_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->next_to_complete_idx); +} + +static inline void shadow_queue_advance_producer(struct shadow_queue *queue) +{ + queue->prod_idx++; +} + +static inline void shadow_queue_advance_consumer(struct shadow_queue *queue) +{ + queue->cons_idx++; +} + +static inline void shadow_queue_advance_next_to_complete(struct shadow_queue *queue) +{ + queue->next_to_complete_idx++; +} + +#endif diff --git a/drivers/infiniband/hw/mana/wr.c b/drivers/infiniband/hw/mana/wr.c new file mode 100644 index 000000000000..1813567d3b16 --- /dev/null +++ b/drivers/infiniband/hw/mana/wr.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define MAX_WR_SGL_NUM (2) + +static int mana_ib_post_recv_ud(struct mana_ib_qp *qp, const struct ib_recv_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_RECV_QUEUE].kmem; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM]; + struct gdma_wqe_request wqe_req = {0}; + struct ud_rq_shadow_wqe *shadow_wqe; + int err, i; + + if (shadow_queue_full(&qp->shadow_rq)) + return -EINVAL; + + if (wr->num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + for (i = 0; i < wr->num_sge; ++i) { + gdma_sgl[i].address = wr->sg_list[i].addr; + gdma_sgl[i].mem_key = wr->sg_list[i].lkey; + gdma_sgl[i].size = wr->sg_list[i].length; + } + wqe_req.num_sge = wr->num_sge; + wqe_req.sgl = gdma_sgl; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_rq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_RECV; + shadow_wqe->header.wr_id = wr->wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_rq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_recv_ud(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting recv wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} + +static int mana_ib_post_send_ud(struct mana_ib_qp *qp, const struct ib_ud_wr *wr) +{ + struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct mana_ib_ah *ah = container_of(wr->ah, struct mana_ib_ah, ibah); + struct net_device *ndev = mana_ib_get_netdev(&mdev->ib_dev, qp->port); + struct gdma_queue *queue = qp->ud_qp.queues[MANA_UD_SEND_QUEUE].kmem; + struct gdma_sge gdma_sgl[MAX_WR_SGL_NUM + 1]; + struct gdma_posted_wqe_info wqe_info = {0}; + struct gdma_wqe_request wqe_req = {0}; + struct rdma_send_oob send_oob = {0}; + struct ud_sq_shadow_wqe *shadow_wqe; + int err, i; + + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + qp->port, qp->ibqp.qp_num); + return -EINVAL; + } + + if (wr->wr.opcode != IB_WR_SEND) + return -EINVAL; + + if (shadow_queue_full(&qp->shadow_sq)) + return -EINVAL; + + if (wr->wr.num_sge > MAX_WR_SGL_NUM) + return -EINVAL; + + gdma_sgl[0].address = ah->dma_handle; + gdma_sgl[0].mem_key = qp->ibqp.pd->local_dma_lkey; + gdma_sgl[0].size = sizeof(struct mana_ib_av); + for (i = 0; i < wr->wr.num_sge; ++i) { + gdma_sgl[i + 1].address = wr->wr.sg_list[i].addr; + gdma_sgl[i + 1].mem_key = wr->wr.sg_list[i].lkey; + gdma_sgl[i + 1].size = wr->wr.sg_list[i].length; + } + + wqe_req.num_sge = wr->wr.num_sge + 1; + wqe_req.sgl = gdma_sgl; + wqe_req.inline_oob_size = sizeof(struct rdma_send_oob); + wqe_req.inline_oob_data = &send_oob; + wqe_req.flags = GDMA_WR_OOB_IN_SGL; + wqe_req.client_data_unit = ib_mtu_enum_to_int(ib_mtu_int_to_enum(ndev->mtu)); + + send_oob.wqe_type = WQE_TYPE_UD_SEND; + send_oob.fence = !!(wr->wr.send_flags & IB_SEND_FENCE); + send_oob.signaled = !!(wr->wr.send_flags & IB_SEND_SIGNALED); + send_oob.solicited = !!(wr->wr.send_flags & IB_SEND_SOLICITED); + send_oob.psn = qp->ud_qp.sq_psn; + send_oob.ssn_or_rqpn = wr->remote_qpn; + send_oob.ud_send.remote_qkey = + qp->ibqp.qp_type == IB_QPT_GSI ? IB_QP1_QKEY : wr->remote_qkey; + + err = mana_gd_post_work_request(queue, &wqe_req, &wqe_info); + if (err) + return err; + + qp->ud_qp.sq_psn++; + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_sq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->header.opcode = IB_WC_SEND; + shadow_wqe->header.wr_id = wr->wr.wr_id; + shadow_wqe->header.posted_wqe_size = wqe_info.wqe_size_in_bu; + shadow_queue_advance_producer(&qp->shadow_sq); + + mana_gd_wq_ring_doorbell(mdev_to_gc(mdev), queue); + return 0; +} + +int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + int err; + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + for (; wr; wr = wr->next) { + switch (ibqp->qp_type) { + case IB_QPT_UD: + case IB_QPT_GSI: + err = mana_ib_post_send_ud(qp, ud_wr(wr)); + if (unlikely(err)) { + *bad_wr = wr; + return err; + } + break; + default: + ibdev_dbg(ibqp->device, "Posting send wr on qp type %u is not supported\n", + ibqp->qp_type); + return -EINVAL; + } + } + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index b38961f5058e..11878ddf7cc7 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -9,6 +9,7 @@ mlx5_ib-y := ah.o \ data_direct.o \ dm.o \ doorbell.o \ + fs.o \ gsi.o \ ib_virt.o \ mad.o \ @@ -26,7 +27,6 @@ mlx5_ib-y := ah.o \ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ - fs.o \ qos.o \ std_types.o mlx5_ib-$(CONFIG_MLX5_MACSEC) += macsec.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 99036afb3aef..531a57f9ee7e 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -50,11 +50,12 @@ static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, return sport; } -static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, +static int create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_init_attr *init_attr) { struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; + int rate_val; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -67,8 +68,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = - (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4); + rate_val = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)); + if (rate_val < 0) + return rate_val; + ah->av.stat_rate_sl = rate_val << 4; if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) @@ -89,6 +92,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); } + + return 0; } int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, @@ -121,8 +126,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, init_attr); - return 0; + return create_ib_ah(dev, ah, init_attr); } int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 81cfa74147a1..b847084dcd99 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -140,6 +140,13 @@ static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = { INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS), }; +static const struct mlx5_ib_counter packets_op_cnts[] = { + INIT_OP_COUNTER(rdma_tx_packets, RDMA_TX_PACKETS), + INIT_OP_COUNTER(rdma_tx_bytes, RDMA_TX_BYTES), + INIT_OP_COUNTER(rdma_rx_packets, RDMA_RX_PACKETS), + INIT_OP_COUNTER(rdma_rx_bytes, RDMA_RX_BYTES), +}; + static int mlx5_ib_read_counters(struct ib_counters *counters, struct ib_counters_read_attr *read_attr, struct uverbs_attr_bundle *attrs) @@ -427,6 +434,52 @@ done: return num_counters; } +static bool is_rdma_bytes_counter(u32 type) +{ + if (type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES || + type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP || + type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP) + return true; + + return false; +} + +static int do_per_qp_get_op_stat(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + int i, ret, index, num_hw_counters; + u64 packets = 0, bytes = 0; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + ret = mlx5_fc_query(dev->mdev, mcounter->fc[i], + &packets, &bytes); + if (ret) + return ret; + + num_hw_counters = cnts->num_q_counters + + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + + index = i - MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP + + num_hw_counters; + + if (is_rdma_bytes_counter(i)) + counter->stats->value[index] = bytes; + else + counter->stats->value[index] = packets; + + clear_bit(index, counter->stats->is_disabled); + } + return 0; +} + static int do_get_op_stat(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port_num, int index) @@ -434,7 +487,7 @@ static int do_get_op_stat(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); const struct mlx5_ib_counters *cnts; const struct mlx5_ib_op_fc *opfcs; - u64 packets = 0, bytes; + u64 packets, bytes; u32 type; int ret; @@ -453,8 +506,11 @@ static int do_get_op_stat(struct ib_device *ibdev, if (ret) return ret; + if (is_rdma_bytes_counter(type)) + stats->value[index] = bytes; + else + stats->value[index] = packets; out: - stats->value[index] = packets; return index; } @@ -523,19 +579,30 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) { struct mlx5_ib_dev *dev = to_mdev(counter->device); const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port); + int ret; + + ret = mlx5_ib_query_q_counters(dev->mdev, cnts, counter->stats, + counter->id); + if (ret) + return ret; + + if (!counter->mode.bind_opcnt) + return 0; - return mlx5_ib_query_q_counters(dev->mdev, cnts, - counter->stats, counter->id); + return do_per_qp_get_op_stat(counter); } static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) { + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(counter->device); u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; if (!counter->id) return 0; + WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa)); + mlx5r_fs_destroy_fcs(dev, counter); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); @@ -543,7 +610,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) } static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, - struct ib_qp *qp) + struct ib_qp *qp, u32 port) { struct mlx5_ib_dev *dev = to_mdev(qp->device); bool new = false; @@ -568,8 +635,14 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, if (err) goto fail_set_counter; + err = mlx5r_fs_bind_op_fc(qp, counter, port); + if (err) + goto fail_bind_op_fc; + return 0; +fail_bind_op_fc: + mlx5_ib_qp_set_counter(qp, NULL); fail_set_counter: if (new) { mlx5_ib_counter_dealloc(counter); @@ -579,9 +652,22 @@ fail_set_counter: return err; } -static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { - return mlx5_ib_qp_set_counter(qp, NULL); + struct rdma_counter *counter = qp->counter; + int err; + + mlx5r_fs_unbind_op_fc(qp, counter); + + err = mlx5_ib_qp_set_counter(qp, NULL); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5r_fs_bind_op_fc(qp, counter, port); + return err; } static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, @@ -681,6 +767,12 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, descs[j].priv = &rdmatx_cnp_op_cnts[i].type; } } + + for (i = 0; i < ARRAY_SIZE(packets_op_cnts); i++, j++) { + descs[j].name = packets_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &packets_op_cnts[i].type; + } } @@ -731,6 +823,8 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, num_op_counters = ARRAY_SIZE(basic_op_cnts); + num_op_counters += ARRAY_SIZE(packets_op_cnts); + if (MLX5_CAP_FLOWTABLE(dev->mdev, ft_field_support_2_nic_receive_rdma.bth_opcode)) num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts); @@ -760,10 +854,58 @@ err: return -ENOMEM; } +/* + * Checks if the given flow counter type should be sharing the same flow counter + * with another type and if it should, checks if that other type flow counter + * was already created, if both conditions are met return true and the counter + * else return false. + */ +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + return false; + } + + *opfc = &opfcs[shared_fc_type]; + if (!(*opfc)->fc) + return false; + + return true; +} + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; int num_cnt_ports = dev->num_ports; + struct mlx5_ib_op_fc *in_use_opfc; int i, j; if (is_mdev_switchdev_mode(dev->mdev)) @@ -785,11 +927,15 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) if (!dev->port[i].cnts.opfcs[j].fc) continue; - if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) - mlx5_ib_fs_remove_op_fc(dev, - &dev->port[i].cnts.opfcs[j], j); + if (mlx5r_is_opfc_shared_and_in_use( + dev->port[i].cnts.opfcs, j, &in_use_opfc)) + goto skip; + + mlx5_ib_fs_remove_op_fc(dev, + &dev->port[i].cnts.opfcs[j], j); mlx5_fc_destroy(dev->mdev, dev->port[i].cnts.opfcs[j].fc); +skip: dev->port[i].cnts.opfcs[j].fc = NULL; } } @@ -983,8 +1129,8 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, unsigned int index, bool enable) { struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_op_fc *opfc, *in_use_opfc; struct mlx5_ib_counters *cnts; - struct mlx5_ib_op_fc *opfc; u32 num_hw_counters, type; int ret; @@ -1008,6 +1154,13 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (opfc->fc) return -EEXIST; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + opfc->fc = mlx5_fc_create(dev->mdev, false); if (IS_ERR(opfc->fc)) return PTR_ERR(opfc->fc); @@ -1023,12 +1176,23 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, if (!opfc->fc) return -EINVAL; + if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, &in_use_opfc)) + goto out; + mlx5_ib_fs_remove_op_fc(dev, opfc, type); mlx5_fc_destroy(dev->mdev, opfc->fc); +out: opfc->fc = NULL; return 0; } +static void mlx5_ib_counter_init(struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + + xa_init(&mcounter->qpn_opfc_xa); +} + static const struct ib_device_ops hw_stats_ops = { .alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats, .get_hw_stats = mlx5_ib_get_hw_stats, @@ -1037,8 +1201,10 @@ static const struct ib_device_ops hw_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, - .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ? - mlx5_ib_modify_stat : NULL, + .modify_hw_stat = mlx5_ib_modify_stat, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops hw_switchdev_vport_op = { @@ -1053,6 +1219,9 @@ static const struct ib_device_ops hw_switchdev_stats_ops = { .counter_dealloc = mlx5_ib_counter_dealloc, .counter_alloc_stats = mlx5_ib_counter_alloc_stats, .counter_update_stats = mlx5_ib_counter_update_stats, + .counter_init = mlx5_ib_counter_init, + + INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter), }; static const struct ib_device_ops counters_ops = { diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index 6bcaaa52e2b2..bd03cee42014 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,10 +8,25 @@ #include "mlx5_ib.h" +struct mlx5_rdma_counter { + struct rdma_counter rdma_counter; + + struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; + struct xarray qpn_opfc_xa; +}; + +static inline struct mlx5_rdma_counter * +to_mcounter(struct rdma_counter *counter) +{ + return container_of(counter, struct mlx5_rdma_counter, rdma_counter); +} + int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_counters_clear_description(struct ib_counters *counters); int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters, struct mlx5_ib_create_flow *ucmd); u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num); +bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type, + struct mlx5_ib_op_fc **opfc); #endif /* _MLX5_IB_COUNTERS_H */ diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 4c54dc578069..1aa5311b03e9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -490,7 +490,7 @@ repoll: } qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; - if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + if (!*cur_qp || (qpn != (*cur_qp)->trans_qp.base.mqp.qpn)) { /* We do not have to take the QP table lock here, * because CQs will be locked while QPs are removed * from the table. diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 4186884c66e1..2479da8620ca 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -13,6 +13,7 @@ #include <rdma/uverbs_std_types.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> +#include <rdma/ib_ucaps.h> #include "mlx5_ib.h" #include "devx.h" #include "qp.h" @@ -122,7 +123,27 @@ devx_ufile2uctx(const struct uverbs_attr_bundle *attrs) return to_mucontext(ib_uverbs_get_ucontext(attrs)); } -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static int set_uctx_ucaps(struct mlx5_ib_dev *dev, u64 req_ucaps, u32 *cap) +{ + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_LOCAL)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL; + else + return -EOPNOTSUPP; + } + + if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA)) { + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + *cap |= MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA; + else + return -EOPNOTSUPP; + } + + return 0; +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps) { u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {}; u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {}; @@ -136,14 +157,22 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) return -EINVAL; uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); - if (is_user && capable(CAP_NET_RAW) && - (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX)) + if (is_user && + (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) && + capable(CAP_NET_RAW)) cap |= MLX5_UCTX_CAP_RAW_TX; - if (is_user && capable(CAP_SYS_RAWIO) && + if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & - MLX5_UCTX_CAP_INTERNAL_DEV_RES)) + MLX5_UCTX_CAP_INTERNAL_DEV_RES) && + capable(CAP_SYS_RAWIO)) cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES; + if (req_ucaps) { + err = set_uctx_ucaps(dev, req_ucaps, &cap); + if (err) + return err; + } + MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); MLX5_SET(uctx, uctx, cap, cap); @@ -2573,7 +2602,7 @@ int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) struct mlx5_devx_event_table *table = &dev->devx_event_table; int uid; - uid = mlx5_ib_devx_create(dev, false); + uid = mlx5_ib_devx_create(dev, false, 0); if (uid > 0) { dev->devx_whitelist_uid = uid; xa_init(&table->event_xa); diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index 1344bf4c9d21..ee9e7d3af93f 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -24,13 +24,14 @@ struct devx_obj { struct list_head event_sub; /* holds devx_event_subscription entries */ }; #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else -static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, + u64 req_ucaps) { return -EOPNOTSUPP; } diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 162814ae8cb4..251246c73b33 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -12,6 +12,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/ib_hdrs.h> #include <rdma/ib_umem.h> +#include <rdma/ib_ucaps.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> #include <linux/mlx5/fs_helpers.h> @@ -32,6 +33,11 @@ enum { MATCH_CRITERIA_ENABLE_MISC2_BIT }; + +struct mlx5_per_qp_opfc { + struct mlx5_ib_op_fc opfcs[MLX5_IB_OPCOUNTER_MAX]; +}; + #define HEADER_IS_ZERO(match_criteria, headers) \ !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ @@ -678,7 +684,7 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) -static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -690,7 +696,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *prio, int priority, int num_entries, int num_groups, - u32 flags) + u32 flags, u16 vport) { struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; @@ -698,6 +704,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; + ft_attr.vport = vport; ft_attr.autogroup.max_num_groups = num_groups; ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); if (IS_ERR(ft)) @@ -792,18 +799,25 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags); + num_groups, flags, 0); return prio; } enum { + RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_RX_ECN_OPCOUNTER_PRIO, RDMA_RX_CNP_OPCOUNTER_PRIO, + RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO, }; enum { + RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO, RDMA_TX_CNP_OPCOUNTER_PRIO, + RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO, }; static int set_vhca_port_spec(struct mlx5_ib_dev *dev, u32 port_num, @@ -867,6 +881,344 @@ static int set_cnp_spec(struct mlx5_ib_dev *dev, u32 port_num, return 0; } +/* Returns the prio we should use for the given optional counter type, + * whereas for bytes type we use the packet type, since they share the same + * resources. + */ +static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev, + u32 type) +{ + u32 prio_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + prio_type = type; + } + + return &dev->flow_db->opfcs[prio_type]; +} + +static void put_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_ib_flow_prio *prio; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return; + } + + prio = get_opfc_prio(dev, per_qp_type); + put_flow_table(dev, prio, true); +} + +static int get_per_qp_prio(struct mlx5_ib_dev *dev, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_ib_optional_counter_type per_qp_type; + enum mlx5_flow_namespace_type fn_type; + struct mlx5_flow_namespace *ns; + struct mlx5_ib_flow_prio *prio; + int priority; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO; + per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + default: + return -EINVAL; + } + + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); + if (!ns) + return -EOPNOTSUPP; + + prio = get_opfc_prio(dev, per_qp_type); + if (prio->flow_table) + return 0; + + prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + if (IS_ERR(prio)) + return PTR_ERR(prio); + + prio->refcount = 1; + + return 0; +} + +static struct mlx5_per_qp_opfc * +get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) +{ + struct mlx5_per_qp_opfc *per_qp_opfc; + + *new = false; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num); + if (per_qp_opfc) + return per_qp_opfc; + per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL); + + if (!per_qp_opfc) + return NULL; + + *new = true; + return per_qp_opfc; +} + +static int add_op_fc_rules(struct mlx5_ib_dev *dev, + struct mlx5_rdma_counter *mcounter, + struct mlx5_per_qp_opfc *per_qp_opfc, + struct mlx5_ib_flow_prio *prio, + enum mlx5_ib_optional_counter_type type, + u32 qp_num, u32 port_num) +{ + struct mlx5_ib_op_fc *opfc = &per_qp_opfc->opfcs[type], *in_use_opfc; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_destination dst; + struct mlx5_flow_spec *spec; + int i, err, spec_num; + bool is_tx; + + if (opfc->fc) + return -EEXIST; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, type, + &in_use_opfc)) { + opfc->fc = in_use_opfc->fc; + opfc->rule[0] = in_use_opfc->rule[0]; + return 0; + } + + opfc->fc = mcounter->fc[type]; + + spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto null_fc; + } + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP: + if (set_ecn_ce_spec(dev, port_num, &spec[0], + MLX5_FS_IPV4_VERSION) || + set_ecn_ce_spec(dev, port_num, &spec[1], + MLX5_FS_IPV6_VERSION)) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 2; + is_tx = false; + + MLX5_SET_TO_ONES(fte_match_param, spec[1].match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec[1].match_value, + misc_parameters.bth_dst_qp, qp_num); + spec[1].match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + break; + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = false; + break; + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP: + if (!MLX5_CAP_FLOWTABLE( + dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free_spec; + } + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + spec_num = 1; + is_tx = true; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + spec_num = 1; + is_tx = false; + break; + default: + err = -EINVAL; + goto free_spec; + } + + if (is_tx) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.source_sqn); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.source_sqn, qp_num); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.bth_dst_qp); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters.bth_dst_qp, qp_num); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dst.counter = opfc->fc; + + flow_act.action = + MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + for (i = 0; i < spec_num; i++) { + opfc->rule[i] = mlx5_add_flow_rules(prio->flow_table, &spec[i], + &flow_act, &dst, 1); + if (IS_ERR(opfc->rule[i])) { + err = PTR_ERR(opfc->rule[i]); + goto del_rules; + } + } + prio->refcount += spec_num; + + err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc, + GFP_KERNEL)); + if (err) + goto del_rules; + + kfree(spec); + + return 0; + +del_rules: + while (i--) + mlx5_del_flow_rules(opfc->rule[i]); + put_flow_table(dev, prio, false); +free_spec: + kfree(spec); +null_fc: + opfc->fc = NULL; + return err; +} + +static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, + u32 type, struct mlx5_fc **fc) +{ + u32 shared_fc_type; + + switch (type) { + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; + break; + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP: + shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP; + break; + default: + return false; + } + + *fc = mcounter->fc[shared_fc_type]; + if (!(*fc)) + return false; + + return true; +} + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_fc *in_use_fc; + int i; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!mcounter->fc[i]) + continue; + + if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) { + mcounter->fc[i] = NULL; + continue; + } + + mlx5_fc_destroy(dev->mdev, mcounter->fc[i]); + mcounter->fc[i] = NULL; + } +} + int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) @@ -921,6 +1273,20 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, priority = RDMA_TX_CNP_OPCOUNTER_PRIO; break; + case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + + case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS: + case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES: + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO; + break; + default: err = -EOPNOTSUPP; goto free; @@ -932,13 +1298,17 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, goto free; } - prio = &dev->flow_db->opfcs[type]; + prio = get_opfc_prio(dev, type); if (!prio->flow_table) { + err = get_per_qp_prio(dev, type); + if (err) + goto free; + prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0); + dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); - goto free; + goto put_prio; } } @@ -965,6 +1335,8 @@ del_rules: for (i -= 1; i >= 0; i--) mlx5_del_flow_rules(opfc->rule[i]); put_flow_table(dev, prio, false); +put_prio: + put_per_qp_prio(dev, type); free: kfree(spec); return err; @@ -974,12 +1346,115 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_ib_flow_prio *prio; int i; + prio = get_opfc_prio(dev, type); + for (i = 0; i < MAX_OPFC_RULES && opfc->rule[i]; i++) { mlx5_del_flow_rules(opfc->rule[i]); - put_flow_table(dev, &dev->flow_db->opfcs[type], true); + put_flow_table(dev, prio, true); } + + put_per_qp_prio(dev, type); +} + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_op_fc *in_use_opfc; + struct mlx5_ib_flow_prio *prio; + int i, j; + + per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num); + if (!per_qp_opfc) + return; + + for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { + if (!per_qp_opfc->opfcs[i].fc) + continue; + + if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, i, + &in_use_opfc)) { + per_qp_opfc->opfcs[i].fc = NULL; + continue; + } + + for (j = 0; j < MAX_OPFC_RULES; j++) { + if (!per_qp_opfc->opfcs[i].rule[j]) + continue; + mlx5_del_flow_rules(per_qp_opfc->opfcs[i].rule[j]); + prio = get_opfc_prio(dev, i); + put_flow_table(dev, prio, true); + } + per_qp_opfc->opfcs[i].fc = NULL; + } + + kfree(per_qp_opfc); + xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num); +} + +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port) +{ + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_per_qp_opfc *per_qp_opfc; + struct mlx5_ib_flow_prio *prio; + struct mlx5_ib_counters *cnts; + struct mlx5_ib_op_fc *opfc; + struct mlx5_fc *in_use_fc; + int i, err, per_qp_type; + bool new; + + if (!counter->mode.bind_opcnt) + return 0; + + cnts = &dev->port[port - 1].cnts; + + for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) { + opfc = &cnts->opfcs[i]; + if (!opfc->fc) + continue; + + per_qp_type = i + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; + prio = get_opfc_prio(dev, per_qp_type); + WARN_ON(!prio->flow_table); + + if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc)) + mcounter->fc[per_qp_type] = in_use_fc; + + if (!mcounter->fc[per_qp_type]) { + mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev, + false); + if (IS_ERR(mcounter->fc[per_qp_type])) + return PTR_ERR(mcounter->fc[per_qp_type]); + } + + per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new); + if (!per_qp_opfc) { + err = -ENOMEM; + goto free_fc; + } + err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio, + per_qp_type, qp->qp_num, port); + if (err) + goto del_rules; + } + + return 0; + +del_rules: + mlx5r_fs_unbind_op_fc(qp, counter); + if (new) + kfree(per_qp_opfc); +free_fc: + if (xa_empty(&mcounter->qpn_opfc_xa)) + mlx5r_fs_destroy_fcs(dev, counter); + return err; } static void set_underlay_qp(struct mlx5_ib_dev *dev, @@ -1413,17 +1888,51 @@ free_ucmd: return ERR_PTR(err); } +static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, + enum mlx5_flow_namespace_type type, + u32 *flags, u16 *vport_idx, + u16 *vport, + struct mlx5_core_dev **ft_mdev, + u32 ib_port) +{ + struct mlx5_core_dev *esw_mdev; + + if (!is_mdev_switchdev_mode(dev->mdev)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(dev->mdev, rdma_transport_manager)) + return -EOPNOTSUPP; + + if (!dev->port[ib_port - 1].rep) + return -EINVAL; + + esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); + if (esw_mdev != dev->mdev) + return -EOPNOTSUPP; + + *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; + *ft_mdev = esw_mdev; + *vport = dev->port[ib_port - 1].rep->vport; + *vport_idx = dev->port[ib_port - 1].rep->vport_index; + + return 0; +} + static struct mlx5_ib_flow_prio * _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, enum mlx5_flow_namespace_type ns_type, - bool mcast) + bool mcast, u32 ib_port) { + struct mlx5_core_dev *ft_mdev = dev->mdev; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; int max_table_size = 0; + u16 vport_idx = 0; bool esw_encap; u32 flags = 0; + u16 vport = 0; int priority; + int ret; if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; @@ -1471,13 +1980,38 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); priority = user_priority; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO) + return ERR_PTR(-EINVAL); + ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, + &vport_idx, &vport, + &ft_mdev, ib_port); + if (ret) + return ERR_PTR(ret); + + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX( + ft_mdev, log_max_ft_size)); + else + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX( + ft_mdev, log_max_ft_size)); + priority = user_priority; + break; default: break; } max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); - ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX || + ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) + ns = mlx5_get_flow_vport_namespace(ft_mdev, ns_type, vport_idx); + else + ns = mlx5_get_flow_namespace(ft_mdev, ns_type); + if (!ns) return ERR_PTR(-EOPNOTSUPP); @@ -1497,6 +2031,12 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, case MLX5_FLOW_NAMESPACE_RDMA_TX: prio = &dev->flow_db->rdma_tx[priority]; break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: + prio = &dev->flow_db->rdma_transport_rx[ib_port - 1]; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: + prio = &dev->flow_db->rdma_transport_tx[ib_port - 1]; + break; default: return ERR_PTR(-EINVAL); } @@ -1507,7 +2047,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return prio; return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags); + MLX5_FS_MAX_TYPES, flags, vport); } static struct mlx5_ib_flow_handler * @@ -1626,7 +2166,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( mutex_lock(&dev->flow_db->lock); ft_prio = _get_flow_table(dev, fs_matcher->priority, - fs_matcher->ns_type, mcast); + fs_matcher->ns_type, mcast, + fs_matcher->ib_port); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -1742,6 +2283,12 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX: *namespace = MLX5_FLOW_NAMESPACE_RDMA_TX; break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX; + break; default: return -EINVAL; } @@ -1831,7 +2378,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ - if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && ((!dest_devx && !dest_qp) || (dest_devx && dest_qp))) return -EINVAL; @@ -1848,7 +2396,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return -EINVAL; /* Allow only flow table as dest when inserting to FDB or RDMA_RX */ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) && *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) return -EINVAL; } else if (dest_qp) { @@ -1869,14 +2418,16 @@ static int get_dests(struct uverbs_attr_bundle *attrs, *dest_id = mqp->raw_packet_qp.rq.tirn; *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; } else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) && !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) { *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)) + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX)) return -EINVAL; return 0; @@ -2353,6 +2904,15 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, return 0; } +static bool verify_context_caps(struct mlx5_ib_dev *dev, u64 enabled_caps) +{ + if (is_mdev_switchdev_mode(dev->mdev)) + return UCAP_ENABLED(enabled_caps, + RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL); +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( struct uverbs_attr_bundle *attrs) { @@ -2401,6 +2961,26 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( goto end; } + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT)) { + err = uverbs_copy_from(&obj->ib_port, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT); + if (err) + goto end; + if (!rdma_is_port_valid(&dev->ib_dev, obj->ib_port)) { + err = -EINVAL; + goto end; + } + if (obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX && + obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) { + err = -EINVAL; + goto end; + } + if (!verify_context_caps(dev, uobj->context->enabled_caps)) { + err = -EOPNOTSUPP; + goto end; + } + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -2448,7 +3028,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, priority, ns_type, 0); + ft_prio = _get_flow_table(dev, priority, ns_type, 0, 0); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto free_obj; @@ -2834,7 +3414,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_OPTIONAL), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, enum mlx5_ib_uapi_flow_table_type, - UA_OPTIONAL)); + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, @@ -2878,6 +3461,7 @@ DECLARE_UVERBS_NAMED_OBJECT( &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); const struct uapi_definition mlx5_ib_flow_defs[] = { +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_FLOW_MATCHER), UAPI_DEF_CHAIN_OBJ_TREE( @@ -2888,6 +3472,7 @@ const struct uapi_definition mlx5_ib_flow_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_STEERING_ANCHOR, UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), +#endif {}, }; @@ -2904,8 +3489,26 @@ int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) if (!dev->flow_db) return -ENOMEM; + dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_rx) + goto free_flow_db; + + dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), + GFP_KERNEL); + if (!dev->flow_db->rdma_transport_tx) + goto free_rdma_transport_rx; + mutex_init(&dev->flow_db->lock); ib_set_device_ops(&dev->ib_dev, &flow_ops); return 0; + +free_rdma_transport_rx: + kfree(dev->flow_db->rdma_transport_rx); +free_flow_db: + kfree(dev->flow_db); + return -ENOMEM; } diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index b9734904f5f0..2ebe86e5be10 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -8,23 +8,8 @@ #include "mlx5_ib.h" -#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); -#else -static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) -{ - dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); - - if (!dev->flow_db) - return -ENOMEM; - - mutex_init(&dev->flow_db->lock); - return 0; -} - -inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} -#endif static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { @@ -40,6 +25,8 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) * is a safe assumption that all references are gone. */ mlx5_ib_fs_cleanup_anchor(dev); + kfree(dev->flow_db->rdma_transport_tx); + kfree(dev->flow_db->rdma_transport_rx); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81849eb671a1..d07cacaa0abd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -47,6 +47,7 @@ #include <rdma/uverbs_ioctl.h> #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_ucaps.h> #include "macsec.h" #include "data_direct.h" @@ -1934,6 +1935,12 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, return 0; } +static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps) +{ + return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) || + UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { @@ -1976,10 +1983,17 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, return -EINVAL; if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - err = mlx5_ib_devx_create(dev, true); + err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps); if (err < 0) goto out_ctx; context->devx_uid = err; + + if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) { + err = mlx5_cmd_add_privileged_uid(dev->mdev, + context->devx_uid); + if (err) + goto out_devx; + } } lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; @@ -1994,7 +2008,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) - goto out_devx; + goto out_ucap; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; @@ -2002,7 +2016,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; - goto out_devx; + goto out_ucap; } bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, @@ -2066,6 +2080,11 @@ out_sys_pages: out_count: kfree(bfregi->count); +out_ucap: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX && + uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid); + out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) mlx5_ib_devx_destroy(dev, context->devx_uid); @@ -2110,8 +2129,12 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) kfree(bfregi->sys_pages); kfree(bfregi->count); - if (context->devx_uid) + if (context->devx_uid) { + if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps)) + mlx5_cmd_remove_privileged_uid(dev->mdev, + context->devx_uid); mlx5_ib_devx_destroy(dev, context->devx_uid); + } } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, @@ -4201,8 +4224,47 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) return (var_table->bitmap) ? 0 : -ENOMEM; } +static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); +} + +static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev) +{ + int ret; + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) { + ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA); + if (ret) + goto remove_local; + } + + return 0; + +remove_local: + if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) + ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL); + return ret; +} + static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev) { + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) + mlx5_ib_cleanup_ucaps(dev); + bitmap_free(dev->var_table.bitmap); } @@ -4253,6 +4315,13 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) return err; } + if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) & + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { + err = mlx5_ib_init_ucaps(dev); + if (err) + return err; + } + dev->ib_dev.use_cq_dim = true; return 0; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 974a45c92fbb..ace2df3e1d9f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -276,6 +276,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_core_dev *mdev; atomic_t usecnt; u8 match_criteria_enable; + u32 ib_port; }; struct mlx5_ib_steering_anchor { @@ -293,6 +294,18 @@ enum mlx5_ib_optional_counter_type { MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS, MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS, MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES, + + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP, + MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP, MLX5_IB_OPCOUNTER_MAX, }; @@ -307,6 +320,8 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; struct mlx5_flow_table *lag_demux_ft; + struct mlx5_ib_flow_prio *rdma_transport_rx; + struct mlx5_ib_flow_prio *rdma_transport_tx; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done @@ -883,6 +898,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type); +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, + u32 port); + +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter); + +void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, + struct rdma_counter *counter); + struct mlx5_ib_multiport_info; struct mlx5_ib_multiport { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 753faa9ad06a..b7c8c926c578 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -56,7 +56,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate, + unsigned long page_size, bool populate, int access_mode); static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); @@ -718,8 +718,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, } static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - struct mlx5_cache_ent *ent, - int access_flags) + struct mlx5_cache_ent *ent) { struct mlx5_ib_mr *mr; int err; @@ -794,7 +793,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, if (!ent) return ERR_PTR(-EOPNOTSUPP); - return _mlx5_mr_cache_alloc(dev, ent, access_flags); + return _mlx5_mr_cache_alloc(dev, ent); } static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) @@ -919,6 +918,25 @@ mkeys_err: return ERR_PTR(ret); } +static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev) +{ + struct rb_root *root = &dev->cache.rb_root; + struct mlx5_cache_ent *ent; + struct rb_node *node; + + mutex_lock(&dev->cache.rb_lock); + node = rb_first(root); + while (node) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); + mlx5r_mkeys_uninit(ent); + kfree(ent); + } + mutex_unlock(&dev->cache.rb_lock); +} + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -970,6 +988,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) err: mutex_unlock(&cache->rb_lock); mlx5_mkey_cache_debugfs_cleanup(dev); + mlx5r_destroy_cache_entries(dev); + destroy_workqueue(cache->wq); mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); return ret; } @@ -1003,17 +1023,7 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); /* At this point all entries are disabled and have no concurrent work. */ - mutex_lock(&dev->cache.rb_lock); - node = rb_first(root); - while (node) { - ent = rb_entry(node, struct mlx5_cache_ent, node); - node = rb_next(node); - clean_keys(dev, ent); - rb_erase(&ent->node, root); - mlx5r_mkeys_uninit(ent); - kfree(ent); - } - mutex_unlock(&dev->cache.rb_lock); + mlx5r_destroy_cache_entries(dev); destroy_workqueue(dev->cache.wq); del_timer_sync(&dev->delay_timer); @@ -1115,7 +1125,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; - unsigned int page_size; + unsigned long page_size; if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); @@ -1144,7 +1154,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } - mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); + mr = _mlx5_mr_cache_alloc(dev, ent); if (IS_ERR(mr)) return mr; @@ -1219,7 +1229,7 @@ err_1: */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate, + unsigned long page_size, bool populate, int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -1425,7 +1435,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, mr = alloc_cacheable_mr(pd, umem, iova, access_flags, MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = + unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); @@ -1957,7 +1967,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, if (mr->mmkey.cache_ent) { spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock); - mr->mmkey.cache_ent->in_use--; goto end; } @@ -2025,6 +2034,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) bool is_odp = is_odp_mr(mr); bool is_odp_dma_buf = is_dmabuf_mr(mr) && !to_ib_umem_dmabuf(mr->umem)->pinned; + bool from_cache = !!ent; int ret = 0; if (is_odp) @@ -2037,6 +2047,8 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ent = mr->mmkey.cache_ent; /* upon storing to a clean temp entry - schedule its cleanup */ spin_lock_irq(&ent->mkeys_queue.lock); + if (from_cache) + ent->in_use--; if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { mod_delayed_work(ent->dev->cache.wq, &ent->dwork, msecs_to_jiffies(30 * 1000)); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e77c9280c07e..86d8fa63bf69 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -309,9 +309,6 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, blk_start_idx = idx; in_block = 1; } - - /* Count page invalidations */ - invalidations += idx - blk_start_idx + 1; } else { u64 umr_offset = idx & umr_block_mask; @@ -321,14 +318,19 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; } } } - if (in_block) + if (in_block) { mlx5r_umr_update_xlt(mr, blk_start_idx, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; + } mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 9f54aa90a35a..bcd43dc30e21 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -237,34 +237,6 @@ enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, return IB_LINK_LAYER_ETHERNET; } -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props) -{ - unsigned long flags; - - if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | - IB_DEVICE_MODIFY_NODE_DESC)) { - dev_warn(&to_vdev(ibdev)->pdev->dev, - "unsupported device modify mask %#x\n", mask); - return -EOPNOTSUPP; - } - - if (mask & IB_DEVICE_MODIFY_NODE_DESC) { - spin_lock_irqsave(&to_vdev(ibdev)->desc_lock, flags); - memcpy(ibdev->node_desc, props->node_desc, 64); - spin_unlock_irqrestore(&to_vdev(ibdev)->desc_lock, flags); - } - - if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { - mutex_lock(&to_vdev(ibdev)->port_mutex); - to_vdev(ibdev)->sys_image_guid = - cpu_to_be64(props->sys_image_guid); - mutex_unlock(&to_vdev(ibdev)->port_mutex); - } - - return 0; -} - /** * pvrdma_modify_port - modify device port attributes * @ibdev: the device to modify diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 4b9edc03d73d..fd47b0b1df5c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -356,8 +356,6 @@ int pvrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev, u32 port); -int pvrdma_modify_device(struct ib_device *ibdev, int mask, - struct ib_device_modify *props); int pvrdma_modify_port(struct ib_device *ibdev, u32 port, int mask, struct ib_port_modify *props); int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); |