From 2ef5612391f0a7a631c42a8afc867095b49a1992 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Jun 2020 15:39:02 +0100 Subject: RDMA/mlx5: Remove duplicated assignment to resp.response_length The assignment to resp.response_length is never read since it is being updated again on the next statement. The assignment is redundant so removed it. Fixes: a645a89d9a78 ("RDMA/mlx5: Return ECE DC support") Link: https://lore.kernel.org/r/20200604143902.56021-1-colin.king@canonical.com Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 81bf6b975e0e..d61ca85033de 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4162,8 +4162,6 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (udata->outlen < min_resp_len) return -EINVAL; - resp.response_length = min_resp_len; - /* * If we don't have enough space for the ECE options, * simply indicate it with resp.response_length. -- cgit v1.2.3 From 4f5747cf8e5947479a27a3597829e45d6d8d73e0 Mon Sep 17 00:00:00 2001 From: Tom Seewald Date: Thu, 4 Jun 2020 21:30:12 -0500 Subject: RDMA/mlx5: Fix -Wformat warning in check_ucmd_data() Variables of type size_t should use %zu rather than %lu [1]. The variables "inlen", "ucmd", "last", and "size" are all size_t, so use the correct format specifiers. [1] https://www.kernel.org/doc/html/latest/core-api/printk-formats.html Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200605023012.9527-1-tseewald@gmail.com Signed-off-by: Tom Seewald Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d61ca85033de..dbe82cdb8d2c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2907,7 +2907,7 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, if (!ret) mlx5_ib_dbg( dev, - "udata is not cleared, inlen = %lu, ucmd = %lu, last = %lu, size = %lu\n", + "udata is not cleared, inlen = %zu, ucmd = %zu, last = %zu, size = %zu\n", udata->inlen, params->ucmd_size, last, size); return ret ? 0 : -EINVAL; } -- cgit v1.2.3 From 0dc63bbee0fa6da20283ae6e22e99c6fde25ed8e Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Tue, 9 Jun 2020 13:45:55 +0100 Subject: RDMA/hfi1: Fix trivial mis-spelling of 'descriptor' The word 'descriptor' is misspelled throughout the tree. Fix it up accordingly: decriptors -> descriptors Link: https://lore.kernel.org/r/20200609124610.3445662-3-kieran.bingham+renesas@ideasonboard.com Link: https://lore.kernel.org/r/20200609124610.3445662-12-kieran.bingham+renesas@ideasonboard.com Signed-off-by: Kieran Bingham Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/iowait.h | 2 +- drivers/infiniband/hw/hfi1/ipoib_tx.c | 2 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 07847cb72169..d580aa17ae37 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -399,7 +399,7 @@ static inline void iowait_get_priority(struct iowait *w) * @wait_head: the wait queue * * This function is called to insert an iowait struct into a - * wait queue after a resource (eg, sdma decriptor or pio + * wait queue after a resource (eg, sdma descriptor or pio * buffer) is run out. */ static inline void iowait_queue(bool pkts_sent, struct iowait *w, diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 883cb9d48022..175290c56db9 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -364,7 +364,7 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, if (unlikely(!tx)) return ERR_PTR(-ENOMEM); - /* so that we can test if the sdma decriptors are there */ + /* so that we can test if the sdma descriptors are there */ tx->txreq.num_desc = 0; tx->priv = priv; tx->txq = txp->txq; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index bfa6e081cb56..d2d526c5a756 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -91,7 +91,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, tx->mr = NULL; tx->sde = priv->s_sde; tx->psc = priv->s_sendcontext; - /* so that we can test if the sdma decriptors are there */ + /* so that we can test if the sdma descriptors are there */ tx->txreq.num_desc = 0; /* Set the header type */ tx->phdr.hdr.hdr_type = priv->hdr_type; -- cgit v1.2.3 From 0133654d8eb8607eacc96badfe49bf992155f4cb Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 14 Jun 2020 13:35:34 +0300 Subject: RDMA/efa: Set maximum pkeys device attribute The max_pkeys device attribute was not set in query device verb, set it to one in order to account for the default pkey (0xffff). This information is exposed to userspace and can cause malfunction Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Link: https://lore.kernel.org/r/20200614103534.88060-1-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 08313f7c73bc..7dd082441333 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -212,6 +212,7 @@ int efa_query_device(struct ib_device *ibdev, props->max_send_sge = dev_attr->max_sq_sge; props->max_recv_sge = dev_attr->max_rq_sge; props->max_sge_rd = dev_attr->max_wr_rdma_sge; + props->max_pkeys = 1; if (udata && udata->outlen) { resp.max_sq_sge = dev_attr->max_sq_sge; -- cgit v1.2.3 From 0dfbd5ecf28cbcb81674c49d34ee97366db1be44 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 16 Jun 2020 12:34:08 +0300 Subject: RDMA/qedr: Fix KASAN: use-after-free in ucma_event_handler+0x532 Private data passed to iwarp_cm_handler is copied for connection request / response, but ignored otherwise. If junk is passed, it is stored in the event and used later in the event processing. The driver passes an old junk pointer during connection close which leads to a use-after-free on event processing. Set private data to NULL for events that don 't have private data. BUG: KASAN: use-after-free in ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: Read of size 4 at addr ffff8886caa71200 by task kworker/u128:1/5250 kernel: kernel: Workqueue: iw_cm_wq cm_work_handler [iw_cm] kernel: Call Trace: kernel: dump_stack+0x8c/0xc0 kernel: print_address_description.constprop.0+0x1b/0x210 kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: __kasan_report.cold+0x1a/0x33 kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: kasan_report+0xe/0x20 kernel: check_memory_region+0x130/0x1a0 kernel: memcpy+0x20/0x50 kernel: ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: ? __rpc_execute+0x608/0x620 [sunrpc] kernel: cma_iw_handler+0x212/0x330 [rdma_cm] kernel: ? iw_conn_req_handler+0x6e0/0x6e0 [rdma_cm] kernel: ? enqueue_timer+0x86/0x140 kernel: ? _raw_write_lock_irq+0xd0/0xd0 kernel: cm_work_handler+0xd3d/0x1070 [iw_cm] Fixes: e411e0587e0d ("RDMA/qedr: Add iWARP connection management functions") Link: https://lore.kernel.org/r/20200616093408.17827-1-michal.kalderon@marvell.com Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr_iw_cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index 792eecd206b6..97fc7dd353b0 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -150,8 +150,17 @@ qedr_iw_issue_event(void *context, if (params->cm_info) { event.ird = params->cm_info->ird; event.ord = params->cm_info->ord; - event.private_data_len = params->cm_info->private_data_len; - event.private_data = (void *)params->cm_info->private_data; + /* Only connect_request and reply have valid private data + * the rest of the events this may be left overs from + * connection establishment. CONNECT_REQUEST is issued via + * qedr_iw_mpa_request + */ + if (event_type == IW_CM_EVENT_CONNECT_REPLY) { + event.private_data_len = + params->cm_info->private_data_len; + event.private_data = + (void *)params->cm_info->private_data; + } } if (ep->cm_id) -- cgit v1.2.3 From ab183d460daac6292cb0cfd989d88b37b2437844 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 16 Jun 2020 13:45:36 +0300 Subject: RDMA/mlx5: Add missed RST2INIT and INIT2INIT steps during ECE handshake Missed steps during ECE handshake left userspace application with less options for the ECE handshake. Pass ECE options in the additional transitions. Fixes: 50aec2c3135e ("RDMA/mlx5: Return ECE data after modify QP") Link: https://lore.kernel.org/r/20200616104536.2426384-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qpc.c | 8 ++++++++ include/linux/mlx5/mlx5_ifc.h | 10 ++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index c19d91d6dce8..7c3968ef9cd1 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -346,6 +346,9 @@ static int get_ece_from_mbox(void *out, u16 opcode) int ece = 0; switch (opcode) { + case MLX5_CMD_OP_INIT2INIT_QP: + ece = MLX5_GET(init2init_qp_out, out, ece); + break; case MLX5_CMD_OP_INIT2RTR_QP: ece = MLX5_GET(init2rtr_qp_out, out, ece); break; @@ -355,6 +358,9 @@ static int get_ece_from_mbox(void *out, u16 opcode) case MLX5_CMD_OP_RTS2RTS_QP: ece = MLX5_GET(rts2rts_qp_out, out, ece); break; + case MLX5_CMD_OP_RST2INIT_QP: + ece = MLX5_GET(rst2init_qp_out, out, ece); + break; default: break; } @@ -406,6 +412,7 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rst2init_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_INIT2RTR_QP: if (MBOX_ALLOC(mbox, init2rtr_qp)) @@ -439,6 +446,7 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(init2init_qp_in, mbox->in, ece, ece); break; default: return -EINVAL; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 116bd9bb347f..ca1887dd0423 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4283,7 +4283,8 @@ struct mlx5_ifc_rst2init_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rst2init_qp_in_bits { @@ -4300,7 +4301,7 @@ struct mlx5_ifc_rst2init_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -6619,7 +6620,8 @@ struct mlx5_ifc_init2init_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_init2init_qp_in_bits { @@ -6636,7 +6638,7 @@ struct mlx5_ifc_init2init_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; -- cgit v1.2.3 From 98a6151907cb5512eb6ba8b90644e3ace2d2fc46 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 16 Jun 2020 21:37:09 +0800 Subject: RDMA/hns: Fix a calltrace when registering MR from userspace ibmr.device is assigned after MR is successfully registered, but both write_mtpt() and frmr_write_mtpt() accesses it during the mr registration process, which may cause the following error when trying to register MR in userspace and pbl_hop_num is set to 0. pc : hns_roce_mtr_find+0xa0/0x200 [hns_roce] lr : set_mtpt_pbl+0x54/0x118 [hns_roce_hw_v2] sp : ffff00023e73ba20 x29: ffff00023e73ba20 x28: ffff00023e73bad8 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000002 x24: 0000000000000000 x23: ffff00023e73bad0 x22: 0000000000000000 x21: ffff0000094d9000 x20: 0000000000000000 x19: ffff8020a6bdb2c0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0140000000000000 x12: 0040000000000041 x11: ffff000240000000 x10: 0000000000001000 x9 : 0000000000000000 x8 : ffff802fb7558480 x7 : ffff802fb7558480 x6 : 000000000003483d x5 : ffff00023e73bad0 x4 : 0000000000000002 x3 : ffff00023e73bad8 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000094d9708 Call trace: hns_roce_mtr_find+0xa0/0x200 [hns_roce] set_mtpt_pbl+0x54/0x118 [hns_roce_hw_v2] hns_roce_v2_write_mtpt+0x14c/0x168 [hns_roce_hw_v2] hns_roce_mr_enable+0x6c/0x148 [hns_roce] hns_roce_reg_user_mr+0xd8/0x130 [hns_roce] ib_uverbs_reg_mr+0x14c/0x2e0 [ib_uverbs] ib_uverbs_write+0x27c/0x3e8 [ib_uverbs] __vfs_write+0x60/0x190 vfs_write+0xac/0x1c0 ksys_write+0x6c/0xd8 __arm64_sys_write+0x24/0x30 el0_svc_common+0x78/0x130 el0_svc_handler+0x38/0x78 el0_svc+0x8/0xc Solve above issue by adding a pointer of structure hns_roce_dev as a parameter of write_mtpt() and frmr_write_mtpt(), so that both of these functions can access it before finishing MR's registration. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Link: https://lore.kernel.org/r/1592314629-51715-1-git-send-email-liweihang@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 7 ++++--- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 15 ++++++++------- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 +++-- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index a77fa6730b2d..479fa557993e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -898,13 +898,14 @@ struct hns_roce_hw { int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr); void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port, enum ib_mtu mtu); - int (*write_mtpt)(void *mb_buf, struct hns_roce_mr *mr, - unsigned long mtpt_idx); + int (*write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr, unsigned long mtpt_idx); int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int flags, u32 pdn, int mr_access_flags, u64 iova, u64 size, void *mb_buf); - int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); + int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index d02207cd30df..cf39f560b800 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1756,10 +1756,10 @@ static void hns_roce_v1_set_mtu(struct hns_roce_dev *hr_dev, u8 phy_port, val); } -static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, +static int hns_roce_v1_write_mtpt(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr, unsigned long mtpt_idx) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); u64 pages[HNS_ROCE_MAX_INNER_MTPT_NUM] = { 0 }; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v1_mpt_entry *mpt_entry; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c597d7281629..bb86754cb94d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2529,10 +2529,10 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, return hns_roce_cmq_send(hr_dev, &desc, 1); } -static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, +static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_mpt_entry *mpt_entry, struct hns_roce_mr *mr) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 }; struct ib_device *ibdev = &hr_dev->ib_dev; dma_addr_t pbl_ba; @@ -2571,7 +2571,8 @@ static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, return 0; } -static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, +static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev, + void *mb_buf, struct hns_roce_mr *mr, unsigned long mtpt_idx) { struct hns_roce_v2_mpt_entry *mpt_entry; @@ -2620,7 +2621,7 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, if (mr->type == MR_TYPE_DMA) return 0; - ret = set_mtpt_pbl(mpt_entry, mr); + ret = set_mtpt_pbl(hr_dev, mpt_entry, mr); return ret; } @@ -2666,15 +2667,15 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, mr->iova = iova; mr->size = size; - ret = set_mtpt_pbl(mpt_entry, mr); + ret = set_mtpt_pbl(hr_dev, mpt_entry, mr); } return ret; } -static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) +static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, + void *mb_buf, struct hns_roce_mr *mr) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v2_mpt_entry *mpt_entry; dma_addr_t pbl_ba = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 4c0bbb12770d..0e71ebee9e52 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -180,9 +180,10 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, } if (mr->type != MR_TYPE_FRMR) - ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr, + mtpt_idx); else - ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); + ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr); if (ret) { dev_err(dev, "Write mtpt fail!\n"); goto err_page; -- cgit v1.2.3 From 3ec5f54f7a0fdf706951f379d3148798325e516f Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 16 Jun 2020 21:39:38 +0800 Subject: RDMA/hns: Fix an cmd queue issue when resetting If a IMP reset caused by some hardware errors and hns RoCE driver reset occurred at the same time, there is a possiblity that the IMP will stop dealing with command and users can't use the hardware. The logs are as follows: hns3 0000:fd:00.1: cleaned 0, need to clean 1 hns3 0000:fd:00.1: firmware version query failed -11 hns3 0000:fd:00.1: Cmd queue init failed hns3 0000:fd:00.1: Upgrade reset level hns3 0000:fd:00.1: global reset interrupt The hns NIC driver divides the reset process into 3 status: initialization, hardware resetting and softwaring restting. RoCE driver gets reset status by interfaces provided by NIC driver and commands will not be sent to the IMP if the driver is in any above status. The main reason for this issue is that there is a time gap between status 1 and 2, if the RoCE driver sends commands to the IMP during this gap, the IMP will stop working because it is not ready. To eliminate the time gap, the hns NIC driver has added a new interface in commit a4de02287abb9 ("net: hns3: provide .get_cmdq_stat interface for the client"), so RoCE driver can ensure that no commands will be sent during resetting. Link: https://lore.kernel.org/r/1592314778-52822-1-git-send-email-liweihang@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bb86754cb94d..dd01a51816cc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -910,7 +910,7 @@ static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev) instance_stage = handle->rinfo.instance_state; reset_stage = handle->rinfo.reset_state; reset_cnt = ops->ae_dev_reset_cnt(handle); - hw_resetting = ops->get_hw_reset_stat(handle); + hw_resetting = ops->get_cmdq_stat(handle); sw_resetting = ops->ae_dev_resetting(handle); if (reset_cnt != hr_dev->reset_cnt) -- cgit v1.2.3 From 6c41965d647a97b51ff665c7406ec9435aab7fc1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Jun 2020 16:01:48 +0300 Subject: RDMA/mlx5: Don't access ib_qp fields in internal destroy QP path destroy_qp_common is called for flows where QP is already created by HW. While it is called from IB/core, the ibqp.* fields will be fully initialized, but it is not the case if this function is called during QP creation. Don't rely on ibqp fields as much as possible and initialize send_cq/recv_cq as temporal solution till all drivers will be converted to IB/core QP allocation scheme. refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 5372 at lib/refcount.c:28 refcount_warn_saturate+0xfe/0x1a0 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 5372 Comm: syz-executor.2 Not tainted 5.5.0-rc5 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: mlx5_core_put_rsc+0x70/0x80 destroy_resource_common+0x8e/0xb0 mlx5_core_destroy_qp+0xaf/0x1d0 mlx5_ib_destroy_qp+0xeb0/0x1460 ib_destroy_qp_user+0x2d5/0x7d0 create_qp+0xed3/0x2130 ib_uverbs_create_qp+0x13e/0x190 ? ib_uverbs_ex_create_qp ib_uverbs_write+0xaa5/0xdf0 __vfs_write+0x7c/0x100 ksys_write+0xc8/0x200 do_syscall_64+0x9c/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 08d53976609a ("RDMA/mlx5: Copy response to the user in one place") Link: https://lore.kernel.org/r/20200617130148.2846643-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index dbe82cdb8d2c..1e567fe3a527 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2341,18 +2341,18 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, unsigned long flags; int err; - if (qp->ibqp.rwq_ind_tbl) { + if (qp->is_rss) { destroy_rss_raw_qp_tir(dev, qp); return; } - base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + base = (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) ? - &qp->raw_packet_qp.rq.base : - &qp->trans_qp.base; + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { - if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && + if (qp->type != IB_QPT_RAW_PACKET && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp, NULL); @@ -2368,8 +2368,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base->mqp.qpn); } - get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, - &send_cq, &recv_cq); + get_cqs(qp->type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, + &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); @@ -2391,7 +2391,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + if (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) { destroy_raw_packet_qp(dev, qp); } else { @@ -3002,10 +3002,18 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, return &qp->ibqp; destroy_qp: - if (qp->type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) { mlx5_ib_destroy_dct(qp); - else + } else { + /* + * The two lines below are temp solution till QP allocation + * will be moved to be under IB/core responsiblity. + */ + qp->ibqp.send_cq = attr->send_cq; + qp->ibqp.recv_cq = attr->recv_cq; destroy_qp_common(dev, qp, udata); + } + qp = NULL; free_qp: kfree(qp); -- cgit v1.2.3 From d44335572f76020ef473d7977d9fd424e6fc0021 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 18 Jun 2020 14:25:07 +0300 Subject: RDMA/mlx5: Fix remote gid value in query QP Remote gid is not copied to the right address. Fix it by using rdma_ah_set_dgid_raw to copy the remote gid value from the QP context on query QP. Fixes: 70bd7fb87625 ("RDMA/mlx5: Remove manually crafted QP context the query call") Link: https://lore.kernel.org/r/20200618112507.3453496-3-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1e567fe3a527..264e6d228803 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4390,8 +4390,7 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, MLX5_GET(ads, path, src_addr_index), MLX5_GET(ads, path, hop_limit), MLX5_GET(ads, path, tclass)); - memcpy(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip), - MLX5_FLD_SZ_BYTES(ads, rgid_rip)); + rdma_ah_set_dgid_raw(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip)); } } -- cgit v1.2.3 From 2c0f5292d5358c2c5576146071d641110c3c1612 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 18 Jun 2020 14:25:06 +0300 Subject: RDMA/mlx5: Remove ECE limitation from the RAW_PACKET QPs Like any other QP type, rely on FW for the RAW_PACKET QPs to decide if ECE is supported or not. This fixes an inability to create RAW_PACKET QPs with latest rdma-core with the ECE support. Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200618112507.3453496-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 264e6d228803..eb5b724b121b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2873,7 +2873,6 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) static int check_ucmd_data(struct mlx5_ib_dev *dev, struct mlx5_create_qp_params *params) { - struct ib_qp_init_attr *attr = params->attr; struct ib_udata *udata = params->udata; size_t size, last; int ret; @@ -2885,14 +2884,7 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, */ last = sizeof(struct mlx5_ib_create_qp_rss); else - /* IB_QPT_RAW_PACKET doesn't have ECE data */ - switch (attr->qp_type) { - case IB_QPT_RAW_PACKET: - last = offsetof(struct mlx5_ib_create_qp, ece_options); - break; - default: - last = offsetof(struct mlx5_ib_create_qp, reserved); - } + last = offsetof(struct mlx5_ib_create_qp, reserved); if (udata->inlen <= last) return 0; -- cgit v1.2.3 From 9e0dc7b9e1cbc9cd0dc4ab2377c0536d4b18bedc Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 17 Jun 2020 16:02:30 +0300 Subject: RDMA/mlx5: Fix integrity enabled QP creation create_flags checks was refactored and broke the creation on integrity enabled QPs and actually broke the NVMe/RDMA and iSER ULP's when using mlx5 driven devices. Fixes: 2978975ce7f1 ("RDMA/mlx5: Process create QP flags in one place") Link: https://lore.kernel.org/r/20200617130230.2846915-1-leon@kernel.org Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index eb5b724b121b..a7fcb00e37a5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2668,6 +2668,9 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) return (create_flags) ? -EINVAL : 0; + process_create_flag(dev, &create_flags, + IB_QP_CREATE_INTEGRITY_EN, + MLX5_CAP_GEN(mdev, sho), qp); process_create_flag(dev, &create_flags, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX5_CAP_GEN(mdev, block_lb_mc), qp); -- cgit v1.2.3 From 6eefa839c4dddf2149e9f5f6f1aa3e1191c8db9c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 21 Jun 2020 14:59:59 +0300 Subject: RDMA/mlx5: Protect from kernel crash if XRC_TGT doesn't have udata Don't deref udata if it is NULL BUG: kernel NULL pointer dereference, address: 0000000000000030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 SMP PTI CPU: 2 PID: 1592 Comm: python3 Not tainted 5.7.0-rc6+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:create_qp+0x39e/0xae0 [mlx5_ib] Code: c0 0d 00 00 bf 10 01 00 00 e8 be a9 e4 e0 48 85 c0 49 89 c2 0f 84 0c 07 00 00 41 8b 85 74 63 01 00 0f c8 a9 00 00 00 10 74 0a <41> 8b 46 30 0f c8 41 89 42 14 41 8b 52 18 41 0f b6 4a 1c 0f ca 89 RSP: 0018:ffffc9000067f8b0 EFLAGS: 00010206 RAX: 0000000010170000 RBX: ffff888441313000 RCX: 0000000000000000 RDX: 0000000000000200 RSI: 0000000000000000 RDI: ffff88845b1d4400 RBP: ffffc9000067fa60 R08: 0000000000000200 R09: ffff88845b1d4200 R10: ffff88845b1d4200 R11: ffff888441313000 R12: ffffc9000067f950 R13: ffff88846ac00140 R14: 0000000000000000 R15: ffff88846c2bc000 FS: 00007faa1a3c0540(0000) GS:ffff88846fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000446dca003 CR4: 0000000000760ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 mlx5_ib_create_qp+0x897/0xfa0 [mlx5_ib] ib_create_qp+0x9e/0x300 [ib_core] create_qp+0x92d/0xb20 [ib_uverbs] ? ib_uverbs_cq_event_handler+0x30/0x30 [ib_uverbs] ? release_resource+0x30/0x30 ib_uverbs_create_qp+0xc4/0xe0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc8/0xf0 [ib_uverbs] ib_uverbs_run_method+0x223/0x770 [ib_uverbs] ? track_pfn_remap+0xa7/0x100 ? uverbs_disassociate_api+0xd0/0xd0 [ib_uverbs] ? remap_pfn_range+0x358/0x490 ib_uverbs_cmd_verbs.isra.6+0x19b/0x370 [ib_uverbs] ? rdma_umap_priv_init+0x82/0xe0 [ib_core] ? vm_mmap_pgoff+0xec/0x120 ib_uverbs_ioctl+0xc0/0x120 [ib_uverbs] ksys_ioctl+0x92/0xb0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x48/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200621115959.60126-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a7fcb00e37a5..f939c9b769f0 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1862,7 +1862,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; - if (MLX5_CAP_GEN(mdev, ece_support)) + if (MLX5_CAP_GEN(mdev, ece_support) && ucmd) MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); -- cgit v1.2.3 From b46925a24a9ca7db03655657565e03d2de3027c8 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Tue, 23 Jun 2020 16:32:24 -0400 Subject: IB/hfi1: Restore kfree in dummy_netdev cleanup We need to do some rework on the dummy netdev. Calling the free_netdev() would normally make sense, and that will be addressed in an upcoming patch. For now just revert the behavior to what it was before keeping the unused variable removal part of the patch. The dd->dumm_netdev is mainly used for packet receiving through alloc_netdev_mqs() for typical net devices. A a result, it should be freed with kfree instead of free_netdev() that leads to a crash when unloading the hfi1 module: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000855b54067 P4D 8000000855b54067 PUD 84a4f5067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 73 PID: 10299 Comm: modprobe Not tainted 5.6.0-rc5+ #1 Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0016.033120161139 03/31/2016 RIP: 0010:__hw_addr_flush+0x12/0x80 Code: 40 00 48 83 c4 08 4c 89 e7 5b 5d 41 5c e9 76 77 18 00 66 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 fc 55 53 48 8b 1f 48 39 df <48> 8b 2b 75 08 eb 4a 48 89 eb 48 89 c5 48 89 df e8 99 bf d0 ff 84 RSP: 0018:ffffb40e08783db8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000002 RDX: ffffb40e00000000 RSI: 0000000000000246 RDI: ffff88ab13662298 RBP: ffff88ab13662000 R08: 0000000000001549 R09: 0000000000001549 R10: 0000000000000001 R11: 0000000000aaaaaa R12: ffff88ab13662298 R13: ffff88ab1b259e20 R14: ffff88ab1b259e42 R15: 0000000000000000 FS: 00007fb39b534740(0000) GS:ffff88b31f940000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000084d3ea004 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dev_addr_flush+0x15/0x30 free_netdev+0x7e/0x130 hfi1_netdev_free+0x59/0x70 [hfi1] remove_one+0x65/0x110 [hfi1] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0xec/0x1b0 driver_detach+0x46/0x90 bus_remove_driver+0x58/0xd0 pci_unregister_driver+0x26/0xa0 hfi1_mod_cleanup+0xc/0xd54 [hfi1] __x64_sys_delete_module+0x16c/0x260 ? exit_to_usermode_loop+0xa4/0xc0 do_syscall_64+0x5b/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 193ba03141bb ("IB/hfi1: Use free_netdev() in hfi1_netdev_free()") Link: https://lore.kernel.org/r/20200623203224.106975.16926.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/netdev_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 63688e85e8da..6d263c9749b3 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -373,7 +373,7 @@ void hfi1_netdev_free(struct hfi1_devdata *dd) { if (dd->dummy_netdev) { dd_dev_info(dd, "hfi1 netdev freed\n"); - free_netdev(dd->dummy_netdev); + kfree(dd->dummy_netdev); dd->dummy_netdev = NULL; } } -- cgit v1.2.3 From 822fbd37410639acdae368ea55477ddd3498651d Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Tue, 23 Jun 2020 16:32:30 -0400 Subject: IB/hfi1: Fix module use count flaw due to leftover module put calls When the try_module_get calls were removed from opening and closing of the i2c debugfs file, the corresponding module_put calls were missed. This results in an inaccurate module use count that requires a power cycle to fix. Fixes: 09fbca8e6240 ("IB/hfi1: No need to use try_module_get for debugfs") Link: https://lore.kernel.org/r/20200623203230.106975.76240.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Kaike Wan Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/debugfs.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 4633a0ce1a8c..2ced236e1553 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -985,15 +985,10 @@ static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) { struct hfi1_pportdata *ppd; - int ret; ppd = private2ppd(fp); - ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); - if (ret) /* failed - release the module */ - module_put(THIS_MODULE); - - return ret; + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); } static int i2c1_debugfs_open(struct inode *in, struct file *fp) @@ -1013,7 +1008,6 @@ static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target) ppd = private2ppd(fp); release_chip_resource(ppd->dd, i2c_target(target)); - module_put(THIS_MODULE); return 0; } @@ -1031,18 +1025,10 @@ static int i2c2_debugfs_release(struct inode *in, struct file *fp) static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target) { struct hfi1_pportdata *ppd; - int ret; - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; ppd = private2ppd(fp); - ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); - if (ret) /* failed - release the module */ - module_put(THIS_MODULE); - - return ret; + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); } static int qsfp1_debugfs_open(struct inode *in, struct file *fp) @@ -1062,7 +1048,6 @@ static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target) ppd = private2ppd(fp); release_chip_resource(ppd->dd, i2c_target(target)); - module_put(THIS_MODULE); return 0; } -- cgit v1.2.3 From 82172b765530f84b4b9da929f2dcf46f2b7b232b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 23 Jun 2020 16:43:22 -0400 Subject: IB/hfi1: Correct -EBUSY handling in tx code The current code mishandles -EBUSY in two ways: - The flow change doesn't test the return from the flush and runs on to process the current packet racing with the wakeup processing - The -EBUSY handling for a single packet inserts the tx into the txlist after the submit call, racing with the same wakeup processing Fix the first by dropping the skb and returning NETDEV_TX_OK. Fix the second by insuring the the list entry within the txreq is inited when allocated. This enables the sleep routine to detect that the txreq has used the non-list api and queue the packet to the txlist. Both flaws can lead to having the flushing thread executing in causing two threads to manipulate the txlist. Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20200623204321.108092.83898.stgit@awfm-01.aw.intel.com Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 175290c56db9..76dcb5624aa6 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -369,6 +369,7 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, tx->priv = priv; tx->txq = txp->txq; tx->skb = skb; + INIT_LIST_HEAD(&tx->txreq.list); hfi1_ipoib_build_ib_tx_headers(tx, txp); @@ -469,6 +470,7 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, ret = hfi1_ipoib_submit_tx(txq, tx); if (likely(!ret)) { +tx_ok: trace_sdma_output_ibhdr(tx->priv->dd, &tx->sdma_hdr.hdr, ib_is_sc5(txp->flow.sc5)); @@ -478,20 +480,8 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, txq->pkts_sent = false; - if (ret == -EBUSY) { - list_add_tail(&tx->txreq.list, &txq->tx_list); - - trace_sdma_output_ibhdr(tx->priv->dd, - &tx->sdma_hdr.hdr, - ib_is_sc5(txp->flow.sc5)); - hfi1_ipoib_check_queue_depth(txq); - return NETDEV_TX_OK; - } - - if (ret == -ECOMM) { - hfi1_ipoib_check_queue_depth(txq); - return NETDEV_TX_OK; - } + if (ret == -EBUSY || ret == -ECOMM) + goto tx_ok; sdma_txclean(priv->dd, &tx->txreq); dev_kfree_skb_any(skb); @@ -509,9 +499,17 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev, struct ipoib_txreq *tx; /* Has the flow change ? */ - if (txq->flow.as_int != txp->flow.as_int) - (void)hfi1_ipoib_flush_tx_list(dev, txq); - + if (txq->flow.as_int != txp->flow.as_int) { + int ret; + + ret = hfi1_ipoib_flush_tx_list(dev, txq); + if (unlikely(ret)) { + if (ret == -EBUSY) + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + } tx = hfi1_ipoib_send_dma_common(dev, skb, txp); if (IS_ERR(tx)) { int ret = PTR_ERR(tx); @@ -612,6 +610,9 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, netif_stop_subqueue(txq->priv->netdev, txq->q_idx); + if (list_empty(&txreq->list)) + /* came from non-list submit */ + list_add_tail(&txreq->list, &txq->tx_list); if (list_empty(&txq->wait.list)) iowait_queue(pkts_sent, wait->iow, &sde->dmawait); -- cgit v1.2.3 From 38fd98afeeb79d3b148db49f81f2ec6a37a4ee00 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 23 Jun 2020 16:43:28 -0400 Subject: IB/hfi1: Add atomic triggered sleep/wakeup When running iperf in a two host configuration the following trace can occur: [ 319.728730] NETDEV WATCHDOG: ib0 (hfi1): transmit queue 0 timed out The issue happens because the current implementation relies on the netif txq being stopped to control the flushing of the tx list. There are two resources that the transmit logic can wait on and stop the txq: - SDMA descriptors - Ring space to hold completions The ring space is tested on the sending side and relieved when the ring is consumed in the napi tx reaping. Unfortunately, that reaping can run conncurrently with the workqueue flushing of the txlist. If the txq is started just before the workitem executes, the txlist will never be flushed, leading to the txq being stuck. Fix by: - Adding sleep/wakeup wrappers * Use an atomic to control the call to the netif routines inside the wrappers - Use another atomic to record ring space exhaustion * Only wakeup when the a ring space exhaustion has happened and it relieved Add additional wrappers to clarify the ring space resource handling. Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20200623204327.108092.4024.stgit@awfm-01.aw.intel.com Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib.h | 6 ++++ drivers/infiniband/hw/hfi1/ipoib_tx.c | 67 ++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index 185c9b02c974..b8c9d0a003fb 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -67,6 +67,9 @@ struct hfi1_ipoib_circ_buf { * @sde: sdma engine * @tx_list: tx request list * @sent_txreqs: count of txreqs posted to sdma + * @stops: count of stops of queue + * @ring_full: ring has been filled + * @no_desc: descriptor shortage seen * @flow: tracks when list needs to be flushed for a flow change * @q_idx: ipoib Tx queue index * @pkts_sent: indicator packets have been sent from this queue @@ -80,6 +83,9 @@ struct hfi1_ipoib_txq { struct sdma_engine *sde; struct list_head tx_list; u64 sent_txreqs; + atomic_t stops; + atomic_t ring_full; + atomic_t no_desc; union hfi1_ipoib_flow flow; u8 q_idx; bool pkts_sent; diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 76dcb5624aa6..9df292b51a05 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -55,23 +55,48 @@ static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed) return sent - completed; } -static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq) +{ + return hfi1_ipoib_txreqs(txq->sent_txreqs, + atomic64_read(&txq->complete_txreqs)); +} + +static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq) { - if (unlikely(hfi1_ipoib_txreqs(++txq->sent_txreqs, - atomic64_read(&txq->complete_txreqs)) >= - min_t(unsigned int, txq->priv->netdev->tx_queue_len, - txq->tx_ring.max_items - 1))) + if (atomic_inc_return(&txq->stops) == 1) netif_stop_subqueue(txq->priv->netdev, txq->q_idx); } +static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq) +{ + if (atomic_dec_and_test(&txq->stops)) + netif_wake_subqueue(txq->priv->netdev, txq->q_idx); +} + +static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items - 1); +} + +static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items) >> 1; +} + +static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +{ + ++txq->sent_txreqs; + if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) && + !atomic_xchg(&txq->ring_full, 1)) + hfi1_ipoib_stop_txq(txq); +} + static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) { struct net_device *dev = txq->priv->netdev; - /* If the queue is already running just return */ - if (likely(!__netif_subqueue_stopped(dev, txq->q_idx))) - return; - /* If shutting down just return as queue state is irrelevant */ if (unlikely(dev->reg_state != NETREG_REGISTERED)) return; @@ -86,11 +111,9 @@ static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) * Use the minimum of the current tx_queue_len or the rings max txreqs * to protect against ring overflow. */ - if (hfi1_ipoib_txreqs(txq->sent_txreqs, - atomic64_read(&txq->complete_txreqs)) - < min_t(unsigned int, dev->tx_queue_len, - txq->tx_ring.max_items) >> 1) - netif_wake_subqueue(dev, txq->q_idx); + if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) && + atomic_xchg(&txq->ring_full, 0)) + hfi1_ipoib_wake_txq(txq); } static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) @@ -608,13 +631,14 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, return -EAGAIN; } - netif_stop_subqueue(txq->priv->netdev, txq->q_idx); - if (list_empty(&txreq->list)) /* came from non-list submit */ list_add_tail(&txreq->list, &txq->tx_list); - if (list_empty(&txq->wait.list)) + if (list_empty(&txq->wait.list)) { + if (!atomic_xchg(&txq->no_desc, 1)) + hfi1_ipoib_stop_txq(txq); iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; @@ -649,9 +673,9 @@ static void hfi1_ipoib_flush_txq(struct work_struct *work) struct net_device *dev = txq->priv->netdev; if (likely(dev->reg_state == NETREG_REGISTERED) && - likely(__netif_subqueue_stopped(dev, txq->q_idx)) && likely(!hfi1_ipoib_flush_tx_list(dev, txq))) - netif_wake_subqueue(dev, txq->q_idx); + if (atomic_xchg(&txq->no_desc, 0)) + hfi1_ipoib_wake_txq(txq); } int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) @@ -705,6 +729,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) txq->sde = NULL; INIT_LIST_HEAD(&txq->tx_list); atomic64_set(&txq->complete_txreqs, 0); + atomic_set(&txq->stops, 0); + atomic_set(&txq->ring_full, 0); + atomic_set(&txq->no_desc, 0); txq->q_idx = i; txq->flow.tx_queue = 0xff; txq->flow.sc5 = 0xff; @@ -770,7 +797,7 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) atomic64_inc(complete_txreqs); } - if (hfi1_ipoib_txreqs(txq->sent_txreqs, atomic64_read(complete_txreqs))) + if (hfi1_ipoib_used(txq)) dd_dev_warn(txq->priv->dd, "txq %d not empty found %llu requests\n", txq->q_idx, -- cgit v1.2.3 From e18321acfb9f14d01c03578e1c498e3f815d20a3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 22 Jun 2020 17:52:24 -0700 Subject: IB/hfi1: Add explicit cast OPA_MTU_8192 to 'enum ib_mtu' Clang warns: drivers/infiniband/hw/hfi1/qp.c:198:9: warning: implicit conversion from enumeration type 'enum opa_mtu' to different enumeration type 'enum ib_mtu' [-Wenum-conversion] mtu = OPA_MTU_8192; ~ ^~~~~~~~~~~~ enum opa_mtu extends enum ib_mtu. There are typically two ways to deal with this: * Remove the expected types and just use 'int' for all parameters and types. * Explicitly cast the enums between each other. This driver chooses to do the later so do the same thing here. Fixes: 6d72344cf6c4 ("IB/ipoib: Increase ipoib Datagram mode MTU's upper limit") Link: https://lore.kernel.org/r/20200623005224.492239-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1062 Link: https://lore.kernel.org/linux-rdma/20200527040350.GA3118979@ubuntu-s3-xlarge-x86/ Signed-off-by: Nathan Chancellor Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 0c2ae9f7b3e8..2f3d9ce077d3 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -195,7 +195,7 @@ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) { /* Constraining 10KB packets to 8KB packets */ if (mtu == (enum ib_mtu)OPA_MTU_10240) - mtu = OPA_MTU_8192; + mtu = (enum ib_mtu)OPA_MTU_8192; return opa_mtu_enum_to_int((enum opa_mtu)mtu); } -- cgit v1.2.3 From f81b4565c1108fb954f344e6b4a153c8189e57fe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 30 Jun 2020 15:21:47 +0300 Subject: RDMA/mlx5: Fix legacy IPoIB QP initialization Legacy IPoIB sets IB_QP_CREATE_NETIF_QP QP create flag and because mlx5 doesn't use this flag, the process_create_flags() failed to create IPoIB QPs. Fixes: 2978975ce7f1 ("RDMA/mlx5: Process create QP flags in one place") Link: https://lore.kernel.org/r/20200630122147.445847-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f939c9b769f0..b316c9cafbc5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2668,6 +2668,10 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) return (create_flags) ? -EINVAL : 0; + process_create_flag(dev, &create_flags, IB_QP_CREATE_NETIF_QP, + mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_BYPASS), + qp); process_create_flag(dev, &create_flags, IB_QP_CREATE_INTEGRITY_EN, MLX5_CAP_GEN(mdev, sho), qp); -- cgit v1.2.3 From 28b70cd9236563e1a88a6094673fef3c08db0d51 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Tue, 23 Jun 2020 16:40:47 -0400 Subject: IB/hfi1: Do not destroy hfi1_wq when the device is shut down The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[] [] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [] queue_work_on+0x45/0x50 [] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [] hfi1_schedule_send+0x32/0x70 [hfi1] [] rvt_rc_timeout+0xe9/0x130 [rdmavt] [] ? trigger_load_balance+0x6a/0x280 [] ? rvt_free_qpn+0x40/0x40 [rdmavt] [] call_timer_fn+0x38/0x110 [] ? rvt_free_qpn+0x40/0x40 [rdmavt] [] run_timer_softirq+0x24d/0x300 [] __do_softirq+0xf5/0x280 [] call_softirq+0x1c/0x30 [] do_softirq+0x65/0xa0 [] irq_exit+0x105/0x110 [] smp_apic_timer_interrupt+0x48/0x60 [] apic_timer_interrupt+0x162/0x170 [] ? cpuidle_enter_state+0x57/0xd0 [] cpuidle_idle_call+0xde/0x230 [] arch_cpu_idle+0xe/0xc0 [] cpu_startup_entry+0x14a/0x1e0 [] start_secondary+0x1f7/0x270 [] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e71136a08 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/20200623204047.107638.77646.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 27 +++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/qp.c | 5 ++++- drivers/infiniband/hw/hfi1/tid_rdma.c | 5 ++++- 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 5eed4360695f..16d6788075f3 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -830,6 +830,25 @@ wq_error: return -ENOMEM; } +/** + * destroy_workqueues - destroy per port workqueues + * @dd: the hfi1_ib device + */ +static void destroy_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } +} + /** * enable_general_intr() - Enable the IRQs that will be handled by the * general interrupt handler. @@ -1104,11 +1123,10 @@ static void shutdown_device(struct hfi1_devdata *dd) */ hfi1_quiet_serdes(ppd); - if (ppd->hfi1_wq) { - destroy_workqueue(ppd->hfi1_wq); - ppd->hfi1_wq = NULL; - } + if (ppd->hfi1_wq) + flush_workqueue(ppd->hfi1_wq); if (ppd->link_wq) { + flush_workqueue(ppd->link_wq); destroy_workqueue(ppd->link_wq); ppd->link_wq = NULL; } @@ -1756,6 +1774,7 @@ static void remove_one(struct pci_dev *pdev) * clear dma engines, etc. */ shutdown_device(dd); + destroy_workqueues(dd); stop_timers(dd); diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 2f3d9ce077d3..be62284e42d9 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -367,7 +367,10 @@ bool _hfi1_schedule_send(struct rvt_qp *qp) struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_devdata *dd = ppd->dd; + + if (dd->flags & HFI1_SHUTDOWN) + return true; return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, priv->s_sde ? diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 243b4ba0b6f6..facff133139a 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5406,7 +5406,10 @@ static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_devdata *dd = ppd->dd; + + if ((dd->flags & HFI1_SHUTDOWN)) + return true; return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq, priv->s_sde ? -- cgit v1.2.3 From 2315ec12ee8e8257bb335654c62e0cae71dc278d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Tue, 23 Jun 2020 16:40:53 -0400 Subject: IB/hfi1: Do not destroy link_wq when the device is shut down The workqueue link_wq should only be destroyed when the hfi1 driver is unloaded, not when the device is shut down. Fixes: 71d47008ca1b ("IB/hfi1: Create workqueue for link events") Link: https://lore.kernel.org/r/20200623204053.107638.70315.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 16d6788075f3..cb7ad1288821 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -846,6 +846,10 @@ static void destroy_workqueues(struct hfi1_devdata *dd) destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } } @@ -1122,14 +1126,10 @@ static void shutdown_device(struct hfi1_devdata *dd) * We can't count on interrupts since we are stopping. */ hfi1_quiet_serdes(ppd); - if (ppd->hfi1_wq) flush_workqueue(ppd->hfi1_wq); - if (ppd->link_wq) { + if (ppd->link_wq) flush_workqueue(ppd->link_wq); - destroy_workqueue(ppd->link_wq); - ppd->link_wq = NULL; - } } sdma_exit(dd); } -- cgit v1.2.3 From 530c8632b547ff72f11ff83654b22462a73f1f7b Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 7 Jul 2020 14:06:11 +0300 Subject: IB/mlx5: Fix 50G per lane indication Some released FW versions mistakenly don't set the capability that 50G per lane link-modes are supported for VFs (ptys_extended_ethernet capability bit). Use PTYS.ext_eth_proto_capability instead, as this indication is always accurate. If PTYS.ext_eth_proto_capability is valid (has a non-zero value) conclude that the HCA supports 50G per lane. Otherwise, conclude that the HCA doesn't support 50G per lane. Fixes: 08e8676f1607 ("IB/mlx5: Add support for 50Gbps per lane link modes") Link: https://lore.kernel.org/r/20200707110612.882962-3-leon@kernel.org Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 343a8b8361e7..6f99ed03d88e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -511,7 +511,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, mdev_port_num); if (err) goto out; - ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet); + ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); props->active_width = IB_WIDTH_4X; -- cgit v1.2.3 From 0a03715068794e4b524f66ebbf412ab1f2933f3f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 7 Jul 2020 14:06:12 +0300 Subject: RDMA/mlx5: Set PD pointers for the error flow unwind ib_pd is accessed internally during destroy of the TIR/TIS, but PD can be not set yet. This leading to the following kernel panic. BUG: kernel NULL pointer dereference, address: 0000000000000074 PGD 8000000079eaa067 P4D 8000000079eaa067 PUD 7ae81067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 1 PID: 709 Comm: syz-executor.0 Not tainted 5.8.0-rc3 #41 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:destroy_raw_packet_qp_tis drivers/infiniband/hw/mlx5/qp.c:1189 [inline] RIP: 0010:destroy_raw_packet_qp drivers/infiniband/hw/mlx5/qp.c:1527 [inline] RIP: 0010:destroy_qp_common+0x2ca/0x4f0 drivers/infiniband/hw/mlx5/qp.c:2397 Code: 00 85 c0 74 2e e8 56 18 55 ff 48 8d b3 28 01 00 00 48 89 ef e8 d7 d3 ff ff 48 8b 43 08 8b b3 c0 01 00 00 48 8b bd a8 0a 00 00 <0f> b7 50 74 e8 0d 6a fe ff e8 28 18 55 ff 49 8d 55 50 4c 89 f1 48 RSP: 0018:ffffc900007bbac8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88807949e800 RCX: 0000000000000998 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88807c180140 RBP: ffff88807b50c000 R08: 000000000002d379 R09: ffffc900007bba00 R10: 0000000000000001 R11: 000000000002d358 R12: ffff888076f37000 R13: ffff88807949e9c8 R14: ffffc900007bbe08 R15: ffff888076f37000 FS: 00000000019bf940(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000074 CR3: 0000000076d68004 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_create_qp+0xf36/0xf90 drivers/infiniband/hw/mlx5/qp.c:3014 _ib_create_qp drivers/infiniband/core/core_priv.h:333 [inline] create_qp+0x57f/0xd20 drivers/infiniband/core/uverbs_cmd.c:1443 ib_uverbs_create_qp+0xcf/0x100 drivers/infiniband/core/uverbs_cmd.c:1564 ib_uverbs_write+0x5fa/0x780 drivers/infiniband/core/uverbs_main.c:664 __vfs_write+0x3f/0x90 fs/read_write.c:495 vfs_write+0xc7/0x1f0 fs/read_write.c:559 ksys_write+0x5e/0x110 fs/read_write.c:612 do_syscall_64+0x3e/0x70 arch/x86/entry/common.c:359 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x466479 Code: Bad RIP value. RSP: 002b:00007ffd057b62b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000466479 RDX: 0000000000000070 RSI: 0000000020000240 RDI: 0000000000000003 RBP: 00000000019bf8fc R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000000bf6 R14: 00000000004cb859 R15: 00000000006fefc0 Fixes: 6c41965d647a ("RDMA/mlx5: Don't access ib_qp fields in internal destroy QP path") Link: https://lore.kernel.org/r/20200707110612.882962-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index b316c9cafbc5..e050eade97a1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3005,11 +3005,12 @@ destroy_qp: mlx5_ib_destroy_dct(qp); } else { /* - * The two lines below are temp solution till QP allocation + * These lines below are temp solution till QP allocation * will be moved to be under IB/core responsiblity. */ qp->ibqp.send_cq = attr->send_cq; qp->ibqp.recv_cq = attr->recv_cq; + qp->ibqp.pd = pd; destroy_qp_common(dev, qp, udata); } -- cgit v1.2.3 From c3d6057e07a5d15be7c69ea545b3f91877808c96 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 12 Jul 2020 13:26:41 +0300 Subject: RDMA/mlx5: Use xa_lock_irq when access to SRQ table SRQ table is accessed both from interrupt and process context, therefore we must use xa_lock_irq. inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. kworker/u17:9/8573 takes: ffff8883e3503d30 (&xa->xa_lock#13){?...}-{2:2}, at: mlx5_cmd_get_srq+0x18/0x70 [mlx5_ib] {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb9/0x3a0 _raw_spin_lock+0x25/0x30 srq_event_notifier+0x2b/0xc0 [mlx5_ib] notifier_call_chain+0x45/0x70 __atomic_notifier_call_chain+0x69/0x100 forward_event+0x36/0xc0 [mlx5_core] notifier_call_chain+0x45/0x70 __atomic_notifier_call_chain+0x69/0x100 mlx5_eq_async_int+0xc5/0x160 [mlx5_core] notifier_call_chain+0x45/0x70 __atomic_notifier_call_chain+0x69/0x100 mlx5_irq_int_handler+0x19/0x30 [mlx5_core] __handle_irq_event_percpu+0x43/0x2a0 handle_irq_event_percpu+0x30/0x70 handle_irq_event+0x34/0x60 handle_edge_irq+0x7c/0x1b0 do_IRQ+0x60/0x110 ret_from_intr+0x0/0x2a default_idle+0x34/0x160 do_idle+0x1ec/0x220 cpu_startup_entry+0x19/0x20 start_secondary+0x153/0x1a0 secondary_startup_64+0xa4/0xb0 irq event stamp: 20907 hardirqs last enabled at (20907): _raw_spin_unlock_irq+0x24/0x30 hardirqs last disabled at (20906): _raw_spin_lock_irq+0xf/0x40 softirqs last enabled at (20746): __do_softirq+0x2c9/0x436 softirqs last disabled at (20681): irq_exit+0xb3/0xc0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&xa->xa_lock#13); lock(&xa->xa_lock#13); *** DEADLOCK *** 2 locks held by kworker/u17:9/8573: #0: ffff888295218d38 ((wq_completion)mlx5_ib_page_fault){+.+.}-{0:0}, at: process_one_work+0x1f1/0x5f0 #1: ffff888401647e78 ((work_completion)(&pfault->work)){+.+.}-{0:0}, at: process_one_work+0x1f1/0x5f0 stack backtrace: CPU: 0 PID: 8573 Comm: kworker/u17:9 Tainted: GO 5.7.0_for_upstream_min_debug_2020_06_14_11_31_46_41 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_ib_page_fault mlx5_ib_eqe_pf_action [mlx5_ib] Call Trace: dump_stack+0x71/0x9b mark_lock+0x4f2/0x590 ? print_shortest_lock_dependencies+0x200/0x200 __lock_acquire+0xa00/0x1eb0 lock_acquire+0xb9/0x3a0 ? mlx5_cmd_get_srq+0x18/0x70 [mlx5_ib] _raw_spin_lock+0x25/0x30 ? mlx5_cmd_get_srq+0x18/0x70 [mlx5_ib] mlx5_cmd_get_srq+0x18/0x70 [mlx5_ib] mlx5_ib_eqe_pf_action+0x257/0xa30 [mlx5_ib] ? process_one_work+0x209/0x5f0 process_one_work+0x27b/0x5f0 ? __schedule+0x280/0x7e0 worker_thread+0x2d/0x3c0 ? process_one_work+0x5f0/0x5f0 kthread+0x111/0x130 ? kthread_park+0x90/0x90 ret_from_fork+0x24/0x30 Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Link: https://lore.kernel.org/r/20200712102641.15210-1-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq_cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 6f5eadc4d183..37aaacebd3f2 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -83,11 +83,11 @@ struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn) struct mlx5_srq_table *table = &dev->srq_table; struct mlx5_core_srq *srq; - xa_lock(&table->array); + xa_lock_irq(&table->array); srq = xa_load(&table->array, srqn); if (srq) refcount_inc(&srq->common.refcount); - xa_unlock(&table->array); + xa_unlock_irq(&table->array); return srq; } -- cgit v1.2.3 From 7b9bd73ed13d3c399f76bb7578cbe1b9e5a8e8b0 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Tue, 14 Jul 2020 19:28:58 +0800 Subject: RDMA/hns: Fix wrong assignment of lp_pktn_ini in QPC The RoCE Engine will schedule to another QP after one has sent (2 ^ lp_pktn_ini) packets. lp_pktn_ini is set in QPC and should be calculated from 2 factors: 1. current MTU as a integer 2. the RoCE Engine's maximum slice length 64KB But the driver use MTU as a enum ib_mtu and the max inline capability, the lp_pktn_ini will be much bigger than expected which may cause traffic of some QPs to never get scheduled. Fixes: b713128de7a1 ("RDMA/hns: Adjust lp_pktn_ini dynamically") Link: https://lore.kernel.org/r/1594726138-49294-1-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dd01a51816cc..0618ced45bf8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3954,6 +3954,15 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, return 0; } +static inline enum ib_mtu get_mtu(struct ib_qp *ibqp, + const struct ib_qp_attr *attr) +{ + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) + return IB_MTU_4096; + + return attr->path_mtu; +} + static int modify_qp_init_to_rtr(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, struct hns_roce_v2_qp_context *context, @@ -3965,6 +3974,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, struct ib_device *ibdev = &hr_dev->ib_dev; dma_addr_t trrl_ba; dma_addr_t irrl_ba; + enum ib_mtu mtu; u8 port_num; u64 *mtts; u8 *dmac; @@ -4062,23 +4072,23 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M, V2_QPC_BYTE_52_DMAC_S, 0); - /* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */ + mtu = get_mtu(ibqp, attr); + + if (attr_mask & IB_QP_PATH_MTU) { + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, mtu); + roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, 0); + } + +#define MAX_LP_MSG_LEN 65536 + /* MTU*(2^LP_PKTN_INI) shouldn't be bigger than 64kb */ roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, V2_QPC_BYTE_56_LP_PKTN_INI_S, - ilog2(hr_dev->caps.max_sq_inline / IB_MTU_4096)); + ilog2(MAX_LP_MSG_LEN / ib_mtu_enum_to_int(mtu))); roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, V2_QPC_BYTE_56_LP_PKTN_INI_S, 0); - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, - V2_QPC_BYTE_24_MTU_S, IB_MTU_4096); - else if (attr_mask & IB_QP_PATH_MTU) - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, - V2_QPC_BYTE_24_MTU_S, attr->path_mtu); - - roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, - V2_QPC_BYTE_24_MTU_S, 0); - roce_set_bit(qpc_mask->byte_108_rx_reqepsn, V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0); roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M, -- cgit v1.2.3 From 79d5208386523183ae5cd359a9d1c024f46b1202 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 14 Jul 2020 19:42:15 +0800 Subject: RDMA/hns: Fix wrong PBL offset when VA is not aligned to PAGE_SIZE ROCE uses "VA % buf_page_size" to caclulate the offset in the PBL's first page, the actual PA corresponding to the MR's VA is equal to MR's PA plus this offset. The first PA in PBL has already been aligned to PAGE_SIZE after calling ib_umem_get(), but the MR's VA may not. If the buf_page_size is smaller than the PAGE_SIZE, this will lead the HW to access the wrong memory because the offset is smaller than expected. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Link: https://lore.kernel.org/r/1594726935-45666-1-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 0e71ebee9e52..6b226a5eb7db 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -120,7 +120,7 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, mr->pbl_hop_num = is_fast ? 1 : hr_dev->caps.pbl_hop_num; buf_attr.page_shift = is_fast ? PAGE_SHIFT : - hr_dev->caps.pbl_buf_pg_sz + HNS_HW_PAGE_SHIFT; + hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT; buf_attr.region[0].size = length; buf_attr.region[0].hopnum = mr->pbl_hop_num; buf_attr.region_count = 1; -- cgit v1.2.3 From a862192e9227ad46e0447fd0a03869ba1b30d16f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 19 Jul 2020 09:54:35 +0300 Subject: RDMA/mlx5: Prevent prefetch from racing with implicit destruction Prefetch work in mlx5_ib_prefetch_mr_work can be queued and able to run concurrently with destruction of the implicit MR. The num_deferred_work was intended to serialize this, but there is a race: CPU0 CPU1 mlx5_ib_free_implicit_mr() xa_erase(odp_mkeys) synchronize_srcu() __xa_erase(implicit_children) mlx5_ib_prefetch_mr_work() pagefault_mr() pagefault_implicit_mr() implicit_get_child_mr() xa_cmpxchg() atomic_dec_and_test(num_deferred_mr) wait_event(imr->q_deferred_work) ib_umem_odp_release(odp_imr) kfree(odp_imr) At this point in mlx5_ib_free_implicit_mr() the implicit_children list is supposed to be empty forever so that destroy_unused_implicit_child_mr() and related are not and will not be running. Since it is not empty the destroy_unused_implicit_child_mr() flow ends up touching deallocated memory as mlx5_ib_free_implicit_mr() already tore down the imr parent. The solution is to flush out the prefetch wq by driving num_deferred_work to zero after creation of new prefetch work is blocked. Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy") Link: https://lore.kernel.org/r/20200719065435.130722-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 7d2ec9ee5097..1ab676b66894 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -601,6 +601,23 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) */ synchronize_srcu(&dev->odp_srcu); + /* + * All work on the prefetch list must be completed, xa_erase() prevented + * new work from being created. + */ + wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); + + /* + * At this point it is forbidden for any other thread to enter + * pagefault_mr() on this imr. It is already forbidden to call + * pagefault_mr() on an implicit child. Due to this additions to + * implicit_children are prevented. + */ + + /* + * Block destroy_unused_implicit_child_mr() from incrementing + * num_deferred_work. + */ xa_lock(&imr->implicit_children); xa_for_each (&imr->implicit_children, idx, mtt) { __xa_erase(&imr->implicit_children, idx); @@ -609,9 +626,8 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) xa_unlock(&imr->implicit_children); /* - * num_deferred_work can only be incremented inside the odp_srcu, or - * under xa_lock while the child is in the xarray. Thus at this point - * it is only decreasing, and all work holding it is now on the wq. + * Wait for any concurrent destroy_unused_implicit_child_mr() to + * complete. */ wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); -- cgit v1.2.3