diff options
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_main.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/cxgb4/cm.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/cxgb4/device.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 8 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 18 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 149 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iscsi_iser.c | 313 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iscsi_iser.h | 408 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_initiator.c | 198 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_memory.c | 99 | ||||
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_verbs.c | 667 | ||||
-rw-r--r-- | drivers/infiniband/ulp/isert/ib_isert.c | 65 |
14 files changed, 1185 insertions, 787 deletions
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 0600c50e6215..5ba2a86aab6a 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2518,6 +2518,8 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; + attr.vlan_id = 0; + memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = ib_create_ah(pd, &attr); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index c73b22a257fe..71ab83fde472 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -477,6 +477,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, entry->desc.async.element = element; entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; entry->counter = counter; list_add_tail(&entry->list, &file->async_file->event_list); @@ -502,6 +503,10 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject || !event->element.qp->uobject->live) + return; + uobj = container_of(event->element.qp->uobject, struct ib_uevent_object, uobject); diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index c2fb71c182a8..fb61f6685809 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -236,10 +236,12 @@ static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) static void set_emss(struct c4iw_ep *ep, u16 opt) { ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - - sizeof(struct iphdr) - sizeof(struct tcphdr); + ((AF_INET == ep->com.remote_addr.ss_family) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)) - + sizeof(struct tcphdr); ep->mss = ep->emss; if (GET_TCPOPT_TSTAMP(opt)) - ep->emss -= 12; + ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4); if (ep->emss < 128) ep->emss = 128; if (ep->emss & 7) @@ -415,6 +417,7 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, return NULL; if (!our_interface(dev, n->dev) && !(n->dev->flags & IFF_LOOPBACK)) { + neigh_release(n); dst_release(&rt->dst); return NULL; } @@ -581,11 +584,14 @@ static void c4iw_record_pm_msg(struct c4iw_ep *ep, } static void best_mtu(const unsigned short *mtus, unsigned short mtu, - unsigned int *idx, int use_ts) + unsigned int *idx, int use_ts, int ipv6) { - unsigned short hdr_size = sizeof(struct iphdr) + + unsigned short hdr_size = (ipv6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) + sizeof(struct tcphdr) + - (use_ts ? 12 : 0); + (use_ts ? + round_up(TCPOLEN_TIMESTAMP, 4) : 0); unsigned short data_size = mtu - hdr_size; cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx); @@ -634,7 +640,8 @@ static int send_connect(struct c4iw_ep *ep) set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, - enable_tcp_timestamps); + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); wscale = compute_wscale(rcv_win); /* @@ -668,6 +675,7 @@ static int send_connect(struct c4iw_ep *ep) if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { opt2 |= T5_OPT_2_VALID; opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ } t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); @@ -713,8 +721,6 @@ static int send_connect(struct c4iw_ep *ep) } else { u32 isn = (prandom_u32() & ~7UL) - 1; - opt2 |= T5_OPT_2_VALID; - opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ if (peer2peer) isn += 4; @@ -756,10 +762,10 @@ static int send_connect(struct c4iw_ep *ep) t5_req6->peer_ip_lo = *((__be64 *) (ra6->sin6_addr.s6_addr + 8)); t5_req6->opt0 = cpu_to_be64(opt0); - t5_req6->params = (__force __be64)cpu_to_be32( + t5_req6->params = cpu_to_be64(V_FILTER_TUPLE( cxgb4_select_ntuple( ep->com.dev->rdev.lldi.ports[0], - ep->l2t)); + ep->l2t))); t5_req6->rsvd = cpu_to_be32(isn); PDBG("%s snd_isn %u\n", __func__, be32_to_cpu(t5_req6->rsvd)); @@ -1763,7 +1769,8 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) req->tcb.tx_max = (__force __be32) jiffies; req->tcb.rcv_adv = htons(1); best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, - enable_tcp_timestamps); + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); wscale = compute_wscale(rcv_win); /* @@ -2162,7 +2169,8 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, ep->hwtid)); best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, - enable_tcp_timestamps && req->tcpopt.tstamp); + enable_tcp_timestamps && req->tcpopt.tstamp, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); wscale = compute_wscale(rcv_win); /* diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index f25df5276c22..72f1f052e88c 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -60,7 +60,7 @@ int c4iw_wr_log = 0; module_param(c4iw_wr_log, int, 0444); MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data."); -int c4iw_wr_log_size_order = 12; +static int c4iw_wr_log_size_order = 12; module_param(c4iw_wr_log_size_order, int, 0444); MODULE_PARM_DESC(c4iw_wr_log_size_order, "Number of entries (log2) in the work request timing log."); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d8907b20522a..a24431746377 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -650,13 +650,13 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm return -EINVAL; idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + pfn = uar_index2pfn(dev, uuari->uars[idx].index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, (unsigned long long)pfn); - if (idx >= uuari->num_uars) - return -EINVAL; - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) @@ -1414,8 +1414,8 @@ err_dealloc: static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; - destroy_umrc_res(dev); ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); destroy_dev_resources(&dev->devr); free_comp_eqs(dev); ib_dealloc_device(&dev->ib_dev); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index a3e81444c825..dae07eae9507 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -55,16 +55,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, u64 pfn; struct scatterlist *sg; int entry; + unsigned long page_shift = ilog2(umem->page_size); - addr = addr >> PAGE_SHIFT; + addr = addr >> page_shift; tmp = (unsigned long)addr; m = find_first_bit(&tmp, sizeof(tmp)); skip = 1 << m; mask = skip - 1; i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; - pfn = sg_dma_address(sg) >> PAGE_SHIFT; + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; for (k = 0; k < len; k++) { if (!(i & mask)) { tmp = (unsigned long)pfn; @@ -103,14 +104,15 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, *ncont = 0; } - *shift = PAGE_SHIFT + m; + *shift = page_shift + m; *count = i; } void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int umr) { - int shift = page_shift - PAGE_SHIFT; + unsigned long umem_page_shift = ilog2(umem->page_size); + int shift = page_shift - umem_page_shift; int mask = (1 << shift) - 1; int i, k; u64 cur = 0; @@ -121,11 +123,11 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; + len = sg_dma_len(sg) >> umem_page_shift; base = sg_dma_address(sg); for (k = 0; k < len; k++) { if (!(i & mask)) { - cur = base + (k << PAGE_SHIFT); + cur = base + (k << umem_page_shift); if (umr) cur |= 3; @@ -134,7 +136,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, i >> shift, be64_to_cpu(pas[i >> shift])); } else mlx5_ib_dbg(dev, "=====> 0x%llx\n", - base + (k << PAGE_SHIFT)); + base + (k << umem_page_shift)); i++; } } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 80b3c63eab5d..8ee7cb46e059 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -881,12 +881,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int order; int err; - mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx\n", - start, virt_addr, length); + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); umem = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); if (IS_ERR(umem)) { - mlx5_ib_dbg(dev, "umem get failed\n"); + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); return (void *)umem; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8c574b63d77b..f1b49e024664 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1302,6 +1302,11 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, path->rlid = cpu_to_be16(ah->dlid); if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->mdev->caps.port[port - 1].gid_table_len) { + pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->mdev->caps.port[port - 1].gid_table_len); + return -EINVAL; + } path->grh_mlid |= 1 << 7; path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; @@ -1317,22 +1322,6 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, path->static_rate = err; path->port = port; - if (ah->ah_flags & IB_AH_GRH) { - if (ah->grh.sgid_index >= dev->mdev->caps.port[port - 1].gid_table_len) { - pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", - ah->grh.sgid_index, dev->mdev->caps.port[port - 1].gid_table_len); - return -EINVAL; - } - - path->grh_mlid |= 1 << 7; - path->mgid_index = ah->grh.sgid_index; - path->hop_limit = ah->grh.hop_limit; - path->tclass_flowlabel = - cpu_to_be32((ah->grh.traffic_class << 20) | - (ah->grh.flow_label)); - memcpy(path->rgid, ah->grh.dgid.raw, 16); - } - if (attr_mask & IB_QP_TIMEOUT) path->ackto_lt = attr->timeout << 3; @@ -2020,56 +2009,31 @@ static u8 bs_selector(int block_size) } } -static int format_selector(struct ib_sig_attrs *attr, - struct ib_sig_domain *domain, - int *selector) +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) { + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; -#define FORMAT_DIF_NONE 0 -#define FORMAT_DIF_CRC_INC 8 -#define FORMAT_DIF_CRC_NO_INC 12 -#define FORMAT_DIF_CSUM_INC 13 -#define FORMAT_DIF_CSUM_NO_INC 14 + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; - switch (domain->sig.dif.type) { - case IB_T10DIF_NONE: - /* No DIF */ - *selector = FORMAT_DIF_NONE; - break; - case IB_T10DIF_TYPE1: /* Fall through */ - case IB_T10DIF_TYPE2: - switch (domain->sig.dif.bg_type) { - case IB_T10DIF_CRC: - *selector = FORMAT_DIF_CRC_INC; - break; - case IB_T10DIF_CSUM: - *selector = FORMAT_DIF_CSUM_INC; - break; - default: - return 1; - } - break; - case IB_T10DIF_TYPE3: - switch (domain->sig.dif.bg_type) { - case IB_T10DIF_CRC: - *selector = domain->sig.dif.type3_inc_reftag ? - FORMAT_DIF_CRC_INC : - FORMAT_DIF_CRC_NO_INC; - break; - case IB_T10DIF_CSUM: - *selector = domain->sig.dif.type3_inc_reftag ? - FORMAT_DIF_CSUM_INC : - FORMAT_DIF_CSUM_NO_INC; - break; - default: - return 1; - } - break; - default: - return 1; + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; } - return 0; + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); } static int mlx5_set_bsf(struct ib_mr *sig_mr, @@ -2080,45 +2044,49 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, struct mlx5_bsf_basic *basic = &bsf->basic; struct ib_sig_domain *mem = &sig_attrs->mem; struct ib_sig_domain *wire = &sig_attrs->wire; - int ret, selector; memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; case IB_SIG_TYPE_T10_DIF: - if (sig_attrs->wire.sig_type != IB_SIG_TYPE_T10_DIF) - return -EINVAL; + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } - /* Input domain check byte mask */ - basic->check_byte_mask = sig_attrs->check_mask; + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && - mem->sig.dif.type == wire->sig.dif.type) { + mem->sig_type == wire->sig_type) { /* Same block structure */ - basic->bsf_size_sbs = 1 << 4; + basic->bsf_size_sbs |= 1 << 4; if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) - basic->wire.copy_byte_mask |= 0xc0; + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) - basic->wire.copy_byte_mask |= 0x30; + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) - basic->wire.copy_byte_mask |= 0x0f; + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; } else basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); - basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); - basic->raw_data_size = cpu_to_be32(data_size); - - ret = format_selector(sig_attrs, mem, &selector); - if (ret) - return -EINVAL; - basic->m_bfs_psv = cpu_to_be32(selector << 24 | - msig->psv_memory.psv_idx); - - ret = format_selector(sig_attrs, wire, &selector); - if (ret) - return -EINVAL; - basic->w_bfs_psv = cpu_to_be32(selector << 24 | - msig->psv_wire.psv_idx); + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); break; - default: return -EINVAL; } @@ -2317,20 +2285,21 @@ static int set_psv_wr(struct ib_sig_domain *domain, memset(psv_seg, 0, sizeof(*psv_seg)); psv_seg->psv_num = cpu_to_be32(psv_idx); switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; case IB_SIG_TYPE_T10_DIF: psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | domain->sig.dif.app_tag); psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); - - *seg += sizeof(*psv_seg); - *size += sizeof(*psv_seg) / 16; break; - default: pr_err("Bad signature type given.\n"); return 1; } + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + return 0; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 93ce62fe1594..f42ab14105ac 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -83,7 +83,7 @@ module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); int iser_debug_level = 0; bool iser_pi_enable = false; -int iser_pi_guard = 0; +int iser_pi_guard = 1; MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); MODULE_LICENSE("Dual BSD/GPL"); @@ -97,14 +97,24 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644); MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); module_param_named(pi_guard, iser_pi_guard, int, 0644); -MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:CRC)"); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); static struct workqueue_struct *release_wq; struct iser_global ig; +/* + * iscsi_iser_recv() - Process a successfull recv completion + * @conn: iscsi connection + * @hdr: iscsi header + * @rx_data: buffer containing receive data payload + * @rx_data_len: length of rx_data + * + * Notes: In case of data length errors or iscsi PDU completion failures + * this routine will signal iscsi layer of connection failure. + */ void -iscsi_iser_recv(struct iscsi_conn *conn, - struct iscsi_hdr *hdr, char *rx_data, int rx_data_len) +iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr, + char *rx_data, int rx_data_len) { int rc = 0; int datalen; @@ -135,20 +145,30 @@ error: iscsi_conn_failure(conn, rc); } -static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) +/** + * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU + * @task: iscsi task + * @opcode: iscsi command opcode + * + * Netes: This routine can't fail, just assign iscsi task + * hdr and max hdr size. + */ +static int +iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) { struct iscsi_iser_task *iser_task = task->dd_data; task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header; task->hdr_max = sizeof(iser_task->desc.iscsi_header); + return 0; } int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc) { - struct iser_conn *ib_conn = task->conn->dd_data; - struct iser_device *device = ib_conn->device; + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; struct iscsi_iser_task *iser_task = task->dd_data; u64 dma_addr; @@ -162,14 +182,18 @@ int iser_initialize_task_headers(struct iscsi_task *task, tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; tx_desc->tx_sg[0].lkey = device->mr->lkey; - iser_task->ib_conn = ib_conn; + iser_task->iser_conn = iser_conn; return 0; } + /** - * iscsi_iser_task_init - Initialize task + * iscsi_iser_task_init() - Initialize iscsi-iser task * @task: iscsi task * * Initialize the task for the scsi command or mgmt command. + * + * Return: Returns zero on success or -ENOMEM when failing + * to init task headers (dma mapping error). */ static int iscsi_iser_task_init(struct iscsi_task *task) @@ -191,7 +215,7 @@ iscsi_iser_task_init(struct iscsi_task *task) } /** - * iscsi_iser_mtask_xmit - xmit management(immediate) task + * iscsi_iser_mtask_xmit() - xmit management (immediate) task * @conn: iscsi connection * @task: task management task * @@ -249,6 +273,12 @@ iscsi_iser_task_xmit_unsol_data_exit: return error; } +/** + * iscsi_iser_task_xmit() - xmit iscsi-iser task + * @task: iscsi task + * + * Return: zero on success or escalates $error on failure. + */ static int iscsi_iser_task_xmit(struct iscsi_task *task) { @@ -286,12 +316,24 @@ iscsi_iser_task_xmit(struct iscsi_task *task) return error; } +/** + * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task + * @task: iscsi task + * + * Notes: In case the RDMA device is already NULL (might have + * been removed in DEVICE_REMOVAL CM event it will bail-out + * without doing dma unmapping. + */ static void iscsi_iser_cleanup_task(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = &iser_task->desc; - struct iser_conn *ib_conn = task->conn->dd_data; - struct iser_device *device = ib_conn->device; + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; + + /* DEVICE_REMOVAL event might have already released the device */ + if (!device) + return; ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); @@ -306,7 +348,20 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) } } -static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +/** + * iscsi_iser_check_protection() - check protection information status of task. + * @task: iscsi task + * @sector: error sector if exsists (output) + * + * Return: zero if no data-integrity errors have occured + * 0x1: data-integrity error occured in the guard-block + * 0x2: data-integrity error occured in the reference tag + * 0x3: data-integrity error occured in the application tag + * + * In addition the error sector is marked. + */ +static u8 +iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) { struct iscsi_iser_task *iser_task = task->dd_data; @@ -318,8 +373,17 @@ static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) sector); } +/** + * iscsi_iser_conn_create() - create a new iscsi-iser connection + * @cls_session: iscsi class connection + * @conn_idx: connection index within the session (for MCS) + * + * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL + * otherwise. + */ static struct iscsi_cls_conn * -iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) +iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, + uint32_t conn_idx) { struct iscsi_conn *conn; struct iscsi_cls_conn *cls_conn; @@ -338,13 +402,25 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) return cls_conn; } +/** + * iscsi_iser_conn_bind() - bind iscsi and iser connection structures + * @cls_session: iscsi class session + * @cls_conn: iscsi class connection + * @transport_eph: transport end-point handle + * @is_leading: indicate if this is the session leading connection (MCS) + * + * Return: zero on success, $error if iscsi_conn_bind fails and + * -EINVAL in case end-point doesn't exsits anymore or iser connection + * state is not UP (teardown already started). + */ static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, - struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, + struct iscsi_cls_conn *cls_conn, + uint64_t transport_eph, int is_leading) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; struct iscsi_endpoint *ep; int error; @@ -360,66 +436,100 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, (unsigned long long)transport_eph); return -EINVAL; } - ib_conn = ep->dd_data; + iser_conn = ep->dd_data; - mutex_lock(&ib_conn->state_mutex); - if (ib_conn->state != ISER_CONN_UP) { + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_UP) { error = -EINVAL; iser_err("iser_conn %p state is %d, teardown started\n", - ib_conn, ib_conn->state); + iser_conn, iser_conn->state); goto out; } - error = iser_alloc_rx_descriptors(ib_conn, conn->session); + error = iser_alloc_rx_descriptors(iser_conn, conn->session); if (error) goto out; /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ - iser_info("binding iscsi conn %p to ib_conn %p\n", conn, ib_conn); + iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn); - conn->dd_data = ib_conn; - ib_conn->iscsi_conn = conn; + conn->dd_data = iser_conn; + iser_conn->iscsi_conn = conn; out: - mutex_unlock(&ib_conn->state_mutex); + mutex_unlock(&iser_conn->state_mutex); return error; } +/** + * iscsi_iser_conn_start() - start iscsi-iser connection + * @cls_conn: iscsi class connection + * + * Notes: Here iser intialize (or re-initialize) stop_completion as + * from this point iscsi must call conn_stop in session/connection + * teardown so iser transport must wait for it. + */ static int iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) { struct iscsi_conn *iscsi_conn; - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; iscsi_conn = cls_conn->dd_data; - ib_conn = iscsi_conn->dd_data; - reinit_completion(&ib_conn->stop_completion); + iser_conn = iscsi_conn->dd_data; + reinit_completion(&iser_conn->stop_completion); return iscsi_conn_start(cls_conn); } +/** + * iscsi_iser_conn_stop() - stop iscsi-iser connection + * @cls_conn: iscsi class connection + * @flag: indicate if recover or terminate (passed as is) + * + * Notes: Calling iscsi_conn_stop might theoretically race with + * DEVICE_REMOVAL event and dereference a previously freed RDMA device + * handle, so we call it under iser the state lock to protect against + * this kind of race. + */ static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iser_conn *ib_conn = conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; - iser_dbg("stopping iscsi_conn: %p, ib_conn: %p\n", conn, ib_conn); - iscsi_conn_stop(cls_conn, flag); + iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn); /* * Userspace may have goofed up and not bound the connection or * might have only partially setup the connection. */ - if (ib_conn) { + if (iser_conn) { + mutex_lock(&iser_conn->state_mutex); + iscsi_conn_stop(cls_conn, flag); + iser_conn_terminate(iser_conn); + + /* unbind */ + iser_conn->iscsi_conn = NULL; conn->dd_data = NULL; - complete(&ib_conn->stop_completion); + + complete(&iser_conn->stop_completion); + mutex_unlock(&iser_conn->state_mutex); + } else { + iscsi_conn_stop(cls_conn, flag); } } -static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) +/** + * iscsi_iser_session_destroy() - destroy iscsi-iser session + * @cls_session: iscsi class session + * + * Removes and free iscsi host. + */ +static void +iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) { struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); @@ -439,6 +549,16 @@ iser_dif_prot_caps(int prot_caps) SHOST_DIX_TYPE3_PROTECTION : 0); } +/** + * iscsi_iser_session_create() - create an iscsi-iser session + * @ep: iscsi end-point handle + * @cmds_max: maximum commands in this session + * @qdepth: session command queue depth + * @initial_cmdsn: initiator command sequnce number + * + * Allocates and adds a scsi host, expose DIF supprot if + * exists, and sets up an iscsi session. + */ static struct iscsi_cls_session * iscsi_iser_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max, uint16_t qdepth, @@ -447,7 +567,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, struct iscsi_cls_session *cls_session; struct iscsi_session *session; struct Scsi_Host *shost; - struct iser_conn *ib_conn = NULL; + struct iser_conn *iser_conn = NULL; + struct ib_conn *ib_conn; shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); if (!shost) @@ -464,7 +585,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, * the leading conn's ep so this will be NULL; */ if (ep) { - ib_conn = ep->dd_data; + iser_conn = ep->dd_data; + ib_conn = &iser_conn->ib_conn; if (ib_conn->pi_support) { u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; @@ -476,8 +598,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, } } - if (iscsi_host_add(shost, - ep ? ib_conn->device->ib_device->dma_device : NULL)) + if (iscsi_host_add(shost, ep ? + ib_conn->device->ib_device->dma_device : NULL)) goto free_host; if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { @@ -549,6 +671,13 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, return 0; } +/** + * iscsi_iser_set_param() - set class connection parameter + * @cls_conn: iscsi class connection + * @stats: iscsi stats to output + * + * Output connection statistics. + */ static void iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats) { @@ -577,18 +706,18 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param param, char *buf) { - struct iser_conn *ib_conn = ep->dd_data; + struct iser_conn *iser_conn = ep->dd_data; int len; switch (param) { case ISCSI_PARAM_CONN_PORT: case ISCSI_PARAM_CONN_ADDRESS: - if (!ib_conn || !ib_conn->cma_id) + if (!iser_conn || !iser_conn->ib_conn.cma_id) return -ENOTCONN; return iscsi_conn_get_addr_param((struct sockaddr_storage *) - &ib_conn->cma_id->route.addr.dst_addr, - param, buf); + &iser_conn->ib_conn.cma_id->route.addr.dst_addr, + param, buf); break; default: return -ENOSYS; @@ -597,29 +726,44 @@ static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, return len; } +/** + * iscsi_iser_ep_connect() - Initiate iSER connection establishment + * @shost: scsi_host + * @dst_addr: destination address + * @non-blocking: indicate if routine can block + * + * Allocate an iscsi endpoint, an iser_conn structure and bind them. + * After that start RDMA connection establishment via rdma_cm. We + * don't allocate iser_conn embedded in iscsi_endpoint since in teardown + * the endpoint will be destroyed at ep_disconnect while iser_conn will + * cleanup its resources asynchronuously. + * + * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error) + * if fails. + */ static struct iscsi_endpoint * iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, int non_blocking) { int err; - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; struct iscsi_endpoint *ep; ep = iscsi_create_endpoint(0); if (!ep) return ERR_PTR(-ENOMEM); - ib_conn = kzalloc(sizeof(*ib_conn), GFP_KERNEL); - if (!ib_conn) { + iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL); + if (!iser_conn) { err = -ENOMEM; goto failure; } - ep->dd_data = ib_conn; - ib_conn->ep = ep; - iser_conn_init(ib_conn); + ep->dd_data = iser_conn; + iser_conn->ep = ep; + iser_conn_init(iser_conn); - err = iser_connect(ib_conn, NULL, dst_addr, non_blocking); + err = iser_connect(iser_conn, NULL, dst_addr, non_blocking); if (err) goto failure; @@ -629,25 +773,38 @@ failure: return ERR_PTR(err); } +/** + * iscsi_iser_ep_poll() - poll for iser connection establishment to complete + * @ep: iscsi endpoint (created at ep_connect) + * @timeout_ms: polling timeout allowed in ms. + * + * This routine boils down to waiting for up_completion signaling + * that cma_id got CONNECTED event. + * + * Return: 1 if succeeded in connection establishment, 0 if timeout expired + * (libiscsi will retry will kick in) or -1 if interrupted by signal + * or more likely iser connection state transitioned to TEMINATING or + * DOWN during the wait period. + */ static int iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) { - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; int rc; - ib_conn = ep->dd_data; - rc = wait_for_completion_interruptible_timeout(&ib_conn->up_completion, + iser_conn = ep->dd_data; + rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion, msecs_to_jiffies(timeout_ms)); /* if conn establishment failed, return error code to iscsi */ if (rc == 0) { - mutex_lock(&ib_conn->state_mutex); - if (ib_conn->state == ISER_CONN_TERMINATING || - ib_conn->state == ISER_CONN_DOWN) + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state == ISER_CONN_TERMINATING || + iser_conn->state == ISER_CONN_DOWN) rc = -1; - mutex_unlock(&ib_conn->state_mutex); + mutex_unlock(&iser_conn->state_mutex); } - iser_info("ib conn %p rc = %d\n", ib_conn, rc); + iser_info("ib conn %p rc = %d\n", iser_conn, rc); if (rc > 0) return 1; /* success, this is the equivalent of POLLOUT */ @@ -657,15 +814,26 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) return rc; /* signal */ } +/** + * iscsi_iser_ep_disconnect() - Initiate connection teardown process + * @ep: iscsi endpoint handle + * + * This routine is not blocked by iser and RDMA termination process + * completion as we queue a deffered work for iser/RDMA destruction + * and cleanup or actually call it immediately in case we didn't pass + * iscsi conn bind/start stage, thus it is safe. + */ static void iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) { - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; + + iser_conn = ep->dd_data; + iser_info("ep %p iser conn %p state %d\n", + ep, iser_conn, iser_conn->state); - ib_conn = ep->dd_data; - iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); - mutex_lock(&ib_conn->state_mutex); - iser_conn_terminate(ib_conn); + mutex_lock(&iser_conn->state_mutex); + iser_conn_terminate(iser_conn); /* * if iser_conn and iscsi_conn are bound, we must wait for @@ -673,14 +841,14 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) * the iser resources. Otherwise we are safe to free resources * immediately. */ - if (ib_conn->iscsi_conn) { - INIT_WORK(&ib_conn->release_work, iser_release_work); - queue_work(release_wq, &ib_conn->release_work); - mutex_unlock(&ib_conn->state_mutex); + if (iser_conn->iscsi_conn) { + INIT_WORK(&iser_conn->release_work, iser_release_work); + queue_work(release_wq, &iser_conn->release_work); + mutex_unlock(&iser_conn->state_mutex); } else { - ib_conn->state = ISER_CONN_DOWN; - mutex_unlock(&ib_conn->state_mutex); - iser_conn_release(ib_conn); + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); } iscsi_destroy_endpoint(ep); } @@ -843,7 +1011,7 @@ register_transport_failure: static void __exit iser_exit(void) { - struct iser_conn *ib_conn, *n; + struct iser_conn *iser_conn, *n; int connlist_empty; iser_dbg("Removing iSER datamover...\n"); @@ -856,8 +1024,9 @@ static void __exit iser_exit(void) if (!connlist_empty) { iser_err("Error cleanup stage completed but we still have iser " "connections, destroying them anyway.\n"); - list_for_each_entry_safe(ib_conn, n, &ig.connlist, conn_list) { - iser_conn_release(ib_conn); + list_for_each_entry_safe(iser_conn, n, &ig.connlist, + conn_list) { + iser_conn_release(iser_conn); } } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 9f0e0e34d6ca..cd4174ca9a76 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -69,39 +69,38 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.4.1" +#define DRV_VER "1.4.8" -#define iser_dbg(fmt, arg...) \ - do { \ - if (iser_debug_level > 2) \ - printk(KERN_DEBUG PFX "%s:" fmt,\ - __func__ , ## arg); \ +#define iser_dbg(fmt, arg...) \ + do { \ + if (iser_debug_level > 2) \ + printk(KERN_DEBUG PFX "%s: " fmt,\ + __func__ , ## arg); \ } while (0) #define iser_warn(fmt, arg...) \ do { \ if (iser_debug_level > 0) \ - pr_warn(PFX "%s:" fmt, \ + pr_warn(PFX "%s: " fmt, \ __func__ , ## arg); \ } while (0) #define iser_info(fmt, arg...) \ do { \ if (iser_debug_level > 1) \ - pr_info(PFX "%s:" fmt, \ + pr_info(PFX "%s: " fmt, \ __func__ , ## arg); \ } while (0) #define iser_err(fmt, arg...) \ do { \ - printk(KERN_ERR PFX "%s:" fmt, \ + printk(KERN_ERR PFX "%s: " fmt, \ __func__ , ## arg); \ } while (0) #define SHIFT_4K 12 #define SIZE_4K (1ULL << SHIFT_4K) #define MASK_4K (~(SIZE_4K-1)) - /* support up to 512KB in one RDMA */ #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) #define ISER_DEF_XMIT_CMDS_DEFAULT 512 @@ -145,18 +144,32 @@ ISER_MAX_TX_MISC_PDUS + \ ISER_MAX_RX_MISC_PDUS) +#define ISER_WC_BATCH_COUNT 16 +#define ISER_SIGNAL_CMD_COUNT 32 + #define ISER_VER 0x10 #define ISER_WSV 0x08 #define ISER_RSV 0x04 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL +#define ISER_BEACON_WRID 0xfffffffffffffffeULL +/** + * struct iser_hdr - iSER header + * + * @flags: flags support (zbva, remote_inv) + * @rsvd: reserved + * @write_stag: write rkey + * @write_va: write virtual address + * @reaf_stag: read rkey + * @read_va: read virtual address + */ struct iser_hdr { u8 flags; u8 rsvd[3]; - __be32 write_stag; /* write rkey */ + __be32 write_stag; __be64 write_va; - __be32 read_stag; /* read rkey */ + __be32 read_stag; __be64 read_va; } __attribute__((packed)); @@ -179,7 +192,7 @@ struct iser_cm_hdr { /* Length of an object name string */ #define ISER_OBJECT_NAME_SIZE 64 -enum iser_ib_conn_state { +enum iser_conn_state { ISER_CONN_INIT, /* descriptor allocd, no conn */ ISER_CONN_PENDING, /* in the process of being established */ ISER_CONN_UP, /* up and running */ @@ -200,23 +213,42 @@ enum iser_data_dir { ISER_DIRS_NUM }; +/** + * struct iser_data_buf - iSER data buffer + * + * @buf: pointer to the sg list + * @size: num entries of this sg + * @data_len: total beffer byte len + * @dma_nents: returned by dma_map_sg + * @copy_buf: allocated copy buf for SGs unaligned + * for rdma which are copied + * @sg_single: SG-ified clone of a non SG SC or + * unaligned SG + */ struct iser_data_buf { - void *buf; /* pointer to the sg list */ - unsigned int size; /* num entries of this sg */ - unsigned long data_len; /* total data len */ - unsigned int dma_nents; /* returned by dma_map_sg */ - char *copy_buf; /* allocated copy buf for SGs unaligned * - * for rdma which are copied */ - struct scatterlist sg_single; /* SG-ified clone of a non SG SC or * - * unaligned SG */ + void *buf; + unsigned int size; + unsigned long data_len; + unsigned int dma_nents; + char *copy_buf; + struct scatterlist sg_single; }; /* fwd declarations */ struct iser_device; -struct iser_cq_desc; struct iscsi_iser_task; struct iscsi_endpoint; +/** + * struct iser_mem_reg - iSER memory registration info + * + * @lkey: MR local key + * @rkey: MR remote key + * @va: MR start address (buffer va) + * @len: MR length + * @mem_h: pointer to registration context (FMR/Fastreg) + * @is_mr: indicates weather we registered the buffer + */ struct iser_mem_reg { u32 lkey; u32 rkey; @@ -226,11 +258,20 @@ struct iser_mem_reg { int is_mr; }; +/** + * struct iser_regd_buf - iSER buffer registration desc + * + * @reg: memory registration info + * @virt_addr: virtual address of buffer + * @device: reference to iser device + * @direction: dma direction (for dma_unmap) + * @data_size: data buffer size in bytes + */ struct iser_regd_buf { - struct iser_mem_reg reg; /* memory registration info */ + struct iser_mem_reg reg; void *virt_addr; - struct iser_device *device; /* device->device for dma_unmap */ - enum dma_data_direction direction; /* direction for dma_unmap */ + struct iser_device *device; + enum dma_data_direction direction; unsigned int data_size; }; @@ -240,19 +281,39 @@ enum iser_desc_type { ISCSI_TX_DATAOUT }; +/** + * struct iser_tx_desc - iSER TX descriptor (for send wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @type: command/control/dataout + * @dam_addr: header buffer dma_address + * @tx_sg: sg[0] points to iser/iscsi headers + * sg[1] optionally points to either of immediate data + * unsolicited data-out or control + * @num_sge: number sges used on this TX task + */ struct iser_tx_desc { struct iser_hdr iser_header; struct iscsi_hdr iscsi_header; enum iser_desc_type type; u64 dma_addr; - /* sg[0] points to iser/iscsi headers, sg[1] optionally points to either - of immediate data, unsolicited data-out or control (login,text) */ struct ib_sge tx_sg[2]; int num_sge; }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ sizeof(u64) + sizeof(struct ib_sge))) +/** + * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @data: received data segment + * @dma_addr: receive buffer dma address + * @rx_sg: ib_sge of receive buffer + * @pad: for sense data TODO: Modify to maximum sense length supported + */ struct iser_rx_desc { struct iser_hdr iser_header; struct iscsi_hdr iscsi_header; @@ -265,25 +326,59 @@ struct iser_rx_desc { #define ISER_MAX_CQ 4 struct iser_conn; +struct ib_conn; struct iscsi_iser_task; +/** + * struct iser_comp - iSER completion context + * + * @device: pointer to device handle + * @cq: completion queue + * @wcs: work completion array + * @tasklet: Tasklet handle + * @active_qps: Number of active QPs attached + * to completion context + */ +struct iser_comp { + struct iser_device *device; + struct ib_cq *cq; + struct ib_wc wcs[ISER_WC_BATCH_COUNT]; + struct tasklet_struct tasklet; + int active_qps; +}; + +/** + * struct iser_device - iSER device handle + * + * @ib_device: RDMA device + * @pd: Protection Domain for this device + * @dev_attr: Device attributes container + * @mr: Global DMA memory region + * @event_handler: IB events handle routine + * @ig_list: entry in devices list + * @refcount: Reference counter, dominated by open iser connections + * @comps_used: Number of completion contexts used, Min between online + * cpus and device max completion vectors + * @comps: Dinamically allocated array of completion handlers + * Memory registration pool Function pointers (FMR or Fastreg): + * @iser_alloc_rdma_reg_res: Allocation of memory regions pool + * @iser_free_rdma_reg_res: Free of memory regions pool + * @iser_reg_rdma_mem: Memory registration routine + * @iser_unreg_rdma_mem: Memory deregistration routine + */ struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; struct ib_device_attr dev_attr; - struct ib_cq *rx_cq[ISER_MAX_CQ]; - struct ib_cq *tx_cq[ISER_MAX_CQ]; struct ib_mr *mr; - struct tasklet_struct cq_tasklet[ISER_MAX_CQ]; struct ib_event_handler event_handler; - struct list_head ig_list; /* entry in ig devices list */ + struct list_head ig_list; int refcount; - int cq_active_qps[ISER_MAX_CQ]; - int cqs_used; - struct iser_cq_desc *cq_desc; - int (*iser_alloc_rdma_reg_res)(struct iser_conn *ib_conn, + int comps_used; + struct iser_comp comps[ISER_MAX_CQ]; + int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, unsigned cmds_max); - void (*iser_free_rdma_reg_res)(struct iser_conn *ib_conn); + void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, @@ -301,78 +396,160 @@ enum iser_reg_indicator { ISER_FASTREG_PROTECTED = 1 << 3, }; +/** + * struct iser_pi_context - Protection information context + * + * @prot_mr: protection memory region + * @prot_frpl: protection fastreg page list + * @sig_mr: signature feature enabled memory region + */ struct iser_pi_context { struct ib_mr *prot_mr; struct ib_fast_reg_page_list *prot_frpl; struct ib_mr *sig_mr; }; +/** + * struct fast_reg_descriptor - Fast registration descriptor + * + * @list: entry in connection fastreg pool + * @data_mr: data memory region + * @data_frpl: data fastreg page list + * @pi_ctx: protection information context + * @reg_indicators: fast registration indicators + */ struct fast_reg_descriptor { struct list_head list; - /* For fast registration - FRWR */ struct ib_mr *data_mr; struct ib_fast_reg_page_list *data_frpl; struct iser_pi_context *pi_ctx; - /* registration indicators container */ u8 reg_indicators; }; +/** + * struct ib_conn - Infiniband related objects + * + * @cma_id: rdma_cm connection maneger handle + * @qp: Connection Queue-pair + * @post_recv_buf_count: post receive counter + * @rx_wr: receive work request for batch posts + * @device: reference to iser device + * @comp: iser completion context + * @pi_support: Indicate device T10-PI support + * @beacon: beacon send wr to signal all flush errors were drained + * @flush_comp: completes when all connection completions consumed + * @lock: protects fmr/fastreg pool + * @union.fmr: + * @pool: FMR pool for fast registrations + * @page_vec: page vector to hold mapped commands pages + * used for registration + * @union.fastreg: + * @pool: Fast registration descriptors pool for fast + * registrations + * @pool_size: Size of pool + */ +struct ib_conn { + struct rdma_cm_id *cma_id; + struct ib_qp *qp; + int post_recv_buf_count; + struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; + struct iser_device *device; + struct iser_comp *comp; + bool pi_support; + struct ib_send_wr beacon; + struct completion flush_comp; + spinlock_t lock; + union { + struct { + struct ib_fmr_pool *pool; + struct iser_page_vec *page_vec; + } fmr; + struct { + struct list_head pool; + int pool_size; + } fastreg; + }; +}; + +/** + * struct iser_conn - iSER connection context + * + * @ib_conn: connection RDMA resources + * @iscsi_conn: link to matching iscsi connection + * @ep: transport handle + * @state: connection logical state + * @qp_max_recv_dtos: maximum number of data outs, corresponds + * to max number of post recvs + * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) + * @min_posted_rx: (qp_max_recv_dtos >> 2) + * @name: connection peer portal + * @release_work: deffered work for release job + * @state_mutex: protects iser onnection state + * @stop_completion: conn_stop completion + * @ib_completion: RDMA cleanup completion + * @up_completion: connection establishment completed + * (state is ISER_CONN_UP) + * @conn_list: entry in ig conn list + * @login_buf: login data buffer (stores login parameters) + * @login_req_buf: login request buffer + * @login_req_dma: login request buffer dma address + * @login_resp_buf: login response buffer + * @login_resp_dma: login response buffer dma address + * @rx_desc_head: head of rx_descs cyclic buffer + * @rx_descs: rx buffers array (cyclic buffer) + * @num_rx_descs: number of rx descriptors + */ struct iser_conn { + struct ib_conn ib_conn; struct iscsi_conn *iscsi_conn; struct iscsi_endpoint *ep; - enum iser_ib_conn_state state; /* rdma connection state */ - atomic_t refcount; - spinlock_t lock; /* used for state changes */ - struct iser_device *device; /* device context */ - struct rdma_cm_id *cma_id; /* CMA ID */ - struct ib_qp *qp; /* QP */ - unsigned qp_max_recv_dtos; /* num of rx buffers */ - unsigned qp_max_recv_dtos_mask; /* above minus 1 */ - unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ - int post_recv_buf_count; /* posted rx count */ - atomic_t post_send_buf_count; /* posted tx count */ + enum iser_conn_state state; + unsigned qp_max_recv_dtos; + unsigned qp_max_recv_dtos_mask; + unsigned min_posted_rx; char name[ISER_OBJECT_NAME_SIZE]; struct work_struct release_work; - struct completion stop_completion; struct mutex state_mutex; - struct completion flush_completion; + struct completion stop_completion; + struct completion ib_completion; struct completion up_completion; - struct list_head conn_list; /* entry in ig conn list */ + struct list_head conn_list; char *login_buf; char *login_req_buf, *login_resp_buf; u64 login_req_dma, login_resp_dma; unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; - bool pi_support; - - /* Connection memory registration pool */ - union { - struct { - struct ib_fmr_pool *pool; /* pool of IB FMRs */ - struct iser_page_vec *page_vec; /* represents SG to fmr maps* - * maps serialized as tx is*/ - } fmr; - struct { - struct list_head pool; - int pool_size; - } fastreg; - }; + u32 num_rx_descs; }; +/** + * struct iscsi_iser_task - iser task context + * + * @desc: TX descriptor + * @iser_conn: link to iser connection + * @status: current task status + * @sc: link to scsi command + * @command_sent: indicate if command was sent + * @dir: iser data direction + * @rdma_regd: task rdma registration desc + * @data: iser data buffer desc + * @data_copy: iser data copy buffer desc (bounce buffer) + * @prot: iser protection buffer desc + * @prot_copy: iser protection copy buffer desc (bounce buffer) + */ struct iscsi_iser_task { struct iser_tx_desc desc; - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; enum iser_task_status status; struct scsi_cmnd *sc; - int command_sent; /* set if command sent */ - int dir[ISER_DIRS_NUM]; /* set if dir use*/ - struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ - struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ - struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ - struct iser_data_buf prot[ISER_DIRS_NUM]; /* prot desc */ - struct iser_data_buf prot_copy[ISER_DIRS_NUM];/* prot copy */ + int command_sent; + int dir[ISER_DIRS_NUM]; + struct iser_regd_buf rdma_regd[ISER_DIRS_NUM]; + struct iser_data_buf data[ISER_DIRS_NUM]; + struct iser_data_buf data_copy[ISER_DIRS_NUM]; + struct iser_data_buf prot[ISER_DIRS_NUM]; + struct iser_data_buf prot_copy[ISER_DIRS_NUM]; }; struct iser_page_vec { @@ -382,17 +559,20 @@ struct iser_page_vec { int data_size; }; -struct iser_cq_desc { - struct iser_device *device; - int cq_index; -}; - +/** + * struct iser_global: iSER global context + * + * @device_list_mutex: protects device_list + * @device_list: iser devices global list + * @connlist_mutex: protects connlist + * @connlist: iser connections global list + * @desc_cache: kmem cache for tx dataout + */ struct iser_global { - struct mutex device_list_mutex;/* */ - struct list_head device_list; /* all iSER devices */ + struct mutex device_list_mutex; + struct list_head device_list; struct mutex connlist_mutex; - struct list_head connlist; /* all iSER IB connections */ - + struct list_head connlist; struct kmem_cache *desc_cache; }; @@ -401,9 +581,6 @@ extern int iser_debug_level; extern bool iser_pi_enable; extern int iser_pi_guard; -/* allocate connection resources needed for rdma functionality */ -int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); - int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task); @@ -415,29 +592,30 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_data *hdr); void iscsi_iser_recv(struct iscsi_conn *conn, - struct iscsi_hdr *hdr, - char *rx_data, - int rx_data_len); + struct iscsi_hdr *hdr, + char *rx_data, + int rx_data_len); -void iser_conn_init(struct iser_conn *ib_conn); +void iser_conn_init(struct iser_conn *iser_conn); -void iser_conn_release(struct iser_conn *ib_conn); +void iser_conn_release(struct iser_conn *iser_conn); -void iser_conn_terminate(struct iser_conn *ib_conn); +int iser_conn_terminate(struct iser_conn *iser_conn); void iser_release_work(struct work_struct *work); void iser_rcv_completion(struct iser_rx_desc *desc, - unsigned long dto_xfer_len, - struct iser_conn *ib_conn); + unsigned long dto_xfer_len, + struct ib_conn *ib_conn); -void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn); +void iser_snd_completion(struct iser_tx_desc *desc, + struct ib_conn *ib_conn); void iser_task_rdma_init(struct iscsi_iser_task *task); void iser_task_rdma_finalize(struct iscsi_iser_task *task); -void iser_free_rx_descriptors(struct iser_conn *ib_conn); +void iser_free_rx_descriptors(struct iser_conn *iser_conn); void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, @@ -449,38 +627,40 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir cmd_dir); -int iser_connect(struct iser_conn *ib_conn, - struct sockaddr *src_addr, - struct sockaddr *dst_addr, - int non_blocking); +int iser_connect(struct iser_conn *iser_conn, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + int non_blocking); -int iser_reg_page_vec(struct iser_conn *ib_conn, +int iser_reg_page_vec(struct ib_conn *ib_conn, struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg); + struct iser_mem_reg *mem_reg); void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); -int iser_post_recvl(struct iser_conn *ib_conn); -int iser_post_recvm(struct iser_conn *ib_conn, int count); -int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc); +int iser_post_recvl(struct iser_conn *iser_conn); +int iser_post_recvm(struct iser_conn *iser_conn, int count); +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, + bool signal); int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, - enum iser_data_dir iser_dir, - enum dma_data_direction dma_dir); + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir); void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, struct iser_data_buf *data); int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); -int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); -void iser_free_fmr_pool(struct iser_conn *ib_conn); -int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max); -void iser_free_fastreg_pool(struct iser_conn *ib_conn); +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session); +int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max); +void iser_free_fmr_pool(struct ib_conn *ib_conn); +int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct ib_conn *ib_conn); u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 8d44a4060634..5a489ea63732 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -49,7 +49,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->ib_conn->device; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -103,7 +103,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, unsigned int edtl) { struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_device *device = iser_task->ib_conn->device; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_regd_buf *regd_buf; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; @@ -160,10 +160,10 @@ iser_prepare_write_cmd(struct iscsi_task *task, } /* creates a new tx descriptor and adds header regd buffer */ -static void iser_create_send_desc(struct iser_conn *ib_conn, +static void iser_create_send_desc(struct iser_conn *iser_conn, struct iser_tx_desc *tx_desc) { - struct iser_device *device = ib_conn->device; + struct iser_device *device = iser_conn->ib_conn.device; ib_dma_sync_single_for_cpu(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); @@ -179,103 +179,108 @@ static void iser_create_send_desc(struct iser_conn *ib_conn, } } -static void iser_free_login_buf(struct iser_conn *ib_conn) +static void iser_free_login_buf(struct iser_conn *iser_conn) { - if (!ib_conn->login_buf) + struct iser_device *device = iser_conn->ib_conn.device; + + if (!iser_conn->login_buf) return; - if (ib_conn->login_req_dma) - ib_dma_unmap_single(ib_conn->device->ib_device, - ib_conn->login_req_dma, + if (iser_conn->login_req_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_req_dma, ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); - if (ib_conn->login_resp_dma) - ib_dma_unmap_single(ib_conn->device->ib_device, - ib_conn->login_resp_dma, + if (iser_conn->login_resp_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_resp_dma, ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->login_buf); + kfree(iser_conn->login_buf); /* make sure we never redo any unmapping */ - ib_conn->login_req_dma = 0; - ib_conn->login_resp_dma = 0; - ib_conn->login_buf = NULL; + iser_conn->login_req_dma = 0; + iser_conn->login_resp_dma = 0; + iser_conn->login_buf = NULL; } -static int iser_alloc_login_buf(struct iser_conn *ib_conn) +static int iser_alloc_login_buf(struct iser_conn *iser_conn) { - struct iser_device *device; + struct iser_device *device = iser_conn->ib_conn.device; int req_err, resp_err; - BUG_ON(ib_conn->device == NULL); + BUG_ON(device == NULL); - device = ib_conn->device; - - ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + ISER_RX_LOGIN_SIZE, GFP_KERNEL); - if (!ib_conn->login_buf) + if (!iser_conn->login_buf) goto out_err; - ib_conn->login_req_buf = ib_conn->login_buf; - ib_conn->login_resp_buf = ib_conn->login_buf + + iser_conn->login_req_buf = iser_conn->login_buf; + iser_conn->login_resp_buf = iser_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN; - ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device, - (void *)ib_conn->login_req_buf, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + iser_conn->login_req_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); - ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device, - (void *)ib_conn->login_resp_buf, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_resp_buf, + ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); req_err = ib_dma_mapping_error(device->ib_device, - ib_conn->login_req_dma); + iser_conn->login_req_dma); resp_err = ib_dma_mapping_error(device->ib_device, - ib_conn->login_resp_dma); + iser_conn->login_resp_dma); if (req_err || resp_err) { if (req_err) - ib_conn->login_req_dma = 0; + iser_conn->login_req_dma = 0; if (resp_err) - ib_conn->login_resp_dma = 0; + iser_conn->login_resp_dma = 0; goto free_login_buf; } return 0; free_login_buf: - iser_free_login_buf(ib_conn); + iser_free_login_buf(iser_conn); out_err: iser_err("unable to alloc or map login buf\n"); return -ENOMEM; } -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session) +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session) { int i, j; u64 dma_addr; struct iser_rx_desc *rx_desc; struct ib_sge *rx_sg; - struct iser_device *device = ib_conn->device; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; - ib_conn->qp_max_recv_dtos = session->cmds_max; - ib_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ - ib_conn->min_posted_rx = ib_conn->qp_max_recv_dtos >> 2; + iser_conn->qp_max_recv_dtos = session->cmds_max; + iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ + iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) goto create_rdma_reg_res_failed; - if (iser_alloc_login_buf(ib_conn)) + if (iser_alloc_login_buf(iser_conn)) goto alloc_login_buf_fail; - ib_conn->rx_descs = kmalloc(session->cmds_max * + iser_conn->num_rx_descs = session->cmds_max; + iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs * sizeof(struct iser_rx_desc), GFP_KERNEL); - if (!ib_conn->rx_descs) + if (!iser_conn->rx_descs) goto rx_desc_alloc_fail; - rx_desc = ib_conn->rx_descs; + rx_desc = iser_conn->rx_descs; - for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) { + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) { dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(device->ib_device, dma_addr)) @@ -289,18 +294,18 @@ int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *s rx_sg->lkey = device->mr->lkey; } - ib_conn->rx_desc_head = 0; + iser_conn->rx_desc_head = 0; return 0; rx_desc_dma_map_failed: - rx_desc = ib_conn->rx_descs; + rx_desc = iser_conn->rx_descs; for (j = 0; j < i; j++, rx_desc++) ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->rx_descs); - ib_conn->rx_descs = NULL; + kfree(iser_conn->rx_descs); + iser_conn->rx_descs = NULL; rx_desc_alloc_fail: - iser_free_login_buf(ib_conn); + iser_free_login_buf(iser_conn); alloc_login_buf_fail: device->iser_free_rdma_reg_res(ib_conn); create_rdma_reg_res_failed: @@ -308,33 +313,35 @@ create_rdma_reg_res_failed: return -ENOMEM; } -void iser_free_rx_descriptors(struct iser_conn *ib_conn) +void iser_free_rx_descriptors(struct iser_conn *iser_conn) { int i; struct iser_rx_desc *rx_desc; + struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - if (!ib_conn->rx_descs) + if (!iser_conn->rx_descs) goto free_login_buf; if (device->iser_free_rdma_reg_res) device->iser_free_rdma_reg_res(ib_conn); - rx_desc = ib_conn->rx_descs; - for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++) + rx_desc = iser_conn->rx_descs; + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->rx_descs); + kfree(iser_conn->rx_descs); /* make sure we never redo any unmapping */ - ib_conn->rx_descs = NULL; + iser_conn->rx_descs = NULL; free_login_buf: - iser_free_login_buf(ib_conn); + iser_free_login_buf(iser_conn); } static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) { - struct iser_conn *ib_conn = conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; + struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iscsi_session *session = conn->session; iser_dbg("req op %x flags %x\n", req->opcode, req->flags); @@ -343,34 +350,37 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) return 0; /* - * Check that there is one posted recv buffer (for the last login - * response) and no posted send buffers left - they must have been - * consumed during previous login phases. + * Check that there is one posted recv buffer + * (for the last login response). */ WARN_ON(ib_conn->post_recv_buf_count != 1); - WARN_ON(atomic_read(&ib_conn->post_send_buf_count) != 0); if (session->discovery_sess) { iser_info("Discovery session, re-using login RX buffer\n"); return 0; } else iser_info("Normal session, posting batch of RX %d buffers\n", - ib_conn->min_posted_rx); + iser_conn->min_posted_rx); /* Initial post receive buffers */ - if (iser_post_recvm(ib_conn, ib_conn->min_posted_rx)) + if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) return -ENOMEM; return 0; } +static inline bool iser_signal_comp(int sig_count) +{ + return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); +} + /** * iser_send_command - send command PDU */ int iser_send_command(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iser_conn *ib_conn = conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; unsigned long edtl; int err; @@ -378,12 +388,13 @@ int iser_send_command(struct iscsi_conn *conn, struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; struct scsi_cmnd *sc = task->sc; struct iser_tx_desc *tx_desc = &iser_task->desc; + static unsigned sig_count; edtl = ntohl(hdr->data_length); /* build the tx desc regd header and add it to the tx desc dto */ tx_desc->type = ISCSI_TX_SCSI_COMMAND; - iser_create_send_desc(ib_conn, tx_desc); + iser_create_send_desc(iser_conn, tx_desc); if (hdr->flags & ISCSI_FLAG_CMD_READ) { data_buf = &iser_task->data[ISER_DIR_IN]; @@ -423,7 +434,8 @@ int iser_send_command(struct iscsi_conn *conn, iser_task->status = ISER_TASK_STATUS_STARTED; - err = iser_post_send(ib_conn, tx_desc); + err = iser_post_send(&iser_conn->ib_conn, tx_desc, + iser_signal_comp(++sig_count)); if (!err) return 0; @@ -439,7 +451,7 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_task *task, struct iscsi_data *hdr) { - struct iser_conn *ib_conn = conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = NULL; struct iser_regd_buf *regd_buf; @@ -488,7 +500,7 @@ int iser_send_data_out(struct iscsi_conn *conn, itt, buf_offset, data_seg_len); - err = iser_post_send(ib_conn, tx_desc); + err = iser_post_send(&iser_conn->ib_conn, tx_desc, true); if (!err) return 0; @@ -501,7 +513,7 @@ send_data_out_error: int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task) { - struct iser_conn *ib_conn = conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *mdesc = &iser_task->desc; unsigned long data_seg_len; @@ -510,9 +522,9 @@ int iser_send_control(struct iscsi_conn *conn, /* build the tx desc regd header and add it to the tx desc dto */ mdesc->type = ISCSI_TX_CONTROL; - iser_create_send_desc(ib_conn, mdesc); + iser_create_send_desc(iser_conn, mdesc); - device = ib_conn->device; + device = iser_conn->ib_conn.device; data_seg_len = ntoh24(task->hdr->dlength); @@ -524,16 +536,16 @@ int iser_send_control(struct iscsi_conn *conn, } ib_dma_sync_single_for_cpu(device->ib_device, - ib_conn->login_req_dma, task->data_count, + iser_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - memcpy(ib_conn->login_req_buf, task->data, task->data_count); + memcpy(iser_conn->login_req_buf, task->data, task->data_count); ib_dma_sync_single_for_device(device->ib_device, - ib_conn->login_req_dma, task->data_count, + iser_conn->login_req_dma, task->data_count, DMA_TO_DEVICE); - tx_dsg->addr = ib_conn->login_req_dma; + tx_dsg->addr = iser_conn->login_req_dma; tx_dsg->length = task->data_count; tx_dsg->lkey = device->mr->lkey; mdesc->num_sge = 2; @@ -542,7 +554,7 @@ int iser_send_control(struct iscsi_conn *conn, if (task == conn->login_task) { iser_dbg("op %x dsl %lx, posting login rx buffer\n", task->hdr->opcode, data_seg_len); - err = iser_post_recvl(ib_conn); + err = iser_post_recvl(iser_conn); if (err) goto send_control_error; err = iser_post_rx_bufs(conn, task->hdr); @@ -550,7 +562,7 @@ int iser_send_control(struct iscsi_conn *conn, goto send_control_error; } - err = iser_post_send(ib_conn, mdesc); + err = iser_post_send(&iser_conn->ib_conn, mdesc, true); if (!err) return 0; @@ -564,15 +576,17 @@ send_control_error: */ void iser_rcv_completion(struct iser_rx_desc *rx_desc, unsigned long rx_xfer_len, - struct iser_conn *ib_conn) + struct ib_conn *ib_conn) { + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); struct iscsi_hdr *hdr; u64 rx_dma; int rx_buflen, outstanding, count, err; /* differentiate between login to all other PDUs */ - if ((char *)rx_desc == ib_conn->login_resp_buf) { - rx_dma = ib_conn->login_resp_dma; + if ((char *)rx_desc == iser_conn->login_resp_buf) { + rx_dma = iser_conn->login_resp_dma; rx_buflen = ISER_RX_LOGIN_SIZE; } else { rx_dma = rx_desc->dma_addr; @@ -580,14 +594,14 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, } ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + rx_buflen, DMA_FROM_DEVICE); hdr = &rx_desc->iscsi_header; iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); - iscsi_iser_recv(ib_conn->iscsi_conn, hdr, rx_desc->data, + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN); ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, @@ -599,21 +613,21 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, * for the posted rx bufs refcount to become zero handles everything */ ib_conn->post_recv_buf_count--; - if (rx_dma == ib_conn->login_resp_dma) + if (rx_dma == iser_conn->login_resp_dma) return; outstanding = ib_conn->post_recv_buf_count; - if (outstanding + ib_conn->min_posted_rx <= ib_conn->qp_max_recv_dtos) { - count = min(ib_conn->qp_max_recv_dtos - outstanding, - ib_conn->min_posted_rx); - err = iser_post_recvm(ib_conn, count); + if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { + count = min(iser_conn->qp_max_recv_dtos - outstanding, + iser_conn->min_posted_rx); + err = iser_post_recvm(iser_conn, count); if (err) iser_err("posting %d rx bufs err %d\n", count, err); } } void iser_snd_completion(struct iser_tx_desc *tx_desc, - struct iser_conn *ib_conn) + struct ib_conn *ib_conn) { struct iscsi_task *task; struct iser_device *device = ib_conn->device; @@ -625,8 +639,6 @@ void iser_snd_completion(struct iser_tx_desc *tx_desc, tx_desc = NULL; } - atomic_dec(&ib_conn->post_send_buf_count); - if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { /* this arithmetic is legal by libiscsi dd_data allocation */ task = (void *) ((long)(void *)tx_desc - @@ -658,7 +670,7 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { - struct iser_device *device = iser_task->ib_conn->device; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; int is_rdma_data_aligned = 1; int is_rdma_prot_aligned = 1; int prot_count = scsi_prot_sg_count(iser_task->sc); diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 47acd3ad3a17..6c5ce357fba6 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -49,7 +49,7 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *data_copy, enum iser_data_dir cmd_dir) { - struct ib_device *dev = iser_task->ib_conn->device->ib_device; + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; struct scatterlist *sgl = (struct scatterlist *)data->buf; struct scatterlist *sg; char *mem = NULL; @@ -116,7 +116,7 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct ib_device *dev; unsigned long cmd_data_len; - dev = iser_task->ib_conn->device->ib_device; + dev = iser_task->iser_conn->ib_conn.device->ib_device; ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, (cmd_dir == ISER_DIR_OUT) ? @@ -322,7 +322,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct ib_device *dev; iser_task->dir[iser_dir] = 1; - dev = iser_task->ib_conn->device->ib_device; + dev = iser_task->iser_conn->ib_conn.device->ib_device; data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); if (data->dma_nents == 0) { @@ -337,7 +337,7 @@ void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, { struct ib_device *dev; - dev = iser_task->ib_conn->device->ib_device; + dev = iser_task->iser_conn->ib_conn.device->ib_device; ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); } @@ -348,7 +348,7 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, int aligned_len) { - struct iscsi_conn *iscsi_conn = iser_task->ib_conn->iscsi_conn; + struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; iscsi_conn->fmr_unalign_cnt++; iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", @@ -377,7 +377,7 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { - struct iser_conn *ib_conn = iser_task->ib_conn; + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; @@ -432,7 +432,7 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, ib_conn->fmr.page_vec->offset); for (i = 0; i < ib_conn->fmr.page_vec->length; i++) iser_err("page_vec[%d] = 0x%llx\n", i, - (unsigned long long) ib_conn->fmr.page_vec->pages[i]); + (unsigned long long)ib_conn->fmr.page_vec->pages[i]); } if (err) return err; @@ -440,77 +440,74 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, return 0; } -static inline enum ib_t10_dif_type -scsi2ib_prot_type(unsigned char prot_type) +static inline void +iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, + struct ib_sig_domain *domain) { - switch (prot_type) { - case SCSI_PROT_DIF_TYPE0: - return IB_T10DIF_NONE; - case SCSI_PROT_DIF_TYPE1: - return IB_T10DIF_TYPE1; - case SCSI_PROT_DIF_TYPE2: - return IB_T10DIF_TYPE2; - case SCSI_PROT_DIF_TYPE3: - return IB_T10DIF_TYPE3; - default: - return IB_T10DIF_NONE; - } -} - + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.pi_interval = sc->device->sector_size; + domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; + /* + * At the moment we hard code those, but in the future + * we will take them from sc. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || + scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2) + domain->sig.dif.ref_remap = true; +}; static int iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) { - unsigned char scsi_ptype = scsi_get_prot_type(sc); - - sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; - sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; - sig_attrs->mem.sig.dif.pi_interval = sc->device->sector_size; - sig_attrs->wire.sig.dif.pi_interval = sc->device->sector_size; - switch (scsi_get_prot_op(sc)) { case SCSI_PROT_WRITE_INSERT: case SCSI_PROT_READ_STRIP: - sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; - sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & - 0xffffffff; break; case SCSI_PROT_READ_INSERT: case SCSI_PROT_WRITE_STRIP: - sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); - sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & - 0xffffffff; - sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + /* + * At the moment we use this modparam to tell what is + * the memory bg_type, in the future we will take it + * from sc. + */ + sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : + IB_T10DIF_CRC; break; case SCSI_PROT_READ_PASS: case SCSI_PROT_WRITE_PASS: - sig_attrs->mem.sig.dif.type = scsi2ib_prot_type(scsi_ptype); - sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->mem.sig.dif.ref_tag = scsi_get_lba(sc) & - 0xffffffff; - sig_attrs->wire.sig.dif.type = scsi2ib_prot_type(scsi_ptype); + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->wire.sig.dif.ref_tag = scsi_get_lba(sc) & - 0xffffffff; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + /* + * At the moment we use this modparam to tell what is + * the memory bg_type, in the future we will take it + * from sc. + */ + sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : + IB_T10DIF_CRC; break; default: iser_err("Unsupported PI operation %d\n", scsi_get_prot_op(sc)); return -EINVAL; } + return 0; } - static int iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) { switch (scsi_get_prot_type(sc)) { case SCSI_PROT_DIF_TYPE0: - *mask = 0x0; break; case SCSI_PROT_DIF_TYPE1: case SCSI_PROT_DIF_TYPE2: @@ -533,7 +530,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, struct fast_reg_descriptor *desc, struct ib_sge *data_sge, struct ib_sge *prot_sge, struct ib_sge *sig_sge) { - struct iser_conn *ib_conn = iser_task->ib_conn; + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_pi_context *pi_ctx = desc->pi_ctx; struct ib_send_wr sig_wr, inv_wr; struct ib_send_wr *bad_wr, *wr = NULL; @@ -609,7 +606,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct ib_sge *sge) { struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; - struct iser_conn *ib_conn = iser_task->ib_conn; + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct ib_mr *mr; @@ -700,7 +697,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { - struct iser_conn *ib_conn = iser_task->ib_conn; + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 3bfec4bbda52..67225bb82bb5 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -39,8 +39,12 @@ #include "iscsi_iser.h" #define ISCSI_ISER_MAX_CONN 8 -#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) -#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ + ISCSI_ISER_MAX_CONN) + +static int iser_cq_poll_limit = 512; static void iser_cq_tasklet_fn(unsigned long data); static void iser_cq_callback(struct ib_cq *cq, void *cq_context); @@ -71,7 +75,6 @@ static void iser_event_handler(struct ib_event_handler *handler, */ static int iser_create_device_ib_res(struct iser_device *device) { - struct iser_cq_desc *cq_desc; struct ib_device_attr *dev_attr = &device->dev_attr; int ret, i; @@ -101,51 +104,35 @@ static int iser_create_device_ib_res(struct iser_device *device) return -1; } - device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); + device->comps_used = min(ISER_MAX_CQ, + device->ib_device->num_comp_vectors); iser_info("using %d CQs, device %s supports %d vectors\n", - device->cqs_used, device->ib_device->name, + device->comps_used, device->ib_device->name, device->ib_device->num_comp_vectors); - device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, - GFP_KERNEL); - if (device->cq_desc == NULL) - goto cq_desc_err; - cq_desc = device->cq_desc; - device->pd = ib_alloc_pd(device->ib_device); if (IS_ERR(device->pd)) goto pd_err; - for (i = 0; i < device->cqs_used; i++) { - cq_desc[i].device = device; - cq_desc[i].cq_index = i; - - device->rx_cq[i] = ib_create_cq(device->ib_device, - iser_cq_callback, - iser_cq_event_callback, - (void *)&cq_desc[i], - ISER_MAX_RX_CQ_LEN, i); - if (IS_ERR(device->rx_cq[i])) { - device->rx_cq[i] = NULL; + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + comp->device = device; + comp->cq = ib_create_cq(device->ib_device, + iser_cq_callback, + iser_cq_event_callback, + (void *)comp, + ISER_MAX_CQ_LEN, i); + if (IS_ERR(comp->cq)) { + comp->cq = NULL; goto cq_err; } - device->tx_cq[i] = ib_create_cq(device->ib_device, - NULL, iser_cq_event_callback, - (void *)&cq_desc[i], - ISER_MAX_TX_CQ_LEN, i); - - if (IS_ERR(device->tx_cq[i])) { - device->tx_cq[i] = NULL; + if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) goto cq_err; - } - if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) - goto cq_err; - - tasklet_init(&device->cq_tasklet[i], - iser_cq_tasklet_fn, - (unsigned long)&cq_desc[i]); + tasklet_init(&comp->tasklet, iser_cq_tasklet_fn, + (unsigned long)comp); } device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | @@ -164,19 +151,17 @@ static int iser_create_device_ib_res(struct iser_device *device) handler_err: ib_dereg_mr(device->mr); dma_mr_err: - for (i = 0; i < device->cqs_used; i++) - tasklet_kill(&device->cq_tasklet[i]); + for (i = 0; i < device->comps_used; i++) + tasklet_kill(&device->comps[i].tasklet); cq_err: - for (i = 0; i < device->cqs_used; i++) { - if (device->tx_cq[i]) - ib_destroy_cq(device->tx_cq[i]); - if (device->rx_cq[i]) - ib_destroy_cq(device->rx_cq[i]); + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + if (comp->cq) + ib_destroy_cq(comp->cq); } ib_dealloc_pd(device->pd); pd_err: - kfree(device->cq_desc); -cq_desc_err: iser_err("failed to allocate an IB resource\n"); return -1; } @@ -190,20 +175,18 @@ static void iser_free_device_ib_res(struct iser_device *device) int i; BUG_ON(device->mr == NULL); - for (i = 0; i < device->cqs_used; i++) { - tasklet_kill(&device->cq_tasklet[i]); - (void)ib_destroy_cq(device->tx_cq[i]); - (void)ib_destroy_cq(device->rx_cq[i]); - device->tx_cq[i] = NULL; - device->rx_cq[i] = NULL; + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + tasklet_kill(&comp->tasklet); + ib_destroy_cq(comp->cq); + comp->cq = NULL; } (void)ib_unregister_event_handler(&device->event_handler); (void)ib_dereg_mr(device->mr); (void)ib_dealloc_pd(device->pd); - kfree(device->cq_desc); - device->mr = NULL; device->pd = NULL; } @@ -213,7 +196,7 @@ static void iser_free_device_ib_res(struct iser_device *device) * * returns 0 on success, or errno code on failure */ -int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) +int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) { struct iser_device *device = ib_conn->device; struct ib_fmr_pool_param params; @@ -263,7 +246,7 @@ int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) /** * iser_free_fmr_pool - releases the FMR pool and page vec */ -void iser_free_fmr_pool(struct iser_conn *ib_conn) +void iser_free_fmr_pool(struct ib_conn *ib_conn) { iser_info("freeing conn %p fmr pool %p\n", ib_conn, ib_conn->fmr.pool); @@ -367,10 +350,10 @@ fast_reg_mr_failure: * for fast registration work requests. * returns 0 on success, or errno code on failure */ -int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) +int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) { - struct iser_device *device = ib_conn->device; - struct fast_reg_descriptor *desc; + struct iser_device *device = ib_conn->device; + struct fast_reg_descriptor *desc; int i, ret; INIT_LIST_HEAD(&ib_conn->fastreg.pool); @@ -406,7 +389,7 @@ err: /** * iser_free_fastreg_pool - releases the pool of fast_reg descriptors */ -void iser_free_fastreg_pool(struct iser_conn *ib_conn) +void iser_free_fastreg_pool(struct ib_conn *ib_conn) { struct fast_reg_descriptor *desc, *tmp; int i = 0; @@ -440,7 +423,7 @@ void iser_free_fastreg_pool(struct iser_conn *ib_conn) * * returns 0 on success, -1 on failure */ -static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +static int iser_create_ib_conn_res(struct ib_conn *ib_conn) { struct iser_device *device; struct ib_qp_init_attr init_attr; @@ -455,28 +438,30 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) mutex_lock(&ig.connlist_mutex); /* select the CQ with the minimal number of usages */ - for (index = 0; index < device->cqs_used; index++) - if (device->cq_active_qps[index] < - device->cq_active_qps[min_index]) + for (index = 0; index < device->comps_used; index++) { + if (device->comps[index].active_qps < + device->comps[min_index].active_qps) min_index = index; - device->cq_active_qps[min_index]++; + } + ib_conn->comp = &device->comps[min_index]; + ib_conn->comp->active_qps++; mutex_unlock(&ig.connlist_mutex); iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context = (void *)ib_conn; - init_attr.send_cq = device->tx_cq[min_index]; - init_attr.recv_cq = device->rx_cq[min_index]; + init_attr.send_cq = ib_conn->comp->cq; + init_attr.recv_cq = ib_conn->comp->cq; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; if (ib_conn->pi_support) { - init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; + init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; } else { - init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; } ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); @@ -495,30 +480,6 @@ out_err: } /** - * releases the QP object - */ -static void iser_free_ib_conn_res(struct iser_conn *ib_conn) -{ - int cq_index; - BUG_ON(ib_conn == NULL); - - iser_info("freeing conn %p cma_id %p qp %p\n", - ib_conn, ib_conn->cma_id, - ib_conn->qp); - - /* qp is created only once both addr & route are resolved */ - - if (ib_conn->qp != NULL) { - cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; - ib_conn->device->cq_active_qps[cq_index]--; - - rdma_destroy_qp(ib_conn->cma_id); - } - - ib_conn->qp = NULL; -} - -/** * based on the resolved device node GUID see if there already allocated * device for this device. If there's no such, create one. */ @@ -572,88 +533,142 @@ static void iser_device_try_release(struct iser_device *device) /** * Called with state mutex held **/ -static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, - enum iser_ib_conn_state comp, - enum iser_ib_conn_state exch) +static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, + enum iser_conn_state comp, + enum iser_conn_state exch) { int ret; - if ((ret = (ib_conn->state == comp))) - ib_conn->state = exch; + ret = (iser_conn->state == comp); + if (ret) + iser_conn->state = exch; + return ret; } void iser_release_work(struct work_struct *work) { - struct iser_conn *ib_conn; - int rc; + struct iser_conn *iser_conn; - ib_conn = container_of(work, struct iser_conn, release_work); + iser_conn = container_of(work, struct iser_conn, release_work); - /* wait for .conn_stop callback */ - rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ); - WARN_ON(rc == 0); + /* Wait for conn_stop to complete */ + wait_for_completion(&iser_conn->stop_completion); + /* Wait for IB resouces cleanup to complete */ + wait_for_completion(&iser_conn->ib_completion); - /* wait for the qp`s post send and post receive buffers to empty */ - rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ); - WARN_ON(rc == 0); + mutex_lock(&iser_conn->state_mutex); + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); - ib_conn->state = ISER_CONN_DOWN; + iser_conn_release(iser_conn); +} + +/** + * iser_free_ib_conn_res - release IB related resources + * @iser_conn: iser connection struct + * @destroy_device: indicator if we need to try to release + * the iser device (only iscsi shutdown and DEVICE_REMOVAL + * will use this. + * + * This routine is called with the iser state mutex held + * so the cm_id removal is out of here. It is Safe to + * be invoked multiple times. + */ +static void iser_free_ib_conn_res(struct iser_conn *iser_conn, + bool destroy_device) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; - mutex_lock(&ib_conn->state_mutex); - ib_conn->state = ISER_CONN_DOWN; - mutex_unlock(&ib_conn->state_mutex); + iser_info("freeing conn %p cma_id %p qp %p\n", + iser_conn, ib_conn->cma_id, ib_conn->qp); + + iser_free_rx_descriptors(iser_conn); - iser_conn_release(ib_conn); + if (ib_conn->qp != NULL) { + ib_conn->comp->active_qps--; + rdma_destroy_qp(ib_conn->cma_id); + ib_conn->qp = NULL; + } + + if (destroy_device && device != NULL) { + iser_device_try_release(device); + ib_conn->device = NULL; + } } /** * Frees all conn objects and deallocs conn descriptor */ -void iser_conn_release(struct iser_conn *ib_conn) +void iser_conn_release(struct iser_conn *iser_conn) { - struct iser_device *device = ib_conn->device; + struct ib_conn *ib_conn = &iser_conn->ib_conn; mutex_lock(&ig.connlist_mutex); - list_del(&ib_conn->conn_list); + list_del(&iser_conn->conn_list); mutex_unlock(&ig.connlist_mutex); - mutex_lock(&ib_conn->state_mutex); - BUG_ON(ib_conn->state != ISER_CONN_DOWN); - - iser_free_rx_descriptors(ib_conn); - iser_free_ib_conn_res(ib_conn); - ib_conn->device = NULL; - /* on EVENT_ADDR_ERROR there's no device yet for this conn */ - if (device != NULL) - iser_device_try_release(device); - mutex_unlock(&ib_conn->state_mutex); + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_DOWN) + iser_warn("iser conn %p state %d, expected state down.\n", + iser_conn, iser_conn->state); + /* + * In case we never got to bind stage, we still need to + * release IB resources (which is safe to call more than once). + */ + iser_free_ib_conn_res(iser_conn, true); + mutex_unlock(&iser_conn->state_mutex); - /* if cma handler context, the caller actually destroy the id */ if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } - kfree(ib_conn); + + kfree(iser_conn); } /** * triggers start of the disconnect procedures and wait for them to be done + * Called with state mutex held */ -void iser_conn_terminate(struct iser_conn *ib_conn) +int iser_conn_terminate(struct iser_conn *iser_conn) { + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_send_wr *bad_wr; int err = 0; - /* change the ib conn state only if the conn is UP, however always call - * rdma_disconnect since this is the only way to cause the CMA to change - * the QP state to ERROR + /* terminate the iser conn only if the conn state is UP */ + if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + return 0; + + iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); + + /* suspend queuing of new iscsi commands */ + if (iser_conn->iscsi_conn) + iscsi_suspend_queue(iser_conn->iscsi_conn); + + /* + * In case we didn't already clean up the cma_id (peer initiated + * a disconnection), we need to Cause the CMA to change the QP + * state to ERROR. */ + if (ib_conn->cma_id) { + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + iser_conn, err); + + /* post an indication that all flush errors were consumed */ + err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); + if (err) + iser_err("conn %p failed to post beacon", ib_conn); + + wait_for_completion(&ib_conn->flush_comp); + } - iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); - err = rdma_disconnect(ib_conn->cma_id); - if (err) - iser_err("Failed to disconnect, conn: 0x%p err %d\n", - ib_conn,err); + return 1; } /** @@ -661,10 +676,10 @@ void iser_conn_terminate(struct iser_conn *ib_conn) **/ static void iser_connect_error(struct rdma_cm_id *cma_id) { - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; - ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_DOWN; + iser_conn = (struct iser_conn *)cma_id->context; + iser_conn->state = ISER_CONN_DOWN; } /** @@ -673,14 +688,16 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; + struct ib_conn *ib_conn; int ret; - ib_conn = (struct iser_conn *)cma_id->context; - if (ib_conn->state != ISER_CONN_PENDING) + iser_conn = (struct iser_conn *)cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; + ib_conn = &iser_conn->ib_conn; device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); @@ -719,14 +736,15 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct rdma_conn_param conn_param; int ret; struct iser_cm_hdr req_hdr; - struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; + struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - if (ib_conn->state != ISER_CONN_PENDING) + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; - ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); + ret = iser_create_ib_conn_res(ib_conn); if (ret) goto failure; @@ -755,57 +773,60 @@ failure: static void iser_connected_handler(struct rdma_cm_id *cma_id) { - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; - ib_conn = (struct iser_conn *)cma_id->context; - if (ib_conn->state != ISER_CONN_PENDING) + iser_conn = (struct iser_conn *)cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ return; (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); - ib_conn->state = ISER_CONN_UP; - complete(&ib_conn->up_completion); + iser_conn->state = ISER_CONN_UP; + complete(&iser_conn->up_completion); } static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { - struct iser_conn *ib_conn; - - ib_conn = (struct iser_conn *)cma_id->context; + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; - /* getting here when the state is UP means that the conn is being * - * terminated asynchronously from the iSCSI layer's perspective. */ - if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)){ - if (ib_conn->iscsi_conn) - iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + if (iser_conn_terminate(iser_conn)) { + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); else iser_err("iscsi_iser connection isn't bound\n"); } +} + +static void iser_cleanup_handler(struct rdma_cm_id *cma_id, + bool destroy_device) +{ + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; - /* Complete the termination process if no posts are pending. This code - * block also exists in iser_handle_comp_error(), but it is needed here - * for cases of no flushes at all, e.g. discovery over rdma. + /* + * We are not guaranteed that we visited disconnected_handler + * by now, call it here to be safe that we handle CM drep + * and flush errors. */ - if (ib_conn->post_recv_buf_count == 0 && - (atomic_read(&ib_conn->post_send_buf_count) == 0)) { - complete(&ib_conn->flush_completion); - } -} + iser_disconnected_handler(cma_id); + iser_free_ib_conn_res(iser_conn, destroy_device); + complete(&iser_conn->ib_completion); +}; static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct iser_conn *ib_conn; + struct iser_conn *iser_conn; + int ret = 0; - ib_conn = (struct iser_conn *)cma_id->context; + iser_conn = (struct iser_conn *)cma_id->context; iser_info("event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); - mutex_lock(&ib_conn->state_mutex); + mutex_lock(&iser_conn->state_mutex); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: iser_addr_handler(cma_id); @@ -824,57 +845,73 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: - case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_disconnected_handler(cma_id); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* + * we *must* destroy the device as we cannot rely + * on iscsid to be around to initiate error handling. + * also implicitly destroy the cma_id. + */ + iser_cleanup_handler(cma_id, true); + iser_conn->ib_conn.cma_id = NULL; + ret = 1; + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cleanup_handler(cma_id, false); + break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } - mutex_unlock(&ib_conn->state_mutex); - return 0; + mutex_unlock(&iser_conn->state_mutex); + + return ret; } -void iser_conn_init(struct iser_conn *ib_conn) +void iser_conn_init(struct iser_conn *iser_conn) { - ib_conn->state = ISER_CONN_INIT; - ib_conn->post_recv_buf_count = 0; - atomic_set(&ib_conn->post_send_buf_count, 0); - init_completion(&ib_conn->stop_completion); - init_completion(&ib_conn->flush_completion); - init_completion(&ib_conn->up_completion); - INIT_LIST_HEAD(&ib_conn->conn_list); - spin_lock_init(&ib_conn->lock); - mutex_init(&ib_conn->state_mutex); + iser_conn->state = ISER_CONN_INIT; + iser_conn->ib_conn.post_recv_buf_count = 0; + init_completion(&iser_conn->ib_conn.flush_comp); + init_completion(&iser_conn->stop_completion); + init_completion(&iser_conn->ib_completion); + init_completion(&iser_conn->up_completion); + INIT_LIST_HEAD(&iser_conn->conn_list); + spin_lock_init(&iser_conn->ib_conn.lock); + mutex_init(&iser_conn->state_mutex); } /** * starts the process of connecting to the target * sleeps until the connection is established or rejected */ -int iser_connect(struct iser_conn *ib_conn, +int iser_connect(struct iser_conn *iser_conn, struct sockaddr *src_addr, struct sockaddr *dst_addr, int non_blocking) { + struct ib_conn *ib_conn = &iser_conn->ib_conn; int err = 0; - mutex_lock(&ib_conn->state_mutex); + mutex_lock(&iser_conn->state_mutex); - sprintf(ib_conn->name, "%pISp", dst_addr); + sprintf(iser_conn->name, "%pISp", dst_addr); - iser_info("connecting to: %s\n", ib_conn->name); + iser_info("connecting to: %s\n", iser_conn->name); /* the device is known only --after-- address resolution */ ib_conn->device = NULL; - ib_conn->state = ISER_CONN_PENDING; + iser_conn->state = ISER_CONN_PENDING; + + ib_conn->beacon.wr_id = ISER_BEACON_WRID; + ib_conn->beacon.opcode = IB_WR_SEND; ib_conn->cma_id = rdma_create_id(iser_cma_handler, - (void *)ib_conn, - RDMA_PS_TCP, IB_QPT_RC); + (void *)iser_conn, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ib_conn->cma_id)) { err = PTR_ERR(ib_conn->cma_id); iser_err("rdma_create_id failed: %d\n", err); @@ -888,27 +925,27 @@ int iser_connect(struct iser_conn *ib_conn, } if (!non_blocking) { - wait_for_completion_interruptible(&ib_conn->up_completion); + wait_for_completion_interruptible(&iser_conn->up_completion); - if (ib_conn->state != ISER_CONN_UP) { + if (iser_conn->state != ISER_CONN_UP) { err = -EIO; goto connect_failure; } } - mutex_unlock(&ib_conn->state_mutex); + mutex_unlock(&iser_conn->state_mutex); mutex_lock(&ig.connlist_mutex); - list_add(&ib_conn->conn_list, &ig.connlist); + list_add(&iser_conn->conn_list, &ig.connlist); mutex_unlock(&ig.connlist_mutex); return 0; id_failure: ib_conn->cma_id = NULL; addr_failure: - ib_conn->state = ISER_CONN_DOWN; + iser_conn->state = ISER_CONN_DOWN; connect_failure: - mutex_unlock(&ib_conn->state_mutex); - iser_conn_release(ib_conn); + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); return err; } @@ -917,7 +954,7 @@ connect_failure: * * returns: 0 on success, errno code on failure */ -int iser_reg_page_vec(struct iser_conn *ib_conn, +int iser_reg_page_vec(struct ib_conn *ib_conn, struct iser_page_vec *page_vec, struct iser_mem_reg *mem_reg) { @@ -987,7 +1024,8 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - struct iser_conn *ib_conn = iser_task->ib_conn; + struct iser_conn *iser_conn = iser_task->iser_conn; + struct ib_conn *ib_conn = &iser_conn->ib_conn; struct fast_reg_descriptor *desc = reg->mem_h; if (!reg->is_mr) @@ -1000,17 +1038,18 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, spin_unlock_bh(&ib_conn->lock); } -int iser_post_recvl(struct iser_conn *ib_conn) +int iser_post_recvl(struct iser_conn *iser_conn) { struct ib_recv_wr rx_wr, *rx_wr_failed; + struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_sge sge; int ib_ret; - sge.addr = ib_conn->login_resp_dma; + sge.addr = iser_conn->login_resp_dma; sge.length = ISER_RX_LOGIN_SIZE; sge.lkey = ib_conn->device->mr->lkey; - rx_wr.wr_id = (unsigned long)ib_conn->login_resp_buf; + rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf; rx_wr.sg_list = &sge; rx_wr.num_sge = 1; rx_wr.next = NULL; @@ -1024,20 +1063,21 @@ int iser_post_recvl(struct iser_conn *ib_conn) return ib_ret; } -int iser_post_recvm(struct iser_conn *ib_conn, int count) +int iser_post_recvm(struct iser_conn *iser_conn, int count) { struct ib_recv_wr *rx_wr, *rx_wr_failed; int i, ib_ret; - unsigned int my_rx_head = ib_conn->rx_desc_head; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + unsigned int my_rx_head = iser_conn->rx_desc_head; struct iser_rx_desc *rx_desc; for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { - rx_desc = &ib_conn->rx_descs[my_rx_head]; + rx_desc = &iser_conn->rx_descs[my_rx_head]; rx_wr->wr_id = (unsigned long)rx_desc; rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; rx_wr->next = rx_wr + 1; - my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask; + my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; } rx_wr--; @@ -1049,7 +1089,7 @@ int iser_post_recvm(struct iser_conn *ib_conn, int count) iser_err("ib_post_recv failed ret=%d\n", ib_ret); ib_conn->post_recv_buf_count -= count; } else - ib_conn->rx_desc_head = my_rx_head; + iser_conn->rx_desc_head = my_rx_head; return ib_ret; } @@ -1059,139 +1099,166 @@ int iser_post_recvm(struct iser_conn *ib_conn, int count) * * returns 0 on success, -1 on failure */ -int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc) +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, + bool signal) { int ib_ret; struct ib_send_wr send_wr, *send_wr_failed; ib_dma_sync_single_for_device(ib_conn->device->ib_device, - tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->dma_addr, ISER_HEADERS_LEN, + DMA_TO_DEVICE); send_wr.next = NULL; send_wr.wr_id = (unsigned long)tx_desc; send_wr.sg_list = tx_desc->tx_sg; send_wr.num_sge = tx_desc->num_sge; send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - - atomic_inc(&ib_conn->post_send_buf_count); + send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); - if (ib_ret) { + if (ib_ret) iser_err("ib_post_send failed, ret:%d\n", ib_ret); - atomic_dec(&ib_conn->post_send_buf_count); - } + return ib_ret; } -static void iser_handle_comp_error(struct iser_tx_desc *desc, - struct iser_conn *ib_conn) +/** + * is_iser_tx_desc - Indicate if the completion wr_id + * is a TX descriptor or not. + * @iser_conn: iser connection + * @wr_id: completion WR identifier + * + * Since we cannot rely on wc opcode in FLUSH errors + * we must work around it by checking if the wr_id address + * falls in the iser connection rx_descs buffer. If so + * it is an RX descriptor, otherwize it is a TX. + */ +static inline bool +is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) +{ + void *start = iser_conn->rx_descs; + int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); + + if (wr_id >= start && wr_id < start + len) + return false; + + return true; +} + +/** + * iser_handle_comp_error() - Handle error completion + * @ib_conn: connection RDMA resources + * @wc: work completion + * + * Notes: We may handle a FLUSH error completion and in this case + * we only cleanup in case TX type was DATAOUT. For non-FLUSH + * error completion we should also notify iscsi layer that + * connection is failed (in case we passed bind stage). + */ +static void +iser_handle_comp_error(struct ib_conn *ib_conn, + struct ib_wc *wc) { - if (desc && desc->type == ISCSI_TX_DATAOUT) - kmem_cache_free(ig.desc_cache, desc); - - if (ib_conn->post_recv_buf_count == 0 && - atomic_read(&ib_conn->post_send_buf_count) == 0) { - /** - * getting here when the state is UP means that the conn is - * being terminated asynchronously from the iSCSI layer's - * perspective. It is safe to peek at the connection state - * since iscsi_conn_failure is allowed to be called twice. - **/ - if (ib_conn->state == ISER_CONN_UP) - iscsi_conn_failure(ib_conn->iscsi_conn, + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + + if (wc->status != IB_WC_WR_FLUSH_ERR) + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); - /* no more non completed posts to the QP, complete the - * termination process w.o worrying on disconnect event */ - complete(&ib_conn->flush_completion); + if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) { + struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id; + + if (desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, desc); + } else { + ib_conn->post_recv_buf_count--; } } -static int iser_drain_tx_cq(struct iser_device *device, int cq_index) +/** + * iser_handle_wc - handle a single work completion + * @wc: work completion + * + * Soft-IRQ context, work completion can be either + * SEND or RECV, and can turn out successful or + * with error (or flush error). + */ +static void iser_handle_wc(struct ib_wc *wc) { - struct ib_cq *cq = device->tx_cq[cq_index]; - struct ib_wc wc; + struct ib_conn *ib_conn; struct iser_tx_desc *tx_desc; - struct iser_conn *ib_conn; - int completed_tx = 0; - - while (ib_poll_cq(cq, 1, &wc) == 1) { - tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; - ib_conn = wc.qp->qp_context; - if (wc.status == IB_WC_SUCCESS) { - if (wc.opcode == IB_WC_SEND) - iser_snd_completion(tx_desc, ib_conn); - else - iser_err("expected opcode %d got %d\n", - IB_WC_SEND, wc.opcode); + struct iser_rx_desc *rx_desc; + + ib_conn = wc->qp->qp_context; + if (wc->status == IB_WC_SUCCESS) { + if (wc->opcode == IB_WC_RECV) { + rx_desc = (struct iser_rx_desc *)wc->wr_id; + iser_rcv_completion(rx_desc, wc->byte_len, + ib_conn); + } else + if (wc->opcode == IB_WC_SEND) { + tx_desc = (struct iser_tx_desc *)wc->wr_id; + iser_snd_completion(tx_desc, ib_conn); } else { - iser_err("tx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - if (wc.wr_id != ISER_FASTREG_LI_WRID) { - atomic_dec(&ib_conn->post_send_buf_count); - iser_handle_comp_error(tx_desc, ib_conn); - } + iser_err("Unknown wc opcode %d\n", wc->opcode); } - completed_tx++; + } else { + if (wc->status != IB_WC_WR_FLUSH_ERR) + iser_err("wr id %llx status %d vend_err %x\n", + wc->wr_id, wc->status, wc->vendor_err); + else + iser_dbg("flush error: wr id %llx\n", wc->wr_id); + + if (wc->wr_id != ISER_FASTREG_LI_WRID && + wc->wr_id != ISER_BEACON_WRID) + iser_handle_comp_error(ib_conn, wc); + + /* complete in case all flush errors were consumed */ + if (wc->wr_id == ISER_BEACON_WRID) + complete(&ib_conn->flush_comp); } - return completed_tx; } - +/** + * iser_cq_tasklet_fn - iSER completion polling loop + * @data: iSER completion context + * + * Soft-IRQ context, polling connection CQ until + * either CQ was empty or we exausted polling budget + */ static void iser_cq_tasklet_fn(unsigned long data) { - struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data; - struct iser_device *device = cq_desc->device; - int cq_index = cq_desc->cq_index; - struct ib_cq *cq = device->rx_cq[cq_index]; - struct ib_wc wc; - struct iser_rx_desc *desc; - unsigned long xfer_len; - struct iser_conn *ib_conn; - int completed_tx, completed_rx = 0; - - /* First do tx drain, so in a case where we have rx flushes and a successful - * tx completion we will still go through completion error handling. - */ - completed_tx = iser_drain_tx_cq(device, cq_index); - - while (ib_poll_cq(cq, 1, &wc) == 1) { - desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; - BUG_ON(desc == NULL); - ib_conn = wc.qp->qp_context; - if (wc.status == IB_WC_SUCCESS) { - if (wc.opcode == IB_WC_RECV) { - xfer_len = (unsigned long)wc.byte_len; - iser_rcv_completion(desc, xfer_len, ib_conn); - } else - iser_err("expected opcode %d got %d\n", - IB_WC_RECV, wc.opcode); - } else { - if (wc.status != IB_WC_WR_FLUSH_ERR) - iser_err("rx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - ib_conn->post_recv_buf_count--; - iser_handle_comp_error(NULL, ib_conn); - } - completed_rx++; - if (!(completed_rx & 63)) - completed_tx += iser_drain_tx_cq(device, cq_index); + struct iser_comp *comp = (struct iser_comp *)data; + struct ib_cq *cq = comp->cq; + struct ib_wc *const wcs = comp->wcs; + int i, n, completed = 0; + + while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { + for (i = 0; i < n; i++) + iser_handle_wc(&wcs[i]); + + completed += n; + if (completed >= iser_cq_poll_limit) + break; } - /* #warning "it is assumed here that arming CQ only once its empty" * - * " would not cause interrupts to be missed" */ + + /* + * It is assumed here that arming CQ only once its empty + * would not cause interrupts to be missed. + */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); + iser_dbg("got %d completions\n", completed); } static void iser_cq_callback(struct ib_cq *cq, void *cq_context) { - struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context; - struct iser_device *device = cq_desc->device; - int cq_index = cq_desc->cq_index; + struct iser_comp *comp = cq_context; - tasklet_schedule(&device->cq_tasklet[cq_index]); + tasklet_schedule(&comp->tasklet); } u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index da8ff124762a..0bea5776bcbc 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2609,58 +2609,45 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, return ret; } -static inline enum ib_t10_dif_type -se2ib_prot_type(enum target_prot_type prot_type) -{ - switch (prot_type) { - case TARGET_DIF_TYPE0_PROT: - return IB_T10DIF_NONE; - case TARGET_DIF_TYPE1_PROT: - return IB_T10DIF_TYPE1; - case TARGET_DIF_TYPE2_PROT: - return IB_T10DIF_TYPE2; - case TARGET_DIF_TYPE3_PROT: - return IB_T10DIF_TYPE3; - default: - return IB_T10DIF_NONE; - } -} +static inline void +isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, + struct ib_sig_domain *domain) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size; + domain->sig.dif.ref_tag = se_cmd->reftag_seed; + /* + * At the moment we hard code those, but if in the future + * the target core would like to use it, we will take it + * from se_cmd. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT || + se_cmd->prot_type == TARGET_DIF_TYPE2_PROT) + domain->sig.dif.ref_remap = true; +}; static int isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) { - enum ib_t10_dif_type ib_prot_type = se2ib_prot_type(se_cmd->prot_type); - - sig_attrs->mem.sig_type = IB_SIG_TYPE_T10_DIF; - sig_attrs->wire.sig_type = IB_SIG_TYPE_T10_DIF; - sig_attrs->mem.sig.dif.pi_interval = - se_cmd->se_dev->dev_attrib.block_size; - sig_attrs->wire.sig.dif.pi_interval = - se_cmd->se_dev->dev_attrib.block_size; - switch (se_cmd->prot_op) { case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: - sig_attrs->mem.sig.dif.type = IB_T10DIF_NONE; - sig_attrs->wire.sig.dif.type = ib_prot_type; - sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); break; case TARGET_PROT_DOUT_INSERT: case TARGET_PROT_DIN_STRIP: - sig_attrs->mem.sig.dif.type = ib_prot_type; - sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; - sig_attrs->wire.sig.dif.type = IB_T10DIF_NONE; + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); break; case TARGET_PROT_DIN_PASS: case TARGET_PROT_DOUT_PASS: - sig_attrs->mem.sig.dif.type = ib_prot_type; - sig_attrs->mem.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->mem.sig.dif.ref_tag = se_cmd->reftag_seed; - sig_attrs->wire.sig.dif.type = ib_prot_type; - sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; - sig_attrs->wire.sig.dif.ref_tag = se_cmd->reftag_seed; + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); break; default: pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); |