diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-20 11:20:35 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-20 11:20:35 -0700 |
| commit | 4b0b946019e7376752456380b67e54eea2f10a7c (patch) | |
| tree | 813f922af517b8624d6c780f9be0d1d041734e92 /drivers/infiniband/sw | |
| parent | a5d1079c28a5bc6caa30ef4099ef04ed17d2c6aa (diff) | |
| parent | 9091e3b59f2bef11c0a841096327565ae0ca220b (diff) | |
| download | lwn-4b0b946019e7376752456380b67e54eea2f10a7c.tar.gz lwn-4b0b946019e7376752456380b67e54eea2f10a7c.zip | |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"The usual collection of driver changes, more core infrastructure
updates that typical this cycle:
- Minor cleanups and kernel-doc fixes in bnxt_re, hns, rdmavt, efa,
ocrdma, erdma, rtrs, hfi1, ionic, and pvrdma
- New udata validation framework and driver updates
- Modernize CQ creation interface in mlx4 and mlx5, manage CQ umem in
core
- Promote UMEM to a core component, split out DMA block iterator
logic
- Introduce FRMR pools with aging, statistics, pinned handles, and
netlink control and use it in mlx5
- Add PCIe TLP emulation support in mlx5
- Extend umem to work with revocable pinned dmabuf's and use it in
irdma
- More net namespace improvements for rxe
- GEN4 hardware support in irdma
- First steps to MW and UC support in mana_ib
- Support for CQ umem and doorbells in bnxt_re
- Drop opa_vnic driver from hfi1
Fixes:
- IB/core zero dmac neighbor resolution race
- GID table memory free
- rxe pad/ICRC validation and r_key async errors
- mlx4 external umem for CQ
- umem DMA attributes on unmap
- mana_ib RX steering on RSS QP destroy"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (116 commits)
RDMA/core: Fix user CQ creation for drivers without create_cq
RDMA/ionic: bound node_desc sysfs read with %.64s
IB/core: Fix zero dmac race in neighbor resolution
RDMA/mana_ib: Support memory windows
RDMA/rxe: Validate pad and ICRC before payload_size() in rxe_rcv
RDMA/core: Prefer NLA_NUL_STRING
RDMA/core: Fix memory free for GID table
RDMA/hns: Remove the duplicate calls to ib_copy_validate_udata_in()
RDMA: Remove redundant = {} for udata req structs
RDMA/irdma: Add missing comp_mask check in alloc_ucontext
RDMA/hns: Add missing comp_mask check in create_qp
RDMA/mlx5: Pull comp_mask validation into ib_copy_validate_udata_in_cm()
RDMA: Use ib_copy_validate_udata_in_cm() for zero comp_mask
RDMA/hns: Use ib_copy_validate_udata_in()
RDMA/mlx4: Use ib_copy_validate_udata_in() for QP
RDMA/mlx4: Use ib_copy_validate_udata_in()
RDMA/mlx5: Use ib_copy_validate_udata_in() for MW
RDMA/mlx5: Use ib_copy_validate_udata_in() for SRQ
RDMA/pvrdma: Use ib_copy_validate_udata_in() for srq
RDMA: Use ib_copy_validate_udata_in() for implicit full structs
...
Diffstat (limited to 'drivers/infiniband/sw')
22 files changed, 388 insertions, 136 deletions
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index e7835ca70e2b..30904c6ae852 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -337,7 +337,7 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) * * Return: 0 for success. */ -int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +int rvt_resize_cq(struct ib_cq *ibcq, unsigned int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); u32 head, tail, n; @@ -349,7 +349,7 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) struct rvt_k_cq_wc *k_wc = NULL; struct rvt_k_cq_wc *old_k_wc = NULL; - if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) + if (cqe > rdi->dparms.props.max_cqe) return -EINVAL; /* diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 4028702a7b2f..82c902c98c8e 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -13,7 +13,7 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); -int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); +int rvt_resize_cq(struct ib_cq *ibcq, unsigned int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); int rvt_driver_cq_init(void); void rvt_cq_exit(void); diff --git a/drivers/infiniband/sw/rdmavt/mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c index 1fda344d2056..b41fe4c069dd 100644 --- a/drivers/infiniband/sw/rdmavt/mcast.c +++ b/drivers/infiniband/sw/rdmavt/mcast.c @@ -49,7 +49,6 @@ static void rvt_mcast_qp_free(struct rvt_mcast_qp *mqp) { struct rvt_qp *qp = mqp->qp; - /* Notify hfi1_destroy_qp() if it is waiting. */ rvt_put_qp(qp); kfree(mqp); diff --git a/drivers/infiniband/sw/rdmavt/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c index 46e3b3e0643a..473f464f33fa 100644 --- a/drivers/infiniband/sw/rdmavt/mmap.c +++ b/drivers/infiniband/sw/rdmavt/mmap.c @@ -9,6 +9,11 @@ #include <rdma/uverbs_ioctl.h> #include "mmap.h" +/* number of reserved mmaps for the driver */ +#define MMAP_RESERVED 256 +/* start point for dynamic offsets */ +#define MMAP_OFFSET_START (MMAP_RESERVED * PAGE_SIZE) + /** * rvt_mmap_init - init link list and lock for mem map * @rdi: rvt dev struct @@ -17,7 +22,7 @@ void rvt_mmap_init(struct rvt_dev_info *rdi) { INIT_LIST_HEAD(&rdi->pending_mmaps); spin_lock_init(&rdi->pending_lock); - rdi->mmap_offset = PAGE_SIZE; + rdi->mmap_offset = MMAP_OFFSET_START; spin_lock_init(&rdi->mmap_offset_lock); } @@ -73,6 +78,13 @@ int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct rvt_mmap_info *ip, *pp; int ret = -EINVAL; + /* call driver if in reserved range */ + if (offset < MMAP_OFFSET_START) { + if (rdi->driver_f.mmap) + return rdi->driver_f.mmap(context, vma); + return -EINVAL; + } + /* * Search the device's list of objects waiting for a mmap call. * Normally, this list is very short since a call to create a @@ -129,9 +141,9 @@ struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size, spin_lock_irq(&rdi->mmap_offset_lock); if (rdi->mmap_offset == 0) - rdi->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA); + rdi->mmap_offset = MMAP_OFFSET_START; ip->offset = rdi->mmap_offset; - rdi->mmap_offset += ALIGN(size, SHMLBA); + rdi->mmap_offset += PAGE_SIZE; spin_unlock_irq(&rdi->mmap_offset_lock); INIT_LIST_HEAD(&ip->pending_mmaps); @@ -159,9 +171,9 @@ void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip, spin_lock_irq(&rdi->mmap_offset_lock); if (rdi->mmap_offset == 0) - rdi->mmap_offset = PAGE_SIZE; + rdi->mmap_offset = MMAP_OFFSET_START; ip->offset = rdi->mmap_offset; - rdi->mmap_offset += size; + rdi->mmap_offset += PAGE_SIZE; spin_unlock_irq(&rdi->mmap_offset_lock); ip->size = size; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3c7ee7ddc5dd..816624e0991a 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2705,7 +2705,7 @@ int rvt_qp_iter_next(struct rvt_qp_iter *iter) struct rvt_ibport *rvp; int pidx; - pidx = n % rdi->ibdev.phys_port_cnt; + pidx = n / 2; /* QP0 and QP1 */ rvp = rdi->ports[pidx]; qp = rcu_dereference(rvp->qp[n & 1]); } else { diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 0c28b412d81a..40aa64208364 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -244,6 +244,10 @@ static int rvt_query_gid(struct ib_device *ibdev, u32 port_num, */ static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct rvt_dev_info *rdi = ib_to_rvt(uctx->device); + + if (rdi->driver_f.alloc_ucontext) + return rdi->driver_f.alloc_ucontext(uctx, udata); return 0; } @@ -253,6 +257,10 @@ static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) */ static void rvt_dealloc_ucontext(struct ib_ucontext *context) { + struct rvt_dev_info *rdi = ib_to_rvt(context->device); + + if (rdi->driver_f.dealloc_ucontext) + rdi->driver_f.dealloc_ucontext(context); return; } @@ -367,7 +375,7 @@ static const struct ib_device_ops rvt_dev_ops = { .query_srq = rvt_query_srq, .reg_user_mr = rvt_reg_user_mr, .req_notify_cq = rvt_req_notify_cq, - .resize_cq = rvt_resize_cq, + .resize_user_cq = rvt_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq), diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile index 93134f1d1d0c..3977f4f13258 100644 --- a/drivers/infiniband/sw/rxe/Makefile +++ b/drivers/infiniband/sw/rxe/Makefile @@ -22,6 +22,7 @@ rdma_rxe-y := \ rxe_mcast.o \ rxe_task.o \ rxe_net.o \ - rxe_hw_counters.o + rxe_hw_counters.o \ + rxe_ns.o rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index e891199cbdef..b0714f9abe3d 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -8,6 +8,8 @@ #include <net/addrconf.h> #include "rxe.h" #include "rxe_loc.h" +#include "rxe_net.h" +#include "rxe_ns.h" MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); MODULE_DESCRIPTION("Soft RDMA transport"); @@ -200,6 +202,8 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) port->mtu_cap = ib_mtu_enum_to_int(mtu); } +static struct rdma_link_ops rxe_link_ops; + /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ @@ -208,6 +212,7 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, { rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); + rxe->ib_dev.link_ops = &rxe_link_ops; return rxe_register_device(rxe, ibdev_name, ndev); } @@ -231,6 +236,10 @@ static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) goto err; } + err = rxe_net_init(ndev); + if (err) + return err; + err = rxe_net_add(ibdev_name, ndev); if (err) { rxe_err("failed to add %s\n", ndev->name); @@ -240,9 +249,17 @@ err: return err; } +static int rxe_dellink(struct ib_device *dev) +{ + rxe_net_del(dev); + + return 0; +} + static struct rdma_link_ops rxe_link_ops = { .type = "rxe", .newlink = rxe_newlink, + .dellink = rxe_dellink, }; static int __init rxe_module_init(void) @@ -253,15 +270,24 @@ static int __init rxe_module_init(void) if (err) return err; - err = rxe_net_init(); - if (err) { - rxe_destroy_wq(); - return err; - } + err = rxe_namespace_init(); + if (err) + goto err_destroy_wq; + + err = rxe_register_notifier(); + if (err) + goto err_namespace_exit; rdma_link_register(&rxe_link_ops); + pr_info("loaded\n"); return 0; + +err_namespace_exit: + rxe_namespace_exit(); +err_destroy_wq: + rxe_destroy_wq(); + return err; } static void __exit rxe_module_exit(void) @@ -271,6 +297,8 @@ static void __exit rxe_module_exit(void) rxe_net_exit(); rxe_destroy_wq(); + rxe_namespace_exit(); + pr_info("unloaded\n"); } diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index ff8cd53f5f28..c56bae376c7f 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -121,4 +121,6 @@ void rxe_port_up(struct rxe_dev *rxe); void rxe_port_down(struct rxe_dev *rxe); void rxe_set_port_state(struct rxe_dev *rxe); +extern struct workqueue_struct *rxe_wq; + #endif /* RXE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fffd144d509e..eaf7802a5cbe 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -8,37 +8,6 @@ #include "rxe_loc.h" #include "rxe_queue.h" -int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, - int cqe, int comp_vector) -{ - int count; - - if (cqe <= 0) { - rxe_dbg_dev(rxe, "cqe(%d) <= 0\n", cqe); - goto err1; - } - - if (cqe > rxe->attr.max_cqe) { - rxe_dbg_dev(rxe, "cqe(%d) > max_cqe(%d)\n", - cqe, rxe->attr.max_cqe); - goto err1; - } - - if (cq) { - count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); - if (cqe < count) { - rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)\n", - cqe, count); - goto err1; - } - } - - return 0; - -err1: - return -EINVAL; -} - int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_udata *udata, struct rxe_create_cq_resp __user *uresp) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 7992290886e1..e095c12699cb 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -18,9 +18,6 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp); /* rxe_cq.c */ -int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, - int cqe, int comp_vector); - int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_udata *udata, struct rxe_create_cq_resp __user *uresp); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index cbc646a30003..50a2cb5405e2 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -17,8 +17,11 @@ #include "rxe.h" #include "rxe_net.h" #include "rxe_loc.h" +#include "rxe_ns.h" -static struct rxe_recv_sockets recv_sockets; +#ifndef SK_REF_FOR_TUNNEL +#define SK_REF_FOR_TUNNEL 2 +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -101,20 +104,20 @@ static inline void rxe_reclassify_recv_socket(struct socket *sock) } static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, + struct net *net, struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) { struct rtable *rt; - struct flowi4 fl = { { 0 } }; + struct flowi4 fl = {}; - memset(&fl, 0, sizeof(fl)); fl.flowi4_oif = ndev->ifindex; memcpy(&fl.saddr, saddr, sizeof(*saddr)); memcpy(&fl.daddr, daddr, sizeof(*daddr)); fl.flowi4_proto = IPPROTO_UDP; - rt = ip_route_output_key(&init_net, &fl); + rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) { rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; @@ -125,22 +128,20 @@ static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net *net, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { struct dst_entry *ndst; - struct flowi6 fl6 = { { 0 } }; + struct flowi6 fl6 = {}; - memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = ndev->ifindex; memcpy(&fl6.saddr, saddr, sizeof(*saddr)); memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; - ndst = ip6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), - recv_sockets.sk6->sk, &fl6, - NULL); + ndst = ip6_dst_lookup_flow(net, rxe_ns_pernet_sk6(net), &fl6, NULL); if (IS_ERR(ndst)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; @@ -160,6 +161,7 @@ put: #else static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net *net, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) @@ -174,6 +176,7 @@ static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_av *av) { struct dst_entry *dst = NULL; + struct net *net; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -182,20 +185,22 @@ static struct dst_entry *rxe_find_route(struct net_device *ndev, if (dst) dst_release(dst); + net = dev_net(ndev); + if (av->network_type == RXE_NETWORK_TYPE_IPV4) { struct in_addr *saddr; struct in_addr *daddr; saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route4(qp, ndev, saddr, daddr); + dst = rxe_find_route4(qp, net, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route6(qp, ndev, saddr6, daddr6); + dst = rxe_find_route6(qp, net, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = @@ -624,6 +629,43 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) return 0; } +static void rxe_sock_put(struct sock *sk, + void (*set_sk)(struct net *, struct sock *), + struct net *net) +{ + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL) { + __sock_put(sk); + } else { + rxe_release_udp_tunnel(sk->sk_socket); + sk = NULL; + set_sk(net, sk); + } +} + +void rxe_net_del(struct ib_device *dev) +{ + struct rxe_dev *rxe = container_of(dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + struct sock *sk; + struct net *net; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + net = dev_net(ndev); + + sk = rxe_ns_pernet_sk4(net); + if (sk) + rxe_sock_put(sk, rxe_ns_pernet_set_sk4, net); + + sk = rxe_ns_pernet_sk6(net); + if (sk) + rxe_sock_put(sk, rxe_ns_pernet_set_sk6, net); + + dev_put(ndev); +} + static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { @@ -680,6 +722,7 @@ static int rxe_notify(struct notifier_block *not_blk, switch (event) { case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); + rxe_net_del(&rxe->ib_dev); break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); @@ -709,66 +752,97 @@ static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; -static int rxe_net_ipv4_init(void) +static int rxe_net_ipv4_init(struct net *net) { - recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, - htons(ROCE_V2_UDP_DPORT), false); - if (IS_ERR(recv_sockets.sk4)) { - recv_sockets.sk4 = NULL; + struct sock *sk; + struct socket *sock; + + sk = rxe_ns_pernet_sk4(net); + if (sk) { + sock_hold(sk); + return 0; + } + + sock = rxe_setup_udp_tunnel(net, htons(ROCE_V2_UDP_DPORT), false); + if (IS_ERR(sock)) { pr_err("Failed to create IPv4 UDP tunnel\n"); return -1; } + rxe_ns_pernet_set_sk4(net, sock->sk); return 0; } -static int rxe_net_ipv6_init(void) +static int rxe_net_ipv6_init(struct net *net) { #if IS_ENABLED(CONFIG_IPV6) + struct sock *sk; + struct socket *sock; - recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, - htons(ROCE_V2_UDP_DPORT), true); - if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { - recv_sockets.sk6 = NULL; + sk = rxe_ns_pernet_sk6(net); + if (sk) { + sock_hold(sk); + return 0; + } + + sock = rxe_setup_udp_tunnel(net, htons(ROCE_V2_UDP_DPORT), true); + if (PTR_ERR(sock) == -EAFNOSUPPORT) { pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); return 0; } - if (IS_ERR(recv_sockets.sk6)) { - recv_sockets.sk6 = NULL; + if (IS_ERR(sock)) { pr_err("Failed to create IPv6 UDP tunnel\n"); return -1; } + + rxe_ns_pernet_set_sk6(net, sock->sk); + #endif return 0; } +int rxe_register_notifier(void) +{ + int err; + + err = register_netdevice_notifier(&rxe_net_notifier); + if (err) { + pr_err("Failed to register netdev notifier\n"); + return -1; + } + + return 0; +} + void rxe_net_exit(void) { - rxe_release_udp_tunnel(recv_sockets.sk6); - rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } -int rxe_net_init(void) +int rxe_net_init(struct net_device *ndev) { + struct net *net; + struct sock *sk; int err; - recv_sockets.sk6 = NULL; + net = dev_net(ndev); - err = rxe_net_ipv4_init(); + err = rxe_net_ipv4_init(net); if (err) return err; - err = rxe_net_ipv6_init(); + + err = rxe_net_ipv6_init(net); if (err) goto err_out; - err = register_netdevice_notifier(&rxe_net_notifier); - if (err) { - pr_err("Failed to register netdev notifier\n"); - goto err_out; - } + return 0; + err_out: - rxe_net_exit(); + /* If ipv6 error, release ipv4 resource */ + sk = rxe_ns_pernet_sk4(net); + if (sk) + rxe_sock_put(sk, rxe_ns_pernet_set_sk4, net); + return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 45d80d00f86b..56249677d692 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -11,14 +11,11 @@ #include <net/if_inet6.h> #include <linux/module.h> -struct rxe_recv_sockets { - struct socket *sk4; - struct socket *sk6; -}; - int rxe_net_add(const char *ibdev_name, struct net_device *ndev); +void rxe_net_del(struct ib_device *dev); -int rxe_net_init(void); +int rxe_register_notifier(void); +int rxe_net_init(struct net_device *ndev); void rxe_net_exit(void); #endif /* RXE_NET_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_ns.c b/drivers/infiniband/sw/rxe/rxe_ns.c new file mode 100644 index 000000000000..8b9d734229b2 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_ns.c @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ + +#include <net/sock.h> +#include <net/netns/generic.h> +#include <net/net_namespace.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/pid_namespace.h> +#include <net/udp_tunnel.h> + +#include "rxe_ns.h" + +/* + * Per network namespace data + */ +struct rxe_ns_sock { + struct sock __rcu *rxe_sk4; + struct sock __rcu *rxe_sk6; +}; + +/* + * Index to store custom data for each network namespace. + */ +static unsigned int rxe_pernet_id; + +/* + * Called for every existing and added network namespaces + */ +static int rxe_ns_init(struct net *net) +{ + /* defer socket create in the namespace to the first + * device create. + */ + + return 0; +} + +static void rxe_ns_exit(struct net *net) +{ + /* called when the network namespace is removed + */ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + struct sock *sk; + + rcu_read_lock(); + sk = rcu_dereference(ns_sk->rxe_sk4); + rcu_read_unlock(); + if (sk) { + rcu_assign_pointer(ns_sk->rxe_sk4, NULL); + udp_tunnel_sock_release(sk->sk_socket); + } + +#if IS_ENABLED(CONFIG_IPV6) + rcu_read_lock(); + sk = rcu_dereference(ns_sk->rxe_sk6); + rcu_read_unlock(); + if (sk) { + rcu_assign_pointer(ns_sk->rxe_sk6, NULL); + udp_tunnel_sock_release(sk->sk_socket); + } +#endif +} + +/* + * callback to make the module network namespace aware + */ +static struct pernet_operations rxe_net_ops = { + .init = rxe_ns_init, + .exit = rxe_ns_exit, + .id = &rxe_pernet_id, + .size = sizeof(struct rxe_ns_sock), +}; + +struct sock *rxe_ns_pernet_sk4(struct net *net) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + struct sock *sk; + + rcu_read_lock(); + sk = rcu_dereference(ns_sk->rxe_sk4); + rcu_read_unlock(); + + return sk; +} + +void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + + rcu_assign_pointer(ns_sk->rxe_sk4, sk); + synchronize_rcu(); +} + +#if IS_ENABLED(CONFIG_IPV6) +struct sock *rxe_ns_pernet_sk6(struct net *net) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + struct sock *sk; + + rcu_read_lock(); + sk = rcu_dereference(ns_sk->rxe_sk6); + rcu_read_unlock(); + + return sk; +} + +void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + + rcu_assign_pointer(ns_sk->rxe_sk6, sk); + synchronize_rcu(); +} +#endif /* IPV6 */ + +int rxe_namespace_init(void) +{ + return register_pernet_subsys(&rxe_net_ops); +} + +void rxe_namespace_exit(void) +{ + unregister_pernet_subsys(&rxe_net_ops); +} diff --git a/drivers/infiniband/sw/rxe/rxe_ns.h b/drivers/infiniband/sw/rxe/rxe_ns.h new file mode 100644 index 000000000000..4da2709e6b71 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_ns.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ + +#ifndef RXE_NS_H +#define RXE_NS_H + +struct sock *rxe_ns_pernet_sk4(struct net *net); +void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk); + +#if IS_ENABLED(CONFIG_IPV6) +void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk); +struct sock *rxe_ns_pernet_sk6(struct net *net); +#else /* IPv6 */ +static inline struct sock *rxe_ns_pernet_sk6(struct net *net) +{ + return NULL; +} + +static inline void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk) +{ +} +#endif /* IPv6 */ + +int rxe_namespace_init(void); +void rxe_namespace_exit(void); + +#endif /* RXE_NS_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index bc11b1ec59ac..ff904d5e54a7 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -545,7 +545,7 @@ static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd, work->frags[i].mr = mr; } - queue_work(system_unbound_wq, &work->work); + queue_work(rxe_wq, &work->work); return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 5861e4244049..f79214738c2b 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -330,7 +330,8 @@ void rxe_rcv(struct sk_buff *skb) pkt->qp = NULL; pkt->mask |= rxe_opcode[pkt->opcode].mask; - if (unlikely(skb->len < header_size(pkt))) + if (unlikely(pkt->paylen < header_size(pkt) + bth_pad(pkt) + + RXE_ICRC_SIZE)) goto drop; err = hdr_check(pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 711f73e0bbb1..9faf8c09aa8e 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -37,6 +37,7 @@ static char *resp_state_name[] = { [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", [RESPST_ERR_RNR] = "ERR_RNR", + [RESPST_ERR_RKEY_VIOLATION_EVENT] = "ERR_RKEY_VIOLATION_EVENT", [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", [RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION", [RESPST_ERR_LENGTH] = "ERR_LENGTH", @@ -423,6 +424,19 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.resid = sizeof(u64); } +/* Transition to an rkey violation state. C9-222.1 requires an async event + * at the responder, but only if the error cannot be attached to an RX WQE. + * WRITE_WITH_IMM is the only op that might have that more precise RX WQE + * to pin the error on. + */ +static enum resp_states get_rkey_violation_state(struct rxe_pkt_info *pkt) +{ + if (pkt->mask & RXE_IMMDT_MASK) + return RESPST_ERR_RKEY_VIOLATION; + + return RESPST_ERR_RKEY_VIOLATION_EVENT; +} + /* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL * if an invalid rkey is received or the rdma length is zero. For middle * or last packets use the stored value of mr. @@ -486,14 +500,14 @@ static enum resp_states check_rkey(struct rxe_qp *qp, mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); - state = RESPST_ERR_RKEY_VIOLATION; + state = get_rkey_violation_state(pkt); goto err; } mr = mw->mr; if (!mr) { rxe_dbg_qp(qp, "MW doesn't have an MR\n"); - state = RESPST_ERR_RKEY_VIOLATION; + state = get_rkey_violation_state(pkt); goto err; } @@ -507,7 +521,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); - state = RESPST_ERR_RKEY_VIOLATION; + state = get_rkey_violation_state(pkt); goto err; } } @@ -521,7 +535,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } if (mr_check_range(mr, va + qp->resp.offset, resid)) { - state = RESPST_ERR_RKEY_VIOLATION; + state = get_rkey_violation_state(pkt); goto err; } @@ -586,7 +600,7 @@ static enum resp_states write_data_in(struct rxe_qp *qp, err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, payload_addr(pkt), data_len, RXE_TO_MR_OBJ); if (err) { - rc = RESPST_ERR_RKEY_VIOLATION; + rc = get_rkey_violation_state(pkt); goto out; } @@ -667,7 +681,7 @@ static enum resp_states process_flush(struct rxe_qp *qp, if (res->flush.type & IB_FLUSH_PERSISTENT) { if (rxe_flush_pmem_iova(mr, start, length)) - return RESPST_ERR_RKEY_VIOLATION; + return get_rkey_violation_state(pkt); /* Make data persistent. */ wmb(); } else if (res->flush.type & IB_FLUSH_GLOBAL) { @@ -1383,6 +1397,20 @@ out: return rc; } +static void do_qp_event(struct rxe_qp *qp, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_qp *ibqp = &qp->ibqp; + + event.event = etype; + event.device = ibqp->device; + event.element.qp = ibqp; + if (ibqp->event_handler) { + rxe_dbg_qp(qp, "reporting QP event %d\n", etype); + ibqp->event_handler(&event, ibqp->qp_context); + } +} + /* Process a class A or C. Both are treated the same in this implementation. */ static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, enum ib_wc_status status) @@ -1476,14 +1504,9 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) int err; if (qp->srq) { - if (notify && qp->ibqp.event_handler) { - struct ib_event ev; + if (notify && qp->ibqp.event_handler) + do_qp_event(qp, IB_EVENT_QP_LAST_WQE_REACHED); - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); - } return; } @@ -1613,6 +1636,13 @@ int rxe_receiver(struct rxe_qp *qp) state = RESPST_CLEANUP; break; + case RESPST_ERR_RKEY_VIOLATION_EVENT: + if (qp_type(qp) == IB_QPT_RC) + do_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + state = RESPST_ERR_RKEY_VIOLATION; + break; + case RESPST_ERR_RKEY_VIOLATION: if (qp_type(qp) == IB_QPT_RC) { /* Class C */ diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index f522820b950c..801d06c969c9 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -6,7 +6,7 @@ #include "rxe.h" -static struct workqueue_struct *rxe_wq; +struct workqueue_struct *rxe_wq; int rxe_alloc_wq(void) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index fe41362c5144..4d4891dc2884 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -452,18 +452,9 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, int err; if (udata) { - if (udata->inlen < sizeof(cmd)) { - err = -EINVAL; - rxe_dbg_srq(srq, "malformed udata\n"); - goto err_out; - } - - err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); - if (err) { - err = -EFAULT; - rxe_dbg_srq(srq, "unable to read udata\n"); + err = ib_copy_validate_udata_in(udata, cmd, mmap_info_addr); + if (err) goto err_out; - } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); @@ -1097,11 +1088,8 @@ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_out; } - err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); - if (err) { - rxe_dbg_dev(rxe, "bad init attributes, err = %d\n", err); - goto err_out; - } + if (attr->cqe > rxe->attr.max_cqe) + return -EINVAL; err = rxe_add_to_pool(&rxe->cq_pool, cq); if (err) { @@ -1127,7 +1115,8 @@ err_out: return err; } -static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +static int rxe_resize_cq(struct ib_cq *ibcq, unsigned int cqe, + struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); @@ -1143,11 +1132,9 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) uresp = udata->outbuf; } - err = rxe_cq_chk_attr(rxe, cq, cqe, 0); - if (err) { - rxe_dbg_cq(cq, "bad attr, err = %d\n", err); - goto err_out; - } + if (cqe > rxe->attr.max_cqe || + cqe < queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT)) + return -EINVAL; err = rxe_cq_resize_queue(cq, cqe, uresp, udata); if (err) { @@ -1519,7 +1506,7 @@ static const struct ib_device_ops rxe_dev_ops = { .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .rereg_user_mr = rxe_rereg_user_mr, - .resize_cq = rxe_resize_cq, + .resize_user_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index fb149f37e91d..d92f80d16f78 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -154,6 +154,7 @@ enum resp_states { RESPST_ERR_MISSING_OPCODE_LAST_D1E, RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION_EVENT, RESPST_ERR_RKEY_VIOLATION, RESPST_ERR_INVALIDATE_RKEY, RESPST_ERR_LENGTH, diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index ef504db8f2b4..1e1d262a4ae2 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1373,11 +1373,7 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, struct siw_uresp_reg_mr uresp = {}; struct siw_mem *mem = mr->mem; - if (udata->inlen < sizeof(ureq)) { - rv = -EINVAL; - goto err_out; - } - rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); + rv = ib_copy_validate_udata_in(udata, ureq, pad); if (rv) goto err_out; |
