diff options
Diffstat (limited to 'include/rdma')
| -rw-r--r-- | include/rdma/frmr_pools.h | 39 | ||||
| -rw-r--r-- | include/rdma/ib_cache.h | 20 | ||||
| -rw-r--r-- | include/rdma/ib_cm.h | 19 | ||||
| -rw-r--r-- | include/rdma/ib_mad.h | 1 | ||||
| -rw-r--r-- | include/rdma/ib_marshall.h | 3 | ||||
| -rw-r--r-- | include/rdma/ib_pack.h | 3 | ||||
| -rw-r--r-- | include/rdma/ib_sa.h | 37 | ||||
| -rw-r--r-- | include/rdma/ib_ucaps.h | 30 | ||||
| -rw-r--r-- | include/rdma/ib_umem.h | 91 | ||||
| -rw-r--r-- | include/rdma/ib_umem_odp.h | 25 | ||||
| -rw-r--r-- | include/rdma/ib_verbs.h | 496 | ||||
| -rw-r--r-- | include/rdma/iter.h | 88 | ||||
| -rw-r--r-- | include/rdma/iw_cm.h | 14 | ||||
| -rw-r--r-- | include/rdma/opa_port_info.h | 8 | ||||
| -rw-r--r-- | include/rdma/opa_vnic.h | 96 | ||||
| -rw-r--r-- | include/rdma/rdma_cm.h | 39 | ||||
| -rw-r--r-- | include/rdma/rdma_counter.h | 7 | ||||
| -rw-r--r-- | include/rdma/rdma_netlink.h | 2 | ||||
| -rw-r--r-- | include/rdma/rdma_vt.h | 10 | ||||
| -rw-r--r-- | include/rdma/rdmavt_qp.h | 70 | ||||
| -rw-r--r-- | include/rdma/restrack.h | 8 | ||||
| -rw-r--r-- | include/rdma/rw.h | 22 | ||||
| -rw-r--r-- | include/rdma/uverbs_ioctl.h | 101 | ||||
| -rw-r--r-- | include/rdma/uverbs_std_types.h | 2 | ||||
| -rw-r--r-- | include/rdma/uverbs_types.h | 1 |
25 files changed, 834 insertions, 398 deletions
diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h new file mode 100644 index 000000000000..af1b88801fa4 --- /dev/null +++ b/include/rdma/frmr_pools.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef FRMR_POOLS_H +#define FRMR_POOLS_H + +#include <linux/types.h> +#include <asm/page.h> + +struct ib_device; +struct ib_mr; + +struct ib_frmr_key { + u64 vendor_key; + /* A pool with non-zero kernel_vendor_key is a kernel-only pool. */ + u64 kernel_vendor_key; + size_t num_dma_blocks; + int access_flags; + u8 ats:1; +}; + +struct ib_frmr_pool_ops { + int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key, + u32 *handles, u32 count); + void (*destroy_frmrs)(struct ib_device *device, u32 *handles, + u32 count); + int (*build_key)(struct ib_device *device, const struct ib_frmr_key *in, + struct ib_frmr_key *out); +}; + +int ib_frmr_pools_init(struct ib_device *device, + const struct ib_frmr_pool_ops *pool_ops); +void ib_frmr_pools_cleanup(struct ib_device *device); +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr); +int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr); + +#endif /* FRMR_POOLS_H */ diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 226ae3702d8a..eed46d966e40 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -34,7 +34,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); /** * ib_get_cached_pkey - Returns a cached PKey table entry - * @device: The device to query. + * @device_handle: The device to query. * @port_num: The port number of the device to query. * @index: The index into the cached PKey table to query. * @pkey: The PKey value found at the specified index. @@ -64,22 +64,6 @@ int ib_find_cached_pkey(struct ib_device *device, u16 *index); /** - * ib_find_exact_cached_pkey - Returns the PKey table index where a specified - * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) - * @device: The device to query. - * @port_num: The port number of the device to search for the PKey. - * @pkey: The PKey value to search for. - * @index: The index into the cached PKey table where the PKey was found. - * - * ib_find_exact_cached_pkey() searches the specified PKey table in - * the local software cache. - */ -int ib_find_exact_cached_pkey(struct ib_device *device, - u32 port_num, - u16 pkey, - u16 *index); - -/** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. * @port_num: The port number of the device to query. @@ -96,7 +80,7 @@ int ib_get_cached_lmc(struct ib_device *device, * ib_get_cached_port_state - Returns a cached port state table entry * @device: The device to query. * @port_num: The port number of the device to query. - * @port_state: port_state for the specified port for that device. + * @port_active: port_state for the specified port for that device. * * ib_get_cached_port_state() fetches the specified port_state table entry stored in * the local software cache. diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index a2ac62b4a6cf..4808a355de41 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -271,7 +271,7 @@ struct ib_cm_event { #define CM_APR_ATTR_ID cpu_to_be16(0x001A) /** - * ib_cm_handler - User-defined callback to process communication events. + * typedef ib_cm_handler - User-defined callback to process communication events. * @cm_id: Communication identifier associated with the reported event. * @event: Information about the communication event. * @@ -480,23 +480,12 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); -#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */ - /** - * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection - * message. + * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a + * connection message in case duplicates are received. * @cm_id: Connection identifier associated with the connection message. - * @service_timeout: The lower 5-bits specify the maximum time required for - * the sender to reply to the connection message. The upper 3-bits - * specify additional control flags. - * @private_data: Optional user-defined private data sent with the - * message receipt acknowledgement. - * @private_data_len: Size of the private data buffer, in bytes. */ -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len); +int ib_prepare_cm_mra(struct ib_cm_id *cm_id); /** * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 3f1b58d8b4bf..8bd0e1eb393b 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -48,6 +48,7 @@ #define IB_MGMT_METHOD_REPORT 0x06 #define IB_MGMT_METHOD_REPORT_RESP 0x86 #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 +#define IB_MGMT_METHOD_GET_TABLE 0x12 #define IB_MGMT_METHOD_RESP 0x80 #define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 1838869aad28..b179e464e3d1 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -22,7 +22,4 @@ void ib_copy_ah_attr_to_user(struct ib_device *device, void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, struct sa_path_rec *src); -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src); - #endif /* IB_USER_MARSHALL_H */ diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index b8c56d7dc35d..8266fab826a7 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -283,7 +283,4 @@ int ib_ud_header_init(int payload_bytes, int ib_ud_header_pack(struct ib_ud_header *header, void *buf); -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header); - #endif /* IB_PACK_H */ diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b46353fc53bf..95e8924ad563 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -189,6 +189,20 @@ struct sa_path_rec { u32 flags; }; +struct sa_service_rec { + __be64 id; + __u8 gid[16]; + __be16 pkey; + __u8 reserved[2]; + __be32 lease; + __u8 key[16]; + __u8 name[64]; + __u8 data_8[16]; + __be16 data_16[8]; + __be32 data_32[4]; + __be64 data_64[2]; +}; + static inline enum ib_gid_type sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec) { @@ -417,6 +431,17 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, unsigned int num_prs, void *context), void *context, struct ib_sa_query **query); +int ib_sa_service_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_service_rec *resp, + unsigned int num_services, + void *context), + void *context, struct ib_sa_query **sa_query); + struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; ib_sa_comp_mask comp_mask; @@ -509,6 +534,18 @@ int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute); /** + * ib_sa_pack_service - Convert a service record from struct ib_sa_service_rec + * to IB MAD wire format. + */ +void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute); + +/** + * ib_sa_unpack_service - Convert a service record from MAD format to struct + * ib_sa_service_rec. + */ +void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec); + +/** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ diff --git a/include/rdma/ib_ucaps.h b/include/rdma/ib_ucaps.h new file mode 100644 index 000000000000..d9f96be3a553 --- /dev/null +++ b/include/rdma/ib_ucaps.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _IB_UCAPS_H_ +#define _IB_UCAPS_H_ + +#define UCAP_ENABLED(ucaps, type) (!!((ucaps) & (1U << (type)))) + +enum rdma_user_cap { + RDMA_UCAP_MLX5_CTRL_LOCAL, + RDMA_UCAP_MLX5_CTRL_OTHER_VHCA, + RDMA_UCAP_MAX +}; + +void ib_cleanup_ucaps(void); +int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int ib_create_ucap(enum rdma_user_cap type); +void ib_remove_ucap(enum rdma_user_cap type); +#else +static inline int ib_create_ucap(enum rdma_user_cap type) +{ + return -EOPNOTSUPP; +} +static inline void ib_remove_ucap(enum rdma_user_cap type) {} +#endif /* CONFIG_INFINIBAND_USER_ACCESS */ + +#endif /* _IB_UCAPS_H_ */ diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 7dc7b1cc71b5..2ad52cc1d52b 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -7,13 +7,9 @@ #ifndef IB_UMEM_H #define IB_UMEM_H -#include <linux/list.h> #include <linux/scatterlist.h> -#include <linux/workqueue.h> -#include <rdma/ib_verbs.h> -struct ib_ucontext; -struct ib_umem_odp; +struct ib_device; struct dma_buf_attach_ops; struct ib_umem { @@ -22,6 +18,7 @@ struct ib_umem { u64 iova; size_t length; unsigned long address; + unsigned long dma_attrs; u32 writable : 1; u32 is_odp : 1; u32 is_dmabuf : 1; @@ -36,6 +33,7 @@ struct ib_umem_dmabuf { struct scatterlist *last_sg; unsigned long first_sg_offset; unsigned long last_sg_trim; + void (*pinned_revoke)(void *priv); void *private; u8 pinned : 1; u8 revoked : 1; @@ -52,11 +50,15 @@ static inline int ib_umem_offset(struct ib_umem *umem) return umem->address & ~PAGE_MASK; } +static inline dma_addr_t ib_umem_start_dma_addr(struct ib_umem *umem) +{ + return sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem); +} + static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem, unsigned long pgsz) { - return (sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem)) & - (pgsz - 1); + return ib_umem_start_dma_addr(umem) & (pgsz - 1); } static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem, @@ -71,37 +73,6 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) { return ib_umem_num_dma_blocks(umem, PAGE_SIZE); } - -static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, - struct ib_umem *umem, - unsigned long pgsz) -{ - __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, - umem->sgt_append.sgt.nents, pgsz); - biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1); - biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz); -} - -static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) -{ - return __rdma_block_iter_next(biter) && biter->__sg_numblocks--; -} - -/** - * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem - * @umem: umem to iterate over - * @pgsz: Page size to split the list into - * - * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The - * returned DMA blocks will be aligned to pgsz and span the range: - * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz) - * - * Performs exactly ib_umem_num_dma_blocks() iterations. - */ -#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ - for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ - __rdma_umem_block_iter_next(biter);) - #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, @@ -117,7 +88,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, * ib_umem_find_best_pgoff - Find best HW page size * * @umem: umem struct - * @pgsz_bitmap bitmap of HW supported page sizes + * @pgsz_bitmap: bitmap of HW supported page sizes * @pgoff_bitmask: Mask of bits that can be represented with an offset * * This is very similar to ib_umem_find_best_pgsz() except instead of accepting @@ -130,19 +101,35 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, * * If the pgoff_bitmask requires either alignment in the low bit or an * unavailable page size for the high bits, this function returns 0. + * + * Returns: best HW page size for the parameters or 0 if none available + * for the given parameters. */ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem, unsigned long pgsz_bitmap, u64 pgoff_bitmask) { - struct scatterlist *sg = umem->sgt_append.sgt.sgl; dma_addr_t dma_addr; - dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK); + dma_addr = ib_umem_start_dma_addr(umem); return ib_umem_find_best_pgsz(umem, pgsz_bitmap, dma_addr & pgoff_bitmask); } +static inline bool ib_umem_is_contiguous(struct ib_umem *umem) +{ + dma_addr_t dma_addr; + unsigned long pgsz; + + /* + * Select the smallest aligned page that can contain the whole umem if + * it was contiguous. + */ + dma_addr = ib_umem_start_dma_addr(umem); + pgsz = roundup_pow_of_two((dma_addr ^ (umem->length - 1 + dma_addr)) + 1); + return !!ib_umem_find_best_pgoff(umem, pgsz, U64_MAX); +} + struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, unsigned long offset, size_t size, int fd, int access, @@ -152,6 +139,12 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, size_t size, int fd, int access); struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_revocable_and_lock(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access); +void ib_umem_dmabuf_set_revoke_locked(struct ib_umem_dmabuf *umem_dmabuf, + void (*revoke)(void *priv), void *priv); +struct ib_umem_dmabuf * ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, struct device *dma_device, unsigned long offset, size_t size, @@ -159,6 +152,8 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); +void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf); +void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf); void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf); #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -205,6 +200,18 @@ ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset, } static inline struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_revocable_and_lock(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +ib_umem_dmabuf_set_revoke_locked(struct ib_umem_dmabuf *umem_dmabuf, + void (*revoke)(void *priv), void *priv) {} + +static inline struct ib_umem_dmabuf * ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, struct device *dma_device, unsigned long offset, size_t size, @@ -219,6 +226,8 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) } static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { } static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { } +static inline void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf) {} +static inline void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) {} static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {} #endif /* CONFIG_INFINIBAND_USER_MEM */ diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0844c1d05ac6..2a24bf791c10 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -8,23 +8,17 @@ #include <rdma/ib_umem.h> #include <rdma/ib_verbs.h> +#include <linux/hmm-dma.h> struct ib_umem_odp { struct ib_umem umem; struct mmu_interval_notifier notifier; struct pid *tgid; - /* An array of the pfns included in the on-demand paging umem. */ - unsigned long *pfn_list; + struct hmm_dma_map map; /* - * An array with DMA addresses mapped for pfns in pfn_list. - * The lower two bits designate access permissions. - * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. - */ - dma_addr_t *dma_list; - /* - * The umem_mutex protects the page_list and dma_list fields of an ODP + * The umem_mutex protects the page_list field of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex * also protects access to the mmu notifier counters. */ @@ -67,19 +61,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) umem_odp->page_shift; } -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3417636da960..9dd76f489a0b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -15,6 +15,7 @@ #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> +#include <linux/bvec.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> @@ -42,6 +43,9 @@ #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> +#include <linux/pci-tph.h> +#include <rdma/frmr_pools.h> +#include <linux/dma-buf.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -59,9 +63,6 @@ extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; -__printf(3, 4) __cold -void ibdev_printk(const char *level, const struct ib_device *ibdev, - const char *format, ...); __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold @@ -317,17 +318,19 @@ enum ib_atomic_cap { }; enum ib_odp_general_cap_bits { - IB_ODP_SUPPORT = 1 << 0, - IB_ODP_SUPPORT_IMPLICIT = 1 << 1, + IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, + IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { - IB_ODP_SUPPORT_SEND = 1 << 0, - IB_ODP_SUPPORT_RECV = 1 << 1, - IB_ODP_SUPPORT_WRITE = 1 << 2, - IB_ODP_SUPPORT_READ = 1 << 3, - IB_ODP_SUPPORT_ATOMIC = 1 << 4, - IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, + IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, + IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, + IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, + IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, + IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, + IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, + IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, + IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { @@ -522,6 +525,23 @@ enum ib_port_state { IB_PORT_ACTIVE_DEFER = 5 }; +static inline const char *__attribute_const__ +ib_port_state_to_str(enum ib_port_state state) +{ + const char * const states[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER", + }; + + if (state < ARRAY_SIZE(states)) + return states[state]; + return "UNKNOWN"; +} + enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, @@ -569,10 +589,10 @@ enum ib_stat_flag { }; /** - * struct rdma_stat_desc - * @name - The name of the counter - * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL - * @priv - Driver private information; Core code should not use + * struct rdma_stat_desc - description of one rdma stat/counter + * @name: The name of the counter + * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL + * @priv: Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; @@ -581,24 +601,24 @@ struct rdma_stat_desc { }; /** - * struct rdma_hw_stats - * @lock - Mutex to protect parallel write access to lifespan and values + * struct rdma_hw_stats - collection of hardware stats and their management + * @lock: Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. - * @timestamp - Used by the core code to track when the last update was - * @lifespan - Used by the core code to determine how old the counters + * @timestamp: Used by the core code to track when the last update was + * @lifespan: Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. - * @descs - Array of pointers to static descriptors used for the counters + * @descs: Array of pointers to static descriptors used for the counters * in directory. - * @is_disabled - A bitmap to indicate each counter is currently disabled + * @is_disabled: A bitmap to indicate each counter is currently disabled * or not. - * @num_counters - How many hardware counters there are. If name is + * @num_counters: How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. - * @value - Array of u64 counters that are accessed by the sysfs code and + * @value: Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { @@ -747,6 +767,7 @@ enum ib_event_type { IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, + IB_EVENT_DEVICE_SPEED_CHANGE, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); @@ -842,6 +863,7 @@ enum ib_rate { IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, + IB_RATE_1600_GBPS = 25, }; /** @@ -859,6 +881,20 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); +struct ib_port_speed_info { + const char *str; + int rate; /* in deci-Gb/sec (100 MBps units) */ +}; + +/** + * ib_port_attr_to_speed_info - Convert port attributes to speed information + * @attr: Port attributes containing active_speed and active_width + * @speed_info: Speed information to return + * + * Returns 0 on success, -EINVAL on error. + */ +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info); /** * enum ib_mr_type - memory region type @@ -1516,6 +1552,7 @@ struct ib_ucontext { struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; + u64 enabled_caps; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -1540,6 +1577,93 @@ struct ib_uobject { const struct uverbs_api_object *uapi_object; }; +/** + * struct ib_udata - Driver request/response data from userspace + * @inbuf: Pointer to request data from userspace + * @outbuf: Pointer to response buffer in userspace + * @inlen: Length of request data + * @outlen: Length of response buffer + * + * struct ib_udata is used to hold the driver data request and response + * structures defined in the uapi. They follow these rules for forwards and + * backwards compatibility: + * + * 1) Userspace can provide a longer request so long as the trailing part the + * kernel doesn't understand is all zeros. + * + * This provides a degree of safety if userspace wrongly tries to use a new + * feature the kernel does not understand with some non-zero value. + * + * It allows a simpler rdma-core implementation because the library can + * simply always use the latest structs for the request, even if they are + * bigger. It simply has to avoid using the new members if they are not + * supported/required. + * + * 2) Userspace can provide a shorter request; the kernel will zero-pad it out + * to fill the storage. The newer kernel should understand that older + * userspace will provide 0 to new fields. The kernel has three options to + * enable new request fields: + * + * - Input comp_mask that says the field is supported + * - Look for non-zero values + * - Check if the udata->inlen size covers the field + * + * This also corrects any bugs related to not filling in request structures + * as the new helper always fully writes to the struct. + * + * 3) Userspace can provide a shorter or longer response struct. If shorter, + * the kernel reply is truncated. The kernel should be designed to not write + * to new reply fields unless userspace has affirmatively requested them. + * + * If the user buffer is longer, the kernel will zero-fill it. + * + * Userspace has three options to enable new response fields: + * + * - Output comp_mask that says the field is supported + * - Look for non-zero values + * - Infer the output must be valid because the request contents demand it + * and old kernels will fail the request + * + * The following helper functions implement these semantics: + * + * ib_copy_validate_udata_in() - Checks the minimum length, and zero trailing:: + * + * struct driver_create_cq_req req; + * int err; + * + * err = ib_copy_validate_udata_in(udata, req, end_member); + * if (err) + * return err; + * + * The third argument specifies the last member of the struct in the first + * kernel version that introduced it, establishing the minimum required size. + * + * ib_copy_validate_udata_in_cm() - The above but also validate a + * comp_mask member only has supported bits set:: + * + * err = ib_copy_validate_udata_in_cm(udata, req, first_version_last_member, + * DRIVER_CREATE_CQ_MASK_FEATURE_A | + * DRIVER_CREATE_CQ_MASK_FEATURE_B); + * + * ib_respond_udata() - Implements the response rules:: + * + * struct driver_create_cq_resp resp = {}; + * + * resp.some_field = value; + * return ib_respond_udata(udata, resp); + * + * ib_is_udata_in_empty() - Used instead of ib_copy_validate_udata_in() if the + * driver does not have a request structure:: + * + * ret = ib_is_udata_in_empty(udata); + * if (ret) + * return ret; + * + * Similarly ib_respond_empty_udata() is used instead of ib_respond_udata() if + * the driver does not have a response structure:: + * + * return ib_respond_empty_udata(udata); + */ struct ib_udata { const void __user *inbuf; void __user *outbuf; @@ -1614,6 +1738,7 @@ struct ib_cq { u8 interrupt:1; u8 shared:1; unsigned int comp_vector; + struct ib_umem *umem; /* * Implementation details of the RDMA core, don't use in drivers: @@ -1829,6 +1954,27 @@ struct ib_dm { atomic_t usecnt; }; +/* bit values to mark existence of ib_dmah fields */ +enum { + IB_DMAH_CPU_ID_EXISTS, + IB_DMAH_MEM_TYPE_EXISTS, + IB_DMAH_PH_EXISTS, +}; + +struct ib_dmah { + struct ib_device *device; + struct ib_uobject *uobject; + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; + u32 cpu_id; + enum tph_mem_type mem_type; + atomic_t usecnt; + u8 ph; + u8 valid_fields; /* use IB_DMAH_XXX_EXISTS */ +}; + struct ib_mr { struct ib_device *device; struct ib_pd *pd; @@ -1846,6 +1992,12 @@ struct ib_mr { struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ + struct ib_dmah *dmah; + struct { + struct ib_frmr_pool *pool; + struct ib_frmr_key key; + u32 handle; + } frmr; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2177,6 +2329,7 @@ struct ib_port_cache { struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; + enum ib_port_state last_port_state; }; struct ib_port_immutable { @@ -2208,7 +2361,6 @@ struct ib_port_data { /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { - RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; @@ -2222,11 +2374,6 @@ struct rdma_netdev { u32 port_num; int mtu; - /* - * cleanup function must be specified. - * FIXME: This is only used for OPA_VNIC and that usage should be - * removed too. - */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ @@ -2256,7 +2403,9 @@ struct rdma_netdev_alloc_params { struct ib_odp_counters { atomic64_t faults; + atomic64_t faults_handled; atomic64_t invalidations; + atomic64_t invalidations_handled; atomic64_t prefetch; }; @@ -2304,6 +2453,9 @@ struct rdma_user_mmap_entry { unsigned long start_pgoff; size_t npages; bool driver_removed; + /* protects access to dmabufs */ + struct mutex dmabufs_lock; + struct list_head dmabufs; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ @@ -2323,6 +2475,12 @@ struct ib_device_ops { enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; + /* + * Indicates the driver checks every op accepting a udata for the + * correct size on input and always handles the output using the udata + * helpers. + */ + unsigned int uverbs_robust_udata:1; /* * NOTE: New drivers should not make use of device_group; instead new @@ -2355,14 +2513,14 @@ struct ib_device_ops { int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); - const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, - int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); + int (*query_port_speed)(struct ib_device *device, u32 port_num, + u64 *speed); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); - /** + /* * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device @@ -2372,7 +2530,7 @@ struct ib_device_ops { struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); - /** + /* * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold @@ -2382,7 +2540,7 @@ struct ib_device_ops { */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); - /** + /* * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params @@ -2396,14 +2554,14 @@ struct ib_device_ops { int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); - /** + /* * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); - /** + /* * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is @@ -2417,7 +2575,7 @@ struct ib_device_ops { * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); - /** + /* * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. @@ -2432,13 +2590,18 @@ struct ib_device_ops { struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); - /** + /* * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); + int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry, + struct phys_vec *phys_vec, + struct p2pdma_provider **provider); + struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext, + off_t pg_off); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); @@ -2466,16 +2629,31 @@ struct ib_device_ops { int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); + int (*create_user_cq)(struct ib_cq *cq, + const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); - int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); + int (*resize_user_cq)(struct ib_cq *cq, unsigned int cqe, + struct ib_udata *udata); + /* + * pre_destroy_cq - Prevent a cq from generating any new work + * completions, but not free any kernel resources + */ + int (*pre_destroy_cq)(struct ib_cq *cq); + /* + * post_destroy_cq - Free all kernel resources + */ + void (*post_destroy_cq)(struct ib_cq *cq); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, @@ -2540,6 +2718,9 @@ struct ib_device_ops { struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); + int (*alloc_dmah)(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs); + int (*dealloc_dmah)(struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); @@ -2554,7 +2735,7 @@ struct ib_device_ops { struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); - /** + /* * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the @@ -2563,7 +2744,7 @@ struct ib_device_ops { struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); - /** + /* * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated @@ -2578,14 +2759,14 @@ struct ib_device_ops { int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); - /** + /* * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); - /** + /* * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); @@ -2621,33 +2802,39 @@ struct ib_device_ops { u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); - /** + /* * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ - int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp); - /** + int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, + u32 port); + /* * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ - int (*counter_unbind_qp)(struct ib_qp *qp); - /** + int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); + /* * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); - /** + /* * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); - /** + /* * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); - /** + /* + * counter_init - Initialize the driver specific rdma counter struct. + */ + void (*counter_init)(struct rdma_counter *counter); + + /* * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ @@ -2663,27 +2850,35 @@ struct ib_device_ops { */ int (*get_numa_node)(struct ib_device *dev); - /** + /* * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); - /** + /* * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); - /** + /* * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); + /* + * report_port_event - Drivers need to implement this if they have + * some private stuff to handle when link status changes. + */ + void (*report_port_event)(struct ib_device *ibdev, + struct net_device *ndev, unsigned long event); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); + DECLARE_RDMA_OBJ_SIZE(ib_dmah); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); @@ -2691,6 +2886,7 @@ struct ib_device_ops { DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); + DECLARE_RDMA_OBJ_SIZE(rdma_counter); }; struct ib_core_device { @@ -2743,6 +2939,7 @@ struct ib_device { * It is a NULL terminated array. */ const struct attribute_group *groups[4]; + u8 hw_stats_attr_index; u64 uverbs_cmd_mask; @@ -2802,6 +2999,8 @@ struct ib_device { struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; + + struct ib_frmr_pools *frmr_pools; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, @@ -2854,27 +3053,18 @@ struct ib_client { u8 no_kverbs_req:1; }; -/* - * IB block DMA iterator - * - * Iterates the DMA-mapped SGL in contiguous memory blocks aligned - * to a HW supported page size. - */ -struct ib_block_iter { - /* internal states */ - struct scatterlist *__sg; /* sg holding the current aligned block */ - dma_addr_t __dma_addr; /* unaligned DMA address of this block */ - size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ - unsigned int __sg_nents; /* number of SG entries */ - unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ - unsigned int __pg_bit; /* alignment of current block */ -}; - -struct ib_device *_ib_alloc_device(size_t size); +struct ib_device *_ib_alloc_device(size_t size, struct net *net); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - struct drv_struct, member))), \ + struct drv_struct, member)), \ + &init_net), \ + struct drv_struct, member) + +#define ib_alloc_device_with_net(drv_struct, member, net) \ + container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct drv_struct, member)), net), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); @@ -2891,38 +3081,6 @@ void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -void __rdma_block_iter_start(struct ib_block_iter *biter, - struct scatterlist *sglist, - unsigned int nents, - unsigned long pgsz); -bool __rdma_block_iter_next(struct ib_block_iter *biter); - -/** - * rdma_block_iter_dma_address - get the aligned dma address of the current - * block held by the block iterator. - * @biter: block iterator holding the memory block - */ -static inline dma_addr_t -rdma_block_iter_dma_address(struct ib_block_iter *biter) -{ - return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); -} - -/** - * rdma_for_each_block - iterate over contiguous memory blocks of the sg list - * @sglist: sglist to iterate over - * @biter: block iterator holding the memory block - * @nents: maximum number of sg entries to iterate over - * @pgsz: best HW supported page size to use - * - * Callers may use rdma_block_iter_dma_address() to get each - * blocks aligned DMA address. - */ -#define rdma_for_each_block(sglist, biter, nents, pgsz) \ - for (__rdma_block_iter_start(biter, sglist, nents, \ - pgsz); \ - __rdma_block_iter_next(biter);) - /** * ib_get_client_data - Get IB client context * @device:Device to get context for @@ -3073,8 +3231,8 @@ static inline u32 rdma_start_port(const struct ib_device *device) /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device - * @device - The struct ib_device * to iterate over - * @iter - The unsigned int to store the port number + * @device: The struct ib_device * to iterate over + * @iter: The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ @@ -3440,7 +3598,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device, /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device - * @port_num: Port number + * @port: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return @@ -3458,7 +3616,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device - * @port_num: Port number + * @port: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. @@ -3835,7 +3993,7 @@ static inline int ib_destroy_qp(struct ib_qp *qp) /** * ib_open_qp - Obtain a reference to an existing sharable QP. - * @xrcd - XRC domain + * @xrcd: XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. @@ -3947,15 +4105,6 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** - * ib_resize_cq - Modifies the capacity of the CQ. - * @cq: The CQ to resize. - * @cqe: The minimum size of the CQ. - * - * Users can examine the cq structure to determine the actual CQ size. - */ -int ib_resize_cq(struct ib_cq *cq, int cqe); - -/** * rdma_set_cq_moderation - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event @@ -4164,6 +4313,47 @@ static inline void ib_dma_unmap_page(struct ib_device *dev, dma_unmap_page(dev->dma_device, addr, size, direction); } +/** + * ib_dma_map_bvec - Map a bio_vec to DMA address + * @dev: The device for which the dma_addr is to be created + * @bvec: The bio_vec to map + * @direction: The direction of the DMA + * + * Returns a DMA address for the bio_vec. The caller must check the + * result with ib_dma_mapping_error() before use; a failed mapping + * must not be passed to ib_dma_unmap_bvec(). + * + * For software RDMA devices (rxe, siw), returns a virtual address + * and no actual DMA mapping occurs. + */ +static inline u64 ib_dma_map_bvec(struct ib_device *dev, + struct bio_vec *bvec, + enum dma_data_direction direction) +{ + if (ib_uses_virt_dma(dev)) + return (uintptr_t)bvec_virt(bvec); + return dma_map_phys(dev->dma_device, bvec_phys(bvec), + bvec->bv_len, direction, 0); +} + +/** + * ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping + * @dev: The device for which the DMA address was created + * @addr: The DMA address returned by ib_dma_map_bvec() + * @size: The size of the region in bytes + * @direction: The direction of the DMA + * + * Releases a DMA mapping created by ib_dma_map_bvec(). For software + * RDMA devices this is a no-op since no actual mapping occurred. + */ +static inline void ib_dma_unmap_bvec(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (!ib_uses_virt_dma(dev)) + dma_unmap_phys(dev->dma_device, addr, size, direction, 0); +} + int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, @@ -4189,9 +4379,9 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created - * @sg: The sg_table object describing the buffer + * @sgt: The sg_table object describing the buffer * @direction: The direction of the DMA - * @attrs: Optional DMA attributes for the map operation + * @dma_attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, @@ -4335,8 +4525,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. - * @mr - struct ib_mr pointer to be updated. - * @newkey - new key to be used. + * @mr: struct ib_mr pointer to be updated. + * @newkey: new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { @@ -4347,7 +4537,7 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. - * @rkey - the rkey to increment. + * @rkey: the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { @@ -4441,7 +4631,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, /** * ib_device_try_get: Hold a registration lock - * device: The device to lock + * @dev: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully @@ -4460,8 +4650,6 @@ static inline bool ib_device_try_get(struct ib_device *dev) void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); -struct ib_device *ib_device_get_by_name(const char *name, - enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); @@ -4469,6 +4657,17 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port); + +static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) +{ + return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; +} + +void ib_dispatch_port_state_event(struct ib_device *ibdev, + struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); @@ -4713,31 +4912,10 @@ static inline __be16 ib_lid_be16(u32 lid) } /** - * ib_get_vector_affinity - Get the affinity mappings of a given completion - * vector - * @device: the rdma device - * @comp_vector: index of completion vector - * - * Returns NULL on failure, otherwise a corresponding cpu map of the - * completion vector (returns all-cpus map if the device driver doesn't - * implement get_vector_affinity). - */ -static inline const struct cpumask * -ib_get_vector_affinity(struct ib_device *device, int comp_vector) -{ - if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || - !device->ops.get_vector_affinity) - return NULL; - - return device->ops.get_vector_affinity(device, comp_vector); - -} - -/** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * - * @device: the rdma device + * @ibdev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); @@ -4746,7 +4924,20 @@ void roce_del_all_netdev_gids(struct ib_device *ib_dev, struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); +#else +static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) +{ + return 0; +} +static inline bool +rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + return false; +} +#endif struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, @@ -4777,7 +4968,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) /** * ibdev_to_node - return the NUMA node for a given ib_device - * @dev: device to get the NUMA node for. + * @ibdev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { @@ -4802,6 +4993,12 @@ static inline int ibdev_to_node(struct ib_device *ibdev) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); +bool rdma_dev_has_raw_cap(const struct ib_device *dev); +static inline struct net *rdma_dev_net(struct ib_device *device) +{ + return read_pnet(&device->coredev.rdma_net); +} + #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) @@ -4809,6 +5006,7 @@ bool rdma_dev_access_netns(const struct ib_device *device, /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label + * @fl: flow_label value * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same diff --git a/include/rdma/iter.h b/include/rdma/iter.h new file mode 100644 index 000000000000..19d64ef04ba9 --- /dev/null +++ b/include/rdma/iter.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef _RDMA_ITER_H_ +#define _RDMA_ITER_H_ + +#include <linux/scatterlist.h> +#include <rdma/ib_umem.h> + +/** + * IB block DMA iterator + * + * Iterates the DMA-mapped SGL in contiguous memory blocks aligned + * to a HW supported page size. + */ +struct ib_block_iter { + /* internal states */ + struct scatterlist *__sg; /* sg holding the current aligned block */ + dma_addr_t __dma_addr; /* unaligned DMA address of this block */ + size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ + unsigned int __sg_nents; /* number of SG entries */ + unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ + unsigned int __pg_bit; /* alignment of current block */ +}; + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, + unsigned int nents, + unsigned long pgsz); +bool __rdma_block_iter_next(struct ib_block_iter *biter); + +/** + * rdma_block_iter_dma_address - get the aligned dma address of the current + * block held by the block iterator. + * @biter: block iterator holding the memory block + */ +static inline dma_addr_t +rdma_block_iter_dma_address(struct ib_block_iter *biter) +{ + return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); +} + +/** + * rdma_for_each_block - iterate over contiguous memory blocks of the sg list + * @sglist: sglist to iterate over + * @biter: block iterator holding the memory block + * @nents: maximum number of sg entries to iterate over + * @pgsz: best HW supported page size to use + * + * Callers may use rdma_block_iter_dma_address() to get each + * blocks aligned DMA address. + */ +#define rdma_for_each_block(sglist, biter, nents, pgsz) \ + for (__rdma_block_iter_start(biter, sglist, nents, \ + pgsz); \ + __rdma_block_iter_next(biter);) + +static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter, + struct ib_umem *umem, + unsigned long pgsz) +{ + __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.nents, pgsz); + biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1); + biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz); +} + +static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter) +{ + return __rdma_block_iter_next(biter) && biter->__sg_numblocks--; +} + +/** + * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem + * @umem: umem to iterate over + * @pgsz: Page size to split the list into + * + * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The + * returned DMA blocks will be aligned to pgsz and span the range: + * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz) + * + * Performs exactly ib_umem_num_dma_blocks() iterations. + */ +#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \ + for (__rdma_umem_block_iter_start(biter, umem, pgsz); \ + __rdma_umem_block_iter_next(biter);) + +#endif /* _RDMA_ITER_H_ */ diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 2b22f153ef63..57b33edd9ce7 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -33,8 +33,8 @@ struct iw_cm_event { }; /** - * iw_cm_handler - Function to be called by the IW CM when delivering events - * to the client. + * typedef iw_cm_handler - Function to be called by the IW CM when delivering + * events to the client. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. @@ -43,9 +43,9 @@ typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, struct iw_cm_event *event); /** - * iw_event_handler - Function called by the provider when delivering provider - * events to the IW CM. Returns either 0 indicating the event was processed - * or -errno if the event could not be processed. + * typedef iw_event_handler - Function called by the provider when delivering + * provider events to the IW CM. Returns either 0 indicating the event was + * processed or -errno if the event could not be processed. * * @cm_id: The IW CM identifier associated with the event. * @event: Pointer to the event structure. @@ -97,7 +97,7 @@ enum iw_flags { * iw_create_cm_id - Create an IW CM identifier. * * @device: The IB device on which to create the IW CM identier. - * @event_handler: User callback invoked to report events associated with the + * @cm_handler: User callback invoked to report events associated with the * returned IW CM identifier. * @context: User specified context associated with the id. */ @@ -147,7 +147,7 @@ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); * iw_cm_reject - Reject an incoming connection request. * * @cm_id: Connection identifier associated with the request. - * @private_daa: Pointer to data to deliver to the remote peer as part of the + * @private_data: Pointer to data to deliver to the remote peer as part of the * reject message. * @private_data_len: The number of bytes in the private_data parameter. * diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 73bcac90a048..fb66d3a1dfa9 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -93,9 +93,11 @@ #define OPA_LINKINIT_QUARANTINED (9 << 4) #define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4) -#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */ -#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ -#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */ +#define OPA_LINK_SPEED_NOP 0x0000 /* no change */ +#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ +#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125 Gbps */ +#define OPA_LINK_SPEED_50G 0x0004 /* 53.125 Gbps */ +#define OPA_LINK_SPEED_100G 0x0008 /* 106.25 Gbps */ #define OPA_LINK_WIDTH_1X 0x0001 #define OPA_LINK_WIDTH_2X 0x0002 diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h deleted file mode 100644 index d297f084001a..000000000000 --- a/include/rdma/opa_vnic.h +++ /dev/null @@ -1,96 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright(c) 2017 - 2020 Intel Corporation. - */ - -#ifndef _OPA_VNIC_H -#define _OPA_VNIC_H - -/* - * This file contains Intel Omni-Path (OPA) Virtual Network Interface - * Controller (VNIC) specific declarations. - */ - -#include <rdma/ib_verbs.h> - -/* 16 header bytes + 2 reserved bytes */ -#define OPA_VNIC_L2_HDR_LEN (16 + 2) - -#define OPA_VNIC_L4_HDR_LEN 2 - -#define OPA_VNIC_HDR_LEN (OPA_VNIC_L2_HDR_LEN + \ - OPA_VNIC_L4_HDR_LEN) - -#define OPA_VNIC_L4_ETHR 0x78 - -#define OPA_VNIC_ICRC_LEN 4 -#define OPA_VNIC_TAIL_LEN 1 -#define OPA_VNIC_ICRC_TAIL_LEN (OPA_VNIC_ICRC_LEN + OPA_VNIC_TAIL_LEN) - -#define OPA_VNIC_SKB_MDATA_LEN 4 -#define OPA_VNIC_SKB_MDATA_ENCAP_ERR 0x1 - -/* opa vnic rdma netdev's private data structure */ -struct opa_vnic_rdma_netdev { - struct rdma_netdev rn; /* keep this first */ - /* followed by device private data */ - char *dev_priv[]; -}; - -static inline void *opa_vnic_priv(const struct net_device *dev) -{ - struct rdma_netdev *rn = netdev_priv(dev); - - return rn->clnt_priv; -} - -static inline void *opa_vnic_dev_priv(const struct net_device *dev) -{ - struct opa_vnic_rdma_netdev *oparn = netdev_priv(dev); - - return oparn->dev_priv; -} - -/* opa_vnic skb meta data structure */ -struct opa_vnic_skb_mdata { - u8 vl; - u8 entropy; - u8 flags; - u8 rsvd; -} __packed; - -/* OPA VNIC group statistics */ -struct opa_vnic_grp_stats { - u64 unicast; - u64 mcastbcast; - u64 untagged; - u64 vlan; - u64 s_64; - u64 s_65_127; - u64 s_128_255; - u64 s_256_511; - u64 s_512_1023; - u64 s_1024_1518; - u64 s_1519_max; -}; - -struct opa_vnic_stats { - /* standard netdev statistics */ - struct rtnl_link_stats64 netstats; - - /* OPA VNIC statistics */ - struct opa_vnic_grp_stats tx_grp; - struct opa_vnic_grp_stats rx_grp; - u64 tx_dlid_zero; - u64 tx_drop_state; - u64 rx_drop_state; - u64 rx_runt; - u64 rx_oversize; -}; - -static inline bool rdma_cap_opa_vnic(struct ib_device *device) -{ - return !!(device->attrs.kernel_cap_flags & IBK_RDMA_NETDEV_OPA); -} - -#endif /* _OPA_VNIC_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 8a8ab2f793ab..d639ff889e64 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -33,7 +33,11 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT + RDMA_CM_EVENT_TIMEWAIT_EXIT, + RDMA_CM_EVENT_ADDRINFO_RESOLVED, + RDMA_CM_EVENT_ADDRINFO_ERROR, + RDMA_CM_EVENT_USER, + RDMA_CM_EVENT_INTERNAL, }; const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); @@ -63,6 +67,9 @@ struct rdma_route { * 2 - Both primary and alternate path are available */ int num_pri_alt_paths; + + unsigned int num_service_recs; + struct sa_service_rec *service_recs; }; struct rdma_conn_param { @@ -93,6 +100,7 @@ struct rdma_cm_event { union { struct rdma_conn_param conn; struct rdma_ud_param ud; + u64 arg; } param; struct rdma_ucm_ece ece; }; @@ -161,6 +169,23 @@ struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void rdma_destroy_id(struct rdma_cm_id *id); /** + * rdma_restrict_node_type - Restrict an RDMA identifier to specific + * RDMA device node type. + * + * @id: RDMA identifier. + * @node_type: The device node type. Only RDMA_NODE_UNSPECIFIED (default), + * RDMA_NODE_RNIC and RDMA_NODE_IB_CA are allowed + * + * This allows the caller to restrict the possible devices + * used to iWarp (RDMA_NODE_RNIC) or InfiniBand/RoCEv1/RoCEv2 (RDMA_NODE_IB_CA). + * + * It needs to be called before the RDMA identifier is bound + * to an device, which mean it should be called before + * rdma_bind_addr(), rdma_resolve_addr() and rdma_listen(). + */ +int rdma_restrict_node_type(struct rdma_cm_id *id, u8 node_type); + +/** * rdma_bind_addr - Bind an RDMA identifier to a source address and * associated RDMA device, if needed. * @@ -198,6 +223,17 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** + * rdma_resolve_ib_service - Resolve the IB service record of the + * service with the given service ID or name. + * + * This function is optional in the rdma cm flow. It is called on the client + * side of a connection, before calling rdma_resolve_route. The resolution + * can be done once per rdma_cm_id. + */ +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs); + +/** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. * @@ -388,6 +424,5 @@ void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid); struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id); -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res); #endif /* RDMA_CM_H */ diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h index 45d5481a7846..4204d08a010a 100644 --- a/include/rdma/rdma_counter.h +++ b/include/rdma/rdma_counter.h @@ -23,6 +23,7 @@ struct rdma_counter_mode { enum rdma_nl_counter_mode mode; enum rdma_nl_counter_mask mask; struct auto_mode_param param; + bool bind_opcnt; }; struct rdma_port_counter { @@ -47,9 +48,10 @@ void rdma_counter_init(struct ib_device *dev); void rdma_counter_release(struct ib_device *dev); int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, + bool bind_opcnt, struct netlink_ext_ack *extack); int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port); -int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); +int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force); int rdma_counter_query_stats(struct rdma_counter *counter); u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index); @@ -61,7 +63,8 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id); int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, - enum rdma_nl_counter_mask *mask); + enum rdma_nl_counter_mask *mask, + bool *opcnt); int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 326deaf56d5d..2fd1358ea57d 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -5,6 +5,7 @@ #include <linux/netlink.h> #include <uapi/rdma/rdma_netlink.h> +#include <rdma/ib_verbs.h> struct ib_device; @@ -126,6 +127,7 @@ struct rdma_link_ops { struct list_head list; const char *type; int (*newlink)(const char *ibdev_name, struct net_device *ndev); + int (*dellink)(struct ib_device *dev); }; void rdma_link_register(struct rdma_link_ops *ops); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index c429d6ddb129..7d8de561f71b 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,7 @@ struct rvt_driver_params { /* User context */ struct rvt_ucontext { struct ib_ucontext ibucontext; + void *priv; }; /* Protection domain */ @@ -359,6 +360,15 @@ struct rvt_driver_provided { /* Get and return CPU to pin CQ processing thread */ int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect); + + /* allocate a ucontext */ + int (*alloc_ucontext)(struct ib_ucontext *uctx, struct ib_udata *udata); + + /* deallocate a ucontext */ + void (*dealloc_ucontext)(struct ib_ucontext *context); + + /* driver mmap */ + int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); }; struct rvt_dev_info { diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d67892944193..71140ea0aeb2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -144,7 +144,7 @@ #define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) /** - * rvt_ud_wr - IB UD work plus AH cache + * struct rvt_ud_wr - IB UD work plus AH cache * @wr: valid IB work request * @attr: pointer to an allocated AH attribute * @@ -184,10 +184,10 @@ struct rvt_swqe { * struct rvt_krwq - kernel struct receive work request * @p_lock: lock to protect producer of the kernel buffer * @head: index of next entry to fill - * @c_lock:lock to protect consumer of the kernel buffer + * @c_lock: lock to protect consumer of the kernel buffer * @tail: index of next entry to pull - * @count: count is aproximate of total receive enteries posted - * @rvt_rwqe: struct of receive work request queue entry + * @count: count is approximate of total receive entries posted + * @curr_wq: struct of receive work request queue entry * * This structure is used to contain the head pointer, * tail pointer and receive work queue entries for kernel @@ -309,10 +309,10 @@ struct rvt_ack_entry { #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) /** - * rvt_operation_params - op table entry - * @length - the length to copy into the swqe entry - * @qpt_support - a bit mask indicating QP type support - * @flags - RVT_OPERATION flags (see above) + * struct rvt_operation_params - op table entry + * @length: the length to copy into the swqe entry + * @qpt_support: a bit mask indicating QP type support + * @flags: RVT_OPERATION flags (see above) * * This supports table driven post send so that * the driver can have differing an potentially @@ -552,7 +552,7 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) /** * rvt_is_user_qp - return if this is user mode QP - * @qp - the target QP + * @qp: the target QP */ static inline bool rvt_is_user_qp(struct rvt_qp *qp) { @@ -561,7 +561,7 @@ static inline bool rvt_is_user_qp(struct rvt_qp *qp) /** * rvt_get_qp - get a QP reference - * @qp - the QP to hold + * @qp: the QP to hold */ static inline void rvt_get_qp(struct rvt_qp *qp) { @@ -570,7 +570,7 @@ static inline void rvt_get_qp(struct rvt_qp *qp) /** * rvt_put_qp - release a QP reference - * @qp - the QP to release + * @qp: the QP to release */ static inline void rvt_put_qp(struct rvt_qp *qp) { @@ -580,7 +580,7 @@ static inline void rvt_put_qp(struct rvt_qp *qp) /** * rvt_put_swqe - drop mr refs held by swqe - * @wqe - the send wqe + * @wqe: the send wqe * * This drops any mr references held by the swqe */ @@ -597,8 +597,8 @@ static inline void rvt_put_swqe(struct rvt_swqe *wqe) /** * rvt_qp_wqe_reserve - reserve operation - * @qp - the rvt qp - * @wqe - the send wqe + * @qp: the rvt qp + * @wqe: the send wqe * * This routine used in post send to record * a wqe relative reserved operation use. @@ -612,8 +612,8 @@ static inline void rvt_qp_wqe_reserve( /** * rvt_qp_wqe_unreserve - clean reserved operation - * @qp - the rvt qp - * @flags - send wqe flags + * @qp: the rvt qp + * @flags: send wqe flags * * This decrements the reserve use count. * @@ -653,8 +653,8 @@ u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); /** * rvt_div_round_up_mtu - round up divide - * @qp - the qp pair - * @len - the length + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu round up divide */ @@ -664,8 +664,9 @@ static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len) } /** - * @qp - the qp pair - * @len - the length + * rvt_div_mtu - shift-based divide + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu divide */ @@ -676,7 +677,7 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len) /** * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies - * @timeout - timeout input(0 - 31). + * @timeout: timeout input(0 - 31). * * Return a timeout value in jiffies. */ @@ -690,7 +691,8 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) /** * rvt_lookup_qpn - return the QP with the given QPN - * @ibp: the ibport + * @rdi: rvt device info structure + * @rvp: the ibport * @qpn: the QP number to look up * * The caller must hold the rcu_read_lock(), and keep the lock until @@ -716,9 +718,9 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, } /** - * rvt_mod_retry_timer - mod a retry timer - * @qp - the QP - * @shift - timeout shift to wait for multiple packets + * rvt_mod_retry_timer_ext - mod a retry timer + * @qp: the QP + * @shift: timeout shift to wait for multiple packets * Modify a potentially already running retry timer */ static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift) @@ -753,7 +755,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) } /** - * rvt_qp_sqwe_incr - increment ring index + * rvt_qp_swqe_incr - increment ring index * @qp: the qp * @val: the starting value * @@ -811,10 +813,10 @@ static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc, /** * rvt_qp_complete_swqe - insert send completion - * @qp - the qp - * @wqe - the send wqe - * @opcode - wc operation (driver dependent) - * @status - completion status + * @qp: the qp + * @wqe: the send wqe + * @opcode: wc operation (driver dependent) + * @status: completion status * * Update the s_last information, and then insert a send * completion into the completion @@ -891,7 +893,7 @@ void rvt_ruc_loopback(struct rvt_qp *qp); /** * struct rvt_qp_iter - the iterator for QPs - * @qp - the current QP + * @qp: the current QP * * This structure defines the current iterator * state for sequenced access to all QPs relative @@ -913,7 +915,7 @@ struct rvt_qp_iter { /** * ib_cq_tail - Return tail index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get tail * of cq buffer. @@ -929,7 +931,7 @@ static inline u32 ib_cq_tail(struct ib_cq *send_cq) /** * ib_cq_head - Return head index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get head * of cq buffer. @@ -945,7 +947,7 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) /** * rvt_free_rq - free memory allocated for rvt_rq struct - * @rvt_rq: request queue data structure + * @rq: request queue data structure * * This function should only be called if the rvt_mmap_info() * has not succeeded. diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 0d69ded73bf2..451f99e3717d 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -57,6 +57,10 @@ enum rdma_restrack_type { */ RDMA_RESTRACK_SRQ, /** + * @RDMA_RESTRACK_DMAH: DMA handle + */ + RDMA_RESTRACK_DMAH, + /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ RDMA_RESTRACK_MAX @@ -83,11 +87,11 @@ struct rdma_restrack_entry { * query stage. */ u8 no_track : 1; - /* + /** * @kref: Protect destroy of the resource */ struct kref kref; - /* + /** * @comp: Signal that all consumers of resource are completed their work */ struct completion comp; diff --git a/include/rdma/rw.h b/include/rdma/rw.h index d606cac48233..6a1d08614e09 100644 --- a/include/rdma/rw.h +++ b/include/rdma/rw.h @@ -5,6 +5,7 @@ #ifndef _RDMA_RW_H #define _RDMA_RW_H +#include <linux/bvec.h> #include <linux/dma-mapping.h> #include <linux/scatterlist.h> #include <rdma/ib_verbs.h> @@ -31,6 +32,14 @@ struct rdma_rw_ctx { struct ib_rdma_wr *wrs; } map; + /* for IOVA-based mapping of bvecs into contiguous DMA range: */ + struct { + struct dma_iova_state state; + struct ib_sge sge; + struct ib_rdma_wr wr; + size_t mapped_len; + } iova; + /* for registering multiple WRs: */ struct rdma_rw_reg_ctx { struct ib_sge sge; @@ -38,6 +47,7 @@ struct rdma_rw_ctx { struct ib_reg_wr reg_wr; struct ib_send_wr inv_wr; struct ib_mr *mr; + struct sg_table sgt; } *reg; }; }; @@ -49,6 +59,16 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir); +struct bio_vec; + +int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + struct bvec_iter iter, u64 remote_addr, u32 rkey, + enum dma_data_direction dir); +void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, + enum dma_data_direction dir); + int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, @@ -66,6 +86,8 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages); +unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, + unsigned int max_rdma_ctxs, u32 create_flags); void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); void rdma_rw_cleanup_mrs(struct ib_qp *qp); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index e6c0de227fad..e2af17da3e32 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -667,6 +667,8 @@ rdma_udata_to_uverbs_attr_bundle(struct ib_udata *udata) (udata ? container_of(rdma_udata_to_uverbs_attr_bundle(udata)->context, \ drv_dev_struct, member) : (drv_dev_struct *)NULL) +struct ib_device *rdma_udata_to_dev(struct ib_udata *udata); + #define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT) static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, @@ -895,6 +897,10 @@ int _uverbs_get_const_unsigned(u64 *to, size_t idx, u64 upper_bound, u64 *def_val); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size); + +int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, + size_t kernel_size, size_t minimum_size); +int _ib_respond_udata(struct ib_udata *udata, const void *src, size_t len); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -951,6 +957,19 @@ _uverbs_get_const_unsigned(u64 *to, { return -EINVAL; } + +static inline int _ib_copy_validate_udata_in(struct ib_udata *udata, void *req, + size_t kernel_size, + size_t minimum_size) +{ + return -EINVAL; +} + +static inline int _ib_respond_udata(struct ib_udata *udata, const void *src, + size_t len) +{ + return -EINVAL; +} #endif #define uverbs_get_const_signed(_to, _attrs_bundle, _idx) \ @@ -1016,4 +1035,86 @@ uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle, return uverbs_get_const_signed(to, attrs_bundle, idx); } +/** + * ib_copy_validate_udata_in - Copy and validate that the request structure is + * compatible with this kernel + * @_udata: The system calls ib_udata struct + * @_req: The name of an on-stack structure that holds the driver data + * @_end_member: The member in the struct that is the original end of struct + * from the first kernel to introduce it. + * + * Check that the udata input request struct is properly formed for this kernel. + * Then copy it into req + */ +#define ib_copy_validate_udata_in(_udata, _req, _end_member) \ + _ib_copy_validate_udata_in(_udata, &(_req), sizeof(_req), \ + offsetofend(typeof(_req), _end_member)) + +int _ib_copy_validate_udata_cm_fail(struct ib_udata *udata, u64 req_cm, + u64 valid_cm); + +/** + * ib_copy_validate_udata_in_cm - Copy the req structure and check the comp_mask + * @_udata: The system calls ib_udata struct + * @_req: The name of an on-stack structure that holds the driver data + * @_end_member: The member in the struct that is the original end of struct + * from the first kernel to introduce it. + * @_valid_cm: A bitmask of bits permitted in the comp_mask_field. + * + * Check that the udata input request struct is properly formed for this kernel. + * Then copy it into req + */ +#define ib_copy_validate_udata_in_cm(_udata, _req, _end_member, _valid_cm) \ + ({ \ + typeof((_req).comp_mask) __valid_cm = _valid_cm; \ + int ret = \ + ib_copy_validate_udata_in(_udata, _req, _end_member); \ + if (!ret && ((_req).comp_mask & ~__valid_cm)) \ + ret = _ib_copy_validate_udata_cm_fail( \ + _udata, (_req).comp_mask, __valid_cm); \ + ret; \ + }) + +/** + * ib_is_udata_in_empty - Check if the udata input buffer is all zeros + * @udata: The system calls ib_udata struct + * + * This should be used if the driver does not currently define a driver data + * struct. Returns 0 if the buffer is empty or all zeros, -EOPNOTSUPP if + * non-zero data is present, or a negative error code on failure. + */ +static inline int ib_is_udata_in_empty(struct ib_udata *udata) +{ + if (!udata || udata->inlen == 0) + return 0; + return _ib_copy_validate_udata_in(udata, NULL, 0, 0); +} + +/** + * ib_respond_udata - Copy a driver data response to userspace + * @_udata: The system calls ib_udata struct + * @_rep: Kernel buffer containing the response driver data on the stack + * + * Copy driver data response structures back to userspace in a way that + * is forwards and backwards compatible. Longer kernel structs are truncated, + * userspace has made some kind of error if it needed the truncated information. + * Shorter structs are zero padded. + */ +#define ib_respond_udata(_udata, _rep) \ + _ib_respond_udata(_udata, &(_rep), sizeof(_rep)) + +/** + * ib_respond_empty_udata - Zero fill the response buffer to userspace + * @_udata: The system calls ib_udata struct + * + * Used when there is no driver response data to return. Provides forward + * compatability by zeroing any buffer the user may have provided. + */ +static inline int ib_respond_empty_udata(struct ib_udata *udata) +{ + if (udata && udata->outlen && clear_user(udata->outbuf, udata->outlen)) + return -EFAULT; + return 0; +} + #endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index fe0512116958..555ea3d142a4 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,7 +34,7 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj) { if (IS_ERR(uobj)) - return NULL; + return ERR_CAST(uobj); return uobj->object; } #define uobj_get_obj_read(_object, _type, _id, _attrs) \ diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 26ba919ac245..6a253b7dc5ea 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -186,6 +186,7 @@ struct ib_uverbs_file { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); +int uverbs_uobject_release(struct ib_uobject *uobj); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) |
