From 0953fffec9ba022f63bfe01e86427530d8320d5c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:50 +0300 Subject: RDMA/uverbs: Add UVERBS_ATTR_CONST_IN to the specs language This makes it clear and safe to access constants passed in from user space. We define a consistent ABI of u64 for all constants, and verify that the data passed in can be represented by the type the user supplies. The expectation is this will always be used with an enum declaring the constant values, and the user will use the enum type as input to the accessor. To retrieve the attribute value we introduce two helper calls - one standard which may fail if attribute is not valid and one where caller can provide a default value which will be used in case the attribute is not valid (useful when attribute is optional). Signed-off-by: Jason Gunthorpe Signed-off-by: Ariel Levkovich Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky --- include/rdma/uverbs_ioctl.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9e997c3c2f04..fc2e52234a2a 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -365,6 +365,15 @@ struct uverbs_object_tree_def { __VA_ARGS__ }, \ }) +/* An input value that is a member in the enum _enum_type. */ +#define UVERBS_ATTR_CONST_IN(_attr_id, _enum_type, ...) \ + UVERBS_ATTR_PTR_IN( \ + _attr_id, \ + UVERBS_ATTR_SIZE( \ + sizeof(u64) + BUILD_BUG_ON_ZERO(!sizeof(_enum_type)), \ + sizeof(u64)), \ + __VA_ARGS__) + /* * An input value that is a bitwise combination of values of _enum_type. * This permits the flag value to be passed as either a u32 or u64, it must @@ -603,6 +612,9 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return _uverbs_alloc(bundle, size, GFP_KERNEL | __GFP_ZERO); } +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -631,6 +643,34 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return ERR_PTR(-EINVAL); } +static inline int +_uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + return -EINVAL; +} #endif +#define uverbs_get_const(_to, _attrs_bundle, _idx) \ + ({ \ + s64 _val; \ + int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), NULL); \ + (*_to) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ + ({ \ + s64 _val; \ + s64 _def_val = _default; \ + int _ret = \ + _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), &_def_val); \ + (*_to) = _val; \ + _ret; \ + }) #endif -- cgit v1.2.3 From b4749bf25652689d8e33827460266b78bb2ec42c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:51 +0300 Subject: RDMA/mlx5: Add a new flow action verb - modify header Expose the ability to create a flow action which changes packet headers. The data passed from userspace should be modify header actions as defined by HW specification. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 134 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/main.c | 3 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 17 +++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 +++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 ++ 5 files changed, 168 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 1a29f47f836e..02103a4b372c 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,24 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +static int +mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, + enum mlx5_flow_namespace_type *namespace) +{ + switch (table_type) { + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX: + *namespace = MLX5_FLOW_NAMESPACE_BYPASS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX: + *namespace = MLX5_FLOW_NAMESPACE_EGRESS; + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { [MLX5_IB_FLOW_TYPE_NORMAL] = { .type = UVERBS_ATTR_TYPE_PTR_IN, @@ -175,6 +194,100 @@ end: return err; } +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + switch (maction->flow_action_raw.sub_type) { + case MLX5_IB_FLOW_ACTION_MODIFY_HEADER: + mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; + default: + break; + } +} + +static struct ib_flow_action * +mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, + enum mlx5_ib_uapi_flow_table_type ft_type, + u8 num_actions, void *in) +{ + enum mlx5_flow_namespace_type namespace; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ERR_PTR(-EINVAL); + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return ERR_PTR(-ENOMEM); + + ret = mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in, + &maction->flow_action_raw.action_id); + + if (ret) { + kfree(maction); + return ERR_PTR(ret); + } + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_MODIFY_HEADER; + maction->flow_action_raw.dev = dev; + + return &maction->ib_action; +} + +static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + max_modify_header_actions) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, max_modify_header_actions); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_table_type ft_type; + struct ib_flow_action *action; + size_t num_actions; + void *in; + int len; + int ret; + + if (!mlx5_ib_modify_header_supported(mdev)) + return -EOPNOTSUPP; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + + if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)) + return -EINVAL; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE); + if (ret) + return ret; + + num_actions = len / MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto), + action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in); + if (IS_ERR(action)) + return PTR_ERR(action); + + atomic_set(&action->usecnt, 0); + action->device = uobj->context->device; + action->type = IB_FLOW_ACTION_UNSPECIFIED; + action->uobject = uobj; + uobj->object = action; + + return 0; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_CREATE_FLOW, UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, @@ -209,6 +322,26 @@ ADD_UVERBS_METHODS(mlx5_ib_fs, &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW), &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( + set_action_in_add_action_in_auto)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +ADD_UVERBS_METHODS( + mlx5_ib_flow_actions, + UVERBS_OBJECT_FLOW_ACTION, + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)); + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_FLOW_MATCHER_CREATE, UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, @@ -247,6 +380,7 @@ int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) root[i++] = &flow_objects; root[i++] = &mlx5_ib_fs; + root[i++] = &mlx5_ib_flow_actions; return i; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c414f3809e5c..d41419fb6b3e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3995,6 +3995,9 @@ static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) */ mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); break; + case IB_FLOW_ACTION_UNSPECIFIED: + mlx5_ib_destroy_flow_action_raw(maction); + break; default: WARN_ON(true); break; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 320d4dfe8c2f..c26ea868b4f1 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -151,6 +151,10 @@ struct mlx5_ib_pd { u32 pdn; }; +enum { + MLX5_IB_FLOW_ACTION_MODIFY_HEADER, +}; + #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) #define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) @@ -814,6 +818,11 @@ struct mlx5_ib_flow_action { u64 ib_flags; struct mlx5_accel_esp_xfrm *ctx; } esp_aes_gcm; + struct { + struct mlx5_ib_dev *dev; + u32 sub_type; + u32 action_id; + } flow_action_raw; }; }; @@ -860,7 +869,7 @@ to_mcounters(struct ib_counters *ibcntrs) struct mlx5_ib_dev { struct ib_device ib_dev; - const struct uverbs_object_tree_def *driver_trees[6]; + const struct uverbs_object_tree_def *driver_trees[7]; struct mlx5_core_dev *mdev; struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; @@ -1238,6 +1247,7 @@ struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( void *cmd_in, int inlen, int dest_id, int dest_type); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, @@ -1256,6 +1266,11 @@ mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) { return 0; } +static inline void +mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + return; +}; #endif static inline void init_query_mad(struct ib_smp *mad) { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 9c51801b9e64..9c83e13c0e89 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -166,4 +166,14 @@ enum mlx5_ib_flow_methods { MLX5_IB_METHOD_DESTROY_FLOW, }; +enum mlx5_ib_flow_action_methods { + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_create_flow_action_create_modify_header_attrs { + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 8a2fb33f3ed4..ceb6d0d8529a 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -39,5 +39,10 @@ enum mlx5_ib_uapi_flow_action_flags { MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA = 1 << 0, }; +enum mlx5_ib_uapi_flow_table_type { + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX = 0x0, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, +}; + #endif -- cgit v1.2.3 From 841eefc5cb57030ad05a0c4bc285f93ffa668ad9 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:52 +0300 Subject: RDMA/uverbs: Add generic function to fill in flow action object Refactor the initialization of a flow action object to a common function. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types_flow_action.c | 7 ++----- drivers/infiniband/hw/mlx5/flow.c | 8 +++----- include/rdma/uverbs_std_types.h | 12 ++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index d8cfafe23bd9..cb9486ad5c67 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = ib_dev; - action->type = IB_FLOW_ACTION_ESP; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); return 0; } diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 02103a4b372c..0c89d5431c7e 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,11 +280,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = uobj->context->device; - action->type = IB_FLOW_ACTION_UNSPECIFIED; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); return 0; } diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 3b00231cc084..526d918fcd5a 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -140,5 +140,17 @@ __uobj_alloc(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, #define uobj_alloc(_type, _ufile, _ib_dev) \ __uobj_alloc(uobj_get_type(_ufile, _type), _ufile, _ib_dev) +static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, + struct ib_uobject *uobj, + struct ib_device *ib_dev, + enum ib_flow_action_type type) +{ + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = type; + action->uobject = uobj; + uobj->object = action; +} + #endif -- cgit v1.2.3 From 08aeb97cb82483192bd8ad8e60d1b73ce1b75923 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:53 +0300 Subject: RDMA/mlx5: Add new flow action verb - packet reformat For now, only add L2_TUNNEL_TO_L2 option. This will allow to perform generic decap operation if the encapsulating protocol is L2 based, and the inner packet is also L2 based. For example this can be used to decap VXLAN packets. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 76 ++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 7 +++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 ++ 4 files changed, 87 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 0c89d5431c7e..888f79d6a125 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -202,6 +202,8 @@ void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, maction->flow_action_raw.action_id); break; + case MLX5_IB_FLOW_ACTION_DECAP: + break; default: break; } @@ -286,6 +288,64 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( return 0; } +static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, + u8 packet_reformat_type, + u8 ft_type) +{ + switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); + break; + default: + break; + } + + return false; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt; + enum mlx5_ib_uapi_flow_table_type ft_type; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE); + if (ret) + return ret; + + ret = uverbs_get_const(&dv_prt, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE); + if (ret) + return ret; + + if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type)) + return -EOPNOTSUPP; + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return -ENOMEM; + + if (dv_prt == + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) { + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_DECAP; + maction->flow_action_raw.dev = mdev; + } + + uverbs_flow_action_fill_action(&maction->ib_action, uobj, + uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); + return 0; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_CREATE_FLOW, UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, @@ -335,10 +395,24 @@ DECLARE_UVERBS_NAMED_METHOD( enum mlx5_ib_uapi_flow_table_type, UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + enum mlx5_ib_uapi_flow_action_packet_reformat_type, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + ADD_UVERBS_METHODS( mlx5_ib_flow_actions, UVERBS_OBJECT_FLOW_ACTION, - &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)); + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_FLOW_MATCHER_CREATE, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c26ea868b4f1..8ac84cc00fd5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -153,6 +153,7 @@ struct mlx5_ib_pd { enum { MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_DECAP, }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 9c83e13c0e89..40db7fca3d0b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -168,6 +168,7 @@ enum mlx5_ib_flow_methods { enum mlx5_ib_flow_action_methods { MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, }; enum mlx5_ib_create_flow_action_create_modify_header_attrs { @@ -176,4 +177,10 @@ enum mlx5_ib_create_flow_action_create_modify_header_attrs { MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, }; +enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index ceb6d0d8529a..b5fda0fcd484 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -44,5 +44,9 @@ enum mlx5_ib_uapi_flow_table_type { MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, }; +enum mlx5_ib_uapi_flow_action_packet_reformat_type { + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, +}; + #endif -- cgit v1.2.3 From a090d0d859ff88dd4c34614d01cee9b0603f4313 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:54 +0300 Subject: RDMA/mlx5: Extend packet reformat verbs We expose new actions: L2_TO_L2_TUNNEL - A generic encap from L2 to L2, the data passed should be the encapsulating headers. L3_TUNNEL_TO_L2 - Will do decap where the inner packet starts from L3, the data should be mac or mac + vlan (14 or 18 bytes). L2_TO_L3_TUNNEL - Will do encap where is L2 of the original packet will not be included, the data should be the encapsulating header. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 95 +++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + include/uapi/rdma/mlx5_user_ioctl_verbs.h | 3 + 4 files changed, 100 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 888f79d6a125..5750a650884e 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -202,6 +202,10 @@ void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, maction->flow_action_raw.action_id); break; + case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT: + mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; case MLX5_IB_FLOW_ACTION_DECAP: break; default: @@ -293,6 +297,21 @@ static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, u8 ft_type) { switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE(ibdev->mdev, + encap_general_header); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev, + reformat_l2_to_l3_tunnel); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, + reformat_l3_tunnel_to_l2); + break; case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); @@ -304,6 +323,56 @@ static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, return false; } +static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt) +{ + switch (dv_prt) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + *prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_ib_flow_action_create_packet_reformat_ctx( + struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_action *maction, + u8 ft_type, u8 dv_prt, + void *in, size_t len) +{ + enum mlx5_flow_namespace_type namespace; + u8 prm_prt; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ret; + + ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt); + if (ret) + return ret; + + ret = mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len, + in, namespace, + &maction->flow_action_raw.action_id); + if (ret) + return ret; + + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT; + maction->flow_action_raw.dev = dev; + + return 0; +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) @@ -338,12 +407,34 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( maction->flow_action_raw.sub_type = MLX5_IB_FLOW_ACTION_DECAP; maction->flow_action_raw.dev = mdev; + } else { + void *in; + int len; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto free_maction; + } + + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + + ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev, + maction, ft_type, dv_prt, in, len); + if (ret) + goto free_maction; } uverbs_flow_action_fill_action(&maction->ib_action, uobj, uobj->context->device, IB_FLOW_ACTION_UNSPECIFIED); return 0; + +free_maction: + kfree(maction); + return ret; } DECLARE_UVERBS_NAMED_METHOD( @@ -401,6 +492,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_FLOW_ACTION, UVERBS_ACCESS_NEW, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, + UVERBS_ATTR_MIN_SIZE(1), + UA_ALLOC_AND_COPY, + UA_OPTIONAL), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, enum mlx5_ib_uapi_flow_action_packet_reformat_type, UA_MANDATORY), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8ac84cc00fd5..eb6a0ca0247f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -153,6 +153,7 @@ struct mlx5_ib_pd { enum { MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT, MLX5_IB_FLOW_ACTION_DECAP, }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 40db7fca3d0b..75c7093fd95b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -181,6 +181,7 @@ enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, }; #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index b5fda0fcd484..4ef62c0e8452 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -46,6 +46,9 @@ enum mlx5_ib_uapi_flow_table_type { enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x1, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x2, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; #endif -- cgit v1.2.3 From f794809a7259dfaa3d47d90ef5a86007cf48b1ce Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 27 Aug 2018 08:35:55 +0300 Subject: IB/core: Add an unbound WQ type to the new CQ API The upstream kernel commit cited below modified the workqueue in the new CQ API to be bound to a specific CPU (instead of being unbound). This caused ALL users of the new CQ API to use the same bound WQ. Specifically, MAD handling was severely delayed when the CPU bound to the WQ was busy handling (higher priority) interrupts. This caused a delay in the MAD "heartbeat" response handling, which resulted in ports being incorrectly classified as "down". To fix this, add a new "unbound" WQ type to the new CQ API, so that users have the option to choose either a bound WQ or an unbound WQ. For MADs, choose the new "unbound" WQ. Fixes: b7363e67b23e ("IB/device: Convert ib-comp-wq to be CPU-bound") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 8 ++++++-- drivers/infiniband/core/device.c | 15 ++++++++++++++- drivers/infiniband/core/mad.c | 2 +- include/rdma/ib_verbs.h | 9 ++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index af5ad6a56ae4..9271f7290005 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work) IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } /** @@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; @@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index db3b6271f09d..6d8ac51a39cc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -61,6 +61,7 @@ struct ib_client_data { }; struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -1166,10 +1167,19 @@ static int __init ib_core_init(void) goto err; } + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) { + ret = -ENOMEM; + goto err_comp; + } + ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); - goto err_comp; + goto err_comp_unbound; } ret = rdma_nl_init(); @@ -1218,6 +1228,8 @@ err_ibnl: rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err: @@ -1236,6 +1248,7 @@ static void __exit ib_core_cleanup(void) addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef459f2f2eeb..b8977c3db5f3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3183,7 +3183,7 @@ static int ib_mad_port_open(struct ib_device *device, cq_size *= 2; port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, - IB_POLL_WORKQUEUE); + IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..df8d234a2b56 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -71,6 +71,7 @@ extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; +extern struct workqueue_struct *ib_comp_unbound_wq; union ib_gid { u8 raw[16]; @@ -1570,9 +1571,10 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ }; struct ib_cq { @@ -1589,6 +1591,7 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + struct workqueue_struct *comp_wq; /* * Implementation details of the RDMA core, don't use in drivers: */ -- cgit v1.2.3 From 6ceb6331b3291694fb6ceba625219f51447c3fa2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 20:18:03 +0300 Subject: RDMA/uverbs: Declare closing variable as boolean The "closing" variable is used as boolean and set to "true" in one place, update the declaration of that variable and their other assignment to proper type. Fixes: e951747a087a ("IB/uverbs: Rework the locking for cleaning up the ucontext") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- include/rdma/ib_verbs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a21d5214afc3..4b72851ade24 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -120,7 +120,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, rcu_read_lock(); ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); - ucontext->closing = 0; + ucontext->closing = false; ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df8d234a2b56..a4c3a09a91bc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1486,7 +1486,7 @@ struct ib_ucontext { * it is set when we are closing the file descriptor and indicates * that mm_sem may be locked. */ - int closing; + bool closing; bool cleanup_retryable; -- cgit v1.2.3 From adee9f3f3bbb317c5469f84deba01eef4b86515b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:58 +0300 Subject: RDMA/core: Depend on device_add() to add device attributes Instead of adding/removing device attribute files, depend on device_add() which considers adding these device files based on NULL terminated attributes group array. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 61 ++++++++++++++++++----------------------- include/rdma/ib_verbs.h | 3 ++ 2 files changed, 30 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..185075af3ad6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1183,7 +1183,7 @@ err_put: return ret; } -static ssize_t show_node_type(struct device *device, +static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1198,8 +1198,9 @@ static ssize_t show_node_type(struct device *device, default: return sprintf(buf, "%d: \n", dev->node_type); } } +static DEVICE_ATTR_RO(node_type); -static ssize_t show_sys_image_guid(struct device *device, +static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1210,8 +1211,9 @@ static ssize_t show_sys_image_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } +static DEVICE_ATTR_RO(sys_image_guid); -static ssize_t show_node_guid(struct device *device, +static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1222,8 +1224,9 @@ static ssize_t show_node_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->node_guid)[2]), be16_to_cpu(((__be16 *) &dev->node_guid)[3])); } +static DEVICE_ATTR_RO(node_guid); -static ssize_t show_node_desc(struct device *device, +static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1231,9 +1234,9 @@ static ssize_t show_node_desc(struct device *device, return sprintf(buf, "%.64s\n", dev->node_desc); } -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) { struct ib_device *dev = container_of(device, struct ib_device, dev); struct ib_device_modify desc = {}; @@ -1249,8 +1252,9 @@ static ssize_t set_node_desc(struct device *device, return count; } +static DEVICE_ATTR_RW(node_desc); -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1259,19 +1263,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } +static DEVICE_ATTR_RO(fw_ver); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_fw_ver, +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, }; static void free_port_list_attributes(struct ib_device *device) @@ -1311,16 +1315,13 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) return ret; + device->groups[0] = &dev_attr_group; + class_dev->groups = device->groups; + ret = device_add(class_dev); if (ret) goto err; - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - device->ports_parent = kobject_create_and_add("ports", &class_dev->kobj); if (!device->ports_parent) { @@ -1347,18 +1348,13 @@ int ib_device_register_sysfs(struct ib_device *device, err_put: free_port_list_attributes(device); - -err_unregister: device_del(class_dev); - err: return ret; } void ib_device_unregister_sysfs(struct ib_device *device) { - int i; - /* Hold kobject until ib_dealloc_device() */ kobject_get(&device->dev.kobj); @@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); } - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) - device_remove_file(&device->dev, ib_class_attributes[i]); - device_unregister(&device->dev); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..cd0f935f0bc1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2536,6 +2536,9 @@ struct ib_device { struct module *owner; struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; + struct kobject *ports_parent; struct list_head port_list; -- cgit v1.2.3 From e1f540c3ed0e9634d0f8c4600f3c85df8aff4ae2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:45 +0300 Subject: RDMA/core: Define client_data_lock as rwlock instead of spinlock Even though device registration/unregistration and client registration/unregistration is not a performance path, define the client_data_lock as rwlock for code clarity. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 30 +++++++++++++++--------------- include/rdma/ib_verbs.h | 5 +++-- 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a51d16ab1329..a0939140ed3a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -270,7 +270,7 @@ struct ib_device *ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + rwlock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); @@ -307,9 +307,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -586,10 +586,10 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); list_for_each_entry(context, &device->client_data_list, list) { @@ -609,13 +609,13 @@ void ib_unregister_device(struct ib_device *device) kfree(device->port_pkey_list); down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) { list_del(&context->list); kfree(context); } - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; @@ -678,14 +678,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -699,9 +699,9 @@ void ib_unregister_client(struct ib_client *client) } down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_del(&found_context->list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); kfree(found_context); } @@ -724,13 +724,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) void *ret = NULL; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + read_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + read_unlock_irqrestore(&device->client_data_lock, flags); return ret; } @@ -751,7 +751,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, struct ib_client_data *context; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; @@ -762,7 +762,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, device->name, client->name); out: - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ddc7c317e136..995f176d4782 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2256,10 +2256,11 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; + rwlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock - * spinlock and the lists_rwsem read-write semaphore */ + * rwlock and the lists_rwsem read-write semaphore + */ struct list_head client_data_list; struct ib_cache cache; -- cgit v1.2.3 From 4269024639f6ff9a1967c4bfa5a2ba7d9853384a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Sep 2018 11:45:14 -0400 Subject: RDMA/core: Document CM @event_handler function Code audit suggests that the RDMA CM event handler callback function is _always_ invoked in a context that is safe to block. That's important for consumer implementers to know, so document that in the comment before rdma_create_id (where the handler function is set up by the consumer). Signed-off-by: Chuck Lever Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_cm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 5d71a7f51a9f..53d93c7d8e01 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -152,7 +152,11 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. * - * The id holds a reference on the network namespace until it is destroyed. + * Returns a new rdma_cm_id. The id holds a reference on the network + * namespace until it is destroyed. + * + * The event handler callback serializes on the id's mutex and is + * allowed to sleep. */ #define rdma_create_id(net, event_handler, context, ps, qp_type) \ __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ -- cgit v1.2.3 From eb93c82ed8c77f00955f2891483170194c3be92c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Sep 2018 11:45:20 -0400 Subject: RDMA/core: Document QP @event_handler function Add helpful warning for RDMA consumer implementers. Signed-off-by: Chuck Lever Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 995f176d4782..f687faadf33b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1138,7 +1138,9 @@ enum ib_qp_create_flags { */ struct ib_qp_init_attr { + /* Consumer's event_handler callback must not block */ void (*event_handler)(struct ib_event *, void *); + void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; -- cgit v1.2.3 From 70cd20aed00f719f3536154df02596106e431e45 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 6 Sep 2018 17:27:01 +0300 Subject: IB/uverbs: Add IDRs array attribute type to ioctl() interface Methods sometimes need to get a flexible set of IDRs and not a strict set as can be achieved today by the conventional IDR attribute. Add a new IDRS_ARRAY attribute to the generic uverbs ioctl layer. IDRS_ARRAY points to array of idrs of the same object type and same access rights, only write and read are supported. Signed-off-by: Guy Levi Signed-off-by: Jason Gunthorpe `` Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 114 +++++++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 12 ++++ include/rdma/uverbs_ioctl.h | 71 ++++++++++++++++++- include/uapi/rdma/rdma_user_ioctl_cmds.h | 7 +- 4 files changed, 201 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 4bafd4671de2..0e95a5888274 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -57,6 +57,7 @@ struct bundle_priv { struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, + spec->u2.objs_arr.access, idr_vals[i]); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + int current_ret; + int ret = 0; + size_t i; + + for (i = 0; i != attr->len; i++) { + current_ret = uverbs_finalize_object( + attr->uobjects[i], spec->u2.objs_arr.access, commit); + if (!ret) + ret = current_ret; + } + + return ret; +} + static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) @@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, } break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); default: return -EOPNOTSUPP; } @@ -384,6 +470,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) unsigned int i; int ret = 0; + /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { @@ -397,6 +484,32 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) ret = current_ret; } + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + int current_ret; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = srcu_dereference( + *slot, + &pbundle->bundle.ufile->device->disassociate_srcu); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + current_ret = uverbs_free_idrs_array( + attr_uapi, &attr->objs_arr_attr, commit); + if (!ret) + ret = current_ret; + } + } + for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; @@ -461,6 +574,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); destroy_ret = bundle_destroy(pbundle, ret == 0); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 73ea6f0db88f..cdf5ced2c84f 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi, if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index fc2e52234a2a..84d3d15f1f38 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -52,6 +52,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, UVERBS_ATTR_TYPE_ENUM_IN, + UVERBS_ATTR_TYPE_IDRS_ARRAY, }; enum uverbs_obj_access { @@ -101,7 +102,7 @@ struct uverbs_attr_spec { } enum_def; } u; - /* This weird split of the enum lets us remove some padding */ + /* This weird split lets us remove some padding */ union { struct { /* @@ -111,6 +112,17 @@ struct uverbs_attr_spec { */ const struct uverbs_attr_spec *ids; } enum_def; + + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u16 min_len; + u16 max_len; + u8 access; + } objs_arr; } u2; }; @@ -251,6 +263,11 @@ static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key) return attr_key - 1; } +static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey) +{ + return attr_bkey + 1; +} + /* * ======================================= * Verbs definitions @@ -323,6 +340,27 @@ struct uverbs_object_tree_def { #define UA_MANDATORY .mandatory = 1 #define UA_OPTIONAL .mandatory = 0 +/* + * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only + * READ\WRITE accesses are supported. + */ +#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \ + ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id) + \ + BUILD_BUG_ON_ZERO((_min_len) == 0 || \ + (_max_len) > \ + PAGE_SIZE / sizeof(void *) || \ + (_min_len) > (_max_len) || \ + (_access) == UVERBS_ACCESS_NEW || \ + (_access) == UVERBS_ACCESS_DESTROY), \ + .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \ + .u2.objs_arr.obj_type = _idr_type, \ + .u2.objs_arr.access = _access, \ + .u2.objs_arr.min_len = _min_len, \ + .u2.objs_arr.max_len = _max_len, \ + __VA_ARGS__ } }) + #define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -440,10 +478,16 @@ struct uverbs_obj_attr { const struct uverbs_api_attr *attr_elm; }; +struct uverbs_objs_arr_attr { + struct ib_uobject **uobjects; + u16 len; +}; + struct uverbs_attr { union { struct uverbs_ptr_attr ptr_attr; struct uverbs_obj_attr obj_attr; + struct uverbs_objs_arr_attr objs_arr_attr; }; }; @@ -516,6 +560,31 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +/** + * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for + * UVERBS_ATTR_TYPE_IDRS_ARRAY. + * @arr: Returned pointer to array of pointers for uobjects or NULL if + * the attribute isn't provided. + * + * Return: The array length or 0 if no attribute was provided. + */ +static inline int uverbs_attr_get_uobjs_arr( + const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx, + struct ib_uobject ***arr) +{ + const struct uverbs_attr *attr = + uverbs_attr_get(attrs_bundle, attr_idx); + + if (IS_ERR(attr)) { + *arr = NULL; + return 0; + } + + *arr = attr->objs_arr_attr.uobjects; + + return attr->objs_arr_attr.len; +} + static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr) { return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data); diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 24800c6c1f32..06c34d99be85 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -53,7 +53,7 @@ enum { struct ib_uverbs_attr { __u16 attr_id; /* command specific type attribute */ - __u16 len; /* only for pointers */ + __u16 len; /* only for pointers and IDRs array */ __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ union { struct { @@ -63,7 +63,10 @@ struct ib_uverbs_attr { __u16 reserved; } attr_data; union { - /* Used by PTR_IN/OUT, ENUM_IN and IDR */ + /* + * ptr to command, inline data, idr/fd or + * ptr to __u32 array of IDRs + */ __aligned_u64 data; /* Used by FD_IN and FD_OUT */ __s64 data_s64; -- cgit v1.2.3 From 86e1d464a8ccd627b6ea3e9a98a0389b0d27fd1f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:02 +0300 Subject: RDMA/uverbs: Move flow resources initialization Use ib_set_flow() when initializing flow related resources. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 6 ------ drivers/infiniband/core/uverbs_cmd.c | 19 ++----------------- drivers/infiniband/hw/mlx5/flow.c | 2 +- include/rdma/ib_verbs.h | 14 -------------- include/rdma/uverbs_std_types.h | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 7199c275ab79..717ab35b0af9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -219,12 +219,6 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uflow_resources; -struct ib_uflow_object { - struct ib_uobject uobject; - struct ib_uflow_resources *resources; -}; - extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4b72851ade24..c054d65dec1b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,15 +2747,6 @@ out_put: return ret ? ret : in_len; } -struct ib_uflow_resources { - size_t max; - size_t num; - size_t collection_num; - size_t counters_num; - struct ib_counters **counters; - struct ib_flow_action **collection; -}; - static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -3462,7 +3453,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; - struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3601,13 +3591,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = PTR_ERR(flow_id); goto err_free; } - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - flow_id->device = qp->device; - flow_id->uobject = uobj; - uobj->object = flow_id; - uflow = container_of(uobj, typeof(*uflow), uobject); - uflow->resources = uflow_res; + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 5750a650884e..12abbc02af99 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -128,7 +128,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( if (IS_ERR(flow_handler)) return PTR_ERR(flow_handler); - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f687faadf33b..6076c9b72ab9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4162,20 +4162,6 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } -static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, - struct ib_qp *qp, struct ib_device *device) -{ - uobj->object = ibflow; - ibflow->uobject = uobj; - - if (qp) { - atomic_inc(&qp->usecnt); - ibflow->qp = qp; - } - - ibflow->device = device; -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 526d918fcd5a..dfd6d35f1783 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -152,5 +152,38 @@ static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, uobj->object = action; } +struct ib_uflow_resources { + size_t max; + size_t num; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; +}; + +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + +static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, + struct ib_qp *qp, struct ib_device *device, + struct ib_uflow_resources *uflow_res) +{ + struct ib_uflow_object *uflow; + + uobj->object = ibflow; + ibflow->uobject = uobj; + + if (qp) { + atomic_inc(&qp->usecnt); + ibflow->qp = qp; + } + + ibflow->device = device; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; +} + #endif -- cgit v1.2.3 From fa76d24ee0aa24fff3fa9ba71fc2179fb88fef6a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:06 +0300 Subject: RDMA/mlx5: Add flow actions support to raw create flow Support attaching flow actions to a flow rule via raw create flow. For now only NIC RX path is supported. This change requires to export flow resources management functions so we can maintain proper bookkeeping of flow actions. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 11 +++++---- drivers/infiniband/hw/mlx5/flow.c | 40 ++++++++++++++++++++++++++++---- include/rdma/uverbs_std_types.h | 6 +++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 4 files changed, 50 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c054d65dec1b..9c87c98a0f19 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,7 +2747,7 @@ out_put: return ret ? ret : in_len; } -static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -2777,6 +2777,7 @@ err: return NULL; } +EXPORT_SYMBOL(flow_resources_alloc); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { @@ -2795,10 +2796,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) kfree(uflow_res->counters); kfree(uflow_res); } +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); -static void flow_resources_add(struct ib_uflow_resources *uflow_res, - enum ib_flow_spec_type type, - void *ibobj) +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); @@ -2819,6 +2821,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } +EXPORT_SYMBOL(flow_resources_add); static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 0e913491d139..ce9276a2aaa5 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -58,12 +58,15 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { }, }; +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; void *devx_obj; int dest_id, dest_type; void *cmd_in; @@ -73,6 +76,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + int len, ret, i; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -124,15 +128,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); fs_matcher = uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + + uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); + if (!uflow_res) + return -ENOMEM; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); + for (i = 0; i < len; i++) { + struct mlx5_ib_flow_action *maction = + to_mflow_act(arr_flow_actions[i]->object); + + ret = parse_flow_flow_action(maction, false, &flow_act); + if (ret) + goto err_out; + flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE, + arr_flow_actions[i]->object); + } + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, cmd_in, inlen, dest_id, dest_type); - if (IS_ERR(flow_handler)) - return PTR_ERR(flow_handler); + if (IS_ERR(flow_handler)) { + ret = PTR_ERR(flow_handler); + goto err_out; + } - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res); return 0; +err_out: + ib_uverbs_flow_resources_free(uflow_res); + return ret; } static int flow_matcher_cleanup(struct ib_uobject *uobject, @@ -459,7 +486,12 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ACCESS_READ), UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_OBJECT_DEVX_OBJ, - UVERBS_ACCESS_READ)); + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_READ, 1, + MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index dfd6d35f1783..3db2802fbc68 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -166,6 +166,12 @@ struct ib_uflow_object { struct ib_uflow_resources *resources; }; +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs); +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); + static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, struct ib_qp *qp, struct ib_device *device, struct ib_uflow_resources *uflow_res) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 75c7093fd95b..91c3d42ebd0f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -155,6 +155,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, }; enum mlx5_ib_destoy_flow_attrs { -- cgit v1.2.3 From a7ee18bdee837e4703f01588993504b72074ffc6 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:08 +0300 Subject: RDMA/mlx5: Allow creating a matcher for a NIC TX flow table Currently a matcher can only be created and attached to a NIC RX flow table. Extend it to allow it on NIC TX flow tables as well. In order to achieve that, we: 1) Expose a new attribute: MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS. enum ib_flow_flags is used as valid flags. Only IB_FLOW_ATTR_FLAGS_EGRESS is supported. 2) Remove the requirement to have a DEVX or QP destination when creating a flow. A flow added to NIC TX flow table will forward the packet outside of the vport (Wire or E-Switch in the SR-iOV case). Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 34 +++++++++++++++++++++++++++----- drivers/infiniband/hw/mlx5/main.c | 5 ++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 16e677c549e6..4ee4af450720 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -86,7 +86,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)) + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && + ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) + return -EINVAL; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS && + (dest_devx || dest_qp)) return -EINVAL; if (dest_devx) { @@ -100,7 +107,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( */ if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) return -EINVAL; - } else { + } else if (dest_qp) { struct mlx5_ib_qp *mqp; qp = uverbs_attr_get_obj(attrs, @@ -117,6 +124,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( else dest_id = mqp->raw_packet_qp.rq.tirn; dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } else { + dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (dev->rep) @@ -126,8 +135,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); inlen = uverbs_attr_get_len(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); - fs_matcher = uverbs_attr_get_obj(attrs, - MLX5_IB_ATTR_CREATE_FLOW_MATCHER); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); if (!uflow_res) @@ -183,6 +190,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); struct mlx5_ib_flow_matcher *obj; + u32 flags; int err; obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); @@ -215,6 +223,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( if (err) goto end; + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + IB_FLOW_ATTR_FLAGS_EGRESS); + if (err) + goto end; + + if (flags) { + err = mlx5_ib_ft_type_to_namespace( + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, &obj->ns_type); + if (err) + goto end; + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -559,7 +580,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY), UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, UVERBS_ATTR_TYPE(u8), - UA_MANDATORY)); + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + enum ib_flow_flags, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e311b6f8e1ee..2be6a4377558 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3892,10 +3892,13 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, dst->type = dest_type; dst->tir_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else { + } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; dst->ft_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; } handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 91c3d42ebd0f..fb4a8b17cca8 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -125,6 +125,7 @@ enum mlx5_ib_flow_matcher_create_attrs { MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, }; enum mlx5_ib_flow_matcher_destroy_attrs { -- cgit v1.2.3 From 0b79b27748cbec221e1ceabf63578198602bf01d Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:49:27 -0700 Subject: IB/{hfi1, qib, rdmavt}: Schedule multi RC/UC packets instead of posting The post_send() path determines if it should post directly or, schedule the post for later. The current logic is: if the swqe ring is empty or (for hfi1) wqe->length <= piothreshold post the send else schedule This can allow large requests to call the send engine directly. Large requests can potentially produce a large number of packets prior to returning to the caller, blocking the caller from posting more requests, and allowing better parallel processing. Allow the driver(s) more say in this logic (pass call_send to the driver, rather than examining a return value). Update hfi1/qib logic to schedule the send engine if an RC or UC message is larger than the QP MTU size. Reviewed-by: Mike Marciniszyn Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 12 +++++------- drivers/infiniband/hw/hfi1/verbs.h | 3 ++- drivers/infiniband/hw/qib/qib_qp.c | 17 +++++++---------- drivers/infiniband/hw/qib/qib_verbs.h | 3 ++- drivers/infiniband/sw/rdmavt/qp.c | 14 ++++++++------ include/rdma/rdma_vt.h | 10 ++++++++-- 6 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 9b1e84a6b1cc..54d9ff171059 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -285,17 +285,13 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, * hfi1_check_send_wqe - validate wqe * @qp - The qp * @wqe - The built wqe - * - * validate wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. + * @call_send - Determine if the send should be posted or scheduled. * * Returns 0 on success, -EINVAL on failure * */ int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) + struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; @@ -305,6 +301,8 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: ah = ibah_to_rvtah(wqe->ud_wr.ah); @@ -321,7 +319,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, default: break; } - return wqe->length <= piothreshold; + return 0; } /** diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a4d06502f06d..269ec338581b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -343,7 +343,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 344e401915f7..a81905df2d0f 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -378,25 +378,22 @@ void qib_flush_qp_waiters(struct rvt_qp *qp) * qib_check_send_wqe - validate wr/wqe * @qp - The qp * @wqe - The built wqe + * @call_send - Determine if the send should be posted or scheduled * - * validate wr/wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. - * - * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure + * Returns 0 on success, -EINVAL on failure */ int qib_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) + struct rvt_swqe *wqe, bool *call_send) { struct rvt_ah *ah; - int ret = 0; switch (qp->ibqp.qp_type) { case IB_QPT_RC: case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: case IB_QPT_GSI: @@ -405,12 +402,12 @@ int qib_check_send_wqe(struct rvt_qp *qp, if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ - ret = 1; + *call_send = true; break; default: break; } - return ret; + return 0; } #ifdef CONFIG_DEBUG_FS diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 666613eef88f..3d7b744ae8fb 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,7 +303,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); -int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 5ce403c6cddb..a9b7d7ff32ee 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1718,7 +1718,7 @@ static inline int rvt_qp_is_avail( */ static int rvt_post_one_wr(struct rvt_qp *qp, const struct ib_send_wr *wr, - int *call_send) + bool *call_send) { struct rvt_swqe *wqe; u32 next; @@ -1825,11 +1825,9 @@ static int rvt_post_one_wr(struct rvt_qp *qp, /* general part of wqe valid - allow for driver checks */ if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe); + ret = rdi->driver_f.check_send_wqe(qp, wqe, call_send); if (ret < 0) goto bail_inval_free; - if (ret) - *call_send = ret; } log_pmtu = qp->log_pmtu; @@ -1897,7 +1895,7 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); unsigned long flags = 0; - int call_send; + bool call_send; unsigned nreq = 0; int err = 0; @@ -1930,7 +1928,11 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, bail: spin_unlock_irqrestore(&qp->s_hlock, flags); if (nreq) { - if (call_send) + /* + * Only call do_send if there is exactly one packet, and the + * driver said it was ok. + */ + if (nreq == 1 && call_send) rdi->driver_f.do_send(qp); else rdi->driver_f.schedule_send_no_lock(qp); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e79229a0cf01..e32facdd9fd3 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -214,8 +214,14 @@ struct rvt_driver_provided { void (*schedule_send)(struct rvt_qp *qp); void (*schedule_send_no_lock)(struct rvt_qp *qp); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + /* + * Validate the wqe. This needs to be done prior to inserting the + * wqe into the ring, but after the wqe has been set up. Allow for + * driver specific work request checking by providing a callback. + * call_send indicates if the wqe should be posted or scheduled. + */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is -- cgit v1.2.3 From 77addc524473ee9a85d2ef5747a32173c85768d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:20 +0300 Subject: RDMA/core: Rename rdma_copy_addr to rdma_copy_src_l2_addr Now that rdma_copy_addr() only copies the source addresses and all callers are interested in copying only source addresses, simplify it to drop the destination address argument. Given that it only copies source layer2 addresses, rename it to rdma_copy_src_l2_addr for better code readability. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 24 +++++++++++++++--------- drivers/infiniband/core/cma.c | 4 ++-- drivers/infiniband/core/core_priv.h | 2 ++ include/rdma/ib_addr.h | 4 ---- 4 files changed, 19 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 40f1c1563477..c9d14d6996b2 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -219,18 +219,24 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr) +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } -EXPORT_SYMBOL(rdma_copy_addr); +EXPORT_SYMBOL(rdma_copy_src_l2_addr); static struct net_device * rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) @@ -271,7 +277,7 @@ int rdma_translate_ip(const struct sockaddr *addr, dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); dev_put(dev); return 0; } @@ -279,7 +285,7 @@ int rdma_translate_ip(const struct sockaddr *addr, rcu_read_lock(); dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); if (!IS_ERR(dev)) - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); rcu_read_unlock(); return PTR_ERR_OR_ZERO(dev); } @@ -484,7 +490,7 @@ static int rdma_set_src_addr(const struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) ret = rdma_translate_ip(dst_in, dev_addr); else - rdma_copy_addr(dev_addr, dst->dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dst->dev); return ret; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 4ba77f4e7098..ace2a4c757f6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1900,7 +1900,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1950,7 +1950,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, goto err; if (net_dev) { - rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 77c7005c396c..c3d93350413c 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -340,5 +340,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct net_device *ndev, int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); #endif /* _CORE_PRIV_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 77c7908b7d73..676514a930ab 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -105,10 +105,6 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, void rdma_addr_cancel(struct rdma_dev_addr *addr); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr); - int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); -- cgit v1.2.3 From 0e9d2c19bff1d351005afb2f990a913e395ba6d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:26 +0300 Subject: RDMA/core: Consider net ns of gid attribute for RoCE When resolving destination address or route, when net namespace is unavailable, refer to the net namespace of the netdevice of the SGID attribute. This is typically the case for requests arriving from the network for RoCE ports. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 75 +++++++++++++++++++++++++++++++------ drivers/infiniband/core/cma.c | 7 ++-- drivers/infiniband/core/core_priv.h | 2 +- drivers/infiniband/core/verbs.c | 2 +- include/rdma/ib_addr.h | 3 ++ 5 files changed, 73 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c4c620334957..7a0356c78f60 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -62,6 +62,7 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; @@ -518,10 +519,37 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, return 0; } +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; + + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; + return 0; +} + +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, + bool resolve_by_gid_attr, u32 seq) { struct dst_entry *dst = NULL; @@ -535,6 +563,23 @@ static int addr_resolve(struct sockaddr *src_in, } rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; + } + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); + return ret; + } + } if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; @@ -543,7 +588,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ret) { rcu_read_unlock(); - return ret; + goto done; } ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); rcu_read_unlock(); @@ -559,6 +604,13 @@ static int addr_resolve(struct sockaddr *src_in, ip_rt_put(rt); else dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); return ret; } @@ -573,7 +625,8 @@ static void process_one_req(struct work_struct *_work) src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); + true, req->resolve_by_gid_attr, + req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { @@ -608,6 +661,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; @@ -636,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, req->addr = addr; req->callback = callback; req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -683,14 +739,11 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, if (!attr || !attr->ndev) return -EINVAL; - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, - &dev_addr, false, 0); + &dev_addr, false, true, 0); if (ret) return ret; @@ -755,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; @@ -771,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ace2a4c757f6..a57c8b823302 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2987,9 +2987,10 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); } } if (ret) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 33f50e1929e7..d7399d5b1cb6 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -338,7 +338,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 6ee03d6089eb..c36be384fe34 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr->ndev, &hop_limit); + sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 676514a930ab..2e33b1529015 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -95,12 +95,15 @@ int rdma_translate_ip(const struct sockaddr *addr, * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. + * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from + * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); -- cgit v1.2.3 From cb816cd22618b1822667a4c2c80023ffd0261777 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 13 Sep 2018 21:47:52 +0800 Subject: RDMA: Remove duplicated include from ib_addr.h Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_addr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 2e33b1529015..e09eca91eb18 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -46,7 +46,6 @@ #include #include #include -#include #include /** -- cgit v1.2.3 From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 14 Aug 2018 15:33:02 -0700 Subject: IB/rxe: Revise the ib_wr_opcode enum This enum has become part of the uABI, as both RXE and the ib_uverbs_post_send() command expect userspace to supply values from this enum. So it should be properly placed in include/uapi/rdma. In userspace this enum is called 'enum ibv_wr_opcode' as part of libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV, IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it turns out) into libiberbs in 2015. The kernel has changed its mind on the numbering for several of the IB_WC values over the years, but has remained stable on IB_WR_LOCAL_INV and below. Based on this we can conclude that there is no real user space user of the values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via rdma-core. This is confirmed by inspection, only rxe uses the kernel enum and implements the latter operations. rxe has clearly never worked with these attributes from userspace. Other drivers that support these opcodes implement the functionality without calling out to the kernel. To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we choose to renumber the IB_WR enum in the kernel to match the uABI that userspace has bee using since before Soft RoCE was merged. This is an overall simpler configuration for the whole software stack, and obviously can't break anything existing. Reported-by: Seth Howell Tested-by: Seth Howell Fixes: 8700e3e7c485 ("Soft RoCE driver") Cc: Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 34 ++++++++++++++++++++-------------- include/uapi/rdma/ib_user_verbs.h | 20 +++++++++++++++++++- 2 files changed, 39 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6076c9b72ab9..e463d3007a35 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1281,21 +1281,27 @@ struct ib_qp_attr { }; enum ib_wr_opcode { - IB_WR_RDMA_WRITE, - IB_WR_RDMA_WRITE_WITH_IMM, - IB_WR_SEND, - IB_WR_SEND_WITH_IMM, - IB_WR_RDMA_READ, - IB_WR_ATOMIC_CMP_AND_SWP, - IB_WR_ATOMIC_FETCH_AND_ADD, - IB_WR_LSO, - IB_WR_SEND_WITH_INV, - IB_WR_RDMA_READ_WITH_INV, - IB_WR_LOCAL_INV, - IB_WR_REG_MR, - IB_WR_MASKED_ATOMIC_CMP_AND_SWP, - IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + /* These are shared with userspace */ + IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND = IB_UVERBS_WR_SEND, + IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, + IB_WR_LSO = IB_UVERBS_WR_TSO, + IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, + IB_WR_MASKED_ATOMIC_CMP_AND_SWP = + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, + IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + + /* These are kernel only and can not be issued by userspace */ + IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 25a16760de2a..1254b51a551a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -763,10 +763,28 @@ struct ib_uverbs_sge { __u32 lkey; }; +enum ib_uverbs_wr_opcode { + IB_UVERBS_WR_RDMA_WRITE = 0, + IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1, + IB_UVERBS_WR_SEND = 2, + IB_UVERBS_WR_SEND_WITH_IMM = 3, + IB_UVERBS_WR_RDMA_READ = 4, + IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5, + IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6, + IB_UVERBS_WR_LOCAL_INV = 7, + IB_UVERBS_WR_BIND_MW = 8, + IB_UVERBS_WR_SEND_WITH_INV = 9, + IB_UVERBS_WR_TSO = 10, + IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + /* Review enum ib_wr_opcode before modifying this */ +}; + struct ib_uverbs_send_wr { __aligned_u64 wr_id; __u32 num_sge; - __u32 opcode; + __u32 opcode; /* see enum ib_uverbs_wr_opcode */ __u32 send_flags; union { __be32 imm_data; -- cgit v1.2.3 From 5f9794dc94f59ad1eb821724a8ae1f8e803ea188 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:08 +0300 Subject: RDMA/ucontext: Add a core API for mmaping driver IO memory To support disassociation and PCI hot unplug, we have to track all the VMAs that refer to the device IO memory. When disassociation occurs the VMAs have to be revised to point to the zero page, not the IO memory, to allow the physical HW to be unplugged. The three drivers supporting this implemented three different versions of this algorithm, all leaving something to be desired. This new common implementation has a few differences from the driver versions: - Track all VMAs, including splitting/truncating/etc. Tie the lifetime of the private data allocation to the lifetime of the vma. This avoids any tricks with setting vm_ops which Linus didn't like. (see link) - Support multiple mms, and support properly tracking mmaps triggered by processes other than the one first opening the uverbs fd. This makes fork behavior of disassociation enabled drivers the same as fork support in normal drivers. - Don't use crazy get_task stuff. - Simplify the approach for to racing between vm_ops close and disassociation, fixing the related bugs most of the driver implementations had. Since we are in core code the tracking list can be placed in struct ib_uverbs_ufile, which has a lifetime strictly longer than any VMAs created by mmap on the uverbs FD. Link: https://www.spinics.net/lists/stable/msg248747.html Link: https://lkml.kernel.org/r/CA+55aFxJTV_g46AQPoPXen-UPiqR1HGMZictt7VpC-SMFbm3Cw@mail.gmail.com Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 4 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_main.c | 223 ++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 22 ++++ 5 files changed, 252 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index c4118bcd5103..06d31fe56677 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -842,8 +842,10 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, struct ib_ucontext *ucontext = ufile->ucontext; int ret; - if (reason == RDMA_REMOVE_DRIVER_REMOVE) + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); ufile_disassociate_ucontext(ucontext); + } put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index f962f2a593ba..4886d2bba7c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 24369eb66c67..c97935a0c7c6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -158,6 +158,9 @@ struct ib_uverbs_file { spinlock_t uobjects_lock; struct list_head uobjects; + struct mutex umap_lock; + struct list_head umaps; + u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index db6de9157668..8d56773aac56 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -811,6 +812,226 @@ out: return ret; } +/* + * Each time we map IO memory into user space this keeps track of the mapping. + * When the device is hot-unplugged we 'zap' the mmaps in user space to point + * to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + */ +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; +}; + +static const struct vm_operations_struct rdma_umap_ops; + +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + vma->vm_private_data = priv; + vma->vm_ops = &rdma_umap_ops; + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, +}; + +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long size) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (vma->vm_end - vma->vm_start != size) + return ERR_PTR(-EINVAL); + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return ERR_PTR(-EINVAL); + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + return priv; +} + +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/* + * The page case is here for a slightly different reason, the driver expects + * to be able to free the page it is sharing to user space when it destroys + * its ucontext, which means we need to zap the user space references. + * + * We could handle this differently by providing an API to allocate a shared + * page and then only freeing the shared page when the last ufile is + * destroyed. + */ +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, + vma->vm_page_prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_page); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + if (!list_empty(&ufile->umaps)) { + mm = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list) + ->vma->vm_mm; + mmget(mm); + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_sem since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + down_write(&mm->mmap_sem); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + } + mutex_unlock(&ufile->umap_lock); + up_write(&mm->mmap_sem); + mmput(mm); + } +} + /* * ib_uverbs_open() does not need the BKL: * @@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e463d3007a35..a66238d8a2a3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2646,6 +2646,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot); +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size); +#else +static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + return -EINVAL; +} +static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + return -EINVAL; +} +#endif + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; -- cgit v1.2.3 From d4b4dd1b9706e48c370f88d3adfe713e43423cc9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:44:45 +0300 Subject: RDMA/umem: Do not use current->tgid to track the mm_struct This is just wrong, the process that calls into the reg_mr is the process associated with the umem, and that does not have to be the same process that created the context. When this code was first written mmgrab() didn't exist, however these days we can just directly hold the mm_struct pointer in the umem and have no ambiguity when it comes to releasing the umem as to which mm it was associated with. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 77 ++++++++++++++++++++---------------------- include/rdma/ib_umem.h | 3 +- 2 files changed, 37 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a41792dbae1f..c32a3e27a896 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -86,6 +86,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct vm_area_struct **vma_list; unsigned long lock_limit; unsigned long cur_base; + struct mm_struct *mm; unsigned long npages; int ret; int i; @@ -124,6 +125,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, return umem; } + umem->owning_mm = mm = current->mm; + mmgrab(mm); umem->odp_data = NULL; /* We assume the memory is from hugetlb until proved otherwise */ @@ -132,7 +135,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree; + goto umem_kfree_drop; } /* @@ -147,14 +150,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += npages; - if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm += npages; + if ((mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + up_write(&mm->mmap_sem); ret = -ENOMEM; goto vma; } - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -172,14 +175,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); while (npages) { ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); goto umem_release; } @@ -197,7 +200,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* preparing for next loop */ sg_list_start = sg; } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -223,6 +226,9 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); +umem_kfree_drop: + if (ret) + mmdrop(umem->owning_mm); umem_kfree: if (ret) kfree(umem); @@ -230,15 +236,21 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void ib_umem_account(struct work_struct *work) +static void __ib_umem_release_tail(struct ib_umem *umem) +{ + mmdrop(umem->owning_mm); + kfree(umem); +} + +static void ib_umem_release_defer(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); - down_write(&umem->mm->mmap_sem); - umem->mm->pinned_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); + down_write(&umem->owning_mm->mmap_sem); + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); + + __ib_umem_release_tail(umem); } /** @@ -248,9 +260,6 @@ static void ib_umem_account(struct work_struct *work) void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - struct task_struct *task; - unsigned long diff; if (umem->odp_data) { ib_umem_odp_release(umem); @@ -259,41 +268,27 @@ void ib_umem_release(struct ib_umem *umem) __ib_umem_release(umem->context->device, umem, 1); - task = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = ib_umem_num_pages(umem); - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting a workqueue. */ if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - + if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_release_defer); queue_work(ib_wq, &umem->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&umem->owning_mm->mmap_sem); + } + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(umem); + __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a1fd63871d17..e1c00b2ead19 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,14 +42,13 @@ struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; + struct mm_struct *owning_mm; size_t length; unsigned long address; int page_shift; int writable; int hugetlb; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; -- cgit v1.2.3 From b5231b019d76521dd8c59a54c174770ec92c767c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:04 +0300 Subject: RDMA/umem: Use ib_umem_odp in all function signatures connected to ODP All of these functions already require the ODP version of the umem struct, make this very clear by having the signature require it. This paves the way to using the container_of() pattern to link umem_odp and umem together. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/core/umem_odp.c | 139 ++++++++++++++++++----------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 3 +- drivers/infiniband/hw/mlx5/odp.c | 54 +++++++------- include/rdma/ib_umem_odp.h | 39 +++++----- include/rdma/ib_verbs.h | 4 +- 7 files changed, 129 insertions(+), 114 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c32a3e27a896..971d92ddea8f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -262,7 +262,7 @@ void ib_umem_release(struct ib_umem *umem) struct ib_ucontext *context = umem->context; if (umem->odp_data) { - ib_umem_odp_release(umem); + ib_umem_odp_release(to_ib_umem_odp(umem)); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 29e34e6a6420..8405e9afd7dc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -77,41 +77,41 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem *item) +static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { - int notifiers_count = item->odp_data->notifiers_count++; + if (umem_odp->mn_counters_active) { + int notifiers_count = umem_odp->notifiers_count++; if (notifiers_count == 0) /* Initialize the completion object for waiting on * notifiers. Since notifier_count is zero, no one * should be waiting right now. */ - reinit_completion(&item->odp_data->notifier_completion); + reinit_completion(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } -static void ib_umem_notifier_end_account(struct ib_umem *item) +static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { + if (umem_odp->mn_counters_active) { /* * This sequence increase will notify the QP page fault that * the page that is going to be mapped in the spte could have * been freed. */ - ++item->odp_data->notifiers_seq; - if (--item->odp_data->notifiers_count == 0) - complete_all(&item->odp_data->notifier_completion); + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } /* Account for a new mmu notifier in an ib_ucontext. */ @@ -156,20 +156,23 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } -static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) { +static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, + u64 start, u64 end, void *cookie) +{ + struct ib_umem *umem = umem_odp->umem; + /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ - ib_umem_notifier_start_account(item); - item->odp_data->dying = 1; + ib_umem_notifier_start_account(umem_odp); + umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(&item->odp_data->notifier_completion); - item->context->invalidate_range(item, ib_umem_start(item), - ib_umem_end(item)); + complete_all(&umem_odp->notifier_completion); + umem->context->invalidate_range(umem_odp, ib_umem_start(umem), + ib_umem_end(umem)); return 0; } @@ -191,20 +194,20 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, up_read(&context->umem_rwsem); } -static int invalidate_page_trampoline(struct ib_umem *item, u64 start, +static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } -static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) +static int invalidate_range_start_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, end); + item->umem->context->invalidate_range(item, start, end); return 0; } @@ -235,7 +238,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return ret; } -static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, +static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_end_account(item); @@ -271,9 +274,8 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size) { struct ib_umem *umem; struct ib_umem_odp *odp_data; @@ -326,7 +328,7 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, umem->odp_data = odp_data; - return umem; + return odp_data; out_page_list: vfree(odp_data->page_list); @@ -462,8 +464,9 @@ out_mm: return ret_val; } -void ib_umem_odp_release(struct ib_umem *umem) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_umem *umem = umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -472,17 +475,17 @@ void ib_umem_odp_release(struct ib_umem *umem) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); down_write(&context->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem->odp_data->interval_tree, + rbt_ib_umem_remove(&umem_odp->interval_tree, &context->umem_tree); context->odp_mrs_count--; - if (!umem->odp_data->mn_counters_active) { - list_del(&umem->odp_data->no_private_counters); - complete_all(&umem->odp_data->notifier_completion); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); } /* @@ -523,9 +526,9 @@ out_put_task: out: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); - vfree(umem->odp_data->page_list); - kfree(umem->odp_data); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); + kfree(umem_odp); kfree(umem); } @@ -538,7 +541,7 @@ out: * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from - * umem->odp_data->notifiers_seq. + * umem_odp->notifiers_seq. * * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. @@ -548,12 +551,13 @@ out: * umem. */ static int ib_umem_odp_map_dma_single_page( - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, int page_index, struct page *page, u64 access_mask, unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -565,11 +569,11 @@ static int ib_umem_odp_map_dma_single_page( * handle case of a racing notifier. This check also allows us to bail * early if we have a notifier running in parallel with us. */ - if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ret = -EAGAIN; goto out; } - if (!(umem->odp_data->dma_list[page_index])) { + if (!(umem_odp->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, 0, BIT(umem->page_shift), @@ -578,15 +582,15 @@ static int ib_umem_odp_map_dma_single_page( ret = -EFAULT; goto out; } - umem->odp_data->dma_list[page_index] = dma_addr | access_mask; - umem->odp_data->page_list[page_index] = page; + umem_odp->dma_list[page_index] = dma_addr | access_mask; + umem_odp->page_list[page_index] = page; umem->npages++; stored_page = 1; - } else if (umem->odp_data->page_list[page_index] == page) { - umem->odp_data->dma_list[page_index] |= access_mask; + } else if (umem_odp->page_list[page_index] == page) { + umem_odp->dma_list[page_index] |= access_mask; } else { pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem->odp_data->page_list[page_index], page); + umem_odp->page_list[page_index], page); /* Better remove the mapping now, to prevent any further * damage. */ remove_existing_mapping = 1; @@ -599,7 +603,7 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( - umem, + umem_odp, ib_umem_start(umem) + (page_index >> umem->page_shift), ib_umem_start(umem) + ((page_index + 1) >> umem->page_shift), @@ -615,7 +619,7 @@ out: * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem->odp_data->dma_list. + * umem_odp->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. @@ -623,7 +627,7 @@ out: * the function from completing its task. * An -ENOENT error code indicates that userspace process is being terminated * and mm was already destroyed. - * @umem: the umem to map and pin + * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error @@ -633,11 +637,13 @@ out: * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from - * umem->odp_data->notifiers_seq before calling this function + * umem_odp->notifiers_seq before calling this function */ -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, - u64 access_mask, unsigned long current_seq) +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, + unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -703,7 +709,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { if (user_virt & ~page_mask) { p += PAGE_SIZE; @@ -716,7 +722,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, } ret = ib_umem_odp_map_dma_single_page( - umem, k, local_page_list[j], + umem_odp, k, local_page_list[j], access_mask, current_seq); if (ret < 0) break; @@ -724,7 +730,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, p = page_to_phys(local_page_list[j]); k++; } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { /* Release left over pages when handling errors. */ @@ -750,9 +756,10 @@ out_no_task: } EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + struct ib_umem *umem = umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; @@ -764,12 +771,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem->odp_data->page_list[idx]) { - struct page *page = umem->odp_data->page_list[idx]; - dma_addr_t dma = umem->odp_data->dma_list[idx]; + if (umem_odp->page_list[idx]) { + struct page *page = umem_odp->page_list[idx]; + dma_addr_t dma = umem_odp->dma_list[idx]; dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); @@ -792,12 +799,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); - umem->odp_data->page_list[idx] = NULL; - umem->odp_data->dma_list[idx] = 0; + umem_odp->page_list[idx] = NULL; + umem_odp->dma_list[idx] = 0; umem->npages--; } } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); @@ -824,7 +831,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem->umem, start, last, cookie) || ret_val; + ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 81154b598266..dc34ffa4c8b3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1150,7 +1150,7 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9fb1d9cb9401..affbf2831ccd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1631,7 +1631,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + mlx5_ib_invalidate_range(to_ib_umem_odp(umem), + ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d216e0d2921d..8f4a4a8171eb 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -170,22 +170,24 @@ static void mr_leaf_free_action(struct work_struct *work) wake_up(&imr->q_leaf_free); } -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; + struct ib_umem *umem; int in_block = 0; u64 addr; - if (!umem || !umem->odp_data) { + if (!umem_odp) { pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } + umem = umem_odp->umem; - mr = umem->odp_data->private; + mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; @@ -208,7 +210,7 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem->odp_data->dma_list[idx] & + if (umem_odp->dma_list[idx] & (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { if (!in_block) { blk_start_idx = idx; @@ -237,13 +239,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * needed. */ - ib_umem_odp_unmap_dma_pages(umem, start, end); + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); if (unlikely(!umem->npages && mr->parent && - !umem->odp_data->dying)) { - WRITE_ONCE(umem->odp_data->dying, 1); + !umem_odp->dying)) { + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); } } @@ -372,7 +374,6 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - struct ib_umem *umem; mutex_lock(&mr->umem->odp_data->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); @@ -385,22 +386,22 @@ next_mr: if (nentries) nentries++; } else { - umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(umem)) { + odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - return ERR_CAST(umem); + return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(umem); + ib_umem_release(odp->umem); return ERR_CAST(mtt); } - odp = umem->odp_data; odp->private = mtt; - mtt->umem = umem; + mtt->umem = odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -460,24 +461,24 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem *umem, u64 start, - u64 end, void *cookie) +static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, + void *cookie) { - struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; + struct ib_umem *umem = umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem, - ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - if (umem->odp_data->dying) + if (umem_odp->dying) return 0; - WRITE_ONCE(umem->odp_data->dying, 1); + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&imr->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); return 0; } @@ -533,7 +534,7 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, access_mask, current_seq); if (ret < 0) @@ -542,7 +543,8 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), + current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 381cdf5a9bd1..3ef2975b5fb2 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -82,15 +82,18 @@ struct ib_umem_odp { struct work_struct work; }; +static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) +{ + return umem->odp_data; +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, int access); -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size); - -void ib_umem_odp_release(struct ib_umem *umem); +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size); +void ib_umem_odp_release(struct ib_umem_odp *umem_odp); /* * The lower 2 bits of the DMA address signal the R/W permissions for @@ -105,13 +108,14 @@ void ib_umem_odp_release(struct ib_umem *umem); #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, - u64 access_mask, unsigned long current_seq); +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, + unsigned long current_seq); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bound); -typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, +typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of @@ -129,25 +133,25 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); -static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, +static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { /* * This code is strongly based on the KVM code from * mmu_notifier_retry. Should be called with - * the relevant locks taken (item->odp_data->umem_mutex + * the relevant locks taken (umem_odp->umem_mutex * and the ucontext umem_mutex semaphore locked for read). */ /* Do not allow page faults while the new ib_umem hasn't seen a state * with zero notifiers yet, and doesn't have its own valid set of * private counters. */ - if (!item->odp_data->mn_counters_active) + if (!umem_odp->mn_counters_active) return 1; - if (unlikely(item->odp_data->notifiers_count)) + if (unlikely(umem_odp->notifiers_count)) return 1; - if (item->odp_data->notifiers_seq != mmu_seq) + if (umem_odp->notifiers_seq != mmu_seq) return 1; return 0; } @@ -161,14 +165,13 @@ static inline int ib_umem_odp_get(struct ib_ucontext *context, return -EINVAL; } -static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static inline struct ib_umem_odp * +ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { return ERR_PTR(-EINVAL); } -static inline void ib_umem_odp_release(struct ib_umem *umem) {} +static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a66238d8a2a3..d611ce9df7fb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,6 +69,8 @@ #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN +struct ib_umem_odp; + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; @@ -1506,7 +1508,7 @@ struct ib_ucontext { * mmu notifiers registration. */ struct rw_semaphore umem_rwsem; - void (*invalidate_range)(struct ib_umem *umem, + void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); struct mmu_notifier mn; -- cgit v1.2.3 From 41b4deeaa123e62e1037af7a0be547af2e0e05f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:05 +0300 Subject: RDMA/umem: Make ib_umem_odp into a sub structure of ib_umem These two structures are linked together, use the container_of pattern instead of a double allocation to make the code simpler and easier to follow. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 36 ++++++++++------- drivers/infiniband/core/umem_odp.c | 79 +++++++++++++++----------------------- drivers/infiniband/hw/mlx5/odp.c | 26 ++++++------- include/rdma/ib_umem_odp.h | 11 ++---- 4 files changed, 69 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 971d92ddea8f..88b9b88f90e1 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -108,34 +108,39 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); + if (access & IB_ACCESS_ON_DEMAND) { + umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->odp_data = to_ib_umem_odp(umem); + } else { + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + } umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(context, umem, access); + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - umem->owning_mm = mm = current->mm; - mmgrab(mm); - umem->odp_data = NULL; - /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree_drop; + goto umem_kfree; } /* @@ -226,12 +231,11 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); -umem_kfree_drop: - if (ret) - mmdrop(umem->owning_mm); umem_kfree: - if (ret) + if (ret) { + mmdrop(umem->owning_mm); kfree(umem); + } return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); @@ -239,7 +243,10 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - kfree(umem); + if (umem->odp_data) + kfree(to_ib_umem_odp(umem)); + else + kfree(umem); } static void ib_umem_release_defer(struct work_struct *work) @@ -263,6 +270,7 @@ void ib_umem_release(struct ib_umem *umem) if (umem->odp_data) { ib_umem_odp_release(to_ib_umem_odp(umem)); + __ib_umem_release_tail(umem); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 8405e9afd7dc..900fdedfe910 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(umem_odp->umem); + return ib_umem_start(&umem_odp->umem); } /* Note that the representation of the intervals in the interval tree @@ -71,7 +71,7 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(umem_odp->umem) - 1; + return ib_umem_end(&umem_odp->umem) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, @@ -159,7 +159,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; /* * Increase the number of notifiers running, to @@ -198,7 +198,7 @@ static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem.context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } @@ -207,7 +207,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, end); + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -277,28 +277,21 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { - struct ib_umem *umem; struct ib_umem_odp *odp_data; + struct ib_umem *umem; int pages = size >> PAGE_SHIFT; int ret; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) return ERR_PTR(-ENOMEM); - + umem = &odp_data->umem; umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); - if (!odp_data) { - ret = -ENOMEM; - goto out_umem; - } - odp_data->umem = umem; - mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -334,15 +327,14 @@ out_page_list: vfree(odp_data->page_list); out_odp_data: kfree(odp_data); -out_umem: - kfree(umem); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access) +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { + struct ib_ucontext *context = umem_odp->umem.context; + struct ib_umem *umem = &umem_odp->umem; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -378,30 +370,23 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, goto out_mm; } - umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); - if (!umem->odp_data) { - ret_val = -ENOMEM; - goto out_mm; - } - umem->odp_data->umem = umem; - - mutex_init(&umem->odp_data->umem_mutex); + mutex_init(&umem_odp->umem_mutex); - init_completion(&umem->odp_data->notifier_completion); + init_completion(&umem_odp->notifier_completion); if (ib_umem_num_pages(umem)) { - umem->odp_data->page_list = - vzalloc(array_size(sizeof(*umem->odp_data->page_list), + umem_odp->page_list = + vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->page_list) { + if (!umem_odp->page_list) { ret_val = -ENOMEM; - goto out_odp_data; + goto out_mm; } - umem->odp_data->dma_list = - vzalloc(array_size(sizeof(*umem->odp_data->dma_list), + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->dma_list) { + if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; } @@ -415,13 +400,13 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, down_write(&context->umem_rwsem); context->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem->odp_data->interval_tree, + rbt_ib_umem_insert(&umem_odp->interval_tree, &context->umem_tree); if (likely(!atomic_read(&context->notifier_count)) || context->odp_mrs_count == 1) - umem->odp_data->mn_counters_active = true; + umem_odp->mn_counters_active = true; else - list_add(&umem->odp_data->no_private_counters, + list_add(&umem_odp->no_private_counters, &context->no_private_counters); downgrade_write(&context->umem_rwsem); @@ -454,11 +439,9 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, out_mutex: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); + vfree(umem_odp->dma_list); out_page_list: - vfree(umem->odp_data->page_list); -out_odp_data: - kfree(umem->odp_data); + vfree(umem_odp->page_list); out_mm: mmput(mm); return ret_val; @@ -466,7 +449,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -528,8 +511,6 @@ out: vfree(umem_odp->dma_list); vfree(umem_odp->page_list); - kfree(umem_odp); - kfree(umem); } /* @@ -557,7 +538,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -643,7 +624,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -759,7 +740,7 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 8f4a4a8171eb..5b9fd56186bd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -64,7 +64,7 @@ static int check_parent(struct ib_umem_odp *odp, static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem->context; + struct ib_ucontext *ctx = odp->umem.context; struct rb_node *rb; down_read(&ctx->umem_rwsem); @@ -102,7 +102,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp->umem) > start + length) + if (ib_umem_start(&odp->umem) > start + length) goto not_found; } not_found: @@ -137,7 +137,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem->address == va) { + if (odp && odp->umem.address == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -153,13 +153,13 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -185,7 +185,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } - umem = umem_odp->umem; + umem = &umem_odp->umem; mr = umem_odp->private; @@ -392,16 +392,16 @@ next_mr: return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); return ERR_CAST(mtt); } odp->private = mtt; - mtt->umem = odp->umem; + mtt->umem = &odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -418,7 +418,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem->address != addr) + if (odp && odp->umem.address != addr) odp = NULL; goto next_mr; } @@ -465,7 +465,7 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; @@ -518,7 +518,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); @@ -577,7 +577,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem->address != io_virt)) { + if (unlikely(!next || next->umem.address != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 3ef2975b5fb2..4519ea663df5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -43,6 +43,7 @@ struct umem_odp_node { }; struct ib_umem_odp { + struct ib_umem umem; /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will @@ -72,7 +73,6 @@ struct ib_umem_odp { /* A linked list of umems that don't have private mmu notifier * counters yet. */ struct list_head no_private_counters; - struct ib_umem *umem; /* Tree tracking */ struct umem_odp_node interval_tree; @@ -84,13 +84,12 @@ struct ib_umem_odp { static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) { - return umem->odp_data; + return container_of(umem, struct ib_umem_odp, umem); } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access); +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); @@ -158,9 +157,7 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem, - int access) +static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { return -EINVAL; } -- cgit v1.2.3 From 597ecc5a095406a668e53ab330495ddb65327f77 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:06 +0300 Subject: RDMA/umem: Get rid of struct ib_umem.odp_data This no longer has any use, we can use container_of to get to the umem_odp, and a simple flag to indicate if this is an odp MR. Remove the few remaining references to it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 8 ++++---- drivers/infiniband/core/umem_odp.c | 3 +-- drivers/infiniband/hw/mlx5/mem.c | 9 ++++----- drivers/infiniband/hw/mlx5/mr.c | 13 +++++++------ drivers/infiniband/hw/mlx5/odp.c | 14 ++++++++------ include/rdma/ib_umem.h | 6 +++--- 6 files changed, 27 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 88b9b88f90e1..fec5d489e311 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -112,7 +112,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->odp_data = to_ib_umem_odp(umem); + umem->is_odp = 1; } else { umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) @@ -243,7 +243,7 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - if (umem->odp_data) + if (umem->is_odp) kfree(to_ib_umem_odp(umem)); else kfree(umem); @@ -268,7 +268,7 @@ void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - if (umem->odp_data) { + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); return; @@ -306,7 +306,7 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; - if (umem->odp_data) + if (umem->is_odp) return ib_umem_num_pages(umem); n = 0; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 900fdedfe910..42272b2bf595 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -291,6 +291,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; + umem->is_odp = 1; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -319,8 +320,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, &context->no_private_counters); up_write(&context->umem_rwsem); - umem->odp_data = odp_data; - return odp_data; out_page_list: diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index f3dbd75a0a96..549234988bb4 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -57,7 +57,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int entry; unsigned long page_shift = umem->page_shift; - if (umem->odp_data) { + if (umem->is_odp) { *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -152,14 +152,13 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, struct scatterlist *sg; int entry; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - const bool odp = umem->odp_data != NULL; - - if (odp) { + if (umem->is_odp) { WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); for (i = 0; i < num_pages; ++i) { - dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + dma_addr_t pa = + to_ib_umem_odp(umem)->dma_list[offset + i]; pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index affbf2831ccd..6aac3a107330 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -98,7 +98,7 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { - if (mr->umem->odp_data) { + if (mr->umem->is_odp) { /* * This barrier prevents the compiler from moving the * setting of umem->odp_data->private to point to our @@ -107,7 +107,7 @@ static void update_odp_mr(struct mlx5_ib_mr *mr) * handle invalidations. */ smp_wmb(); - mr->umem->odp_data->private = mr; + to_ib_umem_odp(mr->umem)->private = mr; /* * Make sure we will see the new * umem->odp_data->private value in the invalidation @@ -1624,15 +1624,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) struct ib_umem *umem = mr->umem; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem && umem->odp_data) { + if (umem && umem->is_odp) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); + /* Prevent new page faults from succeeding */ mr->live = 0; /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(to_ib_umem_odp(umem), - ib_umem_start(umem), + if (umem_odp->page_list) + mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5b9fd56186bd..d4780bded74a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -371,11 +371,12 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - mutex_lock(&mr->umem->odp_data->umem_mutex); + mutex_lock(&odp_mr->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", @@ -388,14 +389,14 @@ next_mr: } else { odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); } mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); ib_umem_release(&odp->umem); return ERR_CAST(mtt); } @@ -433,7 +434,7 @@ next_mr: } } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return result; } @@ -498,6 +499,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped) { + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 access_mask = ODP_READ_ALLOWED_BIT; int npages = 0, page_shift, np; u64 start_idx, page_mask; @@ -506,7 +508,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, size_t size; int ret; - if (!mr->umem->odp_data->page_list) { + if (!odp_mr->page_list) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -514,7 +516,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr = odp->private; } else { - odp = mr->umem->odp_data; + odp = odp_mr; } next_mr: diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index e1c00b2ead19..5d3755ec5afa 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -46,10 +46,10 @@ struct ib_umem { size_t length; unsigned long address; int page_shift; - int writable; - int hugetlb; + u32 writable : 1; + u32 hugetlb : 1; + u32 is_odp : 1; struct work_struct work; - struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; -- cgit v1.2.3 From c9990ab39b6e911003bab10a6da96e98ab1503a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:07 +0300 Subject: RDMA/umem: Move all the ODP related stuff out of ucontext and into per_mm This is the first step to make ODP use the owning_mm that is now part of struct ib_umem. Each ODP umem is linked to a single per_mm structure, which in turn, is linked to a single mm, via the embedded mmu_notifier. This first patch introduces the structure and reworks eveything to use it. This also needs to introduce tgid into the ib_ucontext_per_mm, as get_user_pages_remote() requires the originating task for statistics tracking. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 127 +++++++++++++++++++---------------- drivers/infiniband/core/uverbs_cmd.c | 9 +-- drivers/infiniband/hw/mlx5/odp.c | 43 +++++++----- include/rdma/ib_umem_odp.h | 2 + include/rdma/ib_verbs.h | 32 +++++---- 5 files changed, 120 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 42272b2bf595..6bf3fc0c12a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -115,34 +115,35 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) } /* Account for a new mmu notifier in an ib_ucontext. */ -static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +static void +ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) { - atomic_inc(&context->notifier_count); + atomic_inc(&per_mm->notifier_count); } /* Account for a terminating mmu notifier in an ib_ucontext. * * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) { - int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); if (zero_notifiers && - !list_empty(&context->no_private_counters)) { + !list_empty(&per_mm->no_private_counters)) { /* No currently running mmu notifiers. Now is the chance to * add private accounting to all previously added umems. */ struct ib_umem_odp *odp_data, *next; /* Prevent concurrent mmu notifiers from working on the * no_private_counters list. */ - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); /* Read the notifier_count again, with the umem_rwsem * semaphore taken for write. */ - if (!atomic_read(&context->notifier_count)) { + if (!atomic_read(&per_mm->notifier_count)) { list_for_each_entry_safe(odp_data, next, - &context->no_private_counters, + &per_mm->no_private_counters, no_private_counters) { mutex_lock(&odp_data->umem_mutex); odp_data->mn_counters_active = true; @@ -152,7 +153,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } - up_write(&context->umem_rwsem); + up_write(&per_mm->umem_rwsem); } } @@ -179,19 +180,20 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ib_ucontext_notifier_start_account(per_mm); + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); } static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, @@ -217,23 +219,24 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end, bool blockable) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); int ret; - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return 0; if (blockable) - down_read(&context->umem_rwsem); - else if (!down_read_trylock(&context->umem_rwsem)) + down_read(&per_mm->umem_rwsem); + else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(context); - ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ib_ucontext_notifier_start_account(per_mm); + ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); return ret; } @@ -250,9 +253,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; /* @@ -260,12 +264,12 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, * in ib_umem_notifier_invalidate_range_start so we shouldn't really block * here. But this is ugly and fragile. */ - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); + up_read(&per_mm->umem_rwsem); + ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -277,6 +281,7 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -292,6 +297,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; + odp_data->per_mm = per_mm = &context->per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -310,15 +316,15 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count))) + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; + rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count))) odp_data->mn_counters_active = true; else list_add(&odp_data->no_private_counters, - &context->no_private_counters); - up_write(&context->umem_rwsem); + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); return odp_data; @@ -334,6 +340,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; + struct ib_ucontext_per_mm *per_mm; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -396,28 +403,30 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) * notification before the "current" task (and MM) is * destroyed. We use the umem_rwsem semaphore to synchronize. */ - down_write(&context->umem_rwsem); - context->odp_mrs_count++; + umem_odp->per_mm = per_mm = &context->per_mm; + + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, - &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count)) || - context->odp_mrs_count == 1) + &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count)) || + per_mm->odp_mrs_count == 1) umem_odp->mn_counters_active = true; else list_add(&umem_odp->no_private_counters, - &context->no_private_counters); - downgrade_write(&context->umem_rwsem); + &per_mm->no_private_counters); + downgrade_write(&per_mm->umem_rwsem); - if (context->odp_mrs_count == 1) { + if (per_mm->odp_mrs_count == 1) { /* * Note that at this point, no MMU notifier is running - * for this context! + * for this per_mm! */ - atomic_set(&context->notifier_count, 0); - INIT_HLIST_NODE(&context->mn.hlist); - context->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&context->mn, mm); + atomic_set(&per_mm->notifier_count, 0); + INIT_HLIST_NODE(&per_mm->mn.hlist); + per_mm->mn.ops = &ib_umem_notifiers; + ret_val = mmu_notifier_register(&per_mm->mn, mm); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; @@ -425,7 +434,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); /* * Note that doing an mmput can cause a notifier for the relevant mm. @@ -437,7 +446,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) return 0; out_mutex: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); @@ -449,7 +458,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext *context = umem->context; + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -460,11 +469,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, - &context->umem_tree); - context->odp_mrs_count--; + &per_mm->umem_tree); + per_mm->odp_mrs_count--; if (!umem_odp->mn_counters_active) { list_del(&umem_odp->no_private_counters); complete_all(&umem_odp->notifier_completion); @@ -477,13 +486,13 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * that since we are doing it atomically, no other user could register * and unregister while we do the check. */ - downgrade_write(&context->umem_rwsem); - if (!context->odp_mrs_count) { + downgrade_write(&per_mm->umem_rwsem); + if (!per_mm->odp_mrs_count) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; - owning_process = get_pid_task(context->tgid, - PIDTYPE_PID); + owning_process = + get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); if (owning_process == NULL) /* * The process is already dead, notifier were removed @@ -498,7 +507,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * removed already. */ goto out_put_task; - mmu_notifier_unregister(&context->mn, owning_mm); + mmu_notifier_unregister(&per_mm->mn, owning_mm); mmput(owning_mm); @@ -506,7 +515,7 @@ out_put_task: put_task_struct(owning_process); } out: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9c87c98a0f19..ce678e1008a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,10 +124,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->umem_rwsem); - ucontext->odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->no_private_counters); + ucontext->per_mm.umem_tree = RB_ROOT_CACHED; + init_rwsem(&ucontext->per_mm.umem_rwsem); + ucontext->per_mm.odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); + ucontext->per_mm.context = ucontext; if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d4780bded74a..9982b5f4e598 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -61,13 +61,21 @@ static int check_parent(struct ib_umem_odp *odp, return mr && mr->parent == parent && !odp->dying; } +struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) +{ + if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp)) + return NULL; + + return to_ib_umem_odp(mr->umem)->per_mm; +} + static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem.context; + struct ib_ucontext_per_mm *per_mm = odp->per_mm; struct rb_node *rb; - down_read(&ctx->umem_rwsem); + down_read(&per_mm->umem_rwsem); while (1) { rb = rb_next(&odp->interval_tree.rb); if (!rb) @@ -79,19 +87,19 @@ static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } -static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, - u64 start, u64 length, +static struct ib_umem_odp *odp_lookup(u64 start, u64 length, struct mlx5_ib_mr *parent) { + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); struct ib_umem_odp *odp; struct rb_node *rb; - down_read(&ctx->umem_rwsem); - odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + down_read(&per_mm->umem_rwsem); + odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); if (!odp) goto end; @@ -108,7 +116,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } @@ -116,7 +124,6 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_pd *pd = mr->ibmr.pd; - struct ib_ucontext *ctx = pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem_odp *odp; unsigned long va; @@ -131,8 +138,8 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, return; } - odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -368,7 +375,6 @@ fail: static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt) { - struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); @@ -377,7 +383,7 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct mlx5_ib_mr *mtt; mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(ctx, addr, 1, mr); + odp = odp_lookup(addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", io_virt, bcnt, addr, odp); @@ -387,7 +393,8 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); @@ -486,12 +493,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - down_read(&ctx->umem_rwsem); - rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, mr_leaf_free, true, imr); - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 4519ea663df5..394ea6b68db7 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -44,6 +44,8 @@ struct umem_odp_node { struct ib_umem_odp { struct ib_umem umem; + struct ib_ucontext_per_mm *per_mm; + /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d611ce9df7fb..2cf2cee5a753 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,6 +1488,25 @@ struct ib_rdmacg_object { #endif }; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + + struct rb_root_cached umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; +}; +#endif + struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1502,20 +1521,9 @@ struct ib_ucontext { struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - int odp_mrs_count; + struct ib_ucontext_per_mm per_mm; #endif struct ib_rdmacg_object cg_obj; -- cgit v1.2.3 From f27a0d50a4bc2861b472c2e3740d63a29d1ac460 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:08 +0300 Subject: RDMA/umem: Use umem->owning_mm inside ODP Since ODP had a single struct mmu_notifier located in the ucontext it could only handle a single MM at a time, and this prevented it from using the new owning_mm system. With the prior rework it is now simple to let ODP track multiple MMs per ucontext, finish the job so that the per_mm is allocated on a mm by mm basis, and freed when the last umem is dropped from the ucontext. As a side effect the new saner locking removes the lockdep splat about nesting the umem_rwsem between mmu_notifier_unregister and ib_umem_odp_release. It also makes ODP work with multiple processes, across, fork, etc. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 301 +++++++++++++++++++---------------- drivers/infiniband/core/uverbs_cmd.c | 8 +- drivers/infiniband/hw/mlx5/main.c | 7 + drivers/infiniband/hw/mlx5/odp.c | 2 +- include/rdma/ib_umem_odp.h | 20 ++- include/rdma/ib_verbs.h | 22 +-- 6 files changed, 191 insertions(+), 169 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6bf3fc0c12a1..0577f9ff600f 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -278,10 +278,135 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, size_t size) +static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + + if (likely(!atomic_read(&per_mm->notifier_count))) + umem_odp->mn_counters_active = true; + else + list_add(&umem_odp->no_private_counters, + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); +} + +static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); + } + + up_write(&per_mm->umem_rwsem); +} + +static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, + struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; + int ret; + + per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); + if (!per_mm) + return ERR_PTR(-ENOMEM); + + per_mm->context = ctx; + per_mm->mm = mm; + per_mm->umem_tree = RB_ROOT_CACHED; + init_rwsem(&per_mm->umem_rwsem); + INIT_LIST_HEAD(&per_mm->no_private_counters); + + rcu_read_lock(); + per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + + WARN_ON(mm != current->mm); + + per_mm->mn.ops = &ib_umem_notifiers; + ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); + if (ret) { + dev_err(&ctx->device->dev, + "Failed to register mmu_notifier %d\n", ret); + goto out_pid; + } + + list_add(&per_mm->ucontext_list, &ctx->per_mm_list); + return per_mm; + +out_pid: + put_pid(per_mm->tgid); + kfree(per_mm); + return ERR_PTR(ret); +} + +static int get_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + + /* + * Generally speaking we expect only one or two per_mm in this list, + * so no reason to optimize this search today. + */ + mutex_lock(&ctx->per_mm_list_lock); + list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { + if (per_mm->mm == umem_odp->umem.owning_mm) + goto found; + } + + per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); + if (IS_ERR(per_mm)) { + mutex_unlock(&ctx->per_mm_list_lock); + return PTR_ERR(per_mm); + } + +found: + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + return 0; +} + +void put_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_ucontext *ctx = umem_odp->umem.context; + bool need_free; + + mutex_lock(&ctx->per_mm_list_lock); + umem_odp->per_mm = NULL; + per_mm->odp_mrs_count--; + need_free = per_mm->odp_mrs_count == 0; + if (need_free) + list_del(&per_mm->ucontext_list); + mutex_unlock(&ctx->per_mm_list_lock); + + if (!need_free) + return; + + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); + kfree(per_mm); +} + +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size) +{ + struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -291,13 +416,13 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = context; + umem->context = ctx; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; - odp_data->per_mm = per_mm = &context->per_mm; + odp_data->per_mm = per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -316,15 +441,14 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&per_mm->umem_rwsem); + /* + * Caller must ensure that the umem_odp that the per_mm came from + * cannot be freed during the call to ib_alloc_odp_umem. + */ + mutex_lock(&ctx->per_mm_list_lock); per_mm->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count))) - odp_data->mn_counters_active = true; - else - list_add(&odp_data->no_private_counters, - &per_mm->no_private_counters); - up_write(&per_mm->umem_rwsem); + mutex_unlock(&ctx->per_mm_list_lock); + add_umem_to_per_mm(odp_data); return odp_data; @@ -338,15 +462,13 @@ EXPORT_SYMBOL(ib_alloc_odp_umem); int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { - struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm; + /* + * NOTE: This must called in a process context where umem->owning_mm + * == current->mm + */ + struct mm_struct *mm = umem->owning_mm; int ret_val; - struct pid *our_pid; - struct mm_struct *mm = get_task_mm(current); - - if (!mm) - return -EINVAL; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -366,16 +488,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem->hugetlb = 0; } - /* Prevent creating ODP MRs in child processes */ - rcu_read_lock(); - our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - put_pid(our_pid); - if (context->tgid != our_pid) { - ret_val = -EINVAL; - goto out_mm; - } - mutex_init(&umem_odp->umem_mutex); init_completion(&umem_odp->notifier_completion); @@ -384,10 +496,8 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem_odp->page_list) { - ret_val = -ENOMEM; - goto out_mm; - } + if (!umem_odp->page_list) + return -ENOMEM; umem_odp->dma_list = vzalloc(array_size(sizeof(*umem_odp->dma_list), @@ -398,67 +508,23 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - /* - * When using MMU notifiers, we will get a - * notification before the "current" task (and MM) is - * destroyed. We use the umem_rwsem semaphore to synchronize. - */ - umem_odp->per_mm = per_mm = &context->per_mm; - - down_write(&per_mm->umem_rwsem); - per_mm->odp_mrs_count++; - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count)) || - per_mm->odp_mrs_count == 1) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); - downgrade_write(&per_mm->umem_rwsem); + ret_val = get_per_mm(umem_odp); + if (ret_val) + goto out_dma_list; + add_umem_to_per_mm(umem_odp); - if (per_mm->odp_mrs_count == 1) { - /* - * Note that at this point, no MMU notifier is running - * for this per_mm! - */ - atomic_set(&per_mm->notifier_count, 0); - INIT_HLIST_NODE(&per_mm->mn.hlist); - per_mm->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&per_mm->mn, mm); - if (ret_val) { - pr_err("Failed to register mmu_notifier %d\n", ret_val); - ret_val = -EBUSY; - goto out_mutex; - } - } - - up_read(&per_mm->umem_rwsem); - - /* - * Note that doing an mmput can cause a notifier for the relevant mm. - * If the notifier is called while we hold the umem_rwsem, this will - * cause a deadlock. Therefore, we release the reference only after we - * released the semaphore. - */ - mmput(mm); return 0; -out_mutex: - up_read(&per_mm->umem_rwsem); +out_dma_list: vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); -out_mm: - mmput(mm); return ret_val; } void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -469,54 +535,8 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - per_mm->odp_mrs_count--; - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } - - /* - * Downgrade the lock to a read lock. This ensures that the notifiers - * (who lock the mutex for reading) will be able to finish, and we - * will be able to enventually obtain the mmu notifiers SRCU. Note - * that since we are doing it atomically, no other user could register - * and unregister while we do the check. - */ - downgrade_write(&per_mm->umem_rwsem); - if (!per_mm->odp_mrs_count) { - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = - get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); - if (owning_process == NULL) - /* - * The process is already dead, notifier were removed - * already. - */ - goto out; - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) - /* - * The process' mm is already dead, notifier were - * removed already. - */ - goto out_put_task; - mmu_notifier_unregister(&per_mm->mn, owning_mm); - - mmput(owning_mm); - -out_put_task: - put_task_struct(owning_process); - } -out: - up_read(&per_mm->umem_rwsem); - + remove_umem_from_per_mm(umem_odp); + put_per_mm(umem_odp); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); } @@ -634,7 +654,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, { struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; int j, k, ret = 0, start_idx, npages = 0, page_shift; @@ -658,15 +678,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ - owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (owning_process == NULL) { + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { ret = -EINVAL; - goto out_no_task; - } - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) { - ret = -ENOENT; goto out_put_task; } @@ -738,8 +757,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, mmput(owning_mm); out_put_task: - put_task_struct(owning_process); -out_no_task: + if (owning_process) + put_task_struct(owning_process); free_page((unsigned long)local_page_list); return ret; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ce678e1008a4..d77b0b9793c7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,12 +124,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->per_mm.umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->per_mm.umem_rwsem); - ucontext->per_mm.odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); - ucontext->per_mm.context = ucontext; - + mutex_init(&ucontext->per_mm_list_lock); + INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index aeb328100986..1348a08261a9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1861,6 +1861,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* All umem's must be destroyed before destroying the ucontext. */ + mutex_lock(&ibcontext->per_mm_list_lock); + WARN_ON(!list_empty(&ibcontext->per_mm_list)); + mutex_unlock(&ibcontext->per_mm_list_lock); +#endif + if (context->devx_uid) mlx5_ib_devx_destroy(dev, context); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 9982b5f4e598..b04eb6775326 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -393,7 +393,7 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 394ea6b68db7..259eb08dfc9e 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -91,8 +91,26 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + struct mm_struct *mm; + struct pid *tgid; + + struct rb_root_cached umem_tree; + /* Protects umem_tree */ + struct rw_semaphore umem_rwsem; + atomic_t notifier_count; + + struct mmu_notifier mn; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; + + struct list_head ucontext_list; +}; + int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2cf2cee5a753..6437e6af758d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,25 +1488,6 @@ struct ib_rdmacg_object { #endif }; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -struct ib_ucontext_per_mm { - struct ib_ucontext *context; - - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - unsigned int odp_mrs_count; -}; -#endif - struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1523,7 +1504,8 @@ struct ib_ucontext { #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - struct ib_ucontext_per_mm per_mm; + struct mutex per_mm_list_lock; + struct list_head per_mm_list; #endif struct ib_rdmacg_object cg_obj; -- cgit v1.2.3 From ca748c39ea3f3c755295d64d69ba0b4375e34b5d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:09 +0300 Subject: RDMA/umem: Get rid of per_mm->notifier_count This is intrinsically racy and the scheme is simply unnecessary. New MR registration can wait for any on going invalidation to fully complete. CPU0 CPU1 if (atomic_read()) if (atomic_dec_and_test() && !list_empty()) { /* not taken */ } list_add() Putting the new UMEM into some kind of purgatory until another invalidate rolls through.. Instead hold the read side of the umem_rwsem across the pair'd start/end and get rid of the racy 'deferred add' approach. Since all umem's in the rbt are always ready to go, also get rid of the mn_counters_active stuff. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 113 ++++++------------------------------- include/rdma/ib_umem_odp.h | 15 ----- 2 files changed, 18 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 0577f9ff600f..1c0c4a431218 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - int notifiers_count = umem_odp->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&umem_odp->notifier_completion); - } + if (umem_odp->notifiers_count++ == 0) + /* + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. + */ + reinit_completion(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - } + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void -ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) -{ - atomic_inc(&per_mm->notifier_count); -} - -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) -{ - int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); - - if (zero_notifiers && - !list_empty(&per_mm->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&per_mm->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&per_mm->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &per_mm->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&per_mm->umem_rwsem); - } -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { @@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(per_mm); down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, @@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(per_mm); - ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&per_mm->umem_rwsem); - - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); - ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); - - if (likely(!atomic_read(&per_mm->notifier_count))) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); up_write(&per_mm->umem_rwsem); } @@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } + complete_all(&umem_odp->notifier_completion); up_write(&per_mm->umem_rwsem); } @@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - INIT_LIST_HEAD(&per_mm->no_private_counters); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 259eb08dfc9e..ce9502545903 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -67,15 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -99,11 +93,8 @@ struct ib_ucontext_per_mm { struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - atomic_t notifier_count; struct mmu_notifier mn; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; unsigned int odp_mrs_count; struct list_head ucontext_list; @@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!umem_odp->mn_counters_active) - return 1; - if (unlikely(umem_odp->notifiers_count)) return 1; if (umem_odp->notifiers_seq != mmu_seq) -- cgit v1.2.3 From be7a57b41ad824dbc59d1ffa91160ee73f2999ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:10 +0300 Subject: RDMA/umem: Handle a half-complete start/end sequence mmu_notifier_unregister() can race between a invalidate_start/end and cause the invalidate_end to be skipped. This causes an imbalance in the locking, which lockdep complains about. This is not actually a bug, as we immediately kfree the memory holding the lock, but it simple enough to fix. Mark when the notifier is being destroyed and abort the start callback. This can be done under the lock we already obtained, and can re-purpose the invalidate_range test we already have. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 39 +++++++++++++++++++++++++------------- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1c0c4a431218..d7b6422b9611 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -129,15 +129,11 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) - return; - down_read(&per_mm->umem_rwsem); - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, - ULLONG_MAX, - ib_umem_notifier_release_trampoline, - true, - NULL); + if (per_mm->active) + rbt_ib_umem_for_each_in_range( + &per_mm->umem_tree, 0, ULLONG_MAX, + ib_umem_notifier_release_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); } @@ -166,16 +162,22 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - int ret; - - if (!per_mm->context->invalidate_range) - return 0; if (blockable) down_read(&per_mm->umem_rwsem); else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; + if (!per_mm->active) { + up_read(&per_mm->umem_rwsem); + /* + * At this point active is permanently set and visible to this + * CPU without a lock, that fact is relied on to skip the unlock + * in range_end. + */ + return 0; + } + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); @@ -196,7 +198,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) + if (unlikely(!per_mm->active)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, @@ -251,6 +253,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); + per_mm->active = ctx->invalidate_range; rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -321,6 +324,16 @@ void put_per_mm(struct ib_umem_odp *umem_odp) if (!need_free) return; + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + */ + down_write(&per_mm->umem_rwsem); + per_mm->active = false; + up_write(&per_mm->umem_rwsem); + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); kfree(per_mm); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ce9502545903..ec05c82ead7a 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -89,6 +89,7 @@ struct ib_ucontext_per_mm { struct ib_ucontext *context; struct mm_struct *mm; struct pid *tgid; + bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ -- cgit v1.2.3 From 56ac9dd9177ce451ac8176311915b29e8b5f0ac2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:11 +0300 Subject: RDMA/umem: Avoid synchronize_srcu in the ODP MR destruction path synchronize_rcu is slow enough that it should be avoided on the syscall path when user space is destroying MRs. After all the rework we can now trivially do this by having call_srcu kfree the per_mm. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 10 ++++++++-- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index d7b6422b9611..2b4c5e7dd5a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -307,6 +307,11 @@ found: return 0; } +static void free_per_mm(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +} + void put_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; @@ -334,9 +339,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) per_mm->active = false; up_write(&per_mm->umem_rwsem); - mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); + mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); - kfree(per_mm); + mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ec05c82ead7a..0b1446fe2fab 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -99,6 +99,7 @@ struct ib_ucontext_per_mm { unsigned int odp_mrs_count; struct list_head ucontext_list; + struct rcu_head rcu; }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -- cgit v1.2.3 From 2a3ccfdbeb6a5f832d7203e230799f1ffa46e0fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:12 +0300 Subject: RDMA/uverbs: Get rid of ucontext->tgid Nothing uses this now, just delete it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 1 - drivers/infiniband/core/uverbs_cmd.c | 4 ---- include/rdma/ib_verbs.h | 1 - 3 files changed, 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6a3acf4bf78a..752a55c6bdce 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -816,7 +816,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, ib_dev->disassociate_ucontext(ucontext); } - put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d77b0b9793c7..91d3e4029cd5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -117,9 +117,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, /* ufile is required when some objects are released */ ucontext->ufile = file; - rcu_read_lock(); - ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); ucontext->closing = false; ucontext->cleanup_retryable = false; @@ -169,7 +166,6 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err_alloc: diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6437e6af758d..0d822a9db300 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1500,7 +1500,6 @@ struct ib_ucontext { bool cleanup_retryable; - struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); -- cgit v1.2.3 From 175edba85634a8be0ddab5ee96d0b23d9f17627e Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 17 Sep 2018 13:30:48 +0300 Subject: RDMA/mlx5: Allow creating RAW ethernet QP with loopback support Expose two new flags: MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC Those flags can be used at creation time in order to allow a QP to be able to receive loopback traffic (unicast and multicast). We store the state in the QP to be used on the destroy path to indicate with which flags the QP was created with. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/qp.c | 62 ++++++++++++++++++++++++++++-------- include/uapi/rdma/mlx5-abi.h | 2 ++ 3 files changed, 52 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fde5a867a7d3..ca435654c30b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -428,7 +428,7 @@ struct mlx5_ib_qp { struct list_head cq_send_list; struct mlx5_rate_limit rl; u32 underlay_qpn; - bool tunnel_offload_en; + u32 flags_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ enum ib_qp_type qp_sub_type; }; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index daf1eb84cd31..f29ae401b232 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1258,8 +1258,9 @@ static bool tunnel_offload_supported(struct mlx5_core_dev *dev) static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - bool tunnel_offload_en) + u32 *qp_flags_en) { + u8 lb_flag = 0; u32 *in; void *tirc; int inlen; @@ -1274,12 +1275,21 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); MLX5_SET(tirc, tirc, transport_domain, tdn); - if (tunnel_offload_en) + if (*qp_flags_en & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + if (dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + *qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); @@ -1332,8 +1342,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, goto err_destroy_sq; - err = create_raw_packet_qp_tir(dev, rq, tdn, - qp->tunnel_offload_en); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en); if (err) goto err_destroy_rq; } @@ -1410,6 +1419,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; struct mlx5_ib_create_qp_rss ucmd = {}; size_t required_cmd_sz; + u8 lb_flag = 0; if (init_attr->qp_type != IB_QPT_RAW_PACKET) return -EOPNOTSUPP; @@ -1444,7 +1454,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } - if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) { mlx5_ib_dbg(dev, "invalid flags\n"); return -EOPNOTSUPP; } @@ -1461,6 +1473,16 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; + } + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); @@ -1484,6 +1506,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER) hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); else @@ -1580,10 +1604,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); - err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); if (err) @@ -1710,7 +1730,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); return -EOPNOTSUPP; } - qp->tunnel_offload_en = true; + qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; } if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index addbb9c4529e..e584ba40208e 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -45,6 +45,8 @@ enum { MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, MLX5_QP_FLAG_TYPE_DCT = 1 << 4, MLX5_QP_FLAG_TYPE_DCI = 1 << 5, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, }; enum { -- cgit v1.2.3 From e349f858d29f300ad9ad327fd57735a1d15e147f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 25 Sep 2018 16:58:09 -0600 Subject: RDMA: Fully setup the device name in ib_register_device The current code has two copies of the device name, ibdev->dev and dev_name(&ibdev->dev), and they are setup at different times, which is very confusing. Set them both up at the same time and make dev_name() the lead name, which is the proper use of the driver core APIs. To make it very clear that the name is not valid until registration pass it in to the ib_register_device() call rather than messing with ibdev->name directly. Also the reorganization now checks that dev_name is unique even if it does not contain a %. Signed-off-by: Jason Gunthorpe Acked-by: Adit Ranadive Reviewed-by: Steve Wise Acked-by: Devesh Sharma Reviewed-by: Shiraz Saleem Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl --- drivers/infiniband/core/device.c | 35 +++++++++++++++----------- drivers/infiniband/core/sysfs.c | 4 --- drivers/infiniband/hw/bnxt_re/main.c | 3 +-- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +-- drivers/infiniband/hw/cxgb4/provider.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 3 +-- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +-- drivers/infiniband/hw/mlx4/main.c | 3 +-- drivers/infiniband/hw/mlx5/main.c | 15 ++++++----- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +-- drivers/infiniband/hw/nes/nes_verbs.c | 3 +-- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 +-- drivers/infiniband/hw/qedr/main.c | 4 +-- drivers/infiniband/hw/usnic/usnic_ib_main.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 3 +-- drivers/infiniband/sw/rdmavt/vt.c | 3 ++- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 +-- include/rdma/ib_verbs.h | 6 ++--- include/rdma/rdma_vt.h | 9 ++++++- 19 files changed, 53 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5a680a88aa87..faacf95699d7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -170,10 +170,9 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } -static int alloc_name(char *name) +static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; - char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; @@ -182,24 +181,21 @@ static int alloc_name(char *name) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { - if (!sscanf(device->name, name, &i)) + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(device->name, name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); - if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(buf, dev_name(&device->dev))) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); - snprintf(buf, sizeof buf, name, i); - - if (__ib_device_get_by_name(buf)) - return -ENFILE; - strlcpy(name, buf, IB_DEVICE_NAME_MAX); - return 0; + return dev_set_name(&ibdev->dev, name, i); } static void ib_device_release(struct device *device) @@ -454,9 +450,9 @@ static u32 __dev_new_index(void) * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) { int ret; struct ib_client *client; @@ -495,11 +491,20 @@ int ib_register_device(struct ib_device *device, mutex_lock(&device_mutex); - if (strchr(device->name, '%')) { - ret = alloc_name(device->name); + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); if (ret) goto out; } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); if (ib_device_check_mandatory(device)) { ret = -EINVAL; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0b04dbff884f..bc947a863b34 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1311,10 +1311,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - ret = dev_set_name(class_dev, "%s", device->name); - if (ret) - return ret; - device->groups[0] = &dev_attr_group; class_dev->groups = device->groups; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 20b9f31052bf..73632e5b819f 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -579,7 +579,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) /* ib device init */ ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; - strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX); strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); ibdev->phys_port_cnt = 1; @@ -672,7 +671,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; ibdev->driver_id = RDMA_DRIVER_BNXT_RE; - return ib_register_device(ibdev, NULL); + return ib_register_device(ibdev, "bnxt_re%d", NULL); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b9ff21aa1d5..39530cc15f95 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1319,7 +1319,6 @@ int iwch_register_device(struct iwch_dev *dev) int i; pr_debug("%s iwch_dev %p\n", __func__, dev); - strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -1402,7 +1401,7 @@ int iwch_register_device(struct iwch_dev *dev) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 4eda6872e617..416f8d1af610 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -535,7 +535,6 @@ void c4iw_register_device(struct work_struct *work) struct c4iw_dev *dev = ctx->dev; pr_debug("c4iw_dev %p\n", dev); - strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -627,7 +626,7 @@ void c4iw_register_device(struct work_struct *work) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); if (ret) goto err_kfree_iwcm; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6edb547baee8..5a86a48cba13 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -449,7 +449,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) spin_lock_init(&iboe->lock); ib_dev = &hr_dev->ib_dev; - strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX); ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; @@ -530,7 +529,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; - ret = ib_register_device(ib_dev, NULL); + ret = ib_register_device(ib_dev, "hns_%d", NULL); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index e2e6c74a7452..cb2aef874ca8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2752,7 +2752,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX); iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; @@ -2897,7 +2896,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) iwibdev = iwdev->iwibdev; iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; - ret = ib_register_device(&iwibdev->ibdev, NULL); + ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf3cdb88aaf5..fa5d20eccc21 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2540,7 +2540,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -2803,7 +2802,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; - if (ib_register_device(&ibdev->ib_dev, NULL)) + if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fb1e3c546826..597cd3c171c9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5671,7 +5671,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - const char *name; int err; int i; @@ -5704,12 +5703,6 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - if (!mlx5_lag_is_active(mdev)) - name = "mlx5_%d"; - else - name = "mlx5_bond_%d"; - - strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; @@ -6122,7 +6115,13 @@ static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { - return ib_register_device(&dev->ib_dev, NULL); + const char *name; + + if (!mlx5_lag_is_active(dev->mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + return ib_register_device(&dev->ib_dev, name, NULL); } void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0d3473b4596e..7bd7e2ad17e4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1198,7 +1198,6 @@ int mthca_register_device(struct mthca_dev *dev) if (ret) return ret; - strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; @@ -1297,7 +1296,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6940c7215961..2127cd2f4bec 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3640,7 +3640,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; @@ -3798,7 +3797,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) int i, ret; nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); if (ret) { return ret; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7832ee3e0c84..4d3c27613351 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -116,7 +116,6 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static int ocrdma_register_device(struct ocrdma_dev *dev) { - strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, @@ -214,7 +213,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); } static int ocrdma_alloc_resources(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0af6d424aed..cd7b8b39a129 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -170,8 +170,6 @@ static int qedr_register_device(struct qedr_dev *dev) { int rc; - strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX); - dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; @@ -264,7 +262,7 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; dev->ibdev.driver_id = RDMA_DRIVER_QEDR; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "qedr%d", NULL); } /* This function allocates fast-path status block memory */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f0538a460328..3b9f12928314 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -364,7 +364,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; - strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -416,7 +415,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; - if (ib_register_device(&us_ibdev->ib_dev, NULL)) + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) goto err_fwd_dealloc; usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5719899f49a..6878107fc637 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -162,7 +162,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) int ret = -1; int i = 0; - strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX); dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; @@ -267,7 +266,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); if (ret) goto err_srq_free; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 17e4abc067af..e3249d46bcef 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -828,7 +828,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), + rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); goto bail_mr; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f5b1e0ad6142..e4da5b671e4a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1159,7 +1159,6 @@ int rxe_register_device(struct rxe_dev *rxe) struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; - strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->owner = THIS_MODULE; @@ -1261,7 +1260,7 @@ int rxe_register_device(struct rxe_dev *rxe) rxe->tfm = tfm; dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, NULL); + err = ib_register_device(dev, "rxe%d", NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0d822a9db300..9897d2329f2c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2625,9 +2625,9 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e32facdd9fd3..065c9fbe6589 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -429,7 +429,14 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, const char *fmt, const char *name, const int unit) { - snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); + /* + * FIXME: rvt and its users want to touch the ibdev before + * registration and have things like the name work. We don't have the + * infrastructure in the core to support this directly today, hack it + * to work by setting the name manually here. + */ + dev_set_name(&rdi->ibdev.dev, fmt, name, unit); + strlcpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX); } /** -- cgit v1.2.3 From 6c8541118bd53bc90b6c2473e289e5541de80376 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:27 -0600 Subject: RDMA/ulp: Use dev_name instead of ibdev->name These return the same thing but dev_name is a more conventional use of the kernel API. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Sagi Grimberg Reviewed-by: Dennis Dalessandro --- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 2 +- drivers/infiniband/ulp/iser/iser_verbs.c | 9 +++++---- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 3 ++- drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++++---- drivers/infiniband/ulp/srpt/ib_srpt.c | 26 +++++++++++++------------ include/rdma/rdma_vt.h | 2 +- 7 files changed, 30 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 9f36ca786df8..1e88213459f2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -277,7 +277,7 @@ void ipoib_event(struct ib_event_handler *handler, return; ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, - record->device->name, record->element.port_num); + dev_name(&record->device->dev), record->element.port_num); if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index b686a4aaffe8..946b623ba5eb 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -55,7 +55,7 @@ static void iser_event_handler(struct ib_event_handler *handler, { iser_err("async event %s (%d) on device %s port %d\n", ib_event_msg(event->event), event->event, - event->device->name, event->element.port_num); + dev_name(&event->device->dev), event->element.port_num); } /** @@ -85,7 +85,7 @@ static int iser_create_device_ib_res(struct iser_device *device) max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", - device->comps_used, ib_dev->name, + device->comps_used, dev_name(&ib_dev->dev), ib_dev->num_comp_vectors, max_cqe); device->pd = ib_alloc_pd(ib_dev, @@ -468,7 +468,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr); iser_dbg("device %s supports max_send_wr %d\n", - device->ib_device->name, ib_dev->attrs.max_qp_wr); + dev_name(&device->ib_device->dev), + ib_dev->attrs.max_qp_wr); } } @@ -764,7 +765,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) IB_DEVICE_SIGNATURE_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", - ib_conn->device->ib_device->name); + dev_name(&ib_conn->device->ib_device->dev)); ib_conn->pi_support = false; } else { ib_conn->pi_support = true; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f39670c5c25c..e3dd13798d79 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -262,7 +262,7 @@ isert_alloc_comps(struct isert_device *device) isert_info("Using %d CQs, %s supports %d vectors support " "pi_capable %d\n", - device->comps_used, device->ib_device->name, + device->comps_used, dev_name(&device->ib_device->dev), device->ib_device->num_comp_vectors, device->pi_capable); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 15711dcc6f58..d119d9afa845 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -888,7 +888,8 @@ static void opa_vnic_event(struct ib_event_handler *handler, return; c_dbg("OPA_VNIC received event %d on device %s port %d\n", - record->event, record->device->name, record->element.port_num); + record->event, dev_name(&record->device->dev), + record->element.port_num); if (record->event == IB_EVENT_PORT_ERR) idr_for_each(&port->vport_idr, vema_disable_vport, NULL); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 444d16520506..e2ad7c5ea296 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3124,7 +3124,8 @@ static ssize_t show_local_ib_device(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); + return sprintf(buf, "%s\n", + dev_name(&target->srp_host->srp_dev->dev->dev)); } static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, @@ -3987,7 +3988,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, { struct srp_host *host = container_of(dev, struct srp_host, dev); - return sprintf(buf, "%s\n", host->srp_dev->dev->name); + return sprintf(buf, "%s\n", dev_name(&host->srp_dev->dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -4019,7 +4020,8 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) host->dev.class = &srp_class; host->dev.parent = device->dev->dev.parent; - dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); + dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev), + port); if (device_register(&host->dev)) goto free_host; @@ -4095,7 +4097,7 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", - device->name, mr_page_shift, attr->max_mr_size, + dev_name(&device->dev), mr_page_shift, attr->max_mr_size, attr->max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 447d21ea479a..2357aa727dcf 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -148,7 +148,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, return; pr_debug("ASYNC event= %d on device= %s\n", event->event, - sdev->device->name); + dev_name(&sdev->device->dev)); switch (event->event) { case IB_EVENT_PORT_ERR: @@ -1941,7 +1941,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport) if (srpt_disconnect_ch(ch) >= 0) pr_info("Closing channel %s because target %s_%d has been disabled\n", ch->sess_name, - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), + sport->port); srpt_close_ch(ch); } } @@ -2127,7 +2128,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, if (!sport->enabled) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", - sport->sdev->device->name, port_num); + dev_name(&sport->sdev->device->dev), port_num); goto reject; } @@ -2267,7 +2268,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", - sdev->device->name, port_num); + dev_name(&sdev->device->dev), port_num); mutex_unlock(&sport->mutex); goto reject; } @@ -2842,7 +2843,7 @@ static int srpt_release_sport(struct srpt_port *sport) while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_list_empty(sport), 5 * HZ) <= 0) { pr_info("%s_%d: waiting for session unregistration ...\n", - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), sport->port); rcu_read_lock(); list_for_each_entry(nexus, &sport->nexus_list, entry) { list_for_each_entry(ch, &nexus->ch_list, list) { @@ -2932,7 +2933,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) } pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size, - sdev->device->attrs.max_srq_wr, device->name); + sdev->device->attrs.max_srq_wr, dev_name(&device->dev)); sdev->ioctx_ring = (struct srpt_recv_ioctx **) srpt_alloc_ioctx_ring(sdev, sdev->srq_size, @@ -2965,8 +2966,8 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) } else if (use_srq && !sdev->srq) { ret = srpt_alloc_srq(sdev); } - pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, device->name, - sdev->use_srq, ret); + pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, + dev_name(&device->dev), sdev->use_srq, ret); return ret; } @@ -3052,7 +3053,7 @@ static void srpt_add_one(struct ib_device *device) if (srpt_refresh_port(sport)) { pr_err("MAD registration failed for %s-%d.\n", - sdev->device->name, i); + dev_name(&sdev->device->dev), i); goto err_event; } } @@ -3063,7 +3064,7 @@ static void srpt_add_one(struct ib_device *device) out: ib_set_client_data(device, &srpt_client, sdev); - pr_debug("added %s.\n", device->name); + pr_debug("added %s.\n", dev_name(&device->dev)); return; err_event: @@ -3078,7 +3079,7 @@ free_dev: kfree(sdev); err: sdev = NULL; - pr_info("%s(%s) failed.\n", __func__, device->name); + pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev)); goto out; } @@ -3093,7 +3094,8 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) int i; if (!sdev) { - pr_info("%s(%s): nothing to do.\n", __func__, device->name); + pr_info("%s(%s): nothing to do.\n", __func__, + dev_name(&device->dev)); return; } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 065c9fbe6589..cc08de3570b4 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -447,7 +447,7 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, */ static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi) { - return rdi->ibdev.name; + return dev_name(&rdi->ibdev.dev); } static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) -- cgit v1.2.3 From 7f72052cb48efb5637ed99d2f45cb33a0bf60719 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:45:18 +0300 Subject: IB/mlx5: Expose RAW QP device handles to user space Expose RAW QP device handles to user space by extending the UHW part of mlx5_ib_create_qp_resp. This data is returned only when DEVX context is used where it may be applicable. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 38 ++++++++++++++++++++++++++++++++++++-- include/uapi/rdma/mlx5-abi.h | 13 +++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3455b50705cd..c49a0815a12b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1325,7 +1325,9 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 *in, size_t inlen, - struct ib_pd *pd) + struct ib_pd *pd, + struct ib_udata *udata, + struct mlx5_ib_create_qp_resp *resp) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; @@ -1335,6 +1337,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); int err; u32 tdn = mucontext->tdn; + u16 uid = to_mpd(pd)->uid; if (qp->sq.wqe_cnt) { err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); @@ -1345,6 +1348,13 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_tis; + if (uid) { + resp->tisn = sq->tisn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TISN; + resp->sqn = sq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_SQN; + } + sq->base.container_mibqp = qp; sq->base.mqp.event = mlx5_ib_qp_event; } @@ -1363,13 +1373,25 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd); if (err) goto err_destroy_rq; + + if (uid) { + resp->rqn = rq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN; + resp->tirn = rq->tirn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + } } qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : rq->base.mqp.qpn; + err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); + if (err) + goto err_destroy_tir; return 0; +err_destroy_tir: + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd); err_destroy_rq: destroy_raw_packet_qp_rq(dev, rq); err_destroy_sq: @@ -1640,12 +1662,23 @@ create_tir: if (err) goto err; + if (mucontext->devx_uid) { + resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + resp.tirn = qp->rss_qp.tirn; + } + + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); + if (err) + goto err_copy; + kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; qp->flags |= MLX5_IB_QP_RSS; return 0; +err_copy: + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid); err: kvfree(in); return err; @@ -1978,7 +2011,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags & MLX5_IB_QP_UNDERLAY) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); - err = create_raw_packet_qp(dev, qp, in, inlen, pd); + err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, + &resp); } else { err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index e584ba40208e..6056625237cf 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -351,9 +351,22 @@ struct mlx5_ib_create_qp_rss { __u32 flags; }; +enum mlx5_ib_create_qp_resp_mask { + MLX5_IB_CREATE_QP_RESP_MASK_TIRN = 1UL << 0, + MLX5_IB_CREATE_QP_RESP_MASK_TISN = 1UL << 1, + MLX5_IB_CREATE_QP_RESP_MASK_RQN = 1UL << 2, + MLX5_IB_CREATE_QP_RESP_MASK_SQN = 1UL << 3, +}; + struct mlx5_ib_create_qp_resp { __u32 bfreg_index; __u32 reserved; + __u32 comp_mask; + __u32 tirn; + __u32 tisn; + __u32 rqn; + __u32 sqn; + __u32 reserved1; }; struct mlx5_ib_alloc_mw { -- cgit v1.2.3 From aef716fa5e6da3919cca22ac2097a90d73d8177f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 27 Sep 2018 13:55:58 -0700 Subject: RDMA/qedr: Remove enumerated type qed_roce_ll2_tx_dest Clang warns when one enumerated type is explicitly converted to another. drivers/infiniband/hw/qedr/qedr_roce_cm.c:198:28: warning: implicit conversion from enumeration type 'enum qed_roce_ll2_tx_dest' to different enumeration type 'enum qed_ll2_tx_dest' [-Wenum-conversion] ll2_tx_pkt.tx_dest = pkt->tx_dest; ~ ~~~~~^~~~~~~ 1 warning generated. Turns out that QED_ROCE_LL2_TX_DEST_NW and QED_ROCE_LL2_TX_DEST_LB are only used once in the whole tree and QED_ROCE_LL2_TX_DEST_MAX is used nowhere. Remove them and use the equivalent values from qed_ll2_tx_dest in their place. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr_roce_cm.c | 4 ++-- include/linux/qed/qed_rdma_if.h | 11 +---------- 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index 85578887421b..e1ac2fd60bb1 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -519,9 +519,9 @@ static inline int qedr_gsi_build_packet(struct qedr_dev *dev, } if (ether_addr_equal(udh.eth.smac_h, udh.eth.dmac_h)) - packet->tx_dest = QED_ROCE_LL2_TX_DEST_LB; + packet->tx_dest = QED_LL2_TX_DEST_LB; else - packet->tx_dest = QED_ROCE_LL2_TX_DEST_NW; + packet->tx_dest = QED_LL2_TX_DEST_NW; packet->roce_mode = roce_mode; memcpy(packet->header.vaddr, ud_header_buffer, header_size); diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index df4d13f7e191..d15f8e4815e3 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -39,15 +39,6 @@ #include #include -enum qed_roce_ll2_tx_dest { - /* Light L2 TX Destination to the Network */ - QED_ROCE_LL2_TX_DEST_NW, - - /* Light L2 TX Destination to the Loopback */ - QED_ROCE_LL2_TX_DEST_LB, - QED_ROCE_LL2_TX_DEST_MAX -}; - #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) /* rdma interface */ @@ -581,7 +572,7 @@ struct qed_roce_ll2_packet { int n_seg; struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; int roce_mode; - enum qed_roce_ll2_tx_dest tx_dest; + enum qed_ll2_tx_dest tx_dest; }; enum qed_rdma_type { -- cgit v1.2.3 From d205a06a14796a24b3447bc5d27b7dedff4479d5 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 26 Sep 2018 10:26:44 -0700 Subject: IB/rdmavt: Rename check_send_wqe as setup_wqe The driver-provided function check_send_wqe allows the hardware driver to check and set up the incoming send wqe before it is inserted into the swqe ring. This patch will rename it as setup_wqe to better reflect its usage. In addition, this function is only called when all setup is complete in rdmavt. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 11 ++++++++--- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hfi1/verbs.h | 4 ++-- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 28 ++++++++++++++++++++-------- include/rdma/rdma_vt.h | 13 +++++++------ 6 files changed, 39 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 54d9ff171059..b1044a205ab6 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -282,16 +282,21 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, } /** - * hfi1_check_send_wqe - validate wqe + * hfi1_setup_wqe - set up the wqe * @qp - The qp * @wqe - The built wqe * @call_send - Determine if the send should be posted or scheduled. * + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allows the driver the opportunity to perform + * validation and additional setup of the wqe. + * * Returns 0 on success, -EINVAL on failure * */ -int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe, bool *call_send) +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 13374c727b14..bbee0cb77ff8 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1937,7 +1937,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 269ec338581b..bc77ffec51ce 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -343,8 +343,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, - bool *call_send); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 41babbc0db58..ad9093d33cb2 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1588,7 +1588,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; - dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe; dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah; dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a9b7d7ff32ee..2db71e956d02 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1823,13 +1823,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.num_sge = j; } - /* general part of wqe valid - allow for driver checks */ - if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe, call_send); - if (ret < 0) - goto bail_inval_free; - } - + /* + * Calculate and set SWQE PSN values prior to handing it off + * to the driver's check routine. This give the driver the + * opportunity to adjust PSN values based on internal checks. + */ log_pmtu = qp->log_pmtu; if (qp->ibqp.qp_type != IB_QPT_UC && qp->ibqp.qp_type != IB_QPT_RC) { @@ -1854,8 +1852,18 @@ static int rvt_post_one_wr(struct rvt_qp *qp, (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.setup_wqe) { + ret = rdi->driver_f.setup_wqe(qp, wqe, call_send); + if (ret < 0) + goto bail_inval_free_ref; + } + + if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) + qp->s_next_psn = wqe->lpsn + 1; + if (unlikely(reserved_op)) { wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; rvt_qp_wqe_reserve(qp, wqe); @@ -1869,6 +1877,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return 0; +bail_inval_free_ref: + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); bail_inval_free: /* release mr holds */ while (j) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index cc08de3570b4..52907204afcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -215,13 +215,14 @@ struct rvt_driver_provided { void (*schedule_send_no_lock)(struct rvt_qp *qp); /* - * Validate the wqe. This needs to be done prior to inserting the - * wqe into the ring, but after the wqe has been set up. Allow for - * driver specific work request checking by providing a callback. - * call_send indicates if the wqe should be posted or scheduled. + * Driver specific work request setup and checking. + * This function is allowed to perform any setup, checks, or + * adjustments required to the SWQE in order to be usable by + * underlying protocols. This includes private data structure + * allocations. */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, - bool *call_send); + int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is -- cgit v1.2.3 From 5da0fc9dbf891a9c9e01a634f2126b5952afb3a6 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 28 Sep 2018 07:17:09 -0700 Subject: IB/hfi1: Prepare resource waits for dual leg Current implementation allows each qp to have only one send engine. As such, each qp has only one list to queue prebuilt packets when send engine resources are not available. To improve performance, it is desired to support multiple send engines for each qp. This patch creates the framework to support two send engines (two legs) for each qp for the TID RDMA protocol, which can be easily extended to support more send engines. It achieves the goal by creating a leg specific struct, iowait_work in the iowait struct, to hold the work_struct and the tx_list as well as a pointer to the parent iowait struct. The hfi1_pkt_state now has an additional field to record the current legs work structure and that is now passed to all egress waiters to determine the leg that needs to wait via a new iowait helper. The APIs are adjusted to use the new leg specific struct as required. Many new and modified helpers are added to support this change. Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/iowait.c | 91 +++++++++++++++ drivers/infiniband/hw/hfi1/iowait.h | 192 +++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/qp.c | 67 ++++++++--- drivers/infiniband/hw/hfi1/qp.h | 31 ++--- drivers/infiniband/hw/hfi1/ruc.c | 10 +- drivers/infiniband/hw/hfi1/sdma.c | 52 ++++----- drivers/infiniband/hw/hfi1/sdma.h | 8 +- drivers/infiniband/hw/hfi1/user_sdma.c | 14 ++- drivers/infiniband/hw/hfi1/verbs.c | 11 +- drivers/infiniband/hw/hfi1/verbs.h | 4 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 11 +- drivers/infiniband/hw/hfi1/vnic_sdma.c | 21 ++-- drivers/infiniband/hw/qib/qib_verbs.c | 9 +- drivers/infiniband/hw/qib/qib_verbs.h | 6 +- include/rdma/rdma_vt.h | 4 +- 16 files changed, 366 insertions(+), 166 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/iowait.c (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index a8dcf82ab7cb..ff790390c91a 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -20,6 +20,7 @@ hfi1-y := \ firmware.o \ init.o \ intr.o \ + iowait.o \ mad.o \ mmu_rb.o \ msix.o \ diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c new file mode 100644 index 000000000000..59dc955f1880 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "iowait.h" + +void iowait_set_flag(struct iowait *wait, u32 flag) +{ + set_bit(flag, &wait->flags); +} + +bool iowait_flag_set(struct iowait *wait, u32 flag) +{ + return test_bit(flag, &wait->flags); +} + +inline void iowait_clear_flag(struct iowait *wait, u32 flag) +{ + clear_bit(flag, &wait->flags); +} + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) +{ + int i; + + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; + wait->flags = 0; + for (i = 0; i < IOWAIT_SES; i++) { + wait->wait[i].iow = wait; + INIT_LIST_HEAD(&wait->wait[i].tx_head); + if (i == IOWAIT_IB_SE) + INIT_WORK(&wait->wait[i].iowork, func); + else + INIT_WORK(&wait->wait[i].iowork, tidfunc); + } +} + +/** + * iowait_cancel_work - cancel all work in iowait + * @w: the iowait struct + */ +void iowait_cancel_work(struct iowait *w) +{ + cancel_work_sync(&iowait_get_ib_work(w)->iowork); + cancel_work_sync(&iowait_get_tid_work(w)->iowork); +} + +/** + * iowait_set_work_flag - set work flag based on leg + * @w - the iowait work struct + */ +int iowait_set_work_flag(struct iowait_work *w) +{ + if (w == &w->iow->wait[IOWAIT_IB_SE]) { + iowait_set_flag(w->iow, IOWAIT_PENDING_IB); + return IOWAIT_IB_SE; + } + iowait_set_flag(w->iow, IOWAIT_PENDING_TID); + return IOWAIT_TID_SE; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 3d9c32c7c340..23a58ac0d47c 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -1,7 +1,7 @@ #ifndef _HFI1_IOWAIT_H #define _HFI1_IOWAIT_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include #include +#include #include #include "sdma_txreq.h" @@ -59,16 +60,47 @@ */ typedef void (*restart_t)(struct work_struct *work); +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + struct sdma_txreq; struct sdma_engine; /** - * struct iowait - linkage for delayed progress/waiting + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** * @list: used to add/insert into QP/PQ wait lists - * @lock: uses to record the list head lock * @tx_head: overflow list of sdma_txreq's * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 * @wait_pio: wait for pio_busy == 0 @@ -76,6 +108,8 @@ struct sdma_engine; * @count: total number of descriptors in tx_head'ed list * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array * * This is to be embedded in user's state structure * (QP or PQ). @@ -98,13 +132,11 @@ struct sdma_engine; * Waiters explicity know that, but the destroy * code that unwaits QPs does not. */ - struct iowait { struct list_head list; - struct list_head tx_head; int (*sleep)( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, uint seq, bool pkts_sent @@ -112,7 +144,6 @@ struct iowait { void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); seqlock_t *lock; - struct work_struct iowork; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; atomic_t sdma_busy; @@ -121,63 +152,37 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; }; #define SDMA_AVAIL_REASON 0 -/** - * iowait_init() - initialize wait structure - * @wait: wait struct to initialize - * @tx_limit: limit for overflow queuing - * @func: restart function for workqueue - * @sleep: sleep function for no space - * @resume: wakeup function for no space - * - * This function initializes the iowait - * structure embedded in the QP or PQ. - * - */ +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); -static inline void iowait_init( - struct iowait *wait, - u32 tx_limit, - void (*func)(struct work_struct *work), - int (*sleep)( - struct sdma_engine *sde, - struct iowait *wait, - struct sdma_txreq *tx, - uint seq, - bool pkts_sent), - void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) -{ - wait->count = 0; - wait->lock = NULL; - INIT_LIST_HEAD(&wait->list); - INIT_LIST_HEAD(&wait->tx_head); - INIT_WORK(&wait->iowork, func); - init_waitqueue_head(&wait->wait_dma); - init_waitqueue_head(&wait->wait_pio); - atomic_set(&wait->sdma_busy, 0); - atomic_set(&wait->pio_busy, 0); - wait->tx_limit = tx_limit; - wait->sleep = sleep; - wait->wakeup = wakeup; - wait->sdma_drained = sdma_drained; -} +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)); /** - * iowait_schedule() - initialize wait structure + * iowait_schedule() - schedule the default send engine work * @wait: wait struct to schedule * @wq: workqueue for schedule * @cpu: cpu */ -static inline void iowait_schedule( - struct iowait *wait, - struct workqueue_struct *wq, - int cpu) +static inline bool iowait_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) { - queue_work_on(cpu, wq, &wait->iowork); + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } /** @@ -228,6 +233,8 @@ static inline void iowait_sdma_add(struct iowait *wait, int count) */ static inline int iowait_sdma_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->sdma_busy); } @@ -267,11 +274,13 @@ static inline void iowait_pio_inc(struct iowait *wait) } /** - * iowait_sdma_dec - note pio complete + * iowait_pio_dec - note pio complete * @wait: iowait structure */ static inline int iowait_pio_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->pio_busy); } @@ -293,9 +302,9 @@ static inline void iowait_drain_wakeup(struct iowait *wait) /** * iowait_get_txhead() - get packet off of iowait list * - * @wait wait struture + * @wait iowait_work struture */ -static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) { struct sdma_txreq *tx = NULL; @@ -309,6 +318,28 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) return tx; } +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + num_desc = tx->num_desc; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -372,12 +403,57 @@ static inline void iowait_starve_find_max(struct iowait *w, u8 *max, } /** - * iowait_packet_queued() - determine if a packet is already built - * @wait: the wait structure + * iowait_packet_queued() - determine if a packet is queued + * @wait: the iowait_work structure */ -static inline bool iowait_packet_queued(struct iowait *wait) +static inline bool iowait_packet_queued(struct iowait_work *wait) { return !list_empty(&wait->tx_head); } +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + #endif diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index b1044a205ab6..126e9739e44f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(qp_table_size, "QP table size"); static void flush_tx_list(struct rvt_qp *qp); static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, unsigned int seq, bool pkts_sent); @@ -134,15 +134,13 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { }; -static void flush_tx_list(struct rvt_qp *qp) +static void flush_list_head(struct list_head *l) { - struct hfi1_qp_priv *priv = qp->priv; - - while (!list_empty(&priv->s_iowait.tx_head)) { + while (!list_empty(l)) { struct sdma_txreq *tx; tx = list_first_entry( - &priv->s_iowait.tx_head, + l, struct sdma_txreq, list); list_del_init(&tx->list); @@ -151,6 +149,14 @@ static void flush_tx_list(struct rvt_qp *qp) } } +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head); + flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head); +} + static void flush_iowait(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; @@ -336,7 +342,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) * It is only used in the post send, which doesn't hold * the s_lock. */ -void _hfi1_schedule_send(struct rvt_qp *qp) +bool _hfi1_schedule_send(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = @@ -344,10 +350,10 @@ void _hfi1_schedule_send(struct rvt_qp *qp) struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, - priv->s_sde ? - priv->s_sde->cpu : - cpumask_first(cpumask_of_node(dd->node))); + return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); } static void qp_pio_drain(struct rvt_qp *qp) @@ -375,12 +381,32 @@ static void qp_pio_drain(struct rvt_qp *qp) * * This schedules qp progress and caller should hold * the s_lock. + * @return true if the first leg is scheduled; + * false if the first leg is not scheduled. */ -void hfi1_schedule_send(struct rvt_qp *qp) +bool hfi1_schedule_send(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); - if (hfi1_send_ok(qp)) + if (hfi1_send_ok(qp)) { _hfi1_schedule_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_IB); + return false; +} + +static void hfi1_qp_schedule(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + bool ret; + + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) { + ret = hfi1_schedule_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -391,16 +417,22 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) if (qp->s_flags & flag) { qp->s_flags &= ~flag; trace_hfi1_qpwakeup(qp, flag); - hfi1_schedule_send(qp); + hfi1_qp_schedule(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); /* Notify hfi1_destroy_qp() if it is waiting. */ rvt_put_qp(qp); } +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) +{ + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + qp->s_flags &= ~RVT_S_BUSY; +} + static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, uint seq, bool pkts_sent) @@ -441,7 +473,7 @@ static int iowait_sleep( rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, wait); spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EBUSY; } else { @@ -640,6 +672,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, + NULL, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -689,7 +722,7 @@ void stop_send_queue(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - cancel_work_sync(&priv->s_iowait.iowork); + iowait_cancel_work(&priv->s_iowait); } void quiesce_qp(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 078cff7560b6..7adb6dff6813 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -57,18 +57,6 @@ extern unsigned int hfi1_qp_table_size; extern const struct rvt_operation_params hfi1_post_parms[]; -/* - * Send if not busy or waiting for I/O and either - * a RC response is pending or we can process send work requests. - */ -static inline int hfi1_send_ok(struct rvt_qp *qp) -{ - return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && - (verbs_txreq_queued(qp) || - (qp->s_flags & RVT_S_RESP_PENDING) || - !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); -} - /* * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK * @@ -89,6 +77,20 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) || + (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + /* * free_ahg - clear ahg from QP */ @@ -129,8 +131,8 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); -void _hfi1_schedule_send(struct rvt_qp *qp); -void hfi1_schedule_send(struct rvt_qp *qp); +bool _hfi1_schedule_send(struct rvt_qp *qp); +bool hfi1_schedule_send(struct rvt_qp *qp); void hfi1_migrate_qp(struct rvt_qp *qp); @@ -150,4 +152,5 @@ void quiesce_qp(struct rvt_qp *qp); u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); int mtu_to_path_mtu(u32 mtu); void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait); #endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 5f56f3c1b4c4..17b49b4309a3 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -825,8 +825,8 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp) void _hfi1_do_send(struct work_struct *work) { - struct iowait *wait = container_of(work, struct iowait, iowork); - struct rvt_qp *qp = iowait_to_qp(wait); + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); hfi1_do_send(qp, true); } @@ -850,6 +850,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ppd = ppd_from_ibp(ps.ibp); ps.in_thread = in_thread; + ps.wait = iowait_get_ib_work(&priv->s_iowait); trace_hfi1_rc_do_send(qp, in_thread); @@ -883,6 +884,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -896,7 +899,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.pkts_sent = false; /* insure a pre-built packet is handled */ - ps.s_txreq = get_waiting_verbs_txreq(qp); + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { @@ -907,6 +910,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) */ if (hfi1_verbs_send(qp, &ps)) return; + /* allow other tasks to run */ if (schedule_send_yield(qp, &ps)) return; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 7a9b67e82a96..891d2386d1ca 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -378,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde, __sdma_txclean(sde->dd, tx); if (complete) (*complete)(tx, res); - if (wait && iowait_sdma_dec(wait)) + if (iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); } @@ -1758,7 +1758,6 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) struct iowait *wait, *nw; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; uint i, n = 0, seq, max_idx = 0; - struct sdma_txreq *stx; struct hfi1_ibdev *dev = &sde->dd->verbs_dev; u8 max_starved_cnt = 0; @@ -1779,19 +1778,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) nw, &sde->dmawait, list) { - u16 num_desc = 0; + u32 num_desc; if (!wait->wakeup) continue; if (n == ARRAY_SIZE(waits)) break; - if (!list_empty(&wait->tx_head)) { - stx = list_first_entry( - &wait->tx_head, - struct sdma_txreq, - list); - num_desc = stx->num_desc; - } + num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; @@ -2346,7 +2339,7 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) */ static int sdma_check_progress( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2356,12 +2349,12 @@ static int sdma_check_progress( if (tx->num_desc <= sde->desc_avail) return -EAGAIN; /* pulse the head_lock */ - if (wait && wait->sleep) { + if (wait && iowait_ioww_to_iow(wait)->sleep) { unsigned seq; seq = raw_seqcount_begin( (const seqcount_t *)&sde->head_lock.seqcount); - ret = wait->sleep(sde, wait, tx, seq, pkts_sent); + ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); } else { @@ -2373,7 +2366,7 @@ static int sdma_check_progress( /** * sdma_send_txreq() - submit a tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx: sdma_txreq to submit * @pkts_sent: has any packet been sent yet? * @@ -2386,7 +2379,7 @@ static int sdma_check_progress( * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2397,7 +2390,7 @@ int sdma_send_txreq(struct sdma_engine *sde, /* user should have supplied entire packet */ if (unlikely(tx->tlen)) return -EINVAL; - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); spin_lock_irqsave(&sde->tail_lock, flags); retry: if (unlikely(!__sdma_running(sde))) @@ -2406,14 +2399,14 @@ retry: goto nodesc; tail = submit_tx(sde, tx); if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); return ret; unlock_noconn: if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2422,10 +2415,7 @@ unlock_noconn: spin_lock(&sde->flushlist_lock); list_add_tail(&tx->list, &sde->flushlist); spin_unlock(&sde->flushlist_lock); - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); schedule_work(&sde->flush_worker); ret = -ECOMM; goto unlock; @@ -2442,7 +2432,7 @@ nodesc: /** * sdma_send_txlist() - submit a list of tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx_list: list of sdma_txreqs to submit * @count: pointer to a u16 which, after return will contain the total number of * sdma_txreqs removed from the tx_list. This will include sdma_txreqs @@ -2467,7 +2457,7 @@ nodesc: * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ -int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, +int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, struct list_head *tx_list, u16 *count_out) { struct sdma_txreq *tx, *tx_next; @@ -2479,7 +2469,7 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, spin_lock_irqsave(&sde->tail_lock, flags); retry: list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); if (unlikely(!__sdma_running(sde))) goto unlock_noconn; if (unlikely(tx->num_desc > sde->desc_avail)) @@ -2500,8 +2490,9 @@ retry: update_tail: total_count = submit_count + flush_count; if (wait) { - iowait_sdma_add(wait, total_count); - iowait_starve_clear(submit_count > 0, wait); + iowait_sdma_add(iowait_ioww_to_iow(wait), total_count); + iowait_starve_clear(submit_count > 0, + iowait_ioww_to_iow(wait)); } if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); @@ -2511,7 +2502,7 @@ update_tail: unlock_noconn: spin_lock(&sde->flushlist_lock); list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); list_del_init(&tx->list); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER @@ -2520,10 +2511,7 @@ unlock_noconn: #endif list_add_tail(&tx->list, &sde->flushlist); flush_count++; - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); schedule_work(&sde->flush_worker); diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index c076eef081e8..6dc63d7c5685 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_SDMA_H #define _HFI1_SDMA_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -840,14 +840,14 @@ static inline int sdma_txadd_kvaddr( dd, SDMA_MAP_SINGLE, tx, addr, len); } -struct iowait; +struct iowait_work; int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent); int sdma_send_txlist(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct list_head *tx_list, u16 *count_out); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 825e475dc9fe..6e2aa4480c58 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,7 +100,7 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent); @@ -123,13 +123,13 @@ static struct mmu_rb_ops sdma_rb_ops = { static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_user_sdma_pkt_q *pq = - container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); struct hfi1_ibdev *dev = &pq->dd->verbs_dev; struct user_sdma_txreq *tx = container_of(txreq, struct user_sdma_txreq, txreq); @@ -191,7 +191,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, atomic_set(&pq->n_locked, 0); pq->mm = fd->mm; - iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, activate_packet_queue, NULL); pq->reqidx = 0; @@ -912,7 +912,9 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) npkts++; } dosend: - ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); + ret = sdma_send_txlist(req->sde, + iowait_get_ib_work(&pq->busy), + &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { /* diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index bbee0cb77ff8..16b88948383b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -737,7 +737,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); @@ -748,7 +748,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -950,8 +950,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, - ps->pkts_sent); + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; @@ -1001,7 +1000,7 @@ static int pio_wait(struct rvt_qp *qp, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; @@ -1020,7 +1019,7 @@ static int pio_wait(struct rvt_qp *qp, hfi1_sc_wantpiobuf_intr(sc, 1); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bc77ffec51ce..d4164114396e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -166,11 +166,13 @@ struct hfi1_qp_priv { * This structure is used to hold commonly lookedup and computed values during * the send engine progress. */ +struct iowait_work; struct hfi1_pkt_state { struct hfi1_ibdev *dev; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + struct iowait_work *wait; unsigned long flags; unsigned long timeout; unsigned long timeout_int; @@ -247,7 +249,7 @@ static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) return container_of(rdi, struct hfi1_ibdev, rdi); } -static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) { struct hfi1_qp_priv *priv; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 1c19bbc764b2..2a77af26a231 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -102,22 +102,19 @@ static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) return &tx->txreq; } -static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w) { struct sdma_txreq *stx; - struct hfi1_qp_priv *priv = qp->priv; - stx = iowait_get_txhead(&priv->s_iowait); + stx = iowait_get_txhead(w); if (stx) return container_of(stx, struct verbs_txreq, txreq); return NULL; } -static inline bool verbs_txreq_queued(struct rvt_qp *qp) +static inline bool verbs_txreq_queued(struct iowait_work *w) { - struct hfi1_qp_priv *priv = qp->priv; - - return iowait_packet_queued(&priv->s_iowait); + return iowait_packet_queued(w); } void hfi1_put_txreq(struct verbs_txreq *tx); diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index c3c96c5869ed..97bd940a056a 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -198,8 +198,8 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, goto free_desc; tx->retry_count = 0; - ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq, - vnic_sdma->pkts_sent); + ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), + &tx->txreq, vnic_sdma->pkts_sent); /* When -ECOMM, sdma callback will be called with ABORT status */ if (unlikely(ret && unlikely(ret != -ECOMM))) goto free_desc; @@ -230,13 +230,13 @@ tx_err: * become available. */ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait, struct hfi1_vnic_sdma, wait); + container_of(wait->iow, struct hfi1_vnic_sdma, wait); struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev; struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); @@ -247,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; write_seqlock(&dev->iowait_lock); if (list_empty(&vnic_sdma->wait.list)) - iowait_queue(pkts_sent, wait, &sde->dmawait); + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; } @@ -285,7 +285,8 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) for (i = 0; i < vinfo->num_tx_q; i++) { struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; - iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep, + iowait_init(&vnic_sdma->wait, 0, NULL, NULL, + hfi1_vnic_sdma_sleep, hfi1_vnic_sdma_wakeup, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; @@ -295,10 +296,12 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) /* Add a free descriptor watermark for wakeups */ if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { + struct iowait_work *work; + INIT_LIST_HEAD(&vnic_sdma->stx.list); vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; - list_add_tail(&vnic_sdma->stx.list, - &vnic_sdma->wait.tx_head); + work = iowait_get_ib_work(&vnic_sdma->wait); + list_add_tail(&vnic_sdma->stx.list, &work->tx_head); } } } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ad9093d33cb2..26ab78e5aaa7 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1716,14 +1716,14 @@ void qib_unregister_ib_device(struct qib_devdata *dd) * It is only used in post send, which doesn't hold * the s_lock. */ -void _qib_schedule_send(struct rvt_qp *qp) +bool _qib_schedule_send(struct rvt_qp *qp) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); struct qib_qp_priv *priv = qp->priv; - queue_work(ppd->qib_wq, &priv->s_work); + return queue_work(ppd->qib_wq, &priv->s_work); } /** @@ -1733,8 +1733,9 @@ void _qib_schedule_send(struct rvt_qp *qp) * This schedules qp progress. The s_lock * should be held. */ -void qib_schedule_send(struct rvt_qp *qp) +bool qib_schedule_send(struct rvt_qp *qp) { if (qib_send_ok(qp)) - _qib_schedule_send(qp); + return _qib_schedule_send(qp); + return false; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 3d7b744ae8fb..df90a7a41534 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -223,8 +223,8 @@ static inline int qib_send_ok(struct rvt_qp *qp) !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); } -void _qib_schedule_send(struct rvt_qp *qp); -void qib_schedule_send(struct rvt_qp *qp); +bool _qib_schedule_send(struct rvt_qp *qp); +bool qib_schedule_send(struct rvt_qp *qp); static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 52907204afcd..0a888a9ecc96 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -211,8 +211,8 @@ struct rvt_driver_provided { * version requires the s_lock not to be held. The other assumes the * s_lock is held. */ - void (*schedule_send)(struct rvt_qp *qp); - void (*schedule_send_no_lock)(struct rvt_qp *qp); + bool (*schedule_send)(struct rvt_qp *qp); + bool (*schedule_send_no_lock)(struct rvt_qp *qp); /* * Driver specific work request setup and checking. -- cgit v1.2.3 From d31131bba5a1630304c55ea775c48cc84912ab59 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 2 Oct 2018 16:11:21 +0300 Subject: RDMA: Remove unused parameter from ib_modify_qp_is_ok() The ll parameter is not used in ib_modify_qp_is_ok(), so remove it. Signed-off-by: Kamal Heib Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +-- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++-- drivers/infiniband/hw/mlx4/qp.c | 8 +------- drivers/infiniband/hw/mlx5/qp.c | 5 ++--- drivers/infiniband/hw/mthca/mthca_qp.c | 4 ++-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 +-- drivers/infiniband/hw/qedr/verbs.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 5 +---- drivers/infiniband/sw/rxe/rxe_qp.c | 3 +-- include/rdma/ib_verbs.h | 4 +--- 12 files changed, 15 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee5fc8408add..1e7ad5e0a46e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1509,8 +1509,7 @@ static const struct { }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) + enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bc2b9e038439..9d7c48466f10 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1598,8 +1598,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state, - ib_qp->qp_type, qp_attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ib_qp->qp_type, qp_attr_mask)) { dev_err(rdev_to_dev(rdev), "Invalid attribute mask: %#x specified ", qp_attr_mask); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index efb7e961ca65..0378fc41fcfa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -952,8 +952,8 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { dev_err(dev, "ib_modify_qp_is_ok failed\n"); goto out; } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 6dd3cd2c2f80..0711ca1dfb8f 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2629,7 +2629,6 @@ enum { static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; @@ -2639,13 +2638,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (cur_state != new_state || cur_state != IB_QPS_RESET) { - int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = rdma_port_get_link_layer(&dev->ib_dev, port); - } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, ll)) { + attr_mask)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c49a0815a12b..fa8e5dc65cb4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3509,7 +3509,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, size_t required_cmd_sz; int err = -EINVAL; int port; - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; if (ibqp->rwq_ind_tbl) return -ENOSYS; @@ -3555,7 +3554,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } if (qp->flags & MLX5_IB_QP_UNDERLAY) { @@ -3566,7 +3564,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } else if (qp_type != MLX5_IB_QPT_REG_UMR && qp_type != MLX5_IB_QPT_DCI && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, + attr_mask)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 3d37f2373d63..9d178ee3c96a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -872,8 +872,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c158ca9fde6d..06d2a7f3304c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1480,8 +1480,7 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 9d4d165014d9..82ee4b4a7084 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2238,8 +2238,7 @@ int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (rdma_protocol_roce(&dev->ibdev, 1)) { if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state, - ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ibqp->qp_type, attr_mask)) { DP_ERR(dev, "modify qp: invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 60083c0363a5..cf22f57a9f0d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -499,7 +499,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_ETHERNET)) { + attr_mask)) { ret = -EINVAL; goto out; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2db71e956d02..a036a5368103 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1164,11 +1164,8 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int lastwqe = 0; int mig = 0; int pmtu = 0; /* for gcc warning only */ - enum rdma_link_layer link; int opa_ah; - link = rdma_port_get_link_layer(ibqp->device, qp->port_num); - spin_lock_irq(&qp->r_lock); spin_lock(&qp->s_hlock); spin_lock(&qp->s_lock); @@ -1179,7 +1176,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, link)) + attr_mask)) goto inval; if (rdi->driver_f.check_modify_qp && diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 45b392b7342f..b9710907dac2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -419,8 +419,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_qp_state new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { pr_warn("invalid mask or state for qp\n"); goto err1; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9897d2329f2c..f88c1071413a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2742,7 +2742,6 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes - * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -2751,8 +2750,7 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); + enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); -- cgit v1.2.3 From 38716732f161c3d107c4cc406a287f1201bed752 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:49:24 +0300 Subject: RDMA/netlink: Simplify netlink listener existence check All users of rdma_nl_chk_listeners() are interested to get boolean answer if netlink socket has listeners, so update all places to boolean function. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/netlink.c | 4 ++-- drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c2ca9e4b5160..1400a9d0d56d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -315,7 +315,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ccaae18ad75..724f5a62e82f 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,9 +47,9 @@ static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int rdma_nl_chk_listeners(unsigned int group) +bool rdma_nl_chk_listeners(unsigned int group) { - return (netlink_has_listeners(nls, group)) ? 0 : -1; + return netlink_has_listeners(nls, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a5e76d432d3f..f28f6fdb78cb 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1384,7 +1384,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c369703fcd69..70218e6b5187 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -96,7 +96,7 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. + * Returns true on success or false if no listeners. */ -int rdma_nl_chk_listeners(unsigned int group); +bool rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ -- cgit v1.2.3 From b56511c15713ba6c7572e77a41f7ddba9c1053ec Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 12:57:16 -0700 Subject: IB/mlx4: Avoid implicit enumerated type conversion Clang warns when one enumerated type is implicitly converted to another. drivers/infiniband/hw/mlx4/mad.c:1811:41: warning: implicit conversion from enumeration type 'enum mlx4_ib_qp_flags' to different enumeration type 'enum ib_qp_create_flags' [-Wenum-conversion] qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; ~ ^~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:1819:41: warning: implicit conversion from enumeration type 'enum mlx4_ib_qp_flags' to different enumeration type 'enum ib_qp_create_flags' [-Wenum-conversion] qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; ~ ^~~~~~~~~~~~~~~~~ The type mlx4_ib_qp_flags explicitly provides supplemental values to the type ib_qp_create_flags. Make that clear to Clang by changing the create_flags type to u32. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f88c1071413a..7ce617d77f8f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1151,7 +1151,7 @@ struct ib_qp_init_attr { struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; + u32 create_flags; /* * Only needed for special QP types, or when using the RW API. -- cgit v1.2.3 From 019f118b94c895294debfaa394b267638fe2f648 Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Wed, 26 Sep 2018 10:44:33 -0700 Subject: IB/{hfi1, qib, rdmavt}: Move copy SGE logic into rdmavt This patch moves hfi1_copy_sge() into rdmavt for sharing with qib. This patch also moves all the wss_*() functions into rdmavt as several wss_*() functions are called from hfi1_copy_sge() When SGE copy mode is adaptive, cacheless copy may be done in some cases for performance reasons. In those cases, X86 cacheless copy function is called since the drivers that use rdmavt and may set SGE copy mode to adaptive are X86 only. For this reason, this patch adds "depends on X86_64" to rdmavt/Kconfig. Reviewed-by: Ashutosh Dixit Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Brian Welty Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 6 - drivers/infiniband/hw/hfi1/rc.c | 10 +- drivers/infiniband/hw/hfi1/ruc.c | 3 +- drivers/infiniband/hw/hfi1/uc.c | 10 +- drivers/infiniband/hw/hfi1/ud.c | 18 +-- drivers/infiniband/hw/hfi1/verbs.c | 226 +---------------------------- drivers/infiniband/hw/hfi1/verbs.h | 25 ---- drivers/infiniband/hw/qib/qib_rc.c | 10 +- drivers/infiniband/hw/qib/qib_ruc.c | 2 +- drivers/infiniband/hw/qib/qib_uc.c | 10 +- drivers/infiniband/hw/qib/qib_ud.c | 13 +- drivers/infiniband/hw/qib/qib_verbs.c | 22 +-- drivers/infiniband/hw/qib/qib_verbs.h | 3 - drivers/infiniband/sw/rdmavt/Kconfig | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 258 ++++++++++++++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/qp.h | 2 + drivers/infiniband/sw/rdmavt/vt.c | 12 +- include/rdma/rdma_vt.h | 22 +++ include/rdma/rdmavt_qp.h | 4 + 19 files changed, 344 insertions(+), 314 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 1e770a133779..09044905284f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1504,9 +1504,6 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1515,8 +1512,6 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1533,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); node_affinity_destroy_all(); - hfi1_wss_exit(); hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9bd63abb2dfe..673b31ebf0ac 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1644,7 +1644,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1684,7 +1685,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -2144,7 +2146,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -2200,7 +2202,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 17b49b4309a3..223eaf184934 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -361,7 +361,8 @@ do_write: if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index e254dcec6f64..48a320c01552 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -426,7 +426,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -449,7 +449,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -523,7 +523,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -550,7 +550,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -564,7 +564,7 @@ rdma_last: tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 70d39fc450a1..e55bc4280d58 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } hfi1_make_grh(ibp, &grh, &grd, 0, 0); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, packet->grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { struct ib_grh grh; @@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * out when creating 16B, add back the GRH here. */ hfi1_make_ext_grh(packet, &grh, slid, dlid); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); } - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), - true, false); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 16b88948383b..0a47b46f979e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -129,8 +129,6 @@ unsigned short piothreshold = 256; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp, /* 16B trailing buffer */ static const u8 trail_buf[MAX_16B_PADDING]; -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = { */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -1949,6 +1728,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d4164114396e..eb99e8df6251 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -315,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx); int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - bool release, bool copy_last); - void hfi1_cnp_rcv(struct hfi1_packet *packet); void hfi1_uc_rcv(struct hfi1_packet *packet); @@ -393,28 +390,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_wss_init(void); -void hfi1_wss_exit(void); - -/* platform specific: return the lowest level cache (llc) size, in KiB */ -static inline int wss_llc_size(void) -{ - /* assume that the boot CPU value is universal for all CPUs */ - return boot_cpu_data.x86_cache_size; -} - -/* platform specific: cacheless copy */ -static inline void cacheless_memcpy(void *dst, void *src, size_t n) -{ - /* - * Use the only available X64 cacheless copy. Add a __user cast - * to quiet sparse. The src agument is already in the kernel so - * there are no security issues. The extra fault recovery machinery - * is not invoked. - */ - __copy_user_nocache(dst, (void __user *)src, n, 0); -} - static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) { return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index f35fdeb14347..034b9729f991 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1425,7 +1425,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1471,7 +1472,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1844,7 +1846,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -1890,7 +1892,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index f8a7de795beb..bc2a9e208d18 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -354,7 +354,7 @@ again: if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 3e54bc11e0ae..0a090569148c 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -359,7 +359,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - qib_copy_sge(&qp->r_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -385,7 +385,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - qib_copy_sge(&qp->r_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -449,7 +449,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -479,7 +479,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -495,7 +495,7 @@ rdma_last: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index f8d029a2390f..b12b9c3a6b5c 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -162,8 +162,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); qib_make_grh(ibp, &grh, grd, 0, 0); - qib_copy_sge(&qp->r_sge, &grh, - sizeof(grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -179,7 +179,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -551,12 +551,13 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, goto drop; } if (has_grh) { - qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); - qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 26ab78e5aaa7..ae6d42cc9651 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -131,27 +131,6 @@ const enum ib_wc_opcode ib_qib_wc_opcode[] = { */ __be64 ib_qib_sys_image_guid; -/** - * qib_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - */ -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) -{ - struct rvt_sge *sge = &ss->sge; - - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - memcpy(sge->vaddr, data, len); - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } -} - /* * Count the number of DMA descriptors needed to send length bytes of data. * Don't modify the qib_sge_state to get the count. @@ -1631,6 +1610,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; + dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY; qib_fill_device_attr(dd); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index df90a7a41534..0c5e623ec70c 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -292,9 +292,6 @@ void qib_put_txreq(struct qib_verbs_txreq *tx); int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr, u32 hdrwords, struct rvt_sge_state *ss, u32 len); -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - int release); - void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 98e798007f75..7df896a18d38 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a036a5368103..d969b0803e6f 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,6 +118,187 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { }; EXPORT_SYMBOL(ib_rvt_state_ops); +/* platform specific: return the last level cache (llc) size, in KiB */ +static int rvt_wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +void rvt_wss_exit(struct rvt_dev_info *rdi) +{ + struct rvt_wss *wss = rdi->wss; + + if (!wss) + return; + + /* coded to handle partially initialized and repeat callers */ + kfree(wss->entries); + wss->entries = NULL; + kfree(rdi->wss); + rdi->wss = NULL; +} + +/** + * rvt_wss_init - Init wss data structures + * + * Return: 0 on success + */ +int rvt_wss_init(struct rvt_dev_info *rdi) +{ + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + unsigned int wss_threshold = rdi->dparms.wss_threshold; + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; + long llc_size; + long llc_bits; + long table_size; + long table_bits; + struct rvt_wss *wss; + int node = rdi->dparms.node; + + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { + rdi->wss = NULL; + return 0; + } + + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); + if (!rdi->wss) + return -ENOMEM; + wss = rdi->wss; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = rvt_wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss->pages_mask = table_bits - 1; + wss->num_entries = table_bits / BITS_PER_LONG; + + wss->threshold = (llc_bits * wss_threshold) / 100; + if (wss->threshold == 0) + wss->threshold = 1; + + wss->clean_period = wss_clean_period; + atomic_set(&wss->clean_counter, wss_clean_period); + + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), + GFP_KERNEL, node); + if (!wss->entries) { + rvt_wss_exit(rdi); + return -ENOMEM; + } + + return 0; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(struct rvt_wss *wss) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss->clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss->clean_counter, wss->clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss->clean_entry) - 1) + & (wss->num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss->entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss->total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(struct rvt_wss *wss, void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss->entries[entry])) + atomic_inc(&wss->total_count); + + wss_advance_clean_counter(wss); +} + +/* + * Is the working set larger than the threshold? + */ +static inline bool wss_exceeds_threshold(struct rvt_wss *wss) +{ + return atomic_read(&wss->total_count) >= wss->threshold; +} + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { @@ -2476,3 +2657,80 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, rcu_read_unlock(); } EXPORT_SYMBOL(rvt_qp_iter); + +/** + * rvt_copy_sge - copy data to SGE memory + * @qp: associated QP + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @release: boolean to release MR + * @copy_last: do a separate copy of the last 8 bytes + */ +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int i; + bool in_last = false; + bool cacheless_copy = false; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_wss *wss = rdi->wss; + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(wss, sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); + + cacheless_copy = wss_exceeds_threshold(wss); + } else { + wss_advance_clean_counter(wss); + } + } + + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = false; + in_last = true; + } + } + +again: + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + rvt_update_sge(ss, len, release); + data += len; + length -= len; + } + + if (copy_last) { + copy_last = false; + in_last = true; + length = 8; + goto again; + } +} +EXPORT_SYMBOL(rvt_copy_sge); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 264811fdc530..6d883972e0b8 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -66,4 +66,6 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int rvt_wss_init(struct rvt_dev_info *rdi); +void rvt_wss_exit(struct rvt_dev_info *rdi); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index e3249d46bcef..723d3daf2eba 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -774,6 +774,13 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) goto bail_no_mr; } + /* Memory Working Set Size */ + ret = rvt_wss_init(rdi); + if (ret) { + rvt_pr_err(rdi, "Error in WSS init.\n"); + goto bail_mr; + } + /* Completion queues */ spin_lock_init(&rdi->n_cqs_lock); @@ -832,7 +839,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_mr; + goto bail_wss; } rvt_create_mad_agents(rdi); @@ -840,6 +847,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; +bail_wss: + rvt_wss_exit(rdi); bail_mr: rvt_mr_exit(rdi); @@ -863,6 +872,7 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); + rvt_wss_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 0a888a9ecc96..7fa2f2d46a3c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,10 @@ struct rvt_ibport { #define RVT_CQN_MAX 16 /* maximum length of cq name */ +#define RVT_SGE_COPY_MEMCPY 0 +#define RVT_SGE_COPY_CACHELESS 1 +#define RVT_SGE_COPY_ADAPTIVE 2 + /* * Things that are driver specific, module parameters in hfi1 and qib */ @@ -161,6 +165,9 @@ struct rvt_driver_params { */ unsigned int lkey_table_size; unsigned int qp_table_size; + unsigned int sge_copy_mode; + unsigned int wss_threshold; + unsigned int wss_clean_period; int qpn_start; int qpn_inc; int qpn_res_start; @@ -193,6 +200,19 @@ struct rvt_ah { u8 log_pmtu; }; +/* memory working set size */ +struct rvt_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; + unsigned int clean_period; +}; + struct rvt_dev_info; struct rvt_swqe; struct rvt_driver_provided { @@ -418,6 +438,8 @@ struct rvt_dev_info { u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; + /* Memory Working Set Size */ + struct rvt_wss *wss; }; /** diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 927f6d5b6d0f..eaf2593ca822 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -678,6 +678,10 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last); + /** * struct rvt_qp_iter - the iterator for QPs * @qp - the current QP -- cgit v1.2.3 From 116aa0330ec71b9872dfb3a3cc5202a72b5a1666 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Wed, 26 Sep 2018 10:44:42 -0700 Subject: IB/{hfi1, qib, rdmavt}: Move send completion logic to rdmavt Moving send completion code into rdmavt in order to have shared logic between qib and hfi1 drivers. Reviewed-by: Mike Marciniszyn Reviewed-by: Brian Welty Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 14 +++++----- drivers/infiniband/hw/hfi1/ruc.c | 45 ++------------------------------- drivers/infiniband/hw/hfi1/uc.c | 4 +-- drivers/infiniband/hw/hfi1/ud.c | 4 +-- drivers/infiniband/hw/hfi1/verbs.c | 9 ++++--- drivers/infiniband/hw/hfi1/verbs.h | 3 --- drivers/infiniband/hw/qib/qib_rc.c | 8 +++--- drivers/infiniband/hw/qib/qib_ruc.c | 43 ++----------------------------- drivers/infiniband/hw/qib/qib_sdma.c | 2 +- drivers/infiniband/hw/qib/qib_uc.c | 2 +- drivers/infiniband/hw/qib/qib_ud.c | 4 +-- drivers/infiniband/hw/qib/qib_verbs.c | 7 +++-- drivers/infiniband/hw/qib/qib_verbs.h | 3 --- drivers/infiniband/sw/rdmavt/qp.c | 43 +++++++++++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/trace_tx.h | 42 ++++++++++++++++++++++++++++++ include/rdma/rdma_vt.h | 3 +++ include/rdma/rdmavt_qp.h | 2 ++ 17 files changed, 124 insertions(+), 114 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 673b31ebf0ac..188aa4f686a0 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -309,7 +309,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; @@ -378,9 +378,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, - err ? IB_WC_LOC_PROT_ERR - : IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); goto done_free_tx; @@ -1043,7 +1043,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else { /* need to handle delayed completion */ @@ -1468,7 +1468,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1706,7 +1706,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 223eaf184934..db1d0d8a04a5 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -411,7 +411,7 @@ send_comp: ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - hfi1_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (local_ops) { atomic_dec(&sqp->local_ops_pending); local_ops = 0; @@ -459,7 +459,7 @@ err: serr: spin_lock_irqsave(&sqp->s_lock, flags); - hfi1_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); @@ -922,44 +922,3 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); spin_unlock_irqrestore(&qp->s_lock, ps.flags); } - -/* - * This should be called with s_lock held. - */ -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - trace_hfi1_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_hfi1_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_hfi1_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 48a320c01552..6aca0c5a7f97 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -88,7 +88,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -140,7 +140,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp, wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index e55bc4280d58..4baa8f4d49de 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -518,7 +518,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -560,7 +560,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); ps->flags = tflags; - hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } } diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 0a47b46f979e..bc7f00ba1988 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -492,7 +492,7 @@ static void verbs_sdma_complete( spin_lock(&qp->s_lock); if (tx->wqe) { - hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_opa_header *hdr; @@ -938,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pio_bail: if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, wc_status); + rvt_send_complete(qp, qp->s_wqe, wc_status); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1145,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock_irqrestore(&qp->s_lock, flags); } return -EINVAL; @@ -1735,6 +1735,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index eb99e8df6251..64c9054db5f3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -363,9 +363,6 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp); void hfi1_do_send(struct rvt_qp *qp, bool in_thread); -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn); int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 034b9729f991..6fa002940451 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -254,7 +254,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done; @@ -838,7 +838,7 @@ void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qib_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else /* XXX need to handle delayed completion */ @@ -1221,7 +1221,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1492,7 +1492,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index bc2a9e208d18..c5627baf5dbf 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -403,7 +403,7 @@ send_comp: ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - qib_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); goto again; rnr_nak: @@ -447,7 +447,7 @@ err: serr: spin_lock_irqsave(&sqp->s_lock, flags); - qib_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); @@ -613,42 +613,3 @@ void qib_do_send(struct rvt_qp *qp) spin_unlock_irqrestore(&qp->s_lock, flags); } - -/* - * This should be called with s_lock held. - */ -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - if (++last >= qp->s_size) - last = 0; - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_qib_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index d0723d4aef5c..757d4c9d713d 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -651,7 +651,7 @@ unmap: if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) rvt_error_qp(qp, IB_WC_GENERAL_ERR); } else if (qp->s_wqe) - qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock(&qp->s_lock); spin_unlock(&qp->r_lock); /* return zero to process the next send work request */ diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 0a090569148c..30c70ad0f4bf 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -68,7 +68,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index b12b9c3a6b5c..4d4c31ea4e2d 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -260,7 +260,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } @@ -304,7 +304,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) qib_ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); *flags = tflags; - qib_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ae6d42cc9651..8a45964c4700 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -731,7 +731,7 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) spin_lock(&qp->s_lock); if (tx->wqe) - qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); else if (qp->ibqp.qp_type == IB_QPT_RC) { struct ib_header *hdr; @@ -1004,7 +1004,7 @@ done: } if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1491,6 +1491,9 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_mcast_grp; /* post send table */ dd->verbs_dev.rdi.post_parms = qib_post_parms; + + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_qib_wc_opcode; } /** diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 0c5e623ec70c..a4426c24b0d1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -331,9 +331,6 @@ void _qib_do_send(struct work_struct *work); void qib_do_send(struct rvt_qp *qp); -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void qib_send_rc_ack(struct rvt_qp *qp); int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index d969b0803e6f..7e3ec6674cf7 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2658,6 +2658,49 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, } EXPORT_SYMBOL(rvt_qp_iter); +/* + * This should be called with s_lock held. + */ +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + last = qp->s_last; + old_last = last; + trace_rvt_qp_send_completion(qp, wqe, last); + if (++last >= qp->s_size) + last = 0; + trace_rvt_qp_send_completion(qp, wqe, last); + qp->s_last = last; + /* See post_send() */ + barrier(); + rvt_put_swqe(wqe); + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + + rvt_qp_swqe_complete(qp, + wqe, + rdi->wc_opcode[wqe->wr.opcode], + status); + + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} +EXPORT_SYMBOL(rvt_send_complete); + /** * rvt_copy_sge - copy data to SGE memory * @qp: associated QP diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index 0ef25fc49f25..d5df352eadb1 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -153,6 +153,48 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + rvt_qp_send_completion, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx), + TP_ARGS(qp, wqe, idx), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(struct rvt_swqe *, wqe) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, qpt) + __field(u32, length) + __field(u32, idx) + __field(u32, ssn) + __field(enum ib_wr_opcode, opcode) + __field(int, send_flags) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->wqe = wqe; + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->qpt = qp->ibqp.qp_type; + __entry->length = wqe->length; + __entry->idx = idx; + __entry->ssn = wqe->ssn; + __entry->opcode = wqe->wr.opcode; + __entry->send_flags = wqe->wr.send_flags; + ), + TP_printk( + "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x", + __get_str(dev), + __entry->qpn, + __entry->qpt, + __entry->wqe, + __entry->idx, + __entry->wr_id, + __entry->length, + __entry->ssn, + __entry->opcode, + __entry->send_flags + ) +); #endif /* __RVT_TRACE_TX_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 7fa2f2d46a3c..3584d0816fcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -398,6 +398,9 @@ struct rvt_dev_info { /* post send table */ const struct rvt_operation_params *post_parms; + /* opcode translation table */ + const enum ib_wc_opcode *wc_opcode; + /* Driver specific helper functions */ struct rvt_driver_provided driver_f; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index eaf2593ca822..6fd6f2ad9c0f 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -681,6 +681,8 @@ void rvt_add_retry_timer(struct rvt_qp *qp); void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, void *data, u32 length, bool release, bool copy_last); +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); /** * struct rvt_qp_iter - the iterator for QPs -- cgit v1.2.3 From 15703461533a5ffd775722390431625daaae7618 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Wed, 26 Sep 2018 10:44:52 -0700 Subject: IB/{hfi1, qib, rdmavt}: Move ruc_loopback to rdmavt This patch moves ruc_loopback() from hfi1 into rdmavt for code sharing with the qib driver. Reviewed-by: Brian Welty Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ruc.c | 332 +----------------------------------- drivers/infiniband/hw/qib/qib_ruc.c | 303 +------------------------------- drivers/infiniband/sw/rdmavt/qp.c | 331 +++++++++++++++++++++++++++++++++++ include/rdma/rdmavt_qp.h | 1 + 4 files changed, 335 insertions(+), 632 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index db1d0d8a04a5..7fb317c711df 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -155,334 +155,6 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) return 0; } -/** - * ruc_loopback - handle UC and RC loopback requests - * @sqp: the sending QP - * - * This is called from hfi1_do_send() to - * forward a WQE addressed to the same HFI. - * Note that although we are single threaded due to the send engine, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void ruc_loopback(struct rvt_qp *sqp) -{ - struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - bool release; - int ret; - bool copy_last = false; - int local_ops = 0; - - rcu_read_lock(); - - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - sqp->remote_qpn); - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work request. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = true; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_REG_MR: - goto send_comp; - - case IB_WR_LOCAL_INV: - if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { - if (rvt_invalidate_rkey(sqp, - wqe->wr.ex.invalidate_rkey)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; - } - goto send_comp; - - case IB_WR_SEND_WITH_INV: - if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { - wc.wc_flags = IB_WC_WITH_INVALIDATE; - wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; - } - goto send; - - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: -send: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* skip copy_last set and qp_access_flags recheck */ - goto do_write; - case IB_WR_RDMA_WRITE: - copy_last = rvt_is_user_qp(qp); - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; -do_write: - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = false; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *)qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *)sqp->s_sge.sge.vaddr = - (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64)atomic64_add_return(sdata, maddr) - sdata : - (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - WARN_ON_ONCE(len == 0); - rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, - len, release, copy_last); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - rvt_send_complete(sqp, wqe, send_status); - if (local_ops) { - atomic_dec(&sqp->local_ops_pending); - local_ops = 0; - } - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - rvt_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - /** * hfi1_make_grh - construct a GRH header * @ibp: a pointer to the IB port @@ -860,7 +532,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_rc_req; @@ -870,7 +542,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_uc_req; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index c5627baf5dbf..1fa21938f310 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -170,307 +170,6 @@ err: return 1; } -/** - * qib_ruc_loopback - handle UC and RC lookback requests - * @sqp: the sending QP - * - * This is called from qib_do_send() to - * forward a WQE addressed to the same HCA. - * Note that although we are single threaded due to the tasklet, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void qib_ruc_loopback(struct rvt_qp *sqp) -{ - struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); - struct qib_devdata *dd = ppd->dd; - struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - int release; - int ret; - - rcu_read_lock(); - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn); - if (!qp) - goto done; - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work reqeust. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = 1; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* FALLTHROUGH */ - case IB_WR_RDMA_WRITE: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = 0; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *) qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *) sqp->s_sge.sge.vaddr = - (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64) atomic64_add_return(sdata, maddr) - sdata : - (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - BUG_ON(len == 0); - rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr); - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - rvt_send_complete(sqp, wqe, send_status); - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - rvt_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - /** * qib_make_grh - construct a GRH header * @ibp: a pointer to the IB port @@ -573,7 +272,7 @@ void qib_do_send(struct rvt_qp *qp) qp->ibqp.qp_type == IB_QPT_UC) && (rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == ppd->lid) { - qib_ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 7e3ec6674cf7..1735deb1a9d4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2777,3 +2777,334 @@ again: } } EXPORT_SYMBOL(rvt_copy_sge); + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from rvt_do_send() to forward a WQE addressed to the same HFI + * Note that although we are single threaded due to the send engine, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +void rvt_ruc_loopback(struct rvt_qp *sqp) +{ + struct rvt_ibport *rvp = NULL; + struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device); + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + bool release; + int ret; + bool copy_last = false; + int local_ops = 0; + + rcu_read_lock(); + rvp = rdi->ports[sqp->port_num - 1]; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp, + sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= RVT_S_BUSY; + +again: + if (sqp->s_last == READ_ONCE(sqp->s_head)) + goto clr_busy; + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + rvp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = true; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + goto send_comp; + + case IB_WR_LOCAL_INV: + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: +send: + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; + case IB_WR_RDMA_WRITE: + copy_last = rvt_is_user_qp(qp); + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; +do_write: + if (wqe->length == 0) + break; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = false; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = wqe->atomic_wr.compare_add; + *(u64 *)sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + sdata, wqe->atomic_wr.swap); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + rvt_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + rvt_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + rvp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + rvt_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + rvp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) + goto clr_busy; + rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << + IB_AETH_CREDIT_SHIFT); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + rvt_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + rvt_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~RVT_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_ruc_loopback); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6fd6f2ad9c0f..cbafb1878669 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -683,6 +683,7 @@ void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, bool release, bool copy_last); void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status); +void rvt_ruc_loopback(struct rvt_qp *qp); /** * struct rvt_qp_iter - the iterator for QPs -- cgit v1.2.3 From 363ad35577de3a73cf97006ec5f00fccaee73172 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:01 +0300 Subject: RDMA/restrack: Un-inline set task implementation Prepare rdma_restrack_set_task() call to accommodate more code by moving its implementation from *.h to *.c. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 10 ++++++++++ include/rdma/restrack.h | 10 ++-------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bcc693fffd4c..b02d43988e16 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -155,6 +155,16 @@ static bool res_is_user(struct rdma_restrack_entry *res) } } +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} +EXPORT_SYMBOL(rdma_restrack_set_task); + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 9654d33edd98..0bddbbdbaf7c 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -175,14 +175,8 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); * @res: resource entry * @task: task struct */ -static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) -{ - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; -} +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task); /* * Helper functions for rdma drivers when filling out -- cgit v1.2.3 From 2165fc264079ecb7fbfa5e8b330a92eb3f0fcbe1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:02 +0300 Subject: RDMA/restrack: Consolidate task name updates in one place Unify task update and kernel name set in one place. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 10 ++-------- drivers/infiniband/core/cq.c | 2 +- drivers/infiniband/core/restrack.c | 13 +++++++++---- drivers/infiniband/core/verbs.c | 4 ++-- include/rdma/restrack.h | 4 ++-- 5 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 897aac68158b..f117b755c4c2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -875,10 +875,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; @@ -3945,10 +3942,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv = container_of(id, struct rdma_id_private, id); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 9271f7290005..b1e5365ddafa 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, goto out_destroy_cq; cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); switch (cq->poll_ctx) { diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index b02d43988e16..035af568ba64 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -156,12 +156,17 @@ static bool res_is_user(struct rdma_restrack_entry *res) } void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) + const char *caller) { + if (caller) { + res->kern_name = caller; + return; + } + if (res->task) put_task_struct(res->task); - get_task_struct(task); - res->task = task; + get_task_struct(current); + res->task = current; } EXPORT_SYMBOL(rdma_restrack_set_task); @@ -177,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (res_is_user(res)) { if (!res->task) - rdma_restrack_set_task(res, current); + rdma_restrack_set_task(res, NULL); res->kern_name = NULL; } else { set_kern_name(res); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 1e7ad5e0a46e..65a7e0b44ad7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } pd->res.type = RDMA_RESTRACK_PD; - pd->res.kern_name = caller; + rdma_restrack_set_task(&pd->res, caller); rdma_restrack_add(&pd->res); if (mr_access_flags) { @@ -1889,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); } diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 0bddbbdbaf7c..2638fa7cd702 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -173,10 +173,10 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); /** * rdma_restrack_set_task() - set the task for this resource * @res: resource entry - * @task: task struct + * @caller: kernel name, the current task will be used if the caller is NULL. */ void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task); + const char *caller); /* * Helper functions for rdma drivers when filling out -- cgit v1.2.3 From ba4a41198324be2e6fbb06c270fdc8500c0e38de Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Oct 2018 09:55:10 +0300 Subject: RDMA/mlx5: Add support for flow tag to raw create flow A user can provide a hint which will be attached to the packet and written to the CQE on receive. This can be used as a way to offload operations into the HW, for example parsing a packet which is a tunneled packet, and if so, pass 0x1 as the hint. The software can use that hint to decapsulate the packet and parse only the inner headers thus saving CPU cycles. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 15 ++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 4ee4af450720..e57435cb6d96 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -153,6 +153,16 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( arr_flow_actions[i]->object); } + ret = uverbs_copy_from(&flow_act.flow_tag, attrs, + MLX5_IB_ATTR_CREATE_FLOW_TAG); + if (!ret) { + if (flow_act.flow_tag >= BIT(24)) { + ret = -EINVAL; + goto err_out; + } + flow_act.has_flow_tag = true; + } + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, cmd_in, inlen, dest_id, dest_type); @@ -513,7 +523,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_FLOW_ACTION, UVERBS_ACCESS_READ, 1, MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, - UA_OPTIONAL)); + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index fb4a8b17cca8..408e220034de 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -157,6 +157,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + MLX5_IB_ATTR_CREATE_FLOW_TAG, }; enum mlx5_ib_destoy_flow_attrs { -- cgit v1.2.3 From 9549c2bd094f0f54b8827d64886f5b1de370dff3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:04 +0300 Subject: RDMA/core: Align multiple functions to kernel coding style This patch changes the small number of functions to be aligned to kernel coding style. It is needed to minimize the diffstat of the following patch. It doesn't change any functionality. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 3 +-- drivers/infiniband/core/sa.h | 10 ++++------ include/rdma/ib_addr.h | 3 +-- include/rdma/ib_sa.h | 36 +++++++++++++++--------------------- 4 files changed, 21 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 07e0ffe74a8a..b6f7cde36c2d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -662,8 +662,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context) + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c..57d4496f6720 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, + struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); int mcast_init(void); void mcast_cleanup(void); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index e09eca91eb18..eebbe63b530c 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -102,8 +102,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context); + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b6ddf2a1b9d8..95ce625a49e3 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -449,28 +449,23 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct sa_path_rec *resp, +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, + u8 port_num, struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, struct sa_path_rec *resp, void *context), - void *context, - struct ib_sa_query **query); + void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; @@ -577,8 +572,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, struct ib_device *device, -- cgit v1.2.3 From dbace111e5b320682eee63d7173959a2b2bd9ccb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:05 +0300 Subject: RDMA/core: Annotate timeout as unsigned long The ucma users supply timeout in u32 format, it means that any number with most significant bit set will be converted to negative value by various rdma_*, cma_* and sa_query functions, which treat timeout as int. In the lowest level, the timeout is converted back to be unsigned long. Remove this ambiguous conversion by updating all function signatures to receive unsigned long. Reported-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/cma.c | 11 ++++++----- drivers/infiniband/core/mad.c | 2 +- drivers/infiniband/core/mad_priv.h | 2 +- drivers/infiniband/core/sa.h | 4 ++-- drivers/infiniband/core/sa_query.c | 13 +++++++------ include/rdma/ib_addr.h | 2 +- include/rdma/ib_cm.h | 2 +- include/rdma/ib_sa.h | 6 +++--- include/rdma/rdma_cm.h | 5 +++-- 10 files changed, 26 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b6f7cde36c2d..0dce94e3c495 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -659,7 +659,7 @@ static void process_one_req(struct work_struct *_work) } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1156cb911a5c..15d5bb7bf6bb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2510,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2629,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2852,7 +2853,7 @@ err1: return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -3072,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c355379e7534..d7025cd5be28 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2414,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms) + unsigned long timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index d84ae1671898..216509036aa8 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms); + unsigned long timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index 57d4496f6720..cbaaaa92fff3 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -51,8 +51,8 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, - gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f28f6fdb78cb..be5ba5e15496 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1360,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; @@ -1550,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -1704,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -1801,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -1892,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -2059,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, - int timeout_ms, + unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index eebbe63b530c..2734c895c1bf 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -99,7 +99,7 @@ int rdma_translate_ip(const struct sockaddr *addr, * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c10f4b5ea8ab..49f4f75499b3 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -583,7 +583,7 @@ struct ib_cm_sidr_req_param { struct sa_path_rec *path; const struct ib_gid_attr *sgid_attr; __be64 service_id; - int timeout_ms; + unsigned long timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 95ce625a49e3..19520979b84c 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -451,7 +451,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -460,7 +460,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, @@ -568,7 +568,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 53d93c7d8e01..60987a5903b7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -196,7 +196,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms); + const struct sockaddr *dst_addr, + unsigned long timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier @@ -206,7 +207,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA -- cgit v1.2.3 From 05d940d3a3ec4e6d5d6a726aae4d73c5c64603c6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 10 Oct 2018 09:19:12 +0300 Subject: RDMA/nldev: Allow IB device rename through RDMA netlink Provide an option to rename IB device name through RDMA netlink and limit it to users with ADMIN capability only. Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 34 ++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ba5403fbcd88..573399e3ccc1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -646,6 +646,36 @@ err: return err; } +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + err = ib_device_rename(device, name); + } + + put_device(&device->dev); + return err; +} + static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, @@ -1078,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index edba6351ac13..f9c41bf59efc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -227,8 +227,9 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_UNSPEC, RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, - /* 2 - 4 are free to use */ + /* 3 - 4 are free to use */ RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ -- cgit v1.2.3 From 1ae4cfa03902c83d1d77123e5ac8f0812c61b90e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:41 +0300 Subject: RDMA/core: Rename ports_parent to ports_kobj Normally kobj objects have kobj suffix to reflect it. Rename ports_parent to ports_kobj. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 9 ++++----- include/rdma/ib_verbs.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 107c8ba2046c..f54f107ef668 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + device->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1305,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_parent); + kobject_put(device->ports_kobj); } int ib_device_register_sysfs(struct ib_device *device, @@ -1323,9 +1323,8 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) goto err; - device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); + if (!device->ports_kobj) { ret = -ENOMEM; goto err_put; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7ce617d77f8f..7d732cf87886 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,7 +2542,7 @@ struct ib_device { /* First group for device attributes, NULL terminated array */ const struct attribute_group *groups[2]; - struct kobject *ports_parent; + struct kobject *ports_kobj; struct list_head port_list; enum { -- cgit v1.2.3 From d4122f5abef844112799d2056fdc7bbedbc913f3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 11 Oct 2018 22:31:53 +0300 Subject: RDMA/core: Allow existing drivers to set one sysfs group per device Currently many rdma drivers are creating device attribute files using device_create_file() with device specific attributes. Device specific attributes should be exposed via well defined netlink device attributes in future. Introduce an API rdma_set_device_sysfs_group() for existing drivers to set a group for sysfs attributes for legacy. This API is only for exposing legacy attributes which existed for sometime now. New drivers should not be using this API and rather follow netlink path. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7d732cf87886..b17eea0373cb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2539,8 +2539,11 @@ struct ib_device { struct module *owner; struct device dev; - /* First group for device attributes, NULL terminated array */ - const struct attribute_group *groups[2]; + /* First group for device attributes, + * Second group for driver provided attributes (optional). + * It is NULL terminated array. + */ + const struct attribute_group *groups[3]; struct kobject *ports_kobj; struct list_head port_list; @@ -4191,4 +4194,27 @@ struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile); int uverbs_destroy_def_handler(struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs); + +/** + * rdma_set_device_sysfs_group - Set device attributes group to have + * driver specific sysfs entries at + * for infiniband class. + * + * @device: device pointer for which attributes to be created + * @group: Pointer to group which should be added when device + * is registered with sysfs. + * rdma_set_device_sysfs_group() allows existing drivers to expose one + * group per device to have sysfs attributes. + * + * NOTE: New drivers should not make use of this API; instead new device + * parameter should be exposed via netlink command. This API and mechanism + * exist only for existing drivers. + */ +static inline void +rdma_set_device_sysfs_group(struct ib_device *dev, + const struct attribute_group *group) +{ + dev->groups[1] = group; +} + #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 6f4bc0ea682b59d7013cbc5ced2d4dd73067a33f Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 9 Oct 2018 12:05:15 +0300 Subject: IB/mlx5: Allow scatter to CQE without global signaled WRs Requester scatter to CQE is restricted to QPs configured to signal all WRs. This patch adds ability to enable scatter to cqe (force enable) in the requester without sig_all, for users who do not want all WRs signaled but rather just the ones whose data found in the CQE. Signed-off-by: Yonatan Cohen Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 14 +++++++++++--- include/uapi/rdma/mlx5-abi.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5b1811be6677..368728e6f980 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1706,15 +1706,20 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, void *qpc) { enum ib_qp_type qpt = init_attr->qp_type; int scqe_sz; + bool allow_scat_cqe = 0; if (qpt == IB_QPT_UC || qpt == IB_QPT_UD) return; - if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) + if (ucmd) + allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE; + + if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) return; scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq); @@ -1836,7 +1841,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_QP_FLAG_TUNNEL_OFFLOADS | MLX5_QP_FLAG_BFREG_INDEX | MLX5_QP_FLAG_TYPE_DCT | - MLX5_QP_FLAG_TYPE_DCI)) + MLX5_QP_FLAG_TYPE_DCI | + MLX5_QP_FLAG_ALLOW_SCATTER_CQE)) return -EINVAL; err = get_qp_user_index(to_mucontext(pd->uobject->context), @@ -1971,7 +1977,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->scat_cqe && is_connected(init_attr->qp_type)) { configure_responder_scat_cqe(init_attr, qpc); - configure_requester_scat_cqe(dev, init_attr, qpc); + configure_requester_scat_cqe(dev, init_attr, + (pd && pd->uobject) ? &ucmd : NULL, + qpc); } if (qp->rq.wqe_cnt) { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 6056625237cf..8fa9f90e2bb1 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -47,6 +47,7 @@ enum { MLX5_QP_FLAG_TYPE_DCI = 1 << 5, MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, + MLX5_QP_FLAG_ALLOW_SCATTER_CQE = 1 << 8, }; enum { -- cgit v1.2.3 From a60109dc9a954ef9eddba6577e2d2e9e7952e487 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Wed, 10 Oct 2018 09:25:16 +0300 Subject: IB/mlx5: Add support for extended atomic operations Extended atomic operations cmp&swp and fetch&add is a Mellanox feature extending the standard atomic operation to use, varied operand sizes, as apposed to normal atomic operation that use an 8 byte operand only. Extended atomics allows masking the results and arguments. This patch configures QP to support extended atomic operation with the maximum size possible, as exposed by HCA capabilities. Signed-off-by: Yonatan Cohen Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 96 +++++++++++++++++++++++++++++++++++------ include/linux/mlx5/driver.h | 23 +++++----- 2 files changed, 95 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 368728e6f980..6841c0f9237f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1733,6 +1733,53 @@ static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } +static int atomic_size_to_mode(int size_mask) +{ + /* driver does not support atomic_size > 256B + * and does not know how to translate bigger sizes + */ + int supported_size_mask = size_mask & 0x1ff; + int log_max_size; + + if (!supported_size_mask) + return -EOPNOTSUPP; + + log_max_size = __fls(supported_size_mask); + + if (log_max_size > 3) + return log_max_size; + + return MLX5_ATOMIC_MODE_8B; +} + +static int get_atomic_mode(struct mlx5_ib_dev *dev, + enum ib_qp_type qp_type) +{ + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); + int atomic_mode = -EOPNOTSUPP; + int atomic_size_mask; + + if (!atomic) + return -EOPNOTSUPP; + + if (qp_type == MLX5_IB_QPT_DCT) + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + else + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + if ((atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP) || + (atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD)) + atomic_mode = atomic_size_to_mode(atomic_size_mask); + + if (atomic_mode <= 0 && + (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP && + atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) + atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + + return atomic_mode; +} + static inline bool check_flags_mask(uint64_t input, uint64_t supported) { return (input & ~supported) == 0; @@ -2562,13 +2609,15 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) return 0; } -static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, - int attr_mask) +static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, __be32 *hw_access_flags) { - u32 hw_access_flags = 0; u8 dest_rd_atomic; u32 access_flags; + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else @@ -2583,13 +2632,25 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) - hw_access_flags |= MLX5_QP_BIT_RRE; - if (access_flags & IB_ACCESS_REMOTE_ATOMIC) - hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + *hw_access_flags |= MLX5_QP_BIT_RRE; + if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && + qp->ibqp.qp_type == IB_QPT_RC) { + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type); + if (atomic_mode < 0) + return -EOPNOTSUPP; + + *hw_access_flags |= MLX5_QP_BIT_RAE; + *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET; + } + if (access_flags & IB_ACCESS_REMOTE_WRITE) - hw_access_flags |= MLX5_QP_BIT_RWE; + *hw_access_flags |= MLX5_QP_BIT_RWE; + + *hw_access_flags = cpu_to_be32(*hw_access_flags); - return cpu_to_be32(hw_access_flags); + return 0; } enum { @@ -3287,8 +3348,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } - if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) - context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + __be32 access_flags = 0; + + err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); + if (err) + goto out; + + context->params2 |= access_flags; + } if (attr_mask & IB_QP_MIN_RNR_TIMER) context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); @@ -3504,10 +3572,14 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) MLX5_SET(dctc, dctc, rwe, 1); if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { - if (!mlx5_ib_dc_atomic_is_supported(dev)) + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + if (atomic_mode < 0) return -EOPNOTSUPP; + + MLX5_SET(dctc, dctc, atomic_mode, atomic_mode); MLX5_SET(dctc, dctc, rae, 1); - MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); } MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); MLX5_SET(dctc, dctc, port, attr->port_num); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8fb072aa8671..a73c701edd16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -97,14 +97,15 @@ enum { }; enum { - MLX5_ATOMIC_MODE_IB_COMP = 1 << 16, - MLX5_ATOMIC_MODE_CX = 2 << 16, - MLX5_ATOMIC_MODE_8B = 3 << 16, - MLX5_ATOMIC_MODE_16B = 4 << 16, - MLX5_ATOMIC_MODE_32B = 5 << 16, - MLX5_ATOMIC_MODE_64B = 6 << 16, - MLX5_ATOMIC_MODE_128B = 7 << 16, - MLX5_ATOMIC_MODE_256B = 8 << 16, + MLX5_ATOMIC_MODE_OFFSET = 16, + MLX5_ATOMIC_MODE_IB_COMP = 1, + MLX5_ATOMIC_MODE_CX = 2, + MLX5_ATOMIC_MODE_8B = 3, + MLX5_ATOMIC_MODE_16B = 4, + MLX5_ATOMIC_MODE_32B = 5, + MLX5_ATOMIC_MODE_64B = 6, + MLX5_ATOMIC_MODE_128B = 7, + MLX5_ATOMIC_MODE_256B = 8, }; enum { @@ -162,13 +163,11 @@ enum mlx5_dcbx_oper_mode { MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3, }; -enum mlx5_dct_atomic_mode { - MLX5_ATOMIC_MODE_DCT_CX = 2, -}; - enum { MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, + MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP = 1 << 2, + MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD = 1 << 3, }; enum mlx5_page_fault_resume_flags { -- cgit v1.2.3