From 4b9d07a44015a0e940448fa3885b894349e8b162 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:12 +0100 Subject: net: introduce keepalive function in struct proto Direct call of tcp_set_keepalive() function from protocol-agnostic sock_setsockopt() function in net/core/sock.c violates network layering. And newly introduced protocol (SMC-R) will need its own keepalive function. Therefore, add "keepalive" function pointer to "struct proto", and call it from sock_setsockopt() via this pointer. Signed-off-by: Ursula Braun Reviewed-by: Utz Bacher Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index f0e867f58722..99deda67eba0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1024,6 +1024,7 @@ struct proto { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); + void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, int level, -- cgit v1.2.3 From f16a7dd5cf27eeda187425c9c7d96802a549f9c4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:26 +0100 Subject: smc: netlink interface for SMC sockets Support for SMC socket monitoring via netlink sockets of protocol NETLINK_SOCK_DIAG. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/smc.h | 20 ++++ include/net/sock.h | 3 + include/uapi/linux/netlink.h | 1 + include/uapi/linux/smc_diag.h | 85 +++++++++++++++++ net/smc/Kconfig | 9 ++ net/smc/Makefile | 1 + net/smc/af_smc.c | 43 ++++++++- net/smc/smc.h | 2 + net/smc/smc_close.c | 1 + net/smc/smc_diag.c | 215 ++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 include/net/smc.h create mode 100644 include/uapi/linux/smc_diag.h create mode 100644 net/smc/smc_diag.c (limited to 'include/net/sock.h') diff --git a/include/net/smc.h b/include/net/smc.h new file mode 100644 index 000000000000..12d26358ad9f --- /dev/null +++ b/include/net/smc.h @@ -0,0 +1,20 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ +#ifndef _SMC_H +#define _SMC_H + +struct smc_hashinfo { + rwlock_t lock; + struct hlist_head ht; +}; + +int smc_hash_sk(struct sock *sk); +void smc_unhash_sk(struct sock *sk); +#endif /* _SMC_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 99deda67eba0..389a0a619b45 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -70,6 +70,7 @@ #include #include #include +#include /* * This structure really needs to be cleaned up. @@ -986,6 +987,7 @@ struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; +struct smc_hashinfo; struct module; /* @@ -1094,6 +1096,7 @@ struct proto { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; + struct smc_hashinfo *smc_hash; } h; struct module *owner; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 0dba4e4ed2be..f3946a27bd07 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -27,6 +27,7 @@ #define NETLINK_ECRYPTFS 19 #define NETLINK_RDMA 20 #define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ #define NETLINK_INET_DIAG NETLINK_SOCK_DIAG diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h new file mode 100644 index 000000000000..0063919fea34 --- /dev/null +++ b/include/uapi/linux/smc_diag.h @@ -0,0 +1,85 @@ +#ifndef _UAPI_SMC_DIAG_H_ +#define _UAPI_SMC_DIAG_H_ + +#include +#include +#include + +/* Request structure */ +struct smc_diag_req { + __u8 diag_family; + __u8 pad[2]; + __u8 diag_ext; /* Query extended information */ + struct inet_diag_sockid id; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) based + * on the internal clcsock, and more SMC-related socket data + */ +struct smc_diag_msg { + __u8 diag_family; + __u8 diag_state; + __u8 diag_fallback; + __u8 diag_shutdown; + struct inet_diag_sockid id; + + __u32 diag_uid; + __u64 diag_inode; +}; + +/* Extensions */ + +enum { + SMC_DIAG_NONE, + SMC_DIAG_CONNINFO, + SMC_DIAG_LGRINFO, + SMC_DIAG_SHUTDOWN, + __SMC_DIAG_MAX, +}; + +#define SMC_DIAG_MAX (__SMC_DIAG_MAX - 1) + +/* SMC_DIAG_CONNINFO */ + +struct smc_diag_cursor { + __u16 reserved; + __u16 wrap; + __u32 count; +}; + +struct smc_diag_conninfo { + __u32 token; /* unique connection id */ + __u32 sndbuf_size; /* size of send buffer */ + __u32 rmbe_size; /* size of RMB element */ + __u32 peer_rmbe_size; /* size of peer RMB element */ + /* local RMB element cursors */ + struct smc_diag_cursor rx_prod; /* received producer cursor */ + struct smc_diag_cursor rx_cons; /* received consumer cursor */ + /* peer RMB element cursors */ + struct smc_diag_cursor tx_prod; /* sent producer cursor */ + struct smc_diag_cursor tx_cons; /* sent consumer cursor */ + __u8 rx_prod_flags; /* received producer flags */ + __u8 rx_conn_state_flags; /* recvd connection flags*/ + __u8 tx_prod_flags; /* sent producer flags */ + __u8 tx_conn_state_flags; /* sent connection flags*/ + /* send buffer cursors */ + struct smc_diag_cursor tx_prep; /* prepared to be sent cursor */ + struct smc_diag_cursor tx_sent; /* sent cursor */ + struct smc_diag_cursor tx_fin; /* confirmed sent cursor */ +}; + +/* SMC_DIAG_LINKINFO */ + +struct smc_diag_linkinfo { + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ +}; + +struct smc_diag_lgrinfo { + struct smc_diag_linkinfo lnk[1]; + __u8 role; +}; +#endif /* _UAPI_SMC_DIAG_H_ */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index bc029803e728..c717ef0896aa 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -9,3 +9,12 @@ config SMC a separate socket family SMC. Select this option if you want to run SMC socket applications + +config SMC_DIAG + tristate "SMC: socket monitoring interface" + depends on SMC + ---help--- + Support for SMC socket monitoring interface used by tools such as + smcss. + + if unsure, say Y. diff --git a/net/smc/Makefile b/net/smc/Makefile index 5cf0cafaa208..188104654b54 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_SMC) += smc.o +obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3f543d58bc5c..5d4208ad029e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "smc.h" #include "smc_clc.h" @@ -59,13 +60,48 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } -static struct proto smc_proto = { +static struct smc_hashinfo smc_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), +}; + +int smc_hash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + struct hlist_head *head; + + head = &h->ht; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(smc_hash_sk); + +void smc_unhash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(smc_unhash_sk); + +struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_DESTROY_BY_RCU, }; +EXPORT_SYMBOL_GPL(smc_proto); static int smc_release(struct socket *sock) { @@ -109,6 +145,7 @@ static int smc_release(struct socket *sock) schedule_delayed_work(&smc->sock_put_work, SMC_CLOSE_SOCK_PUT_DELAY); } + sk->sk_prot->unhash(sk); release_sock(sk); sock_put(sk); @@ -144,6 +181,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); + sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); return sk; @@ -536,6 +574,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) lsmc->sk.sk_err = -rc; new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); sock_put(new_sk); *new_smc = NULL; goto out; @@ -545,6 +584,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); sock_put(new_sk); *new_smc = NULL; goto out; @@ -1320,6 +1360,7 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto; } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { diff --git a/net/smc/smc.h b/net/smc/smc.h index 959a5d2014ab..ee5fbea24549 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,6 +21,8 @@ #define SMC_MAX_PORTS 2 /* Max # of ports */ +extern struct proto smc_proto; + #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index d70c05b57021..03dfcc6b7661 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -384,6 +384,7 @@ void smc_close_sock_put_work(struct work_struct *work) struct smc_sock, sock_put_work); + smc->sk.sk_prot->unhash(&smc->sk); sock_put(&smc->sk); } diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c new file mode 100644 index 000000000000..d2d01cf70224 --- /dev/null +++ b/net/smc/smc_diag.c @@ -0,0 +1,215 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Monitoring SMC transport protocol sockets + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "smc.h" +#include "smc_core.h" + +static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + r->diag_family = sk->sk_family; + if (!smc->clcsock) + return; + r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); + r->id.idiag_dport = smc->clcsock->sk->sk_dport; + r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +} + +static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct smc_diag_msg *r, + struct user_namespace *user_ns) +{ + if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) + return 1; + + r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_inode = sock_i_ino(sk); + return 0; +} + +static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct smc_diag_req *req, + struct nlattr *bc) +{ + struct smc_sock *smc = smc_sk(sk); + struct user_namespace *user_ns; + struct smc_diag_msg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + smc_diag_msg_common_fill(r, sk); + r->diag_state = sk->sk_state; + r->diag_fallback = smc->use_fallback; + user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); + if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) + goto errout; + + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) { + struct smc_connection *conn = &smc->conn; + struct smc_diag_conninfo cinfo = { + .token = conn->alert_token_local, + .sndbuf_size = conn->sndbuf_size, + .rmbe_size = conn->rmbe_size, + .peer_rmbe_size = conn->peer_rmbe_size, + + .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, + .rx_prod.count = conn->local_rx_ctrl.prod.count, + .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, + .rx_cons.count = conn->local_rx_ctrl.cons.count, + + .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, + .tx_prod.count = conn->local_tx_ctrl.prod.count, + .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, + .tx_cons.count = conn->local_tx_ctrl.cons.count, + + .tx_prod_flags = + *(u8 *)&conn->local_tx_ctrl.prod_flags, + .tx_conn_state_flags = + *(u8 *)&conn->local_tx_ctrl.conn_state_flags, + .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, + .rx_conn_state_flags = + *(u8 *)&conn->local_rx_ctrl.conn_state_flags, + + .tx_prep.wrap = conn->tx_curs_prep.wrap, + .tx_prep.count = conn->tx_curs_prep.count, + .tx_sent.wrap = conn->tx_curs_sent.wrap, + .tx_sent.count = conn->tx_curs_sent.count, + .tx_fin.wrap = conn->tx_curs_fin.wrap, + .tx_fin.count = conn->tx_curs_fin.count, + }; + + if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) + goto errout; + } + + if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) { + struct smc_diag_lgrinfo linfo = { + .role = smc->conn.lgr->role, + .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, + .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + }; + + memcpy(linfo.lnk[0].ibname, + smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, + smc->conn.lgr->lnk[0].gid.raw); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, + smc->conn.lgr->lnk[0].peer_gid); + + if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) + goto errout; + } + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *bc = NULL; + struct hlist_head *head; + struct sock *sk; + int rc = 0; + + read_lock(&smc_proto.h.smc_hash->lock); + head = &smc_proto.h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc) + break; + } + +out: + read_unlock(&smc_proto.h.smc_hash->lock); + return rc; +} + +static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + struct net *net = sock_net(skb->sk); + + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { + { + struct netlink_dump_control c = { + .dump = smc_diag_dump, + .min_dump_alloc = SKB_WITH_OVERHEAD(32768), + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + } + return 0; +} + +static const struct sock_diag_handler smc_diag_handler = { + .family = AF_SMC, + .dump = smc_diag_handler_dump, +}; + +static int __init smc_diag_init(void) +{ + return sock_diag_register(&smc_diag_handler); +} + +static void __exit smc_diag_exit(void) +{ + sock_diag_unregister(&smc_diag_handler); +} + +module_init(smc_diag_init); +module_exit(smc_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); -- cgit v1.2.3 From 6c59ebd356ff2ca64cdf1f61c5fe17f6fa8fc045 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 20 Jan 2017 22:27:04 +0800 Subject: sock: use hlist_entry_safe Use hlist_entry_safe() instead of open-coding it. Signed-off-by: Geliang Tang Signed-off-by: David S. Miller --- include/net/sock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 389a0a619b45..7144750d14e5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -544,8 +544,7 @@ static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) static inline struct sock *sk_next(const struct sock *sk) { - return sk->sk_node.next ? - hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; + return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) -- cgit v1.2.3 From 158f323b9868b59967ad96957c4ca388161be321 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jan 2017 07:11:27 -0800 Subject: net: adjust skb->truesize in pskb_expand_head() Slava Shwartsman reported a warning in skb_try_coalesce(), when we detect skb->truesize is completely wrong. In his case, issue came from IPv6 reassembly coping with malicious datagrams, that forced various pskb_may_pull() to reallocate a bigger skb->head than the one allocated by NIC driver before entering GRO layer. Current code does not change skb->truesize, leaving this burden to callers if they care enough. Blindly changing skb->truesize in pskb_expand_head() is not easy, as some producers might track skb->truesize, for example in xmit path for back pressure feedback (sk->sk_wmem_alloc) We can detect the cases where it should be safe to change skb->truesize : 1) skb is not attached to a socket. 2) If it is attached to a socket, destructor is sock_edemux() My audit gave only two callers doing their own skb->truesize manipulation. I had to remove skb parameter in sock_edemux macro when CONFIG_INET is not set to avoid a compile error. Signed-off-by: Eric Dumazet Reported-by: Slava Shwartsman Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- net/core/skbuff.c | 14 +++++++++++--- net/netlink/af_netlink.c | 8 +++----- net/wireless/util.c | 2 -- 4 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 7144750d14e5..94e65fd70354 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1534,7 +1534,7 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); #else -#define sock_edemux(skb) sock_efree(skb) +#define sock_edemux sock_efree #endif int sock_setsockopt(struct socket *sock, int level, int op, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f8dbe4a7ab46..26c1344cc23e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1192,10 +1192,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; - int size = nhead + skb_end_offset(skb) + ntail; + int i, osize = skb_end_offset(skb); + int size = osize + nhead + ntail; long off; + u8 *data; BUG_ON(nhead < 0); @@ -1257,6 +1257,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + return 0; nofrags: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index edcc1e19ad53..7b73c7c161a9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1210,11 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) skb = nskb; } - if (!pskb_expand_head(skb, 0, -delta, - (allocation & ~__GFP_DIRECT_RECLAIM) | - __GFP_NOWARN | __GFP_NORETRY)) - skb->truesize -= delta; - + pskb_expand_head(skb, 0, -delta, + (allocation & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOWARN | __GFP_NORETRY); return skb; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 1b9296882dcd..68e5f2ecee1a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -618,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) return -ENOMEM; - - skb->truesize += head_need; } if (encaps_data) { -- cgit v1.2.3 From 9b8805a325591cf5b6b9df71200de25a2bd721fd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:11 +0200 Subject: sock: add sk_dst_pending_confirm flag Add new sock flag to allow sockets to confirm neighbour. When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. As not all call paths lock the socket use full word for the flag. Add sk_dst_confirm as replacement for dst_confirm when called for received packets. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 ++++++++++++ net/core/sock.c | 2 ++ 2 files changed, 14 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 94e65fd70354..85d856b94b4b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -240,6 +240,7 @@ struct sock_common { * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache + * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,6 +394,8 @@ struct sock { struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; + __u32 sk_dst_pending_confirm; + /* Note: 32bit hole on 64bit arches */ long sk_sndtimeo; struct timer_list sk_timer; __u32 sk_priority; @@ -1764,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk) if (ndst != dst) { rcu_assign_pointer(sk->sk_dst_cache, ndst); sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; } } } @@ -1774,6 +1778,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; /* * This can be called while sk is owned by the caller only, * with no state that can be checked in a rcu_dereference_check() cond @@ -1789,6 +1794,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); dst_release(old_dst); } @@ -1809,6 +1815,12 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +static inline void sk_dst_confirm(struct sock *sk) +{ + if (!sk->sk_dst_pending_confirm) + sk->sk_dst_pending_confirm = 1; +} + bool sk_mc_loop(struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index 8b35debfe454..b74356535559 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -1519,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) af_family_clock_key_strings[newsk->sk_family]); newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; atomic_set(&newsk->sk_drops, 0); -- cgit v1.2.3 From 4ff0620354f2b39b9fe2a91c22c4de9d1fba0c8e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:12 +0200 Subject: net: add dst_pending_confirm flag to skbuff Add new skbuff flag to allow protocols to confirm neighbour. When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. Add sock_confirm_neigh() helper to confirm the neighbour and use it for IPv4, IPv6 and VRF before dst_neigh_output. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++++- include/linux/skbuff.h | 12 ++++++++++++ include/net/sock.h | 14 ++++++++++++++ net/ipv4/ip_output.c | 5 ++++- net/ipv6/ip6_output.c | 1 + 5 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 264fc1585b3c..630eafdb79e8 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return ret; @@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); - if (!IS_ERR(neigh)) + if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); + } rcu_read_unlock_bh(); err: diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6a78e1892b6..f1adddc1c5ac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -612,6 +612,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1, * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS + * @dst_pending_confirm: need to confirm neighbour * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark @@ -741,6 +742,7 @@ struct sk_buff { __u8 csum_level:2; __u8 csum_bad:1; + __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif @@ -3698,6 +3700,16 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) return skb->queue_mapping != 0; } +static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) +{ + skb->dst_pending_confirm = val; +} + +static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) +{ + return skb->dst_pending_confirm != 0; +} + static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { #ifdef CONFIG_XFRM diff --git a/include/net/sock.h b/include/net/sock.h index 85d856b94b4b..6f83e78eaa5a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1821,6 +1821,20 @@ static inline void sk_dst_confirm(struct sock *sk) sk->sk_dst_pending_confirm = 1; } +static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) +{ + if (skb_get_dst_pending_confirm(skb)) { + struct sock *sk = skb->sk; + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + if (sk && sk->sk_dst_pending_confirm) + sk->sk_dst_pending_confirm = 0; + } +} + bool sk_mc_loop(struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b67719f45953..c9fc32fa3272 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (!IS_ERR(neigh)) { - int res = dst_neigh_output(dst, neigh, skb); + int res; + + sock_confirm_neigh(skb, neigh); + res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return res; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b6a94ff0bbd0..14d99fbf102e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return ret; -- cgit v1.2.3