From 071c37983d99da07797294ea78e9da1a6e287144 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 14 Jul 2019 23:36:11 +0200 Subject: net: neigh: fix multiple neigh timer scheduling Neigh timer can be scheduled multiple times from userspace adding multiple neigh entries and forcing the neigh timer scheduling passing NTF_USE in the netlink requests. This will result in a refcount leak and in the following dump stack: [ 32.465295] NEIGH: BUG, double timer add, state is 8 [ 32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65 [ 32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014 [ 32.465313] Call Trace: [ 32.465318] dump_stack+0x7c/0xc0 [ 32.465323] __neigh_event_send+0x20c/0x880 [ 32.465326] ? ___neigh_create+0x846/0xfb0 [ 32.465329] ? neigh_lookup+0x2a9/0x410 [ 32.465332] ? neightbl_fill_info.constprop.0+0x800/0x800 [ 32.465334] neigh_add+0x4f8/0x5e0 [ 32.465337] ? neigh_xmit+0x620/0x620 [ 32.465341] ? find_held_lock+0x85/0xa0 [ 32.465345] rtnetlink_rcv_msg+0x204/0x570 [ 32.465348] ? rtnl_dellink+0x450/0x450 [ 32.465351] ? mark_held_locks+0x90/0x90 [ 32.465354] ? match_held_lock+0x1b/0x230 [ 32.465357] netlink_rcv_skb+0xc4/0x1d0 [ 32.465360] ? rtnl_dellink+0x450/0x450 [ 32.465363] ? netlink_ack+0x420/0x420 [ 32.465366] ? netlink_deliver_tap+0x115/0x560 [ 32.465369] ? __alloc_skb+0xc9/0x2f0 [ 32.465372] netlink_unicast+0x270/0x330 [ 32.465375] ? netlink_attachskb+0x2f0/0x2f0 [ 32.465378] netlink_sendmsg+0x34f/0x5a0 [ 32.465381] ? netlink_unicast+0x330/0x330 [ 32.465385] ? move_addr_to_kernel.part.0+0x20/0x20 [ 32.465388] ? netlink_unicast+0x330/0x330 [ 32.465391] sock_sendmsg+0x91/0xa0 [ 32.465394] ___sys_sendmsg+0x407/0x480 [ 32.465397] ? copy_msghdr_from_user+0x200/0x200 [ 32.465401] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 32.465404] ? lockdep_hardirqs_on+0x17d/0x250 [ 32.465407] ? __wake_up_common_lock+0xcb/0x110 [ 32.465410] ? __wake_up_common+0x230/0x230 [ 32.465413] ? netlink_bind+0x3e1/0x490 [ 32.465416] ? netlink_setsockopt+0x540/0x540 [ 32.465420] ? __fget_light+0x9c/0xf0 [ 32.465423] ? sockfd_lookup_light+0x8c/0xb0 [ 32.465426] __sys_sendmsg+0xa5/0x110 [ 32.465429] ? __ia32_sys_shutdown+0x30/0x30 [ 32.465432] ? __fd_install+0xe1/0x2c0 [ 32.465435] ? lockdep_hardirqs_off+0xb5/0x100 [ 32.465438] ? mark_held_locks+0x24/0x90 [ 32.465441] ? do_syscall_64+0xf/0x270 [ 32.465444] do_syscall_64+0x63/0x270 [ 32.465448] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER' receiving a netlink request with NTF_USE flag set Reported-by: Marek Majkowski Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table") Signed-off-by: Lorenzo Bianconi Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 742cea4ce72e..0dfc97bc8760 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1124,6 +1124,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh_del_timer(neigh); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), @@ -1140,6 +1141,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); + neigh_del_timer(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + -- cgit v1.2.3 From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:52 -0700 Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate that it can be used for both loads and stores. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- net/core/filter.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6d944369ca87..ff65d22cf336 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } -#define bpf_ctx_wide_store_ok(off, size, type, field) \ +#define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ diff --git a/net/core/filter.c b/net/core/filter.c index 47f6386fb17a..c5983ddb1a9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - user_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) return true; - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - msg_src_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) return true; if (size != size_default) -- cgit v1.2.3 From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:53 -0700 Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and msg_src_ip6 Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and update the comment. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6f68438aa4ed..81be929b89fc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3248,7 +3248,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3260,7 +3260,7 @@ struct bpf_sock_addr { __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); diff --git a/net/core/filter.c b/net/core/filter.c index c5983ddb1a9f..0f6854ccf894 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): - /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { -- cgit v1.2.3 From db8051f30fbab7f579d691137f1e23f3bb1ac2eb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 16 Jul 2019 11:43:05 -0400 Subject: skbuff: fix compilation warnings in skb_dump() The commit 6413139dfc64 ("skbuff: increase verbosity when dumping skb data") introduced a few compilation warnings. net/core/skbuff.c:766:32: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] level, sk->sk_family, sk->sk_type, sk->sk_protocol); ^~~~~~~~~~~ net/core/skbuff.c:766:45: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] level, sk->sk_family, sk->sk_type, sk->sk_protocol); ^~~~~~~~~~~~~~~ Fix them by using the proper types. Fixes: 6413139dfc64 ("skbuff: increase verbosity when dumping skb data") Signed-off-by: Qian Cai Reviewed-by: Nathan Chancellor Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6f1e31f674a3..0338820ee0ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -762,7 +762,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) printk("%sdev name=%s feat=0x%pNF\n", level, dev->name, &dev->features); if (sk) - printk("%ssk family=%hu type=%hu proto=%hu\n", + printk("%ssk family=%hu type=%u proto=%u\n", level, sk->sk_family, sk->sk_type, sk->sk_protocol); if (full_pkt && headroom) -- cgit v1.2.3 From 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jul 2019 19:28:14 -0700 Subject: tcp: fix tcp_set_congestion_control() use from bpf hook Neal reported incorrect use of ns_capable() from bpf hook. bpf_setsockopt(...TCP_CONGESTION...) -> tcp_set_congestion_control() -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) -> ns_capable_common() -> current_cred() -> rcu_dereference_protected(current->cred, 1) Accessing 'current' in bpf context makes no sense, since packets are processed from softirq context. As Neal stated : The capability check in tcp_set_congestion_control() was written assuming a system call context, and then was reused from a BPF call site. The fix is to add a new parameter to tcp_set_congestion_control(), so that the ns_capable() call is only performed under the right context. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Eric Dumazet Cc: Lawrence Brakmo Reported-by: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_cong.c | 6 +++--- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/net/tcp.h b/include/net/tcp.h index cca3c59b98bf..f42d300f0cfa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1064,7 +1064,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 0f6854ccf894..4e2a79b2fd77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4335,7 +4335,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit); + reinit, true); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7846afacdf0b..776905899ac0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2785,7 +2785,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true); + err = tcp_set_congestion_control(sk, name, true, true, + ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index e1862b64a90f..c445a81d144e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -333,7 +333,8 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -369,8 +370,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo } else { err = -EBUSY; } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { + } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; } else if (!try_module_get(ca->owner)) { err = -EBUSY; -- cgit v1.2.3