From 071c37983d99da07797294ea78e9da1a6e287144 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Sun, 14 Jul 2019 23:36:11 +0200
Subject: net: neigh: fix multiple neigh timer scheduling

Neigh timer can be scheduled multiple times from userspace adding
multiple neigh entries and forcing the neigh timer scheduling passing
NTF_USE in the netlink requests.
This will result in a refcount leak and in the following dump stack:

[   32.465295] NEIGH: BUG, double timer add, state is 8
[   32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65
[   32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014
[   32.465313] Call Trace:
[   32.465318]  dump_stack+0x7c/0xc0
[   32.465323]  __neigh_event_send+0x20c/0x880
[   32.465326]  ? ___neigh_create+0x846/0xfb0
[   32.465329]  ? neigh_lookup+0x2a9/0x410
[   32.465332]  ? neightbl_fill_info.constprop.0+0x800/0x800
[   32.465334]  neigh_add+0x4f8/0x5e0
[   32.465337]  ? neigh_xmit+0x620/0x620
[   32.465341]  ? find_held_lock+0x85/0xa0
[   32.465345]  rtnetlink_rcv_msg+0x204/0x570
[   32.465348]  ? rtnl_dellink+0x450/0x450
[   32.465351]  ? mark_held_locks+0x90/0x90
[   32.465354]  ? match_held_lock+0x1b/0x230
[   32.465357]  netlink_rcv_skb+0xc4/0x1d0
[   32.465360]  ? rtnl_dellink+0x450/0x450
[   32.465363]  ? netlink_ack+0x420/0x420
[   32.465366]  ? netlink_deliver_tap+0x115/0x560
[   32.465369]  ? __alloc_skb+0xc9/0x2f0
[   32.465372]  netlink_unicast+0x270/0x330
[   32.465375]  ? netlink_attachskb+0x2f0/0x2f0
[   32.465378]  netlink_sendmsg+0x34f/0x5a0
[   32.465381]  ? netlink_unicast+0x330/0x330
[   32.465385]  ? move_addr_to_kernel.part.0+0x20/0x20
[   32.465388]  ? netlink_unicast+0x330/0x330
[   32.465391]  sock_sendmsg+0x91/0xa0
[   32.465394]  ___sys_sendmsg+0x407/0x480
[   32.465397]  ? copy_msghdr_from_user+0x200/0x200
[   32.465401]  ? _raw_spin_unlock_irqrestore+0x37/0x40
[   32.465404]  ? lockdep_hardirqs_on+0x17d/0x250
[   32.465407]  ? __wake_up_common_lock+0xcb/0x110
[   32.465410]  ? __wake_up_common+0x230/0x230
[   32.465413]  ? netlink_bind+0x3e1/0x490
[   32.465416]  ? netlink_setsockopt+0x540/0x540
[   32.465420]  ? __fget_light+0x9c/0xf0
[   32.465423]  ? sockfd_lookup_light+0x8c/0xb0
[   32.465426]  __sys_sendmsg+0xa5/0x110
[   32.465429]  ? __ia32_sys_shutdown+0x30/0x30
[   32.465432]  ? __fd_install+0xe1/0x2c0
[   32.465435]  ? lockdep_hardirqs_off+0xb5/0x100
[   32.465438]  ? mark_held_locks+0x24/0x90
[   32.465441]  ? do_syscall_64+0xf/0x270
[   32.465444]  do_syscall_64+0x63/0x270
[   32.465448]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER'
receiving a netlink request with NTF_USE flag set

Reported-by: Marek Majkowski <marek@cloudflare.com>
Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 742cea4ce72e..0dfc97bc8760 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1124,6 +1124,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 
 			atomic_set(&neigh->probes,
 				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
+			neigh_del_timer(neigh);
 			neigh->nud_state     = NUD_INCOMPLETE;
 			neigh->updated = now;
 			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
@@ -1140,6 +1141,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 		}
 	} else if (neigh->nud_state & NUD_STALE) {
 		neigh_dbg(2, "neigh %p is delayed\n", neigh);
+		neigh_del_timer(neigh);
 		neigh->nud_state = NUD_DELAY;
 		neigh->updated = jiffies;
 		neigh_add_timer(neigh, jiffies +
-- 
cgit v1.2.3


From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:52 -0700
Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok

Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate
that it can be used for both loads and stores.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d944369ca87..ff65d22cf336 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/net/core/filter.c b/net/core/filter.c
index 47f6386fb17a..c5983ddb1a9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
-- 
cgit v1.2.3


From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:53 -0700
Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and
 msg_src_ip6

Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and
update the comment.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  4 ++--
 net/core/filter.c        | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..81be929b89fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3248,7 +3248,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3260,7 +3260,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index c5983ddb1a9f..0f6854ccf894 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
 				msg_src_ip6[3]):
-		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
+				return true;
+
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-- 
cgit v1.2.3


From db8051f30fbab7f579d691137f1e23f3bb1ac2eb Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 16 Jul 2019 11:43:05 -0400
Subject: skbuff: fix compilation warnings in skb_dump()

The commit 6413139dfc64 ("skbuff: increase verbosity when dumping skb
data") introduced a few compilation warnings.

net/core/skbuff.c:766:32: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                       level, sk->sk_family, sk->sk_type,
sk->sk_protocol);
                                             ^~~~~~~~~~~
net/core/skbuff.c:766:45: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                       level, sk->sk_family, sk->sk_type,
sk->sk_protocol);
^~~~~~~~~~~~~~~

Fix them by using the proper types.

Fixes: 6413139dfc64 ("skbuff: increase verbosity when dumping skb data")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6f1e31f674a3..0338820ee0ec 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -762,7 +762,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
 		printk("%sdev name=%s feat=0x%pNF\n",
 		       level, dev->name, &dev->features);
 	if (sk)
-		printk("%ssk family=%hu type=%hu proto=%hu\n",
+		printk("%ssk family=%hu type=%u proto=%u\n",
 		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
 
 	if (full_pkt && headroom)
-- 
cgit v1.2.3


From 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jul 2019 19:28:14 -0700
Subject: tcp: fix tcp_set_congestion_control() use from bpf hook

Neal reported incorrect use of ns_capable() from bpf hook.

bpf_setsockopt(...TCP_CONGESTION...)
  -> tcp_set_congestion_control()
   -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)
    -> ns_capable_common()
     -> current_cred()
      -> rcu_dereference_protected(current->cred, 1)

Accessing 'current' in bpf context makes no sense, since packets
are processed from softirq context.

As Neal stated : The capability check in tcp_set_congestion_control()
was written assuming a system call context, and then was reused from
a BPF call site.

The fix is to add a new parameter to tcp_set_congestion_control(),
so that the ns_capable() call is only performed under the right
context.

Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 3 ++-
 net/core/filter.c   | 2 +-
 net/ipv4/tcp.c      | 4 +++-
 net/ipv4/tcp_cong.c | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cca3c59b98bf..f42d300f0cfa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1064,7 +1064,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f6854ccf894..4e2a79b2fd77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4335,7 +4335,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
 			ret = tcp_set_congestion_control(sk, name, false,
-							 reinit);
+							 reinit, true);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7846afacdf0b..776905899ac0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2785,7 +2785,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name, true, true);
+		err = tcp_set_congestion_control(sk, name, true, true,
+						 ns_capable(sock_net(sk)->user_ns,
+							    CAP_NET_ADMIN));
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index e1862b64a90f..c445a81d144e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -333,7 +333,8 @@ out:
  * tcp_reinit_congestion_control (if the current congestion control was
  * already initialized.
  */
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -369,8 +370,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
 		} else {
 			err = -EBUSY;
 		}
-	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-		     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
+	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
 		err = -EPERM;
 	} else if (!try_module_get(ca->owner)) {
 		err = -EBUSY;
-- 
cgit v1.2.3