From 6bdacdb48e94ff26c03c6eeeef48c03c5e2f7dd4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 12 May 2021 20:57:14 +0200
Subject: bpf: Fix BPF_JIT kconfig symbol dependency

Randy reported a randconfig build error recently on i386:

  ld: arch/x86/net/bpf_jit_comp32.o: in function `do_jit':
  bpf_jit_comp32.c:(.text+0x28c9): undefined reference to `__bpf_call_base'
  ld: arch/x86/net/bpf_jit_comp32.o: in function `bpf_int_jit_compile':
  bpf_jit_comp32.c:(.text+0x3694): undefined reference to `bpf_jit_blind_constants'
  ld: bpf_jit_comp32.c:(.text+0x3719): undefined reference to `bpf_jit_binary_free'
  ld: bpf_jit_comp32.c:(.text+0x3745): undefined reference to `bpf_jit_binary_alloc'
  ld: bpf_jit_comp32.c:(.text+0x37d3): undefined reference to `bpf_jit_prog_release_other'
  [...]

The cause was that b24abcff918a ("bpf, kconfig: Add consolidated menu entry for
bpf with core options") moved BPF_JIT from net/Kconfig into kernel/bpf/Kconfig
and previously BPF_JIT was guarded by a 'if NET'. However, there is no actual
dependency on NET, it's just that menuconfig NET selects BPF. And the latter in
turn causes kernel/bpf/core.o to be built which contains above symbols. Randy's
randconfig didn't have NET set, and BPF wasn't either, but BPF_JIT otoh was.
Detangle this by making BPF_JIT depend on BPF instead. arm64 was the only arch
that pulled in its JIT in net/ via obj-$(CONFIG_NET), all others unconditionally
pull this dir in via obj-y. Do the same since CONFIG_NET guard there is really
useless as we compiled the JIT via obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o anyway.

Fixes: b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
---
 arch/arm64/Kbuild  | 3 +--
 kernel/bpf/Kconfig | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kbuild b/arch/arm64/Kbuild
index d6465823b281..7b393cfec071 100644
--- a/arch/arm64/Kbuild
+++ b/arch/arm64/Kbuild
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y			+= kernel/ mm/
-obj-$(CONFIG_NET)	+= net/
+obj-y			+= kernel/ mm/ net/
 obj-$(CONFIG_KVM)	+= kvm/
 obj-$(CONFIG_XEN)	+= xen/
 obj-$(CONFIG_CRYPTO)	+= crypto/
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 26b591e23f16..bd04f4a44c01 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -37,6 +37,7 @@ config BPF_SYSCALL
 
 config BPF_JIT
 	bool "Enable BPF Just In Time compiler"
+	depends on BPF
 	depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
 	depends on MODULES
 	help
-- 
cgit v1.2.3


From 8f1634b82189e715b0f82f16ce54fab43cfedd8a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 14 May 2021 10:05:28 -0700
Subject: selftests/bpf: Convert static to global in tc_redirect progs

Both IFINDEX_SRC and IFINDEX_DST are set from the userspace
and it won't work once bpf merges with bpf-next.

Fixes: 096eccdef0b3 ("selftests/bpf: Rewrite test_tc_redirect.sh as prog_tests/tc_redirect.c")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210514170528.3750250-1-sdf@google.com
---
 tools/testing/selftests/bpf/progs/test_tc_neigh.c | 4 ++--
 tools/testing/selftests/bpf/progs/test_tc_peer.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
index 90f64a85998f..0c93d326a663 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
@@ -33,8 +33,8 @@
 				 a.s6_addr32[3] == b.s6_addr32[3])
 #endif
 
-static volatile const __u32 IFINDEX_SRC;
-static volatile const __u32 IFINDEX_DST;
+volatile const __u32 IFINDEX_SRC;
+volatile const __u32 IFINDEX_DST;
 
 static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
 					    __be32 addr)
diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
index 72c72950c3bb..ef264bced0e6 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_peer.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c
@@ -8,8 +8,8 @@
 
 #include <bpf/bpf_helpers.h>
 
-static volatile const __u32 IFINDEX_SRC;
-static volatile const __u32 IFINDEX_DST;
+volatile const __u32 IFINDEX_SRC;
+volatile const __u32 IFINDEX_DST;
 
 SEC("classifier/chk_egress")
 int tc_chk(struct __sk_buff *skb)
-- 
cgit v1.2.3


From 704e2beba23c45eaa056b1c03b5e1fb221e03f80 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 14 May 2021 11:07:26 -0700
Subject: selftests/bpf: Test ringbuf mmap read-only and read-write
 restrictions

Extend ringbuf selftest to validate read/write and read-only restrictions on
memory mapping consumer/producer/data pages. Ensure no "escalations" from
PROT_READ to PROT_WRITE/PROT_EXEC is allowed. And test that mremap() fails to
expand mmap()'ed area.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210514180726.843157-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/ringbuf.c | 49 +++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
index de78617f6550..f9a8ae331963 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
@@ -86,8 +86,9 @@ void test_ringbuf(void)
 	const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample);
 	pthread_t thread;
 	long bg_ret = -1;
-	int err, cnt;
+	int err, cnt, rb_fd;
 	int page_size = getpagesize();
+	void *mmap_ptr, *tmp_ptr;
 
 	skel = test_ringbuf__open();
 	if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
@@ -101,6 +102,52 @@ void test_ringbuf(void)
 	if (CHECK(err != 0, "skel_load", "skeleton load failed\n"))
 		goto cleanup;
 
+	rb_fd = bpf_map__fd(skel->maps.ringbuf);
+	/* good read/write cons_pos */
+	mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0);
+	ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos");
+	tmp_ptr = mremap(mmap_ptr, page_size, 2 * page_size, MREMAP_MAYMOVE);
+	if (!ASSERT_ERR_PTR(tmp_ptr, "rw_extend"))
+		goto cleanup;
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_cons_pos_protect");
+	ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw");
+
+	/* bad writeable prod_pos */
+	mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, page_size);
+	err = -errno;
+	ASSERT_ERR_PTR(mmap_ptr, "wr_prod_pos");
+	ASSERT_EQ(err, -EPERM, "wr_prod_pos_err");
+
+	/* bad writeable data pages */
+	mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size);
+	err = -errno;
+	ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_one");
+	ASSERT_EQ(err, -EPERM, "wr_data_page_one_err");
+	mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 3 * page_size);
+	ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_two");
+	mmap_ptr = mmap(NULL, 2 * page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size);
+	ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_all");
+
+	/* good read-only pages */
+	mmap_ptr = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED, rb_fd, 0);
+	if (!ASSERT_OK_PTR(mmap_ptr, "ro_prod_pos"))
+		goto cleanup;
+
+	ASSERT_ERR(mprotect(mmap_ptr, 4 * page_size, PROT_WRITE), "write_protect");
+	ASSERT_ERR(mprotect(mmap_ptr, 4 * page_size, PROT_EXEC), "exec_protect");
+	ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 4 * page_size, MREMAP_MAYMOVE), "ro_remap");
+	ASSERT_OK(munmap(mmap_ptr, 4 * page_size), "unmap_ro");
+
+	/* good read-only pages with initial offset */
+	mmap_ptr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rb_fd, page_size);
+	if (!ASSERT_OK_PTR(mmap_ptr, "ro_prod_pos"))
+		goto cleanup;
+
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_WRITE), "write_protect");
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_protect");
+	ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 3 * page_size, MREMAP_MAYMOVE), "ro_remap");
+	ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_ro");
+
 	/* only trigger BPF program for current process */
 	skel->bss->pid = getpid();
 
-- 
cgit v1.2.3


From 8afcc19fbf083a8459284d9a29b4b5ac1cb2396c Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 17 May 2021 11:28:29 +0200
Subject: bpf: Clarify a bpf_bprintf_prepare macro

The per-cpu buffers contain bprintf data rather than printf arguments.
The macro name and comment were a bit confusing, this rewords them in a
clearer way.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20210517092830.1026418-1-revest@chromium.org
---
 kernel/bpf/helpers.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ef658a9ea5c9..3a5ab614cbb0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -692,13 +692,14 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
 	return -EINVAL;
 }
 
-/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p
+/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
+ * arguments representation.
  */
-#define MAX_PRINTF_BUF_LEN	512
+#define MAX_BPRINTF_BUF_LEN	512
 
 /* Support executing three nested bprintf helper calls on a given CPU */
 struct bpf_bprintf_buffers {
-	char tmp_bufs[3][MAX_PRINTF_BUF_LEN];
+	char tmp_bufs[3][MAX_BPRINTF_BUF_LEN];
 };
 static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
@@ -761,7 +762,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 		if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
 			return -EBUSY;
 
-		tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN;
+		tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
 		*bin_args = (u32 *)tmp_buf;
 	}
 
-- 
cgit v1.2.3


From 0af02eb2a7d76ca85a1ecaf4b3775e2c86408fab Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 17 May 2021 11:28:30 +0200
Subject: bpf: Avoid using ARRAY_SIZE on an uninitialized pointer

The cppcheck static code analysis reported the following error:

    if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) {
                                             ^
ARRAY_SIZE is a macro that expands to sizeofs, so bufs is not actually
dereferenced at runtime, and the code is actually safe. But to keep
things tidy, this patch removes the need for a call to ARRAY_SIZE by
extracting the size of the array into a macro. Cppcheck should no longer
be confused and the code ends up being a bit cleaner.

Fixes: e2d5b2bb769f ("bpf: Fix nested bpf_bprintf_prepare with more per-cpu buffers")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20210517092830.1026418-2-revest@chromium.org
---
 kernel/bpf/helpers.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3a5ab614cbb0..73443498d88f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -698,8 +698,9 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
 #define MAX_BPRINTF_BUF_LEN	512
 
 /* Support executing three nested bprintf helper calls on a given CPU */
+#define MAX_BPRINTF_NEST_LEVEL	3
 struct bpf_bprintf_buffers {
-	char tmp_bufs[3][MAX_BPRINTF_BUF_LEN];
+	char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
 };
 static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
@@ -711,7 +712,7 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
 
 	preempt_disable();
 	nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
-	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) {
+	if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
 		this_cpu_dec(bpf_bprintf_nest_level);
 		preempt_enable();
 		return -EBUSY;
-- 
cgit v1.2.3


From ceb11679d9fcf3fdb358a310a38760fcbe9b63ed Mon Sep 17 00:00:00 2001
From: Yinjun Zhang <yinjun.zhang@corigine.com>
Date: Thu, 20 May 2021 10:58:34 +0200
Subject: bpf, offload: Reorder offload callback 'prepare' in verifier

Commit 4976b718c355 ("bpf: Introduce pseudo_btf_id") switched the
order of resolve_pseudo_ldimm(), in which some pseudo instructions
are rewritten. Thus those rewritten instructions cannot be passed
to driver via 'prepare' offload callback.

Reorder the 'prepare' offload callback to fix it.

Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id")
Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210520085834.15023-1-simon.horman@netronome.com
---
 kernel/bpf/verifier.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c58598ef4b5b..09849e43f035 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13368,12 +13368,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 
-	if (bpf_prog_is_dev_bound(env->prog->aux)) {
-		ret = bpf_prog_offload_verifier_prep(env->prog);
-		if (ret)
-			goto skip_full_check;
-	}
-
 	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
@@ -13401,6 +13395,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (ret < 0)
 		goto skip_full_check;
 
+	if (bpf_prog_is_dev_bound(env->prog->aux)) {
+		ret = bpf_prog_offload_verifier_prep(env->prog);
+		if (ret)
+			goto skip_full_check;
+	}
+
 	ret = check_cfg(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From 84316ca4e100d8cbfccd9f774e23817cb2059868 Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Wed, 19 May 2021 15:47:42 +0000
Subject: bpf: Set mac_len in bpf_skb_change_head

The skb_change_head() helper did not set "skb->mac_len", which is
problematic when it's used in combination with skb_redirect_peer().
Without it, redirecting a packet from a L3 device such as wireguard to
the veth peer device will cause skb->data to point to the middle of the
IP header on entry to tcp_v4_rcv() since the L2 header is not pulled
correctly due to mac_len=0.

Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure")
Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210519154743.2554771-2-joamaki@gmail.com
---
 net/core/filter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index cae56d08a670..65ab4e21c087 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3784,6 +3784,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
 		__skb_push(skb, head_room);
 		memset(skb->data, 0, head_room);
 		skb_reset_mac_header(skb);
+		skb_reset_mac_len(skb);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From a8deba8547e39f26440101164a3bbc2899c5b305 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Tue, 25 May 2021 09:41:39 +0800
Subject: bpftool: Add sock_release help info for cgroup attach/prog load
 command

The help information was not added at the time when the function got added.
Fix this and add the missing information to its cli, documentation and bash
completion.

Fixes: db94cc0b4805 ("bpftool: Add support for BPF_CGROUP_INET_SOCK_RELEASE")
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20210525014139.323859-1-liujian56@huawei.com
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 4 +++-
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   | 2 +-
 tools/bpf/bpftool/bash-completion/bpftool          | 6 +++---
 tools/bpf/bpftool/cgroup.c                         | 3 ++-
 tools/bpf/bpftool/prog.c                           | 2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 790944c35602..baee8591ac76 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -30,7 +30,8 @@ CGROUP COMMANDS
 |	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
 |		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
 |               **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
-|               **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** }
+|               **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** |
+|               **sock_release** }
 |	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -106,6 +107,7 @@ DESCRIPTION
 		  **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
 		  **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
 		  **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
+		  **sock_release** closing an userspace inet socket (since 5.9).
 
 	**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
 		  Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 358c7309d419..fe1b38e7e887 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -44,7 +44,7 @@ PROG COMMANDS
 |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** |
 |               **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
 |		**cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
-|		**cgroup/getsockopt** | **cgroup/setsockopt** |
+|		**cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** |
 |		**struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup**
 |	}
 |       *ATTACH_TYPE* := {
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index d67518bcbd44..cc33c5824a2f 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -478,7 +478,7 @@ _bpftool()
                                 cgroup/recvmsg4 cgroup/recvmsg6 \
                                 cgroup/post_bind4 cgroup/post_bind6 \
                                 cgroup/sysctl cgroup/getsockopt \
-                                cgroup/setsockopt struct_ops \
+                                cgroup/setsockopt cgroup/sock_release struct_ops \
                                 fentry fexit freplace sk_lookup" -- \
                                                    "$cur" ) )
                             return 0
@@ -1021,7 +1021,7 @@ _bpftool()
                         device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
                         getpeername4 getpeername6 getsockname4 getsockname6 \
                         sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
-                        setsockopt'
+                        setsockopt sock_release'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag name'
                     case $prev in
@@ -1032,7 +1032,7 @@ _bpftool()
                         ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
                         post_bind4|post_bind6|connect4|connect6|getpeername4|\
                         getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
-                        recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt)
+                        recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt|sock_release)
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index d901cc1b904a..6e53b1d393f4 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -28,7 +28,8 @@
 	"                        connect6 | getpeername4 | getpeername6 |\n"   \
 	"                        getsockname4 | getsockname6 | sendmsg4 |\n"   \
 	"                        sendmsg6 | recvmsg4 | recvmsg6 |\n"           \
-	"                        sysctl | getsockopt | setsockopt }"
+	"                        sysctl | getsockopt | setsockopt |\n"	       \
+	"                        sock_release }"
 
 static unsigned int query_flags;
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 3f067d2d7584..da4846c9856a 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2138,7 +2138,7 @@ static int do_help(int argc, char **argv)
 		"                 cgroup/getpeername4 | cgroup/getpeername6 |\n"
 		"                 cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
 		"                 cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
-		"                 cgroup/getsockopt | cgroup/setsockopt |\n"
+		"                 cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n"
 		"                 struct_ops | fentry | fexit | freplace | sk_lookup }\n"
 		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
 		"                        flow_dissector }\n"
-- 
cgit v1.2.3


From 6fd5fb63820a9a1146aba0bba2fdbc1db4b903e7 Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Tue, 25 May 2021 10:29:55 +0000
Subject: selftests/bpf: Add test for l3 use of bpf_redirect_peer

Add a test case for using bpf_skb_change_head() in combination with
bpf_redirect_peer() to redirect a packet from a L3 device to veth and back.

The test uses a BPF program that adds L2 headers to the packet coming
from a L3 device and then calls bpf_redirect_peer() to redirect the packet
to a veth device. The test fails as skb->mac_len is not set properly and
thus the ethernet headers are not properly skb_pull'd in cls_bpf_classify(),
causing tcp_v4_rcv() to point the TCP header into middle of the IP header.

Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210525102955.2811090-1-joamaki@gmail.com
---
 .../testing/selftests/bpf/prog_tests/tc_redirect.c | 552 ++++++++++++++-------
 tools/testing/selftests/bpf/progs/test_tc_peer.c   |  31 ++
 2 files changed, 405 insertions(+), 178 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 95ef9fcd31d8..5703c918812b 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -11,14 +11,17 @@
  */
 
 #define _GNU_SOURCE
-#include <fcntl.h>
+
+#include <arpa/inet.h>
 #include <linux/limits.h>
 #include <linux/sysctl.h>
+#include <linux/if_tun.h>
+#include <linux/if.h>
 #include <sched.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <sys/stat.h>
-#include <sys/types.h>
+#include <sys/mount.h>
 
 #include "test_progs.h"
 #include "network_helpers.h"
@@ -32,18 +35,25 @@
 
 #define IP4_SRC "172.16.1.100"
 #define IP4_DST "172.16.2.100"
+#define IP4_TUN_SRC "172.17.1.100"
+#define IP4_TUN_FWD "172.17.1.200"
 #define IP4_PORT 9004
 
-#define IP6_SRC "::1:dead:beef:cafe"
-#define IP6_DST "::2:dead:beef:cafe"
+#define IP6_SRC "0::1:dead:beef:cafe"
+#define IP6_DST "0::2:dead:beef:cafe"
+#define IP6_TUN_SRC "1::1:dead:beef:cafe"
+#define IP6_TUN_FWD "1::2:dead:beef:cafe"
 #define IP6_PORT 9006
 
 #define IP4_SLL "169.254.0.1"
 #define IP4_DLL "169.254.0.2"
 #define IP4_NET "169.254.0.0"
 
+#define MAC_DST_FWD "00:11:22:33:44:55"
+#define MAC_DST "00:22:33:44:55:66"
+
 #define IFADDR_STR_LEN 18
-#define PING_ARGS "-c 3 -w 10 -q"
+#define PING_ARGS "-i 0.2 -c 3 -w 10 -q"
 
 #define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src"
 #define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst"
@@ -51,120 +61,104 @@
 
 #define TIMEOUT_MILLIS 10000
 
-#define MAX_PROC_MODS 128
-#define MAX_PROC_VALUE_LEN 16
-
 #define log_err(MSG, ...) \
 	fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
 		__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
 
-struct proc_mod {
-	char path[PATH_MAX];
-	char oldval[MAX_PROC_VALUE_LEN];
-	int oldlen;
-};
-
 static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
-static int root_netns_fd = -1;
-static int num_proc_mods;
-static struct proc_mod proc_mods[MAX_PROC_MODS];
 
-/**
- * modify_proc() - Modify entry in /proc
- *
- * Modifies an entry in /proc and saves the original value for later
- * restoration with restore_proc().
- */
-static int modify_proc(const char *path, const char *newval)
+static int write_file(const char *path, const char *newval)
 {
-	struct proc_mod *mod;
 	FILE *f;
 
-	if (num_proc_mods + 1 > MAX_PROC_MODS)
-		return -1;
-
 	f = fopen(path, "r+");
 	if (!f)
 		return -1;
-
-	mod = &proc_mods[num_proc_mods];
-	num_proc_mods++;
-
-	strncpy(mod->path, path, PATH_MAX);
-
-	if (!fread(mod->oldval, 1, MAX_PROC_VALUE_LEN, f)) {
-		log_err("reading from %s failed", path);
-		goto fail;
-	}
-	rewind(f);
 	if (fwrite(newval, strlen(newval), 1, f) != 1) {
 		log_err("writing to %s failed", path);
-		goto fail;
+		fclose(f);
+		return -1;
 	}
-
 	fclose(f);
 	return 0;
-
-fail:
-	fclose(f);
-	num_proc_mods--;
-	return -1;
 }
 
-/**
- * restore_proc() - Restore all /proc modifications
- */
-static void restore_proc(void)
+struct nstoken {
+	int orig_netns_fd;
+};
+
+static int setns_by_fd(int nsfd)
 {
-	int i;
+	int err;
 
-	for (i = 0; i < num_proc_mods; i++) {
-		struct proc_mod *mod = &proc_mods[i];
-		FILE *f;
+	err = setns(nsfd, CLONE_NEWNET);
+	close(nsfd);
 
-		f = fopen(mod->path, "w");
-		if (!f) {
-			log_err("fopen of %s failed", mod->path);
-			continue;
-		}
+	if (!ASSERT_OK(err, "setns"))
+		return err;
 
-		if (fwrite(mod->oldval, mod->oldlen, 1, f) != 1)
-			log_err("fwrite to %s failed", mod->path);
+	/* Switch /sys to the new namespace so that e.g. /sys/class/net
+	 * reflects the devices in the new namespace.
+	 */
+	err = unshare(CLONE_NEWNS);
+	if (!ASSERT_OK(err, "unshare"))
+		return err;
 
-		fclose(f);
-	}
-	num_proc_mods = 0;
+	err = umount2("/sys", MNT_DETACH);
+	if (!ASSERT_OK(err, "umount2 /sys"))
+		return err;
+
+	err = mount("sysfs", "/sys", "sysfs", 0, NULL);
+	if (!ASSERT_OK(err, "mount /sys"))
+		return err;
+
+	err = mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL);
+	if (!ASSERT_OK(err, "mount /sys/fs/bpf"))
+		return err;
+
+	return 0;
 }
 
 /**
- * setns_by_name() - Set networks namespace by name
+ * open_netns() - Switch to specified network namespace by name.
+ *
+ * Returns token with which to restore the original namespace
+ * using close_netns().
  */
-static int setns_by_name(const char *name)
+static struct nstoken *open_netns(const char *name)
 {
 	int nsfd;
 	char nspath[PATH_MAX];
 	int err;
+	struct nstoken *token;
+
+	token = malloc(sizeof(struct nstoken));
+	if (!ASSERT_OK_PTR(token, "malloc token"))
+		return NULL;
+
+	token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+	if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net"))
+		goto fail;
 
 	snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name);
 	nsfd = open(nspath, O_RDONLY | O_CLOEXEC);
-	if (nsfd < 0)
-		return nsfd;
+	if (!ASSERT_GE(nsfd, 0, "open netns fd"))
+		goto fail;
 
-	err = setns(nsfd, CLONE_NEWNET);
-	close(nsfd);
+	err = setns_by_fd(nsfd);
+	if (!ASSERT_OK(err, "setns_by_fd"))
+		goto fail;
 
-	return err;
+	return token;
+fail:
+	free(token);
+	return NULL;
 }
 
-/**
- * setns_root() - Set network namespace to original (root) namespace
- *
- * Not expected to ever fail, so error not returned, but failure logged
- * and test marked as failed.
- */
-static void setns_root(void)
+static void close_netns(struct nstoken *token)
 {
-	ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "setns root");
+	ASSERT_OK(setns_by_fd(token->orig_netns_fd), "setns_by_fd");
+	free(token);
 }
 
 static int netns_setup_namespaces(const char *verb)
@@ -237,15 +231,17 @@ static int get_ifindex(const char *name)
 
 static int netns_setup_links_and_routes(struct netns_setup_result *result)
 {
+	struct nstoken *nstoken = NULL;
 	char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {};
-	char veth_dst_fwd_addr[IFADDR_STR_LEN+1] = {};
 
 	SYS("ip link add veth_src type veth peer name veth_src_fwd");
 	SYS("ip link add veth_dst type veth peer name veth_dst_fwd");
+
+	SYS("ip link set veth_dst_fwd address " MAC_DST_FWD);
+	SYS("ip link set veth_dst address " MAC_DST);
+
 	if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr))
 		goto fail;
-	if (get_ifaddr("veth_dst_fwd", veth_dst_fwd_addr))
-		goto fail;
 
 	result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd");
 	if (result->ifindex_veth_src_fwd < 0)
@@ -260,7 +256,8 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	SYS("ip link set veth_dst netns " NS_DST);
 
 	/** setup in 'src' namespace */
-	if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
+	nstoken = open_netns(NS_SRC);
+	if (!ASSERT_OK_PTR(nstoken, "setns src"))
 		goto fail;
 
 	SYS("ip addr add " IP4_SRC "/32 dev veth_src");
@@ -276,8 +273,11 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	SYS("ip neigh add " IP6_DST " dev veth_src lladdr %s",
 	    veth_src_fwd_addr);
 
+	close_netns(nstoken);
+
 	/** setup in 'fwd' namespace */
-	if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
 		goto fail;
 
 	/* The fwd netns automatically gets a v6 LL address / routes, but also
@@ -294,8 +294,11 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	SYS("ip route add " IP4_DST "/32 dev veth_dst_fwd scope global");
 	SYS("ip route add " IP6_DST "/128 dev veth_dst_fwd scope global");
 
+	close_netns(nstoken);
+
 	/** setup in 'dst' namespace */
-	if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
+	nstoken = open_netns(NS_DST);
+	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
 		goto fail;
 
 	SYS("ip addr add " IP4_DST "/32 dev veth_dst");
@@ -306,23 +309,20 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result)
 	SYS("ip route add " IP4_NET "/16 dev veth_dst scope global");
 	SYS("ip route add " IP6_SRC "/128 dev veth_dst scope global");
 
-	SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr %s",
-	    veth_dst_fwd_addr);
-	SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr %s",
-	    veth_dst_fwd_addr);
+	SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr " MAC_DST_FWD);
+	SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr " MAC_DST_FWD);
+
+	close_netns(nstoken);
 
-	setns_root();
 	return 0;
 fail:
-	setns_root();
+	if (nstoken)
+		close_netns(nstoken);
 	return -1;
 }
 
 static int netns_load_bpf(void)
 {
-	if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
-		return -1;
-
 	SYS("tc qdisc add dev veth_src_fwd clsact");
 	SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned "
 	    SRC_PROG_PIN_FILE);
@@ -335,42 +335,29 @@ static int netns_load_bpf(void)
 	SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
 	    CHK_PROG_PIN_FILE);
 
-	setns_root();
-	return -1;
-fail:
-	setns_root();
-	return -1;
-}
-
-static int netns_unload_bpf(void)
-{
-	if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
-		goto fail;
-	SYS("tc qdisc delete dev veth_src_fwd clsact");
-	SYS("tc qdisc delete dev veth_dst_fwd clsact");
-
-	setns_root();
 	return 0;
 fail:
-	setns_root();
 	return -1;
 }
 
-
 static void test_tcp(int family, const char *addr, __u16 port)
 {
 	int listen_fd = -1, accept_fd = -1, client_fd = -1;
 	char buf[] = "testing testing";
 	int n;
+	struct nstoken *nstoken;
 
-	if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
+	nstoken = open_netns(NS_DST);
+	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
 		return;
 
 	listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
 	if (!ASSERT_GE(listen_fd, 0, "listen"))
 		goto done;
 
-	if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
+	close_netns(nstoken);
+	nstoken = open_netns(NS_SRC);
+	if (!ASSERT_OK_PTR(nstoken, "setns src"))
 		goto done;
 
 	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
@@ -392,7 +379,8 @@ static void test_tcp(int family, const char *addr, __u16 port)
 	ASSERT_EQ(n, sizeof(buf), "recv from server");
 
 done:
-	setns_root();
+	if (nstoken)
+		close_netns(nstoken);
 	if (listen_fd >= 0)
 		close(listen_fd);
 	if (accept_fd >= 0)
@@ -405,7 +393,7 @@ static int test_ping(int family, const char *addr)
 {
 	const char *ping = family == AF_INET6 ? "ping6" : "ping";
 
-	SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s", ping, addr);
+	SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping, addr);
 	return 0;
 fail:
 	return -1;
@@ -419,19 +407,37 @@ static void test_connectivity(void)
 	test_ping(AF_INET6, IP6_DST);
 }
 
+static int set_forwarding(bool enable)
+{
+	int err;
+
+	err = write_file("/proc/sys/net/ipv4/ip_forward", enable ? "1" : "0");
+	if (!ASSERT_OK(err, "set ipv4.ip_forward=0"))
+		return err;
+
+	err = write_file("/proc/sys/net/ipv6/conf/all/forwarding", enable ? "1" : "0");
+	if (!ASSERT_OK(err, "set ipv6.forwarding=0"))
+		return err;
+
+	return 0;
+}
+
 static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
 {
-	struct test_tc_neigh_fib *skel;
+	struct nstoken *nstoken = NULL;
+	struct test_tc_neigh_fib *skel = NULL;
 	int err;
 
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+		return;
+
 	skel = test_tc_neigh_fib__open();
 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
-		return;
+		goto done;
 
-	if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) {
-		test_tc_neigh_fib__destroy(skel);
-		return;
-	}
+	if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load"))
+		goto done;
 
 	err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
 	if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
@@ -449,46 +455,37 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
 		goto done;
 
 	/* bpf_fib_lookup() checks if forwarding is enabled */
-	if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
+	if (!ASSERT_OK(set_forwarding(true), "enable forwarding"))
 		goto done;
 
-	err = modify_proc("/proc/sys/net/ipv4/ip_forward", "1");
-	if (!ASSERT_OK(err, "set ipv4.ip_forward"))
-		goto done;
-
-	err = modify_proc("/proc/sys/net/ipv6/conf/all/forwarding", "1");
-	if (!ASSERT_OK(err, "set ipv6.forwarding"))
-		goto done;
-	setns_root();
-
 	test_connectivity();
+
 done:
-	bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-	test_tc_neigh_fib__destroy(skel);
-	netns_unload_bpf();
-	setns_root();
-	restore_proc();
+	if (skel)
+		test_tc_neigh_fib__destroy(skel);
+	close_netns(nstoken);
 }
 
 static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
 {
-	struct test_tc_neigh *skel;
+	struct nstoken *nstoken = NULL;
+	struct test_tc_neigh *skel = NULL;
 	int err;
 
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+		return;
+
 	skel = test_tc_neigh__open();
 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
-		return;
+		goto done;
 
 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
 	skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
 
 	err = test_tc_neigh__load(skel);
-	if (!ASSERT_OK(err, "test_tc_neigh__load")) {
-		test_tc_neigh__destroy(skel);
-		return;
-	}
+	if (!ASSERT_OK(err, "test_tc_neigh__load"))
+		goto done;
 
 	err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
 	if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
@@ -505,34 +502,37 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
 	if (netns_load_bpf())
 		goto done;
 
+	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
+		goto done;
+
 	test_connectivity();
 
 done:
-	bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-	test_tc_neigh__destroy(skel);
-	netns_unload_bpf();
-	setns_root();
+	if (skel)
+		test_tc_neigh__destroy(skel);
+	close_netns(nstoken);
 }
 
 static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
 {
+	struct nstoken *nstoken;
 	struct test_tc_peer *skel;
 	int err;
 
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+		return;
+
 	skel = test_tc_peer__open();
 	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
-		return;
+		goto done;
 
 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
 	skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
 
 	err = test_tc_peer__load(skel);
-	if (!ASSERT_OK(err, "test_tc_peer__load")) {
-		test_tc_peer__destroy(skel);
-		return;
-	}
+	if (!ASSERT_OK(err, "test_tc_peer__load"))
+		goto done;
 
 	err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
 	if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
@@ -549,41 +549,237 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
 	if (netns_load_bpf())
 		goto done;
 
+	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
+		goto done;
+
 	test_connectivity();
 
 done:
-	bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
-	bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
-	test_tc_peer__destroy(skel);
-	netns_unload_bpf();
-	setns_root();
+	if (skel)
+		test_tc_peer__destroy(skel);
+	close_netns(nstoken);
 }
 
-void test_tc_redirect(void)
+static int tun_open(char *name)
+{
+	struct ifreq ifr;
+	int fd, err;
+
+	fd = open("/dev/net/tun", O_RDWR);
+	if (!ASSERT_GE(fd, 0, "open /dev/net/tun"))
+		return -1;
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
+	if (*name)
+		strncpy(ifr.ifr_name, name, IFNAMSIZ);
+
+	err = ioctl(fd, TUNSETIFF, &ifr);
+	if (!ASSERT_OK(err, "ioctl TUNSETIFF"))
+		goto fail;
+
+	SYS("ip link set dev %s up", name);
+
+	return fd;
+fail:
+	close(fd);
+	return -1;
+}
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+enum {
+	SRC_TO_TARGET = 0,
+	TARGET_TO_SRC = 1,
+};
+
+static int tun_relay_loop(int src_fd, int target_fd)
 {
-	struct netns_setup_result setup_result;
+	fd_set rfds, wfds;
 
-	root_netns_fd = open("/proc/self/ns/net", O_RDONLY);
-	if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net"))
+	FD_ZERO(&rfds);
+	FD_ZERO(&wfds);
+
+	for (;;) {
+		char buf[1500];
+		int direction, nread, nwrite;
+
+		FD_SET(src_fd, &rfds);
+		FD_SET(target_fd, &rfds);
+
+		if (select(1 + MAX(src_fd, target_fd), &rfds, NULL, NULL, NULL) < 0) {
+			log_err("select failed");
+			return 1;
+		}
+
+		direction = FD_ISSET(src_fd, &rfds) ? SRC_TO_TARGET : TARGET_TO_SRC;
+
+		nread = read(direction == SRC_TO_TARGET ? src_fd : target_fd, buf, sizeof(buf));
+		if (nread < 0) {
+			log_err("read failed");
+			return 1;
+		}
+
+		nwrite = write(direction == SRC_TO_TARGET ? target_fd : src_fd, buf, nread);
+		if (nwrite != nread) {
+			log_err("write failed");
+			return 1;
+		}
+	}
+}
+
+static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
+{
+	struct test_tc_peer *skel = NULL;
+	struct nstoken *nstoken = NULL;
+	int err;
+	int tunnel_pid = -1;
+	int src_fd, target_fd;
+	int ifindex;
+
+	/* Start a L3 TUN/TAP tunnel between the src and dst namespaces.
+	 * This test is using TUN/TAP instead of e.g. IPIP or GRE tunnel as those
+	 * expose the L2 headers encapsulating the IP packet to BPF and hence
+	 * don't have skb in suitable state for this test. Alternative to TUN/TAP
+	 * would be e.g. Wireguard which would appear as a pure L3 device to BPF,
+	 * but that requires much more complicated setup.
+	 */
+	nstoken = open_netns(NS_SRC);
+	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
 		return;
 
-	if (netns_setup_namespaces("add"))
-		goto done;
+	src_fd = tun_open("tun_src");
+	if (!ASSERT_GE(src_fd, 0, "tun_open tun_src"))
+		goto fail;
 
-	if (netns_setup_links_and_routes(&setup_result))
-		goto done;
+	close_netns(nstoken);
+
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
+		goto fail;
 
-	if (test__start_subtest("tc_redirect_peer"))
-		test_tc_redirect_peer(&setup_result);
+	target_fd = tun_open("tun_fwd");
+	if (!ASSERT_GE(target_fd, 0, "tun_open tun_fwd"))
+		goto fail;
 
-	if (test__start_subtest("tc_redirect_neigh"))
-		test_tc_redirect_neigh(&setup_result);
+	tunnel_pid = fork();
+	if (!ASSERT_GE(tunnel_pid, 0, "fork tun_relay_loop"))
+		goto fail;
 
-	if (test__start_subtest("tc_redirect_neigh_fib"))
-		test_tc_redirect_neigh_fib(&setup_result);
+	if (tunnel_pid == 0)
+		exit(tun_relay_loop(src_fd, target_fd));
 
-done:
-	close(root_netns_fd);
-	netns_setup_namespaces("delete");
+	skel = test_tc_peer__open();
+	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
+		goto fail;
+
+	ifindex = get_ifindex("tun_fwd");
+	if (!ASSERT_GE(ifindex, 0, "get_ifindex tun_fwd"))
+		goto fail;
+
+	skel->rodata->IFINDEX_SRC = ifindex;
+	skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
+
+	err = test_tc_peer__load(skel);
+	if (!ASSERT_OK(err, "test_tc_peer__load"))
+		goto fail;
+
+	err = bpf_program__pin(skel->progs.tc_src_l3, SRC_PROG_PIN_FILE);
+	if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
+		goto fail;
+
+	err = bpf_program__pin(skel->progs.tc_dst_l3, DST_PROG_PIN_FILE);
+	if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
+		goto fail;
+
+	err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
+	if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
+		goto fail;
+
+	/* Load "tc_src_l3" to the tun_fwd interface to redirect packets
+	 * towards dst, and "tc_dst" to redirect packets
+	 * and "tc_chk" on veth_dst_fwd to drop non-redirected packets.
+	 */
+	SYS("tc qdisc add dev tun_fwd clsact");
+	SYS("tc filter add dev tun_fwd ingress bpf da object-pinned "
+	    SRC_PROG_PIN_FILE);
+
+	SYS("tc qdisc add dev veth_dst_fwd clsact");
+	SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned "
+	    DST_PROG_PIN_FILE);
+	SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
+	    CHK_PROG_PIN_FILE);
+
+	/* Setup route and neigh tables */
+	SYS("ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24");
+	SYS("ip -netns " NS_FWD " addr add dev tun_fwd " IP4_TUN_FWD "/24");
+
+	SYS("ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad");
+	SYS("ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad");
+
+	SYS("ip -netns " NS_SRC " route del " IP4_DST "/32 dev veth_src scope global");
+	SYS("ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD
+	    " dev tun_src scope global");
+	SYS("ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev veth_dst scope global");
+	SYS("ip -netns " NS_SRC " route del " IP6_DST "/128 dev veth_src scope global");
+	SYS("ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD
+	    " dev tun_src scope global");
+	SYS("ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev veth_dst scope global");
+
+	SYS("ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD);
+	SYS("ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD);
+
+	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
+		goto fail;
+
+	test_connectivity();
+
+fail:
+	if (tunnel_pid > 0) {
+		kill(tunnel_pid, SIGTERM);
+		waitpid(tunnel_pid, NULL, 0);
+	}
+	if (src_fd >= 0)
+		close(src_fd);
+	if (target_fd >= 0)
+		close(target_fd);
+	if (skel)
+		test_tc_peer__destroy(skel);
+	if (nstoken)
+		close_netns(nstoken);
+}
+
+#define RUN_TEST(name)                                                                      \
+	({                                                                                  \
+		struct netns_setup_result setup_result;                                     \
+		if (test__start_subtest(#name))                                             \
+			if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \
+				if (ASSERT_OK(netns_setup_links_and_routes(&setup_result),  \
+					      "setup links and routes"))                    \
+					test_ ## name(&setup_result);                       \
+				netns_setup_namespaces("delete");                           \
+			}                                                                   \
+	})
+
+static void *test_tc_redirect_run_tests(void *arg)
+{
+	RUN_TEST(tc_redirect_peer);
+	RUN_TEST(tc_redirect_peer_l3);
+	RUN_TEST(tc_redirect_neigh);
+	RUN_TEST(tc_redirect_neigh_fib);
+	return NULL;
+}
+
+void test_tc_redirect(void)
+{
+	pthread_t test_thread;
+	int err;
+
+	/* Run the tests in their own thread to isolate the namespace changes
+	 * so they do not affect the environment of other tests.
+	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
+	 */
+	err = pthread_create(&test_thread, NULL, &test_tc_redirect_run_tests, NULL);
+	if (ASSERT_OK(err, "pthread_create"))
+		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
 }
diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
index ef264bced0e6..fe818cd5f010 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_peer.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c
@@ -5,12 +5,17 @@
 #include <linux/bpf.h>
 #include <linux/stddef.h>
 #include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
 
 #include <bpf/bpf_helpers.h>
 
 volatile const __u32 IFINDEX_SRC;
 volatile const __u32 IFINDEX_DST;
 
+static const __u8 src_mac[] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55};
+static const __u8 dst_mac[] = {0x00, 0x22, 0x33, 0x44, 0x55, 0x66};
+
 SEC("classifier/chk_egress")
 int tc_chk(struct __sk_buff *skb)
 {
@@ -29,4 +34,30 @@ int tc_src(struct __sk_buff *skb)
 	return bpf_redirect_peer(IFINDEX_DST, 0);
 }
 
+SEC("classifier/dst_ingress_l3")
+int tc_dst_l3(struct __sk_buff *skb)
+{
+	return bpf_redirect(IFINDEX_SRC, 0);
+}
+
+SEC("classifier/src_ingress_l3")
+int tc_src_l3(struct __sk_buff *skb)
+{
+	__u16 proto = skb->protocol;
+
+	if (bpf_skb_change_head(skb, ETH_HLEN, 0) != 0)
+		return TC_ACT_SHOT;
+
+	if (bpf_skb_store_bytes(skb, 0, &src_mac, ETH_ALEN, 0) != 0)
+		return TC_ACT_SHOT;
+
+	if (bpf_skb_store_bytes(skb, ETH_ALEN, &dst_mac, ETH_ALEN, 0) != 0)
+		return TC_ACT_SHOT;
+
+	if (bpf_skb_store_bytes(skb, ETH_ALEN + ETH_ALEN, &proto, sizeof(__u16), 0) != 0)
+		return TC_ACT_SHOT;
+
+	return bpf_redirect_peer(IFINDEX_DST, 0);
+}
+
 char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 5c9d706f61336d9f7f285df64c734af778c70f39 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 25 May 2021 20:35:29 +0200
Subject: bpf: Fix BPF_LSM kconfig symbol dependency

Similarly as 6bdacdb48e94 ("bpf: Fix BPF_JIT kconfig symbol dependency") we
need to detangle the hard BPF_LSM dependency on NET. This was previously
implicit by its dependency on BPF_JIT which itself was dependent on NET (but
without any actual/real hard dependency code-wise). Given the latter was
lifted, so should be the former as BPF_LSMs could well exist on net-less
systems. This therefore also fixes a randconfig build error recently reported
by Randy:

  ld: kernel/bpf/bpf_lsm.o: in function `bpf_lsm_func_proto':
  bpf_lsm.c:(.text+0x1a0): undefined reference to `bpf_sk_storage_get_proto'
  ld: bpf_lsm.c:(.text+0x1b8): undefined reference to `bpf_sk_storage_delete_proto'
  [...]

Fixes: b24abcff918a ("bpf, kconfig: Add consolidated menu entry for bpf with core options")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
---
 kernel/bpf/bpf_lsm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 5efb2b24012c..da471bf01b97 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -107,10 +107,12 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_inode_storage_get_proto;
 	case BPF_FUNC_inode_storage_delete:
 		return &bpf_inode_storage_delete_proto;
+#ifdef CONFIG_NET
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_proto;
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
+#endif /* CONFIG_NET */
 	case BPF_FUNC_spin_lock:
 		return &bpf_spin_lock_proto;
 	case BPF_FUNC_spin_unlock:
-- 
cgit v1.2.3


From 3d0220f6861d713213b015b582e9f21e5b28d2e0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 May 2021 10:17:36 +0000
Subject: bpf: Wrap aux data inside bpf_sanitize_info container

Add a container structure struct bpf_sanitize_info which holds
the current aux info, and update call-sites to sanitize_ptr_alu()
to pass it in. This is needed for passing in additional state
later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Piotr Krysiuk <piotras@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 09849e43f035..98690f5367f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6486,15 +6486,19 @@ static bool sanitize_needed(u8 opcode)
 	return opcode == BPF_ADD || opcode == BPF_SUB;
 }
 
+struct bpf_sanitize_info {
+	struct bpf_insn_aux_data aux;
+};
+
 static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 			    struct bpf_insn *insn,
 			    const struct bpf_reg_state *ptr_reg,
 			    const struct bpf_reg_state *off_reg,
 			    struct bpf_reg_state *dst_reg,
-			    struct bpf_insn_aux_data *tmp_aux,
+			    struct bpf_sanitize_info *info,
 			    const bool commit_window)
 {
-	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
+	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
 	struct bpf_verifier_state *vstate = env->cur_state;
 	bool off_is_imm = tnum_is_const(off_reg->var_off);
 	bool off_is_neg = off_reg->smin_value < 0;
@@ -6523,8 +6527,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 		/* In commit phase we narrow the masking window based on
 		 * the observed pointer move after the simulated operation.
 		 */
-		alu_state = tmp_aux->alu_state;
-		alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+		alu_state = info->aux.alu_state;
+		alu_limit = abs(info->aux.alu_limit - alu_limit);
 	} else {
 		alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
 		alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
@@ -6685,7 +6689,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
 	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
-	struct bpf_insn_aux_data tmp_aux = {};
+	struct bpf_sanitize_info info = {};
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
 	int ret;
@@ -6754,7 +6758,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	if (sanitize_needed(opcode)) {
 		ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
-				       &tmp_aux, false);
+				       &info, false);
 		if (ret < 0)
 			return sanitize_err(env, insn, ret, off_reg, dst_reg);
 	}
@@ -6895,7 +6899,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	if (sanitize_needed(opcode)) {
 		ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
-				       &tmp_aux, true);
+				       &info, true);
 		if (ret < 0)
 			return sanitize_err(env, insn, ret, off_reg, dst_reg);
 	}
-- 
cgit v1.2.3


From bb01a1bba579b4b1c5566af24d95f1767859771e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 May 2021 10:19:22 +0000
Subject: bpf: Fix mask direction swap upon off reg sign change

Masking direction as indicated via mask_to_left is considered to be
calculated once and then used to derive pointer limits. Thus, this
needs to be placed into bpf_sanitize_info instead so we can pass it
to sanitize_ptr_alu() call after the pointer move. Piotr noticed a
corner case where the off reg causes masking direction change which
then results in an incorrect final aux->alu_limit.

Fixes: 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask")
Reported-by: Piotr Krysiuk <piotras@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Piotr Krysiuk <piotras@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98690f5367f9..8574cb60915a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6409,18 +6409,10 @@ enum {
 };
 
 static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
-			      const struct bpf_reg_state *off_reg,
-			      u32 *alu_limit, u8 opcode)
+			      u32 *alu_limit, bool mask_to_left)
 {
-	bool off_is_neg = off_reg->smin_value < 0;
-	bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
-			    (opcode == BPF_SUB && !off_is_neg);
 	u32 max = 0, ptr_limit = 0;
 
-	if (!tnum_is_const(off_reg->var_off) &&
-	    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
-		return REASON_BOUNDS;
-
 	switch (ptr_reg->type) {
 	case PTR_TO_STACK:
 		/* Offset 0 is out-of-bounds, but acceptable start for the
@@ -6488,6 +6480,7 @@ static bool sanitize_needed(u8 opcode)
 
 struct bpf_sanitize_info {
 	struct bpf_insn_aux_data aux;
+	bool mask_to_left;
 };
 
 static int sanitize_ptr_alu(struct bpf_verifier_env *env,
@@ -6519,7 +6512,16 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	if (vstate->speculative)
 		goto do_sim;
 
-	err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
+	if (!commit_window) {
+		if (!tnum_is_const(off_reg->var_off) &&
+		    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+			return REASON_BOUNDS;
+
+		info->mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
+				     (opcode == BPF_SUB && !off_is_neg);
+	}
+
+	err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From a7036191277f9fa68d92f2071ddc38c09b1e5ee5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 4 May 2021 08:58:25 +0000
Subject: bpf: No need to simulate speculative domain for immediates

In 801c6058d14a ("bpf: Fix leakage of uninitialized bpf stack under
speculation") we replaced masking logic with direct loads of immediates
if the register is a known constant. Given in this case we do not apply
any masking, there is also no reason for the operation to be truncated
under the speculative domain.

Therefore, there is also zero reason for the verifier to branch-off and
simulate this case, it only needs to do it for unknown but bounded scalars.
As a side-effect, this also enables few test cases that were previously
rejected due to simulation under zero truncation.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Piotr Krysiuk <piotras@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8574cb60915a..94ba5163d4c5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6545,8 +6545,12 @@ do_sim:
 	/* If we're in commit phase, we're done here given we already
 	 * pushed the truncated dst_reg into the speculative verification
 	 * stack.
+	 *
+	 * Also, when register is a known constant, we rewrite register-based
+	 * operation to immediate-based, and thus do not need masking (and as
+	 * a consequence, do not need to simulate the zero-truncation either).
 	 */
-	if (commit_window)
+	if (commit_window || off_is_imm)
 		return 0;
 
 	/* Simulate and find potential out-of-bounds access under
-- 
cgit v1.2.3


From 1bad6fd52be4ce12d207e2820ceb0f29ab31fc53 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 4 May 2021 08:58:25 +0000
Subject: bpf, selftests: Adjust few selftest result_unpriv outcomes

Given we don't need to simulate the speculative domain for registers with
immediates anymore since the verifier uses direct imm-based rewrites instead
of having to mask, we can also lift a few cases that were previously rejected.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/verifier/stack_ptr.c       | 2 --
 tools/testing/selftests/bpf/verifier/value_ptr_arith.c | 8 --------
 2 files changed, 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/verifier/stack_ptr.c b/tools/testing/selftests/bpf/verifier/stack_ptr.c
index 07eaa04412ae..8ab94d65f3d5 100644
--- a/tools/testing/selftests/bpf/verifier/stack_ptr.c
+++ b/tools/testing/selftests/bpf/verifier/stack_ptr.c
@@ -295,8 +295,6 @@
 	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
 	BPF_EXIT_INSN(),
 	},
-	.result_unpriv = REJECT,
-	.errstr_unpriv = "invalid write to stack R1 off=0 size=1",
 	.result = ACCEPT,
 	.retval = 42,
 },
diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index e5913fd3b903..7ae2859d495c 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -300,8 +300,6 @@
 	},
 	.fixup_map_array_48b = { 3 },
 	.result = ACCEPT,
-	.result_unpriv = REJECT,
-	.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 	.retval = 1,
 },
 {
@@ -371,8 +369,6 @@
 	},
 	.fixup_map_array_48b = { 3 },
 	.result = ACCEPT,
-	.result_unpriv = REJECT,
-	.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 	.retval = 1,
 },
 {
@@ -472,8 +468,6 @@
 	},
 	.fixup_map_array_48b = { 3 },
 	.result = ACCEPT,
-	.result_unpriv = REJECT,
-	.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 	.retval = 1,
 },
 {
@@ -766,8 +760,6 @@
 	},
 	.fixup_map_array_48b = { 3 },
 	.result = ACCEPT,
-	.result_unpriv = REJECT,
-	.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 	.retval = 1,
 },
 {
-- 
cgit v1.2.3