From c9883ee9d110703ccb3dfe2ca13e0b7a01351077 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Sat, 10 Dec 2022 16:20:45 +0800 Subject: libbpf: Optimized return value in libbpf_strerror when errno is libbpf errno This is a small improvement in libbpf_strerror. When libbpf_strerror is used to obtain the system error description, if the length of the buf is insufficient, libbpf_sterror returns ERANGE and sets errno to ERANGE. However, this processing is not performed when the error code customized by libbpf is obtained. Make some minor improvements here, return -ERANGE and set errno to ERANGE when buf is not enough for custom description. Signed-off-by: Xin Liu Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221210082045.233697-1-liuxin350@huawei.com --- tools/lib/bpf/libbpf_errno.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c index 96f67a772a1b..6b180172ec6b 100644 --- a/tools/lib/bpf/libbpf_errno.c +++ b/tools/lib/bpf/libbpf_errno.c @@ -39,14 +39,14 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { int libbpf_strerror(int err, char *buf, size_t size) { + int ret; + if (!buf || !size) return libbpf_err(-EINVAL); err = err > 0 ? err : -err; if (err < __LIBBPF_ERRNO__START) { - int ret; - ret = strerror_r(err, buf, size); buf[size - 1] = '\0'; return libbpf_err_errno(ret); @@ -56,12 +56,20 @@ int libbpf_strerror(int err, char *buf, size_t size) const char *msg; msg = libbpf_strerror_table[ERRNO_OFFSET(err)]; - snprintf(buf, size, "%s", msg); + ret = snprintf(buf, size, "%s", msg); buf[size - 1] = '\0'; + /* The length of the buf and msg is positive. + * A negative number may be returned only when the + * size exceeds INT_MAX. Not likely to appear. + */ + if (ret >= size) + return libbpf_err(-ERANGE); return 0; } - snprintf(buf, size, "Unknown libbpf error %d", err); + ret = snprintf(buf, size, "Unknown libbpf error %d", err); buf[size - 1] = '\0'; + if (ret >= size) + return libbpf_err(-ERANGE); return libbpf_err(-ENOENT); } -- cgit v1.2.3 From 872aec4b5f635d94111d48ec3c57fbe078d64e7d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:00 -0800 Subject: libbpf: Fix single-line struct definition output in btf_dump btf_dump APIs emit unnecessary tabs when emitting struct/union definition that fits on the single line. Before this patch we'd get: struct blah {}; This patch fixes this and makes sure that we get more natural: struct blah {}; Fixes: 44a726c3f23c ("bpftool: Print newline before '}' for struct with padding only fields") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-2-andrii@kernel.org --- tools/lib/bpf/btf_dump.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index deb2bc9a0a7b..69e80ee5f70e 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -959,9 +959,12 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, * Keep `struct empty {}` on a single line, * only print newline when there are regular or padding fields. */ - if (vlen || t->size) + if (vlen || t->size) { btf_dump_printf(d, "\n"); - btf_dump_printf(d, "%s}", pfx(lvl)); + btf_dump_printf(d, "%s}", pfx(lvl)); + } else { + btf_dump_printf(d, "}"); + } if (packed) btf_dump_printf(d, " __attribute__((packed))"); } -- cgit v1.2.3 From 21a9a1bcccaa4f0337a24d666fe55944abcb171e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:01 -0800 Subject: libbpf: Handle non-standardly sized enums better in BTF-to-C dumper Turns out C allows to force enum to be 1-byte or 8-byte explicitly using mode(byte) or mode(word), respecticely. Linux sources are using this in some cases. This is imporant to handle correctly, as enum size determines corresponding fields in a struct that use that enum type. And if enum size is incorrect, this will lead to invalid struct layout. So add mode(byte) and mode(word) attribute support to btf_dump APIs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-3-andrii@kernel.org --- tools/lib/bpf/btf_dump.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 69e80ee5f70e..234e82334d56 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1076,6 +1077,43 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, else btf_dump_emit_enum64_val(d, t, lvl, vlen); btf_dump_printf(d, "\n%s}", pfx(lvl)); + + /* special case enums with special sizes */ + if (t->size == 1) { + /* one-byte enums can be forced with mode(byte) attribute */ + btf_dump_printf(d, " __attribute__((mode(byte)))"); + } else if (t->size == 8 && d->ptr_sz == 8) { + /* enum can be 8-byte sized if one of the enumerator values + * doesn't fit in 32-bit integer, or by adding mode(word) + * attribute (but probably only on 64-bit architectures); do + * our best here to try to satisfy the contract without adding + * unnecessary attributes + */ + bool needs_word_mode; + + if (btf_is_enum(t)) { + /* enum can't represent 64-bit values, so we need word mode */ + needs_word_mode = true; + } else { + /* enum64 needs mode(word) if none of its values has + * non-zero upper 32-bits (which means that all values + * fit in 32-bit integers and won't cause compiler to + * bump enum to be 64-bit naturally + */ + int i; + + needs_word_mode = true; + for (i = 0; i < vlen; i++) { + if (btf_enum64(t)[i].val_hi32 != 0) { + needs_word_mode = false; + break; + } + } + } + if (needs_word_mode) + btf_dump_printf(d, " __attribute__((mode(word)))"); + } + } static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, -- cgit v1.2.3 From 9d2349740e430b20660457b7c88fa06467457272 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:02 -0800 Subject: selftests/bpf: Add non-standardly sized enum tests for btf_dump Add few custom enum definitions testing mode(byte) and mode(word) attributes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-4-andrii@kernel.org --- .../bpf/progs/btf_dump_test_case_syntax.c | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 4ee4748133fe..26fffb02ed10 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -25,6 +25,39 @@ typedef enum { H = 2, } e3_t; +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *enum e_byte { + * EBYTE_1 = 0, + * EBYTE_2 = 1, + *} __attribute__((mode(byte))); + * + */ +/* ----- END-EXPECTED-OUTPUT ----- */ +enum e_byte { + EBYTE_1, + EBYTE_2, +} __attribute__((mode(byte))); + +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *enum e_word { + * EWORD_1 = 0LL, + * EWORD_2 = 1LL, + *} __attribute__((mode(word))); + * + */ +/* ----- END-EXPECTED-OUTPUT ----- */ +enum e_word { + EWORD_1, + EWORD_2, +} __attribute__((mode(word))); /* force to use 8-byte backing for this enum */ + +/* ----- START-EXPECTED-OUTPUT ----- */ +enum e_big { + EBIG_1 = 1000000000000ULL, +}; + typedef int int_t; typedef volatile const int * volatile const crazy_ptr_t; @@ -224,6 +257,9 @@ struct root_struct { enum e2 _2; e2_t _2_1; e3_t _2_2; + enum e_byte _100; + enum e_word _101; + enum e_big _102; struct struct_w_typedefs _3; anon_struct_t _7; struct struct_fwd *_8; -- cgit v1.2.3 From 25a4481b4136af7794e1df2d6c90ed2f354d60ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:03 -0800 Subject: libbpf: Fix btf__align_of() by taking into account field offsets btf__align_of() is supposed to be return alignment requirement of a requested BTF type. For STRUCT/UNION it doesn't always return correct value, because it calculates alignment only based on field types. But for packed structs this is not enough, we need to also check field offsets and struct size. If field offset isn't aligned according to field type's natural alignment, then struct must be packed. Similarly, if struct size is not a multiple of struct's natural alignment, then struct must be packed as well. This patch fixes this issue precisely by additionally checking these conditions. Fixes: 3d208f4ca111 ("libbpf: Expose btf__align_of() API") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-5-andrii@kernel.org --- tools/lib/bpf/btf.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 71e165b09ed5..8cbcef959456 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -688,8 +688,21 @@ int btf__align_of(const struct btf *btf, __u32 id) if (align <= 0) return libbpf_err(align); max_align = max(max_align, align); + + /* if field offset isn't aligned according to field + * type's alignment, then struct must be packed + */ + if (btf_member_bitfield_size(t, i) == 0 && + (m->offset % (8 * align)) != 0) + return 1; } + /* if struct/union size isn't a multiple of its alignment, + * then struct must be packed + */ + if ((t->size % max_align) != 0) + return 1; + return max_align; } default: -- cgit v1.2.3 From ea2ce1ba99aa6a60c8d8a706e3abadf3de372163 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:04 -0800 Subject: libbpf: Fix BTF-to-C converter's padding logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out that btf_dump API doesn't handle a bunch of tricky corner cases, as reported by Per, and further discovered using his testing Python script ([0]). This patch revamps btf_dump's padding logic significantly, making it more correct and also avoiding unnecessary explicit padding, where compiler would pad naturally. This overall topic turned out to be very tricky and subtle, there are lots of subtle corner cases. The comments in the code tries to give some clues, but comments themselves are supposed to be paired with good understanding of C alignment and padding rules. Plus some experimentation to figure out subtle things like whether `long :0;` means that struct is now forced to be long-aligned (no, it's not, turns out). Anyways, Per's script, while not completely correct in some known situations, doesn't show any obvious cases where this logic breaks, so this is a nice improvement over the previous state of this logic. Some selftests had to be adjusted to accommodate better use of natural alignment rules, eliminating some unnecessary padding, or changing it to `type: 0;` alignment markers. Note also that for when we are in between bitfields, we emit explicit bit size, while otherwise we use `: 0`, this feels much more natural in practice. Next patch will add few more test cases, found through randomized Per's script. [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/ Reported-by: Per Sundström XP Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-6-andrii@kernel.org --- tools/lib/bpf/btf_dump.c | 169 +++++++++++++++------ .../bpf/progs/btf_dump_test_case_bitfields.c | 2 +- .../bpf/progs/btf_dump_test_case_padding.c | 58 ++++--- 3 files changed, 164 insertions(+), 65 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 234e82334d56..d6fd93a57f11 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -830,6 +830,25 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) } } +static int btf_natural_align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + int i, align, vlen; + const struct btf_member *m; + + if (!btf_is_composite(t)) + return btf__align_of(btf, id); + + align = 1; + m = btf_members(t); + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, m++) { + align = max(align, btf__align_of(btf, m->type)); + } + + return align; +} + static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { @@ -837,16 +856,16 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, int align, i, bit_sz; __u16 vlen; - align = btf__align_of(btf, id); - /* size of a non-packed struct has to be a multiple of its alignment*/ - if (align && t->size % align) + align = btf_natural_align_of(btf, id); + /* size of a non-packed struct has to be a multiple of its alignment */ + if (align && (t->size % align) != 0) return true; m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ for (i = 0; i < vlen; i++, m++) { - align = btf__align_of(btf, m->type); + align = btf_natural_align_of(btf, m->type); bit_sz = btf_member_bitfield_size(t, i); if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; @@ -859,44 +878,97 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, return false; } -static int chip_away_bits(int total, int at_most) -{ - return total % at_most ? : at_most; -} - static void btf_dump_emit_bit_padding(const struct btf_dump *d, - int cur_off, int m_off, int m_bit_sz, - int align, int lvl) + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) { - int off_diff = m_off - cur_off; - int ptr_bits = d->ptr_sz * 8; + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; + + if (cur_off >= next_off) + return; /* no gap */ + + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; - if (off_diff <= 0) - /* no gap */ - return; - if (m_bit_sz == 0 && off_diff < align * 8) - /* natural padding will take care of a gap */ - return; + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } - while (off_diff > 0) { - const char *pad_type; - int pad_bits; - - if (ptr_bits > 32 && off_diff > 32) { - pad_type = "long"; - pad_bits = chip_away_bits(off_diff, ptr_bits); - } else if (off_diff > 16) { - pad_type = "int"; - pad_bits = chip_away_bits(off_diff, 32); - } else if (off_diff > 8) { - pad_type = "short"; - pad_bits = chip_away_bits(off_diff, 16); - } else { - pad_type = "char"; - pad_bits = chip_away_bits(off_diff, 8); + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than 's + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `: sizeof();` markers, except for the last one, which + * might need smaller than sizeof() padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; } - btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); - off_diff -= pad_bits; } } @@ -916,9 +988,11 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, { const struct btf_member *m = btf_members(t); bool is_struct = btf_is_struct(t); - int align, i, packed, off = 0; + bool packed, prev_bitfield = false; + int align, i, off = 0; __u16 vlen = btf_vlen(t); + align = btf__align_of(d->btf, id); packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; btf_dump_printf(d, "%s%s%s {", @@ -928,33 +1002,36 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, for (i = 0; i < vlen; i++, m++) { const char *fname; - int m_off, m_sz; + int m_off, m_sz, m_align; + bool in_bitfield; fname = btf_name_of(d, m->name_off); m_sz = btf_member_bitfield_size(t, i); m_off = btf_member_bit_offset(t, i); - align = packed ? 1 : btf__align_of(d->btf, m->type); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); + + in_bitfield = prev_bitfield && m_sz != 0; - btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1); + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); btf_dump_printf(d, "\n%s", pfx(lvl + 1)); btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); if (m_sz) { btf_dump_printf(d, ": %d", m_sz); off = m_off + m_sz; + prev_bitfield = true; } else { m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); off = m_off + m_sz * 8; + prev_bitfield = false; } + btf_dump_printf(d, ";"); } /* pad at the end, if necessary */ - if (is_struct) { - align = packed ? 1 : btf__align_of(d->btf, id); - btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align, - lvl + 1); - } + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); /* * Keep `struct empty {}` on a single line, diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c index e5560a656030..e01690618e1e 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c @@ -53,7 +53,7 @@ struct bitfields_only_mixed_types { */ /* ------ END-EXPECTED-OUTPUT ------ */ struct bitfield_mixed_with_others { - long: 4; /* char is enough as a backing field */ + char: 4; /* char is enough as a backing field */ int a: 4; /* 8-bit implicit padding */ short b; /* combined with previous bitfield */ diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c index 7cb522d22a66..6f963d34c45b 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c @@ -19,7 +19,7 @@ struct padded_implicitly { /* *struct padded_explicitly { * int a; - * int: 32; + * long: 0; * int b; *}; * @@ -28,41 +28,28 @@ struct padded_implicitly { struct padded_explicitly { int a; - int: 1; /* algo will explicitly pad with full 32 bits here */ + int: 1; /* algo will emit aligning `long: 0;` here */ int b; }; /* ----- START-EXPECTED-OUTPUT ----- */ -/* - *struct padded_a_lot { - * int a; - * long: 32; - * long: 64; - * long: 64; - * int b; - *}; - * - */ -/* ------ END-EXPECTED-OUTPUT ------ */ - struct padded_a_lot { int a; - /* 32 bit of implicit padding here, which algo will make explicit */ long: 64; long: 64; int b; }; +/* ------ END-EXPECTED-OUTPUT ------ */ + /* ----- START-EXPECTED-OUTPUT ----- */ /* *struct padded_cache_line { * int a; - * long: 32; * long: 64; * long: 64; * long: 64; * int b; - * long: 32; * long: 64; * long: 64; * long: 64; @@ -85,7 +72,7 @@ struct padded_cache_line { *struct zone { * int a; * short b; - * short: 16; + * long: 0; * struct zone_padding __pad__; *}; * @@ -108,6 +95,39 @@ struct padding_wo_named_members { long: 64; }; +struct padding_weird_1 { + int a; + long: 64; + short: 16; + short b; +}; + +/* ------ END-EXPECTED-OUTPUT ------ */ + +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct padding_weird_2 { + * long: 56; + * char a; + * long: 56; + * char b; + * char: 8; + *}; + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ +struct padding_weird_2 { + int: 32; /* these paddings will be collapsed into `long: 56;` */ + short: 16; + char: 8; + char a; + int: 32; /* these paddings will be collapsed into `long: 56;` */ + short: 16; + char: 8; + char b; + char: 8; +}; + /* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { @@ -117,6 +137,8 @@ int f(struct { struct padded_cache_line _4; struct zone _5; struct padding_wo_named_members _6; + struct padding_weird_1 _7; + struct padding_weird_2 _8; } *_) { return 0; -- cgit v1.2.3 From b148c8b9b926e257a59c8eb2cd6fa3adfd443254 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:05 -0800 Subject: selftests/bpf: Add few corner cases to test padding handling of btf_dump Add few hand-crafted cases and few randomized cases found using script from [0] that tests btf_dump's padding logic. [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-7-andrii@kernel.org --- .../bpf/progs/btf_dump_test_case_packing.c | 61 +++++++++++- .../bpf/progs/btf_dump_test_case_padding.c | 104 +++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c index e304b6204bd9..5c6c62f7ed32 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c @@ -58,7 +58,64 @@ union jump_code_union { } __attribute__((packed)); }; -/*------ END-EXPECTED-OUTPUT ------ */ +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct nested_packed_but_aligned_struct { + * int x1; + * int x2; + *}; + * + *struct outer_implicitly_packed_struct { + * char y1; + * struct nested_packed_but_aligned_struct y2; + *} __attribute__((packed)); + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ + +struct nested_packed_but_aligned_struct { + int x1; + int x2; +} __attribute__((packed)); + +struct outer_implicitly_packed_struct { + char y1; + struct nested_packed_but_aligned_struct y2; +}; +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct usb_ss_ep_comp_descriptor { + * char: 8; + * char bDescriptorType; + * char bMaxBurst; + * short wBytesPerInterval; + *}; + * + *struct usb_host_endpoint { + * long: 64; + * char: 8; + * struct usb_ss_ep_comp_descriptor ss_ep_comp; + * long: 0; + *} __attribute__((packed)); + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ + +struct usb_ss_ep_comp_descriptor { + char: 8; + char bDescriptorType; + char bMaxBurst; + int: 0; + short wBytesPerInterval; +} __attribute__((packed)); + +struct usb_host_endpoint { + long: 64; + char: 8; + struct usb_ss_ep_comp_descriptor ss_ep_comp; + long: 0; +}; + int f(struct { struct packed_trailing_space _1; @@ -69,6 +126,8 @@ int f(struct { union union_is_never_packed _6; union union_does_not_need_packing _7; union jump_code_union _8; + struct outer_implicitly_packed_struct _9; + struct usb_host_endpoint _10; } *_) { return 0; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c index 6f963d34c45b..79276fbe454a 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c @@ -128,6 +128,98 @@ struct padding_weird_2 { char: 8; }; +/* ----- START-EXPECTED-OUTPUT ----- */ +struct exact_1byte { + char x; +}; + +struct padded_1byte { + char: 8; +}; + +struct exact_2bytes { + short x; +}; + +struct padded_2bytes { + short: 16; +}; + +struct exact_4bytes { + int x; +}; + +struct padded_4bytes { + int: 32; +}; + +struct exact_8bytes { + long x; +}; + +struct padded_8bytes { + long: 64; +}; + +struct ff_periodic_effect { + int: 32; + short magnitude; + long: 0; + short phase; + long: 0; + int: 32; + int custom_len; + short *custom_data; +}; + +struct ib_wc { + long: 64; + long: 64; + int: 32; + int byte_len; + void *qp; + union {} ex; + long: 64; + int slid; + int wc_flags; + long: 64; + char smac[6]; + long: 0; + char network_hdr_type; +}; + +struct acpi_object_method { + long: 64; + char: 8; + char type; + short reference_count; + char flags; + short: 0; + char: 8; + char sync_level; + long: 64; + void *node; + void *aml_start; + union {} dispatch; + long: 64; + int aml_length; +}; + +struct nested_unpacked { + int x; +}; + +struct nested_packed { + struct nested_unpacked a; + char c; +} __attribute__((packed)); + +struct outer_mixed_but_unpacked { + struct nested_packed b1; + short a1; + struct nested_packed b2; +}; + /* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { @@ -139,6 +231,18 @@ int f(struct { struct padding_wo_named_members _6; struct padding_weird_1 _7; struct padding_weird_2 _8; + struct exact_1byte _100; + struct padded_1byte _101; + struct exact_2bytes _102; + struct padded_2bytes _103; + struct exact_4bytes _104; + struct padded_4bytes _105; + struct exact_8bytes _106; + struct padded_8bytes _107; + struct ff_periodic_effect _200; + struct ib_wc _201; + struct acpi_object_method _202; + struct outer_mixed_but_unpacked _203; } *_) { return 0; -- cgit v1.2.3 From 4fb877aaa179dcdb1676d55216482febaada457e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 15 Dec 2022 10:36:05 -0800 Subject: libbpf: Fix btf_dump's packed struct determination Fix bug in btf_dump's logic of determining if a given struct type is packed or not. The notion of "natural alignment" is not needed and is even harmful in this case, so drop it altogether. The biggest difference in btf_is_struct_packed() compared to its original implementation is that we don't really use btf__align_of() to determine overall alignment of a struct type (because it could be 1 for both packed and non-packed struct, depending on specifci field definitions), and just use field's actual alignment to calculate whether any field is requiring packing or struct's size overall necessitates packing. Add two simple test cases that demonstrate the difference this change would make. Fixes: ea2ce1ba99aa ("libbpf: Fix BTF-to-C converter's padding logic") Reported-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20221215183605.4149488-1-andrii@kernel.org --- tools/lib/bpf/btf_dump.c | 33 ++++------------------ .../bpf/progs/btf_dump_test_case_packing.c | 19 +++++++++++++ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index d6fd93a57f11..580985ee5545 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -830,47 +830,26 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) } } -static int btf_natural_align_of(const struct btf *btf, __u32 id) -{ - const struct btf_type *t = btf__type_by_id(btf, id); - int i, align, vlen; - const struct btf_member *m; - - if (!btf_is_composite(t)) - return btf__align_of(btf, id); - - align = 1; - m = btf_members(t); - vlen = btf_vlen(t); - for (i = 0; i < vlen; i++, m++) { - align = max(align, btf__align_of(btf, m->type)); - } - - return align; -} - static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { const struct btf_member *m; - int align, i, bit_sz; + int max_align = 1, align, i, bit_sz; __u16 vlen; - align = btf_natural_align_of(btf, id); - /* size of a non-packed struct has to be a multiple of its alignment */ - if (align && (t->size % align) != 0) - return true; - m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ for (i = 0; i < vlen; i++, m++) { - align = btf_natural_align_of(btf, m->type); + align = btf__align_of(btf, m->type); bit_sz = btf_member_bitfield_size(t, i); if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; + max_align = max(align, max_align); } - + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; /* * if original struct was marked as packed, but its layout is * naturally aligned, we'll detect that it's not packed diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c index 5c6c62f7ed32..7998f27df7dd 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c @@ -116,6 +116,23 @@ struct usb_host_endpoint { long: 0; }; +/* ----- START-EXPECTED-OUTPUT ----- */ +struct nested_packed_struct { + int a; + char b; +} __attribute__((packed)); + +struct outer_nonpacked_struct { + short a; + struct nested_packed_struct b; +}; + +struct outer_packed_struct { + short a; + struct nested_packed_struct b; +} __attribute__((packed)); + +/* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { struct packed_trailing_space _1; @@ -128,6 +145,8 @@ int f(struct { union jump_code_union _8; struct outer_implicitly_packed_struct _9; struct usb_host_endpoint _10; + struct outer_nonpacked_struct _11; + struct outer_packed_struct _12; } *_) { return 0; -- cgit v1.2.3 From 0e43662e61f2569500ab83b8188c065603530785 Mon Sep 17 00:00:00 2001 From: Shen Jiamin Date: Thu, 15 Dec 2022 12:47:03 +0800 Subject: tools/resolve_btfids: Use pkg-config to locate libelf When libelf was not installed in the standard location, it cannot be located by the current building config. Use pkg-config to help locate libelf in such cases. Signed-off-by: Shen Jiamin Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20221215044703.400139-1-shen_jiamin@comp.nus.edu.sg --- tools/bpf/resolve_btfids/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 19a3112e271a..f7375a119f54 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -56,13 +56,17 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU DESTDIR=$(LIBBPF_DESTDIR) prefix= EXTRA_CFLAGS="$(CFLAGS)" \ $(abspath $@) install_headers +LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null) +LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) + CFLAGS += -g \ -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ -I$(LIBBPF_INCLUDE) \ - -I$(SUBCMD_SRC) + -I$(SUBCMD_SRC) \ + $(LIBELF_FLAGS) -LIBS = -lelf -lz +LIBS = $(LIBELF_LIBS) -lz export srctree OUTPUT CFLAGS Q include $(srctree)/tools/build/Makefile.include -- cgit v1.2.3 From 78aa1cc9404399a15d2a1205329c6a06236f5378 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Dec 2022 22:44:28 +0100 Subject: bpf: Add struct for bin_args arg in bpf_bprintf_prepare Adding struct bpf_bprintf_data to hold bin_args argument for bpf_bprintf_prepare function. We will add another return argument to bpf_bprintf_prepare and pass the struct to bpf_bprintf_cleanup for proper cleanup in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221215214430.1336195-2-jolsa@kernel.org --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 24 +++++++++++++----------- kernel/bpf/verifier.c | 3 ++- kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++-------------- 4 files changed, 41 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3de24cfb7a3d..cc390ba32e70 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2796,8 +2796,13 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 +struct bpf_bprintf_data { + u32 *bin_args; + bool get_bin_args; +}; + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u32 **bin_buf, u32 num_args); + u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(void); /* the implementation of the opaque uapi struct bpf_dynptr */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index af30c6cbd65d..7dbf6bb72cad 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -798,16 +798,16 @@ void bpf_bprintf_cleanup(void) * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: - * - Format string verification only: when bin_args is NULL + * - Format string verification only: when data->get_bin_args is false * - Arguments preparation: in addition to the above verification, it writes in - * bin_args a binary representation of arguments usable by bstr_printf where - * pointers from BPF have been sanitized. + * data->bin_args a binary representation of arguments usable by bstr_printf + * where pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u32 **bin_args, u32 num_args) + u32 num_args, struct bpf_bprintf_data *data) { char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; size_t sizeof_cur_arg, sizeof_cur_ip; @@ -820,12 +820,12 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (bin_args) { + if (data->get_bin_args) { if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) return -EBUSY; tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; - *bin_args = (u32 *)tmp_buf; + data->bin_args = (u32 *)tmp_buf; } for (i = 0; i < fmt_size; i++) { @@ -1026,24 +1026,26 @@ out: } BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, - const void *, data, u32, data_len) + const void *, args, u32, data_len) { + struct bpf_bprintf_data data = { + .get_bin_args = true, + }; int err, num_args; - u32 *bin_args; if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || - (data_len && !data)) + (data_len && !args)) return -EINVAL; num_args = data_len / 8; /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ - err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); if (err < 0) return err; - err = bstr_printf(str, str_size, fmt, bin_args); + err = bstr_printf(str, str_size, fmt, data.bin_args); bpf_bprintf_cleanup(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a5255a0dcbb6..faa358b3d5d7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7612,6 +7612,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; struct bpf_map *fmt_map = fmt_reg->map_ptr; + struct bpf_bprintf_data data = {}; int err, fmt_map_off, num_args; u64 fmt_addr; char *fmt; @@ -7636,7 +7637,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we * can focus on validating the format specifiers. */ - err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data); if (err < 0) verbose(env, "Invalid format string\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bbd3f0c810c..3e849c3a7cc8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -378,18 +378,20 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; - u32 *bin_args; + struct bpf_bprintf_data data = { + .get_bin_args = true, + }; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; int ret; - ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, - MAX_TRACE_PRINTK_VARARGS); + ret = bpf_bprintf_prepare(fmt, fmt_size, args, + MAX_TRACE_PRINTK_VARARGS, &data); if (ret < 0) return ret; raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); + ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); @@ -427,25 +429,27 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data, +BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, u32, data_len) { + struct bpf_bprintf_data data = { + .get_bin_args = true, + }; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; int ret, num_args; - u32 *bin_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || - (data_len && !data)) + (data_len && !args)) return -EINVAL; num_args = data_len / 8; - ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); + ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (ret < 0) return ret; raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); + ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); @@ -472,21 +476,23 @@ const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) } BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, - const void *, data, u32, data_len) + const void *, args, u32, data_len) { + struct bpf_bprintf_data data = { + .get_bin_args = true, + }; int err, num_args; - u32 *bin_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || - (data_len && !data)) + (data_len && !args)) return -EINVAL; num_args = data_len / 8; - err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); + err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (err < 0) return err; - seq_bprintf(m, fmt, bin_args); + seq_bprintf(m, fmt, data.bin_args); bpf_bprintf_cleanup(); -- cgit v1.2.3 From f19a4050455aad847fb93f18dc1fe502eb60f989 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Dec 2022 22:44:29 +0100 Subject: bpf: Do cleanup in bpf_bprintf_cleanup only when needed Currently we always cleanup/decrement bpf_bprintf_nest_level variable in bpf_bprintf_cleanup if it's > 0. There's possible scenario where this could cause a problem, when bpf_bprintf_prepare does not get bin_args buffer (because num_args is 0) and following bpf_bprintf_cleanup call decrements bpf_bprintf_nest_level variable, like: in task context: bpf_bprintf_prepare(num_args != 0) increments 'bpf_bprintf_nest_level = 1' -> first irq : bpf_bprintf_prepare(num_args == 0) bpf_bprintf_cleanup decrements 'bpf_bprintf_nest_level = 0' -> second irq: bpf_bprintf_prepare(num_args != 0) bpf_bprintf_nest_level = 1 gets same buffer as task context above Adding check to bpf_bprintf_cleanup and doing the real cleanup only if we got bin_args data in the first place. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221215214430.1336195-3-jolsa@kernel.org --- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 16 +++++++++------- kernel/trace/bpf_trace.c | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc390ba32e70..656879385fbf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2803,7 +2803,7 @@ struct bpf_bprintf_data { int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); -void bpf_bprintf_cleanup(void); +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); /* the implementation of the opaque uapi struct bpf_dynptr */ struct bpf_dynptr_kern { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7dbf6bb72cad..9cca02e13f2e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -784,12 +784,14 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) return 0; } -void bpf_bprintf_cleanup(void) +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { - if (this_cpu_read(bpf_bprintf_nest_level)) { - this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); - } + if (!data->bin_args) + return; + if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) + return; + this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); } /* @@ -1021,7 +1023,7 @@ nocopy_fmt: err = 0; out: if (err) - bpf_bprintf_cleanup(); + bpf_bprintf_cleanup(data); return err; } @@ -1047,7 +1049,7 @@ BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, err = bstr_printf(str, str_size, fmt, data.bin_args); - bpf_bprintf_cleanup(); + bpf_bprintf_cleanup(&data); return err + 1; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e849c3a7cc8..2129f7c68bb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -396,7 +396,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - bpf_bprintf_cleanup(); + bpf_bprintf_cleanup(&data); return ret; } @@ -454,7 +454,7 @@ BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - bpf_bprintf_cleanup(); + bpf_bprintf_cleanup(&data); return ret; } @@ -494,7 +494,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, seq_bprintf(m, fmt, data.bin_args); - bpf_bprintf_cleanup(); + bpf_bprintf_cleanup(&data); return seq_has_overflowed(m) ? -EOVERFLOW : 0; } -- cgit v1.2.3 From e2bb9e01d589f7fa82573aedd2765ff9b277816a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Dec 2022 22:44:30 +0100 Subject: bpf: Remove trace_printk_lock Both bpf_trace_printk and bpf_trace_vprintk helpers use static buffer guarded with trace_printk_lock spin lock. The spin lock contention causes issues with bpf programs attached to contention_begin tracepoint [1][2]. Andrii suggested we could get rid of the contention by using trylock, but we could actually get rid of the spinlock completely by using percpu buffers the same way as for bin_args in bpf_bprintf_prepare function. Adding new return 'buf' argument to struct bpf_bprintf_data and making bpf_bprintf_prepare to return also the buffer for printk helpers. [1] https://lore.kernel.org/bpf/CACkBjsakT_yWxnSWr4r-0TpPvbKm9-OBmVUhJb7hV3hY8fdCkw@mail.gmail.com/ [2] https://lore.kernel.org/bpf/CACkBjsaCsTovQHFfkqJKto6S4Z8d02ud1D7MPESrHa1cVNNTrw@mail.gmail.com/ Reported-by: Hao Sun Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221215214430.1336195-4-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ kernel/bpf/helpers.c | 31 +++++++++++++++++++------------ kernel/trace/bpf_trace.c | 20 ++++++-------------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 656879385fbf..5fec2d1be6d7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2795,10 +2795,13 @@ struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 +#define MAX_BPRINTF_BUF 1024 struct bpf_bprintf_data { u32 *bin_args; + char *buf; bool get_bin_args; + bool get_buf; }; int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9cca02e13f2e..23aa8cf8fd1a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -756,19 +756,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary * arguments representation. */ -#define MAX_BPRINTF_BUF_LEN 512 +#define MAX_BPRINTF_BIN_ARGS 512 /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 struct bpf_bprintf_buffers { - char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; + char bin_args[MAX_BPRINTF_BIN_ARGS]; + char buf[MAX_BPRINTF_BUF]; }; -static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); + +static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_fmt_tmp_buf(char **tmp_buf) +static int try_get_buffers(struct bpf_bprintf_buffers **bufs) { - struct bpf_bprintf_buffers *bufs; int nest_level; preempt_disable(); @@ -778,15 +779,14 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) preempt_enable(); return -EBUSY; } - bufs = this_cpu_ptr(&bpf_bprintf_bufs); - *tmp_buf = bufs->tmp_bufs[nest_level - 1]; + *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); return 0; } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { - if (!data->bin_args) + if (!data->bin_args && !data->buf) return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; @@ -811,7 +811,9 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { + bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; + struct bpf_bprintf_buffers *buffers = NULL; size_t sizeof_cur_arg, sizeof_cur_ip; int err, i, num_spec = 0; u64 cur_arg; @@ -822,14 +824,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (data->get_bin_args) { - if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) - return -EBUSY; + if (get_buffers && try_get_buffers(&buffers)) + return -EBUSY; - tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; + if (data->get_bin_args) { + if (num_args) + tmp_buf = buffers->bin_args; + tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; data->bin_args = (u32 *)tmp_buf; } + if (data->get_buf) + data->buf = buffers->buf; + for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2129f7c68bb5..23ce498bca97 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -369,8 +369,6 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static DEFINE_RAW_SPINLOCK(trace_printk_lock); - #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 @@ -380,9 +378,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; struct bpf_bprintf_data data = { .get_bin_args = true, + .get_buf = true, }; - static char buf[BPF_TRACE_PRINTK_SIZE]; - unsigned long flags; int ret; ret = bpf_bprintf_prepare(fmt, fmt_size, args, @@ -390,11 +387,9 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (ret < 0) return ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); - trace_bpf_trace_printk(buf); - raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); @@ -434,9 +429,8 @@ BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, { struct bpf_bprintf_data data = { .get_bin_args = true, + .get_buf = true, }; - static char buf[BPF_TRACE_PRINTK_SIZE]; - unsigned long flags; int ret, num_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || @@ -448,11 +442,9 @@ BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, if (ret < 0) return ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = bstr_printf(buf, sizeof(buf), fmt, data.bin_args); + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); - trace_bpf_trace_printk(buf); - raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); -- cgit v1.2.3 From 13aa2a92840dd49a64362ddd1e4d16084a4b9e0f Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 18 Dec 2022 15:14:51 +0900 Subject: samples/bpf: remove unused function with test_lru_dist Currently, compiling samples/bpf with LLVM warns about the unused function with test_lru_dist. ./samples/bpf/test_lru_dist.c:45:19: warning: unused function 'list_empty' [-Wunused-function] static inline int list_empty(const struct list_head *head) ^ 1 warning generated. This commit resolve this compiler warning by removing the abandoned function. Signed-off-by: Daniel T. Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221218061453.6287-2-danieltimlee@gmail.com Signed-off-by: Martin KaFai Lau --- samples/bpf/test_lru_dist.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index 5efb91763d65..1c161276d57b 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -42,11 +42,6 @@ static inline void INIT_LIST_HEAD(struct list_head *list) list->prev = list; } -static inline int list_empty(const struct list_head *head) -{ - return head->next == head; -} - static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) -- cgit v1.2.3 From 71135b77aac75769f6dae81ac4725405f1015d22 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 18 Dec 2022 15:14:52 +0900 Subject: samples/bpf: replace meaningless counter with tracex4 Currently, compiling samples/bpf with LLVM warns about the unused but set variable with tracex4_user. ./samples/bpf/tracex4_user.c:54:14: warning: variable 'i' set but not used [-Wunused-but-set-variable] int map_fd, i, j = 0; ^ 1 warning generated. This commit resolve this compiler warning by replacing the meaningless counter. Signed-off-by: Daniel T. Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221218061453.6287-3-danieltimlee@gmail.com Signed-off-by: Martin KaFai Lau --- samples/bpf/tracex4_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index 227b05a0bc88..dee8f0a091ba 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -51,7 +51,7 @@ int main(int ac, char **argv) struct bpf_program *prog; struct bpf_object *obj; char filename[256]; - int map_fd, i, j = 0; + int map_fd, j = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); @@ -82,7 +82,7 @@ int main(int ac, char **argv) j++; } - for (i = 0; ; i++) { + while (1) { print_old_objects(map_fd); sleep(1); } -- cgit v1.2.3 From 68be98e0f4191abc64c871e3e461f275715eaf67 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 18 Dec 2022 15:14:53 +0900 Subject: samples/bpf: fix uninitialized warning with test_current_task_under_cgroup Currently, compiling samples/bpf with LLVM warns about the uninitialized use of variable with test_current_task_under_cgroup. ./samples/bpf/test_current_task_under_cgroup_user.c:57:6: warning: variable 'cg2' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (setup_cgroup_environment()) ^~~~~~~~~~~~~~~~~~~~~~~~~~ ./samples/bpf/test_current_task_under_cgroup_user.c:106:8: note: uninitialized use occurs here close(cg2); ^~~ ./samples/bpf/test_current_task_under_cgroup_user.c:57:2: note: remove the 'if' if its condition is always false if (setup_cgroup_environment()) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./samples/bpf/test_current_task_under_cgroup_user.c:19:9: note: initialize the variable 'cg2' to silence this warning int cg2, idx = 0, rc = 1; ^ = 0 1 warning generated. This commit resolve this compiler warning by pre-initialize the variable with error for safeguard. Signed-off-by: Daniel T. Lee Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221218061453.6287-4-danieltimlee@gmail.com Signed-off-by: Martin KaFai Lau --- samples/bpf/test_current_task_under_cgroup_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index ac251a417f45..6fb25906835e 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -14,9 +14,9 @@ int main(int argc, char **argv) { pid_t remote_pid, local_pid = getpid(); + int cg2 = -1, idx = 0, rc = 1; struct bpf_link *link = NULL; struct bpf_program *prog; - int cg2, idx = 0, rc = 1; struct bpf_object *obj; char filename[256]; int map_fd[2]; @@ -103,7 +103,9 @@ int main(int argc, char **argv) rc = 0; err: - close(cg2); + if (cg2 != -1) + close(cg2); + cleanup_cgroup_environment(); cleanup: -- cgit v1.2.3 From e26aa600ba6a62fe84659f1df497a381bab6d07e Mon Sep 17 00:00:00 2001 From: Christian Ehrig Date: Sun, 18 Dec 2022 06:17:31 +0100 Subject: bpf: Add flag BPF_F_NO_TUNNEL_KEY to bpf_skb_set_tunnel_key() This patch allows to remove TUNNEL_KEY from the tunnel flags bitmap when using bpf_skb_set_tunnel_key by providing a BPF_F_NO_TUNNEL_KEY flag. On egress, the resulting tunnel header will not contain a tunnel key if the protocol and implementation supports it. At the moment bpf_tunnel_key wants a user to specify a numeric tunnel key. This will wrap the inner packet into a tunnel header with the key bit and value set accordingly. This is problematic when using a tunnel protocol that supports optional tunnel keys and a receiving tunnel device that is not expecting packets with the key bit set. The receiver won't decapsulate and drop the packet. RFC 2890 and RFC 2784 GRE tunnels are examples where this flag is useful. It allows for generating packets, that can be decapsulated by a GRE tunnel device not operating in collect metadata mode or not expecting the key bit set. Signed-off-by: Christian Ehrig Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221218051734.31411-1-cehrig@cloudflare.com --- include/uapi/linux/bpf.h | 4 ++++ net/core/filter.c | 5 ++++- tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 464ca3f01fe7..bc1a3d232ae4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2001,6 +2001,9 @@ union bpf_attr { * sending the packet. This flag was added for GRE * encapsulation, but might be used with other protocols * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. * * Here is a typical usage on the transmit path: * @@ -5764,6 +5767,7 @@ enum { BPF_F_ZERO_CSUM_TX = (1ULL << 1), BPF_F_DONT_FRAGMENT = (1ULL << 2), BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), }; /* BPF_FUNC_skb_get_tunnel_key flags. */ diff --git a/net/core/filter.c b/net/core/filter.c index 929358677183..c746e4d77214 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4615,7 +4615,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -4653,6 +4654,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags &= ~TUNNEL_CSUM; if (flags & BPF_F_SEQ_NUMBER) info->key.tun_flags |= TUNNEL_SEQ; + if (flags & BPF_F_NO_TUNNEL_KEY) + info->key.tun_flags &= ~TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 464ca3f01fe7..bc1a3d232ae4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2001,6 +2001,9 @@ union bpf_attr { * sending the packet. This flag was added for GRE * encapsulation, but might be used with other protocols * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. * * Here is a typical usage on the transmit path: * @@ -5764,6 +5767,7 @@ enum { BPF_F_ZERO_CSUM_TX = (1ULL << 1), BPF_F_DONT_FRAGMENT = (1ULL << 2), BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), }; /* BPF_FUNC_skb_get_tunnel_key flags. */ -- cgit v1.2.3 From ac6e45e05857464a1e347c50da9917141f1fbb80 Mon Sep 17 00:00:00 2001 From: Christian Ehrig Date: Sun, 18 Dec 2022 06:17:32 +0100 Subject: selftests/bpf: Add BPF_F_NO_TUNNEL_KEY test This patch adds a selftest simulating a GRE sender and receiver using tunnel headers without tunnel keys. It validates if packets encapsulated using BPF_F_NO_TUNNEL_KEY are decapsulated by a GRE receiver not configured with tunnel keys. Signed-off-by: Christian Ehrig Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221218051734.31411-2-cehrig@cloudflare.com --- .../testing/selftests/bpf/progs/test_tunnel_kern.c | 21 ++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 40 ++++++++++++++++++++-- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 98af55f0bcd3..508da4a23c4f 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -81,6 +81,27 @@ int gre_set_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("tc") +int gre_set_tunnel_no_key(struct __sk_buff *skb) +{ + int ret; + struct bpf_tunnel_key key; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ + key.tunnel_ttl = 64; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER | + BPF_F_NO_TUNNEL_KEY); + if (ret < 0) { + log_err(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + SEC("tc") int gre_get_tunnel(struct __sk_buff *skb) { diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 2eaedc1d9ed3..06857b689c11 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -66,15 +66,20 @@ config_device() add_gre_tunnel() { + tun_key= + if [ -n "$1" ]; then + tun_key="key $1" + fi + # at_ns0 namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ + ip link add dev $DEV_NS type $TYPE seq $tun_key \ local 172.16.1.100 remote 172.16.1.200 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 # root namespace - ip link add dev $DEV type $TYPE key 2 external + ip link add dev $DEV type $TYPE $tun_key external ip link set dev $DEV up ip addr add dev $DEV 10.1.1.200/24 } @@ -238,7 +243,7 @@ test_gre() check $TYPE config_device - add_gre_tunnel + add_gre_tunnel 2 attach_bpf $DEV gre_set_tunnel gre_get_tunnel ping $PING_ARG 10.1.1.100 check_err $? @@ -253,6 +258,30 @@ test_gre() echo -e ${GREEN}"PASS: $TYPE"${NC} } +test_gre_no_tunnel_key() +{ + TYPE=gre + DEV_NS=gre00 + DEV=gre11 + ret=0 + + check $TYPE + config_device + add_gre_tunnel + attach_bpf $DEV gre_set_tunnel_no_key gre_get_tunnel + ping $PING_ARG 10.1.1.100 + check_err $? + ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 + check_err $? + cleanup + + if [ $ret -ne 0 ]; then + echo -e ${RED}"FAIL: $TYPE"${NC} + return 1 + fi + echo -e ${GREEN}"PASS: $TYPE"${NC} +} + test_ip6gre() { TYPE=ip6gre @@ -589,6 +618,7 @@ cleanup() ip link del ipip6tnl11 2> /dev/null ip link del ip6ip6tnl11 2> /dev/null ip link del gretap11 2> /dev/null + ip link del gre11 2> /dev/null ip link del ip6gre11 2> /dev/null ip link del ip6gretap11 2> /dev/null ip link del geneve11 2> /dev/null @@ -641,6 +671,10 @@ bpf_tunnel_test() test_gre errors=$(( $errors + $? )) + echo "Testing GRE tunnel (without tunnel keys)..." + test_gre_no_tunnel_key + errors=$(( $errors + $? )) + echo "Testing IP6GRE tunnel..." test_ip6gre errors=$(( $errors + $? )) -- cgit v1.2.3 From cafb92d719e8d8d8491b4d493fad331c2bc80f94 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 19 Dec 2022 09:55:12 +0000 Subject: docs: BPF_MAP_TYPE_SOCK[MAP|HASH] Add documentation for BPF_MAP_TYPE_SOCK[MAP|HASH] including kernel versions introduced, usage and examples. Signed-off-by: Maryam Tahhan Signed-off-by: Andrii Nakryiko Reviewed-by: Bagas Sanjaya Acked-by: John Fastabend Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20221219095512.26534-1-mtahhan@redhat.com --- Documentation/bpf/map_sockmap.rst | 498 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 498 insertions(+) create mode 100644 Documentation/bpf/map_sockmap.rst diff --git a/Documentation/bpf/map_sockmap.rst b/Documentation/bpf/map_sockmap.rst new file mode 100644 index 000000000000..cc92047c6630 --- /dev/null +++ b/Documentation/bpf/map_sockmap.rst @@ -0,0 +1,498 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright Red Hat + +============================================== +BPF_MAP_TYPE_SOCKMAP and BPF_MAP_TYPE_SOCKHASH +============================================== + +.. note:: + - ``BPF_MAP_TYPE_SOCKMAP`` was introduced in kernel version 4.14 + - ``BPF_MAP_TYPE_SOCKHASH`` was introduced in kernel version 4.18 + +``BPF_MAP_TYPE_SOCKMAP`` and ``BPF_MAP_TYPE_SOCKHASH`` maps can be used to +redirect skbs between sockets or to apply policy at the socket level based on +the result of a BPF (verdict) program with the help of the BPF helpers +``bpf_sk_redirect_map()``, ``bpf_sk_redirect_hash()``, +``bpf_msg_redirect_map()`` and ``bpf_msg_redirect_hash()``. + +``BPF_MAP_TYPE_SOCKMAP`` is backed by an array that uses an integer key as the +index to look up a reference to a ``struct sock``. The map values are socket +descriptors. Similarly, ``BPF_MAP_TYPE_SOCKHASH`` is a hash backed BPF map that +holds references to sockets via their socket descriptors. + +.. note:: + The value type is either __u32 or __u64; the latter (__u64) is to support + returning socket cookies to userspace. Returning the ``struct sock *`` that + the map holds to user-space is neither safe nor useful. + +These maps may have BPF programs attached to them, specifically a parser program +and a verdict program. The parser program determines how much data has been +parsed and therefore how much data needs to be queued to come to a verdict. The +verdict program is essentially the redirect program and can return a verdict +of ``__SK_DROP``, ``__SK_PASS``, or ``__SK_REDIRECT``. + +When a socket is inserted into one of these maps, its socket callbacks are +replaced and a ``struct sk_psock`` is attached to it. Additionally, this +``sk_psock`` inherits the programs that are attached to the map. + +A sock object may be in multiple maps, but can only inherit a single +parse or verdict program. If adding a sock object to a map would result +in having multiple parser programs the update will return an EBUSY error. + +The supported programs to attach to these maps are: + +.. code-block:: c + + struct sk_psock_progs { + struct bpf_prog *msg_parser; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; + }; + +.. note:: + Users are not allowed to attach ``stream_verdict`` and ``skb_verdict`` + programs to the same map. + +The attach types for the map programs are: + +- ``msg_parser`` program - ``BPF_SK_MSG_VERDICT``. +- ``stream_parser`` program - ``BPF_SK_SKB_STREAM_PARSER``. +- ``stream_verdict`` program - ``BPF_SK_SKB_STREAM_VERDICT``. +- ``skb_verdict`` program - ``BPF_SK_SKB_VERDICT``. + +There are additional helpers available to use with the parser and verdict +programs: ``bpf_msg_apply_bytes()`` and ``bpf_msg_cork_bytes()``. With +``bpf_msg_apply_bytes()`` BPF programs can tell the infrastructure how many +bytes the given verdict should apply to. The helper ``bpf_msg_cork_bytes()`` +handles a different case where a BPF program cannot reach a verdict on a msg +until it receives more bytes AND the program doesn't want to forward the packet +until it is known to be good. + +Finally, the helpers ``bpf_msg_pull_data()`` and ``bpf_msg_push_data()`` are +available to ``BPF_PROG_TYPE_SK_MSG`` BPF programs to pull in data and set the +start and end pointers to given values or to add metadata to the ``struct +sk_msg_buff *msg``. + +All these helpers will be described in more detail below. + +Usage +===== +Kernel BPF +---------- +bpf_msg_redirect_map() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + +This helper is used in programs implementing policies at the socket level. If +the message ``msg`` is allowed to pass (i.e., if the verdict BPF program +returns ``SK_PASS``), redirect it to the socket referenced by ``map`` (of type +``BPF_MAP_TYPE_SOCKMAP``) at index ``key``. Both ingress and egress interfaces +can be used for redirection. The ``BPF_F_INGRESS`` value in ``flags`` is used +to select the ingress path otherwise the egress path is selected. This is the +only flag supported for now. + +Returns ``SK_PASS`` on success, or ``SK_DROP`` on error. + +bpf_sk_redirect_map() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key u64 flags) + +Redirect the packet to the socket referenced by ``map`` (of type +``BPF_MAP_TYPE_SOCKMAP``) at index ``key``. Both ingress and egress interfaces +can be used for redirection. The ``BPF_F_INGRESS`` value in ``flags`` is used +to select the ingress path otherwise the egress path is selected. This is the +only flag supported for now. + +Returns ``SK_PASS`` on success, or ``SK_DROP`` on error. + +bpf_map_lookup_elem() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + +socket entries of type ``struct sock *`` can be retrieved using the +``bpf_map_lookup_elem()`` helper. + +bpf_sock_map_update() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + +Add an entry to, or update a ``map`` referencing sockets. The ``skops`` is used +as a new value for the entry associated to ``key``. The ``flags`` argument can +be one of the following: + +- ``BPF_ANY``: Create a new element or update an existing element. +- ``BPF_NOEXIST``: Create a new element only if it did not exist. +- ``BPF_EXIST``: Update an existing element. + +If the ``map`` has BPF programs (parser and verdict), those will be inherited +by the socket being added. If the socket is already attached to BPF programs, +this results in an error. + +Returns 0 on success, or a negative error in case of failure. + +bpf_sock_hash_update() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + +Add an entry to, or update a sockhash ``map`` referencing sockets. The ``skops`` +is used as a new value for the entry associated to ``key``. + +The ``flags`` argument can be one of the following: + +- ``BPF_ANY``: Create a new element or update an existing element. +- ``BPF_NOEXIST``: Create a new element only if it did not exist. +- ``BPF_EXIST``: Update an existing element. + +If the ``map`` has BPF programs (parser and verdict), those will be inherited +by the socket being added. If the socket is already attached to BPF programs, +this results in an error. + +Returns 0 on success, or a negative error in case of failure. + +bpf_msg_redirect_hash() +^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + +This helper is used in programs implementing policies at the socket level. If +the message ``msg`` is allowed to pass (i.e., if the verdict BPF program returns +``SK_PASS``), redirect it to the socket referenced by ``map`` (of type +``BPF_MAP_TYPE_SOCKHASH``) using hash ``key``. Both ingress and egress +interfaces can be used for redirection. The ``BPF_F_INGRESS`` value in +``flags`` is used to select the ingress path otherwise the egress path is +selected. This is the only flag supported for now. + +Returns ``SK_PASS`` on success, or ``SK_DROP`` on error. + +bpf_sk_redirect_hash() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + +This helper is used in programs implementing policies at the skb socket level. +If the sk_buff ``skb`` is allowed to pass (i.e., if the verdict BPF program +returns ``SK_PASS``), redirect it to the socket referenced by ``map`` (of type +``BPF_MAP_TYPE_SOCKHASH``) using hash ``key``. Both ingress and egress +interfaces can be used for redirection. The ``BPF_F_INGRESS`` value in +``flags`` is used to select the ingress path otherwise the egress path is +selected. This is the only flag supported for now. + +Returns ``SK_PASS`` on success, or ``SK_DROP`` on error. + +bpf_msg_apply_bytes() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + +For socket policies, apply the verdict of the BPF program to the next (number +of ``bytes``) of message ``msg``. For example, this helper can be used in the +following cases: + +- A single ``sendmsg()`` or ``sendfile()`` system call contains multiple + logical messages that the BPF program is supposed to read and for which it + should apply a verdict. +- A BPF program only cares to read the first ``bytes`` of a ``msg``. If the + message has a large payload, then setting up and calling the BPF program + repeatedly for all bytes, even though the verdict is already known, would + create unnecessary overhead. + +Returns 0 + +bpf_msg_cork_bytes() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + +For socket policies, prevent the execution of the verdict BPF program for +message ``msg`` until the number of ``bytes`` have been accumulated. + +This can be used when one needs a specific number of bytes before a verdict can +be assigned, even if the data spans multiple ``sendmsg()`` or ``sendfile()`` +calls. + +Returns 0 + +bpf_msg_pull_data() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + +For socket policies, pull in non-linear data from user space for ``msg`` and set +pointers ``msg->data`` and ``msg->data_end`` to ``start`` and ``end`` bytes +offsets into ``msg``, respectively. + +If a program of type ``BPF_PROG_TYPE_SK_MSG`` is run on a ``msg`` it can only +parse data that the (``data``, ``data_end``) pointers have already consumed. +For ``sendmsg()`` hooks this is likely the first scatterlist element. But for +calls relying on the ``sendpage`` handler (e.g., ``sendfile()``) this will be +the range (**0**, **0**) because the data is shared with user space and by +default the objective is to avoid allowing user space to modify data while (or +after) BPF verdict is being decided. This helper can be used to pull in data +and to set the start and end pointers to given values. Data will be copied if +necessary (i.e., if data was not linear and if start and end pointers do not +point to the same chunk). + +A call to this helper is susceptible to change the underlying packet buffer. +Therefore, at load time, all checks on pointers previously done by the verifier +are invalidated and must be performed again, if the helper is used in +combination with direct packet access. + +All values for ``flags`` are reserved for future usage, and must be left at +zero. + +Returns 0 on success, or a negative error in case of failure. + +bpf_map_lookup_elem() +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: c + + void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + +Look up a socket entry in the sockmap or sockhash map. + +Returns the socket entry associated to ``key``, or NULL if no entry was found. + +bpf_map_update_elem() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + +Add or update a socket entry in a sockmap or sockhash. + +The flags argument can be one of the following: + +- BPF_ANY: Create a new element or update an existing element. +- BPF_NOEXIST: Create a new element only if it did not exist. +- BPF_EXIST: Update an existing element. + +Returns 0 on success, or a negative error in case of failure. + +bpf_map_delete_elem() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + long bpf_map_delete_elem(struct bpf_map *map, const void *key) + +Delete a socket entry from a sockmap or a sockhash. + +Returns 0 on success, or a negative error in case of failure. + +User space +---------- +bpf_map_update_elem() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags) + +Sockmap entries can be added or updated using the ``bpf_map_update_elem()`` +function. The ``key`` parameter is the index value of the sockmap array. And the +``value`` parameter is the FD value of that socket. + +Under the hood, the sockmap update function uses the socket FD value to +retrieve the associated socket and its attached psock. + +The flags argument can be one of the following: + +- BPF_ANY: Create a new element or update an existing element. +- BPF_NOEXIST: Create a new element only if it did not exist. +- BPF_EXIST: Update an existing element. + +bpf_map_lookup_elem() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + int bpf_map_lookup_elem(int fd, const void *key, void *value) + +Sockmap entries can be retrieved using the ``bpf_map_lookup_elem()`` function. + +.. note:: + The entry returned is a socket cookie rather than a socket itself. + +bpf_map_delete_elem() +^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + int bpf_map_delete_elem(int fd, const void *key) + +Sockmap entries can be deleted using the ``bpf_map_delete_elem()`` +function. + +Returns 0 on success, or negative error in case of failure. + +Examples +======== + +Kernel BPF +---------- +Several examples of the use of sockmap APIs can be found in: + +- `tools/testing/selftests/bpf/progs/test_sockmap_kern.h`_ +- `tools/testing/selftests/bpf/progs/sockmap_parse_prog.c`_ +- `tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c`_ +- `tools/testing/selftests/bpf/progs/test_sockmap_listen.c`_ +- `tools/testing/selftests/bpf/progs/test_sockmap_update.c`_ + +The following code snippet shows how to declare a sockmap. + +.. code-block:: c + + struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); + } sock_map_rx SEC(".maps"); + +The following code snippet shows a sample parser program. + +.. code-block:: c + + SEC("sk_skb/stream_parser") + int bpf_prog_parser(struct __sk_buff *skb) + { + return skb->len; + } + +The following code snippet shows a simple verdict program that interacts with a +sockmap to redirect traffic to another socket based on the local port. + +.. code-block:: c + + SEC("sk_skb/stream_verdict") + int bpf_prog_verdict(struct __sk_buff *skb) + { + __u32 lport = skb->local_port; + __u32 idx = 0; + + if (lport == 10000) + return bpf_sk_redirect_map(skb, &sock_map_rx, idx, 0); + + return SK_PASS; + } + +The following code snippet shows how to declare a sockhash map. + +.. code-block:: c + + struct socket_key { + __u32 src_ip; + __u32 dst_ip; + __u32 src_port; + __u32 dst_port; + }; + + struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 1); + __type(key, struct socket_key); + __type(value, __u64); + } sock_hash_rx SEC(".maps"); + +The following code snippet shows a simple verdict program that interacts with a +sockhash to redirect traffic to another socket based on a hash of some of the +skb parameters. + +.. code-block:: c + + static inline + void extract_socket_key(struct __sk_buff *skb, struct socket_key *key) + { + key->src_ip = skb->remote_ip4; + key->dst_ip = skb->local_ip4; + key->src_port = skb->remote_port >> 16; + key->dst_port = (bpf_htonl(skb->local_port)) >> 16; + } + + SEC("sk_skb/stream_verdict") + int bpf_prog_verdict(struct __sk_buff *skb) + { + struct socket_key key; + + extract_socket_key(skb, &key); + + return bpf_sk_redirect_hash(skb, &sock_hash_rx, &key, 0); + } + +User space +---------- +Several examples of the use of sockmap APIs can be found in: + +- `tools/testing/selftests/bpf/prog_tests/sockmap_basic.c`_ +- `tools/testing/selftests/bpf/test_sockmap.c`_ +- `tools/testing/selftests/bpf/test_maps.c`_ + +The following code sample shows how to create a sockmap, attach a parser and +verdict program, as well as add a socket entry. + +.. code-block:: c + + int create_sample_sockmap(int sock, int parse_prog_fd, int verdict_prog_fd) + { + int index = 0; + int map, err; + + map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); + if (map < 0) { + fprintf(stderr, "Failed to create sockmap: %s\n", strerror(errno)); + return -1; + } + + err = bpf_prog_attach(parse_prog_fd, map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err){ + fprintf(stderr, "Failed to attach_parser_prog_to_map: %s\n", strerror(errno)); + goto out; + } + + err = bpf_prog_attach(verdict_prog_fd, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err){ + fprintf(stderr, "Failed to attach_verdict_prog_to_map: %s\n", strerror(errno)); + goto out; + } + + err = bpf_map_update_elem(map, &index, &sock, BPF_NOEXIST); + if (err) { + fprintf(stderr, "Failed to update sockmap: %s\n", strerror(errno)); + goto out; + } + + out: + close(map); + return err; + } + +References +=========== + +- https://github.com/jrfastab/linux-kernel-xdp/commit/c89fd73cb9d2d7f3c716c3e00836f07b1aeb261f +- https://lwn.net/Articles/731133/ +- http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf +- https://lwn.net/Articles/748628/ +- https://lore.kernel.org/bpf/20200218171023.844439-7-jakub@cloudflare.com/ + +.. _`tools/testing/selftests/bpf/progs/test_sockmap_kern.h`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +.. _`tools/testing/selftests/bpf/progs/sockmap_parse_prog.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c +.. _`tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c +.. _`tools/testing/selftests/bpf/prog_tests/sockmap_basic.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +.. _`tools/testing/selftests/bpf/test_sockmap.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/test_sockmap.c +.. _`tools/testing/selftests/bpf/test_maps.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/test_maps.c +.. _`tools/testing/selftests/bpf/progs/test_sockmap_listen.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +.. _`tools/testing/selftests/bpf/progs/test_sockmap_update.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_update.c -- cgit v1.2.3 From 1520e8466d683b6c5e1aa53aa65165ebd5da46cf Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Mon, 19 Dec 2022 11:15:26 -0800 Subject: libbpf: Fix build warning on ref_ctr_off for 32-bit architectures Clang warns on 32-bit ARM on this comparision: libbpf.c:10497:18: error: result of comparison of constant 4294967296 with expression of type 'size_t' (aka 'unsigned int') is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) ~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Typecast ref_ctr_off to __u64 in the check conditional, it is false on 32bit anyways. Signed-off-by: Khem Raj Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221219191526.296264-1-raj.khem@gmail.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2a82f49ce16f..a5c67a3c93c5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9903,7 +9903,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, char errmsg[STRERR_BUFSIZE]; int type, pfd; - if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) return -EINVAL; memset(&attr, 0, attr_sz); -- cgit v1.2.3 From e6b4e1d759d3bfb7cb84c87cc8f1858da7db8dea Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sun, 18 Dec 2022 06:35:08 +0800 Subject: libbpf: Show error info about missing ".BTF" section Show the real problem instead of just saying "No such file or directory". Now will print below info: libbpf: failed to find '.BTF' ELF section in /home/changbin/work/linux/vmlinux Error: failed to load BTF from /home/changbin/work/linux/vmlinux: No such file or directory Signed-off-by: Changbin Du Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221217223509.88254-2-changbin.du@gmail.com --- tools/lib/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8cbcef959456..b03250007018 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1003,6 +1003,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, err = 0; if (!btf_data) { + pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); err = -ENOENT; goto done; } -- cgit v1.2.3 From e7f0d5cdd023d8fa53d9ca541b9a55f0eb45618c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sun, 18 Dec 2022 06:35:09 +0800 Subject: bpf: makefiles: Do not generate empty vmlinux.h Remove the empty vmlinux.h if bpftool failed to dump btf info. The empty vmlinux.h can hide real error when reading output of make. This is done by adding .DELETE_ON_ERROR special target in related makefiles. Signed-off-by: Changbin Du Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20221217223509.88254-3-changbin.du@gmail.com --- tools/bpf/bpftool/Makefile | 3 +++ tools/testing/selftests/bpf/Makefile | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 787b857d3fb5..313fd1b09189 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -289,3 +289,6 @@ FORCE: .PHONY: all FORCE bootstrap clean install-bin install uninstall .PHONY: doc doc-clean doc-install doc-uninstall .DEFAULT_GOAL := all + +# Delete partially updated (corrupted) files on error +.DELETE_ON_ERROR: diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c22c43bbee19..205e8c3c346a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -626,3 +626,6 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ liburandom_read.so) .PHONY: docs docs-clean + +# Delete partially updated (corrupted) files on error +.DELETE_ON_ERROR: -- cgit v1.2.3 From 552d42a356ebf78df9d2f4b73e077d2459966fac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 20 Dec 2022 17:30:36 -0800 Subject: bpf: Reduce smap->elem_size 'struct bpf_local_storage_elem' has an unused 56 byte padding at the end due to struct's cache-line alignment requirement. This padding space is overlapped by storage value contents, so if we use sizeof() to calculate the total size, we overinflate it by 56 bytes. Use offsetof() instead to calculate more exact memory use. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221221013036.3427431-1-martin.lau@linux.dev --- kernel/bpf/bpf_local_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b39a46e8fb08..373c3c2c75bc 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -580,8 +580,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att raw_spin_lock_init(&smap->buckets[i].lock); } - smap->elem_size = - sizeof(struct bpf_local_storage_elem) + attr->value_size; + smap->elem_size = offsetof(struct bpf_local_storage_elem, + sdata.data[attr->value_size]); return smap; } -- cgit v1.2.3 From 4ec38eda85b9d07942a1cc7a29bba8aa7c810780 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 21 Dec 2022 10:00:49 -0800 Subject: libbpf: start v1.2 development cycle Bump current version for new development cycle to v1.2. Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221221180049.853365-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.map | 3 +++ tools/lib/bpf/libbpf_version.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 71bf5691a689..11c36a3c1a9f 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -382,3 +382,6 @@ LIBBPF_1.1.0 { user_ring_buffer__reserve_blocking; user_ring_buffer__submit; } LIBBPF_1.0.0; + +LIBBPF_1.2.0 { +} LIBBPF_1.1.0; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index e944f5bce728..1fd2eeac5cfc 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 2 #endif /* __LIBBPF_VERSION_H */ -- cgit v1.2.3 From 90156f4bfa21ada9943b538e27385152407605b1 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 16 Dec 2022 13:43:18 -0800 Subject: bpf, x86: Improve PROBE_MEM runtime load check This patch rewrites the runtime PROBE_MEM check insns emitted by the BPF JIT in order to ensure load safety. The changes in the patch fix two issues with the previous logic and more generally improve size of emitted code. Paragraphs between this one and "FIX 1" below explain the purpose of the runtime check and examine the current implementation. When a load is marked PROBE_MEM - e.g. due to PTR_UNTRUSTED access - the address being loaded from is not necessarily valid. The BPF jit sets up exception handlers for each such load which catch page faults and 0 out the destination register. Arbitrary register-relative loads can escape this exception handling mechanism. Specifically, a load like dst_reg = *(src_reg + off) will not trigger BPF exception handling if (src_reg + off) is outside of kernel address space, resulting in an uncaught page fault. A concrete example of such behavior is a program like: struct result { char space[40]; long a; }; /* if err, returns ERR_PTR(-EINVAL) */ struct result *ptr = get_ptr_maybe_err(); long x = ptr->a; If get_ptr_maybe_err returns ERR_PTR(-EINVAL) and the result isn't checked for err, 'result' will be (u64)-EINVAL, a number close to U64_MAX. The ptr->a load will be > U64_MAX and will wrap over to a small positive u64, which will be in userspace and thus not covered by BPF exception handling mechanism. In order to prevent such loads from occurring, the BPF jit emits some instructions which do runtime checking of (src_reg + off) and skip the actual load if it's out of range. As an example, here are instructions emitted for a %rdi = *(%rdi + 0x10) PROBE_MEM load: 72: movabs $0x800000000010,%r11 --| 7c: cmp %r11,%rdi |- 72 - 7f: Check 1 7f: jb 0x000000000000008d --| 81: mov %rdi,%r11 -----| 84: add $0x0000000000000010,%r11 |- 81-8b: Check 2 8b: jnc 0x0000000000000091 -----| 8d: xor %edi,%edi ---- 0 out dest 8f: jmp 0x0000000000000095 91: mov 0x10(%rdi),%rdi ---- Actual load 95: The JIT considers kernel address space to start at MAX_TASK_SIZE + PAGE_SIZE. Determining whether a load will be outside of kernel address space should be a simple check: (src_reg + off) >= MAX_TASK_SIZE + PAGE_SIZE But because there is only one spare register when the checking logic is emitted, this logic is split into two checks: Check 1: src_reg >= (MAX_TASK_SIZE + PAGE_SIZE - off) Check 2: src_reg + off doesn't wrap over U64_MAX and result in small pos u64 Emitted insns implementing Checks 1 and 2 are annotated in the above example. Check 1 can be done with a single spare register since the source reg by definition is the left-hand-side of the inequality. Since adding 'off' to both sides of Check 1's inequality results in the original inequality we want, it's equivalent to testing that inequality. Except in the case where src_reg + off wraps past U64_MAX, which is why Check 2 needs to actually add src_reg + off if Check 1 passes - again using the single spare reg. FIX 1: The Check 1 inequality listed above is not what current code is doing. Current code is a bit more pessimistic, instead checking: src_reg >= (MAX_TASK_SIZE + PAGE_SIZE + abs(off)) The 0x800000000010 in above example is from this current check. If Check 1 was corrected to use the correct right-hand-side, the value would be 0x7ffffffffff0. This patch changes the checking logic more broadly (FIX 2 below will elaborate), fixing this issue as a side-effect of the rewrite. Regardless, it's important to understand why Check 1 should've been doing MAX_TASK_SIZE + PAGE_SIZE - off before proceeding. FIX 2: Current code relies on a 'jnc' to determine whether src_reg + off addition wrapped over. For negative offsets this logic is incorrect. Consider Check 2 insns emitted when off = -0x10: 81: mov %rdi,%r11 84: add 0xfffffffffffffff0,%r11 8b: jnc 0x0000000000000091 2's complement representation of -0x10 is a large positive u64. Any value of src_reg that passes Check 1 will result in carry flag being set after (src_reg + off) addition. So a load with any negative offset will always fail Check 2 at runtime and never do the actual load. This patch fixes the negative offset issue by rewriting both checks in order to not rely on carry flag. The rewrite takes advantage of the fact that, while we only have one scratch reg to hold arbitrary values, we know the offset at JIT time. This we can use src_reg as a temporary scratch reg to hold src_reg + offset since we can return it to its original value by later subtracting offset. As a result we can directly check the original inequality we care about: (src_reg + off) >= MAX_TASK_SIZE + PAGE_SIZE For a load like %rdi = *(%rsi + -0x10), this results in emitted code: 43: movabs $0x800000000000,%r11 4d: add $0xfffffffffffffff0,%rsi --- src_reg += off 54: cmp %r11,%rsi --- Check original inequality 57: jae 0x000000000000005d 59: xor %edi,%edi 5b: jmp 0x0000000000000061 5d: mov 0x0(%rdi),%rsi --- Actual Load 61: sub $0xfffffffffffffff0,%rsi --- src_reg -= off Note that the actual load is always done with offset 0, since previous insns have already done src_reg += off. Regardless of whether the new check succeeds or fails, insn 61 is always executed, returning src_reg to its original value. Because the goal of these checks is to ensure that loaded-from address will be protected by BPF exception handler, the new check can safely ignore any wrapover from insn 4d. If such wrapped-over address passes insn 54 + 57's cmp-and-jmp it will have such protection so the load can proceed. IMPROVEMENTS: The above improved logic is 8 insns vs original logic's 9, and has 1 fewer jmp. The number of checking insns can be further improved in common scenarios: If src_reg == dst_reg, the actual load insn will clobber src_reg, so there's no original src_reg state for the sub insn immediately following the load to restore, so it can be omitted. In fact, it must be omitted since it would incorrectly subtract from the result of the load if it wasn't. So for src_reg == dst_reg, JIT emits these insns: 3c: movabs $0x800000000000,%r11 46: add $0xfffffffffffffff0,%rdi 4d: cmp %r11,%rdi 50: jae 0x0000000000000056 52: xor %edi,%edi 54: jmp 0x000000000000005a 56: mov 0x0(%rdi),%rdi 5a: The only difference from larger example being the omitted sub, which would've been insn 5a in this example. If offset == 0, we can similarly omit the sub as in previous case, since there's nothing added to subtract. For the same reason we can omit the addition as well, resulting in JIT emitting these insns: 46: movabs $0x800000000000,%r11 4d: cmp %r11,%rdi 50: jae 0x0000000000000056 52: xor %edi,%edi 54: jmp 0x000000000000005a 56: mov 0x0(%rdi),%rdi 5a: Although the above example also has src_reg == dst_reg, the same offset == 0 optimization is valid to apply if src_reg != dst_reg. To summarize the improvements in emitted insn count for the check-and-load: BEFORE: 8 check insns, 3 jmps AFTER (general case): 7 check insns, 2 jmps (12.5% fewer insn, 33% jmp) AFTER (src == dst): 6 check insns, 2 jmps (25% fewer insn) AFTER (offset == 0): 5 check insns, 2 jmps (37.5% fewer insn) (Above counts don't include the 1 load insn, just checking around it) Based on BPF bytecode + JITted x86 insn I saw while experimenting with these improvements, I expect the src_reg == dst_reg case to occur most often, followed by offset == 0, then the general case. Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221216214319.3408356-1-davemarchevsky@fb.com --- arch/x86/net/bpf_jit_comp.c | 70 +++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 36ffe67ad6e5..e3e2b57e4e13 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -992,6 +992,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image u8 b2 = 0, b3 = 0; u8 *start_of_ldx; s64 jmp_offset; + s16 insn_off; u8 jmp_cond; u8 *func; int nops; @@ -1358,57 +1359,52 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + insn_off = insn->off; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { - /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM - * add abs(insn->off) to the limit to make sure that negative - * offset won't be an issue. - * insn->off is s16, so it won't affect valid pointers. + /* Conservatively check that src_reg + insn->off is a kernel address: + * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE + * src_reg is used as scratch for src_reg += insn->off and restored + * after emit_ldx if necessary */ - u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off); - u8 *end_of_jmp1, *end_of_jmp2; - /* Conservatively check that src_reg + insn->off is a kernel address: - * 1. src_reg + insn->off >= limit - * 2. src_reg + insn->off doesn't become small positive. - * Cannot do src_reg + insn->off >= limit in one branch, - * since it needs two spare registers, but JIT has only one. + u64 limit = TASK_SIZE_MAX + PAGE_SIZE; + u8 *end_of_jmp; + + /* At end of these emitted checks, insn->off will have been added + * to src_reg, so no need to do relative load with insn->off offset */ + insn_off = 0; /* movabsq r11, limit */ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG)); EMIT((u32)limit, 4); EMIT(limit >> 32, 4); + + if (insn->off) { + /* add src_reg, insn->off */ + maybe_emit_1mod(&prog, src_reg, true); + EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off); + } + /* cmp src_reg, r11 */ maybe_emit_mod(&prog, src_reg, AUX_REG, true); EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG)); - /* if unsigned '<' goto end_of_jmp2 */ - EMIT2(X86_JB, 0); - end_of_jmp1 = prog; - - /* mov r11, src_reg */ - emit_mov_reg(&prog, true, AUX_REG, src_reg); - /* add r11, insn->off */ - maybe_emit_1mod(&prog, AUX_REG, true); - EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off); - /* jmp if not carry to start_of_ldx - * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr - * that has to be rejected. - */ - EMIT2(0x73 /* JNC */, 0); - end_of_jmp2 = prog; + + /* if unsigned '>=', goto load */ + EMIT2(X86_JAE, 0); + end_of_jmp = prog; /* xor dst_reg, dst_reg */ emit_mov_imm32(&prog, false, dst_reg, 0); /* jmp byte_after_ldx */ EMIT2(0xEB, 0); - /* populate jmp_offset for JB above to jump to xor dst_reg */ - end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1; - /* populate jmp_offset for JNC above to jump to start_of_ldx */ + /* populate jmp_offset for JAE above to jump to start_of_ldx */ start_of_ldx = prog; - end_of_jmp2[-1] = start_of_ldx - end_of_jmp2; + end_of_jmp[-1] = start_of_ldx - end_of_jmp; } - emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); @@ -1417,6 +1413,18 @@ st: if (is_imm8(insn->off)) /* populate jmp_offset for JMP above */ start_of_ldx[-1] = prog - start_of_ldx; + if (insn->off && src_reg != dst_reg) { + /* sub src_reg, insn->off + * Restore src_reg after "add src_reg, insn->off" in prev + * if statement. But if src_reg == dst_reg, emit_ldx + * above already clobbered src_reg, so no need to restore. + * If add src_reg, insn->off was unnecessary, no need to + * restore either. + */ + maybe_emit_1mod(&prog, src_reg, true); + EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off); + } + if (!bpf_prog->aux->extable) break; -- cgit v1.2.3 From 59fe41b5255f7b8a19a3347ec4b03fd830d6f4aa Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 16 Dec 2022 13:43:19 -0800 Subject: selftests/bpf: Add verifier test exercising jit PROBE_MEM logic This patch adds a test exercising logic that was fixed / improved in the previous patch in the series, as well as general sanity checking for jit's PROBE_MEM logic which should've been unaffected by the previous patch. The added verifier test does the following: * Acquire a referenced kptr to struct prog_test_ref_kfunc using existing net/bpf/test_run.c kfunc * Helper returns ptr to a specific prog_test_ref_kfunc whose first two fields - both ints - have been prepopulated w/ vals 42 and 108, respectively * kptr_xchg the acquired ptr into an arraymap * Do a direct map_value load of the just-added ptr * Goal of all this setup is to get an unreferenced kptr pointing to struct with ints of known value, which is the result of this step * Using unreferenced kptr obtained in previous step, do loads of prog_test_ref_kfunc.a (offset 0) and .b (offset 4) * Then incr the kptr by 8 and load prog_test_ref_kfunc.a again (this time at offset -8) * Add all the loaded ints together and return Before the PROBE_MEM fixes in previous patch, the loads at offset 0 and 4 would succeed, while the load at offset -8 would incorrectly fail runtime check emitted by the JIT and 0 out dst reg as a result. This confirmed by retval of 150 for this test before previous patch - since second .a read is 0'd out - and a retval of 192 with the fixed logic. The test exercises the two optimizations to fixed logic added in last patch as well: * First load, with insn "r8 = *(u32 *)(r9 + 0)" exercises "insn->off is 0, no need to add / sub from src_reg" optimization * Third load, with insn "r9 = *(u32 *)(r9 - 8)" exercises "src_reg == dst_reg, no need to restore src_reg after load" optimization Signed-off-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221216214319.3408356-2-davemarchevsky@fb.com --- .../selftests/bpf/prog_tests/jit_probe_mem.c | 28 ++++++++++ tools/testing/selftests/bpf/progs/jit_probe_mem.c | 61 ++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c create mode 100644 tools/testing/selftests/bpf/progs/jit_probe_mem.c diff --git a/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c b/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c new file mode 100644 index 000000000000..5639428607e6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/jit_probe_mem.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "jit_probe_mem.skel.h" + +void test_jit_probe_mem(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct jit_probe_mem *skel; + int ret; + + skel = jit_probe_mem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "jit_probe_mem__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_jit_probe_mem), &opts); + ASSERT_OK(ret, "jit_probe_mem ret"); + ASSERT_OK(opts.retval, "jit_probe_mem opts.retval"); + ASSERT_EQ(skel->data->total_sum, 192, "jit_probe_mem total_sum"); + + jit_probe_mem__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/jit_probe_mem.c b/tools/testing/selftests/bpf/progs/jit_probe_mem.c new file mode 100644 index 000000000000..2d2e61470794 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/jit_probe_mem.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +static struct prog_test_ref_kfunc __kptr_ref *v; +long total_sum = -1; + +extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym; +extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; + +SEC("tc") +int test_jit_probe_mem(struct __sk_buff *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long zero = 0, sum; + + p = bpf_kfunc_call_test_acquire(&zero); + if (!p) + return 1; + + p = bpf_kptr_xchg(&v, p); + if (p) + goto release_out; + + /* Direct map value access of kptr, should be PTR_UNTRUSTED */ + p = v; + if (!p) + return 1; + + asm volatile ( + "r9 = %[p];" + "%[sum] = 0;" + + /* r8 = p->a */ + "r8 = *(u32 *)(r9 + 0);" + "%[sum] += r8;" + + /* r8 = p->b */ + "r8 = *(u32 *)(r9 + 4);" + "%[sum] += r8;" + + "r9 += 8;" + /* r9 = p->a */ + "r9 = *(u32 *)(r9 - 8);" + "%[sum] += r9;" + + : [sum] "=r"(sum) + : [p] "r"(p) + : "r8", "r9" + ); + + total_sum = sum; + return 0; +release_out: + bpf_kfunc_call_test_release(p); + return 1; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 5fbf8c24b66d8d63fd6a452fc60e7e11d98ac5f6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 22 Dec 2022 15:30:58 +0100 Subject: selftests/bpf: Add jit probe_mem corner case tests to s390x denylist BPF CI fails for s390x with the following result: [...] All error logs: libbpf: prog 'test_jit_probe_mem': BPF program load failed: ERROR: strerror_r(524)=22 libbpf: prog 'test_jit_probe_mem': -- BEGIN PROG LOAD LOG -- JIT does not support calling kernel function processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- libbpf: prog 'test_jit_probe_mem': failed to load: -524 libbpf: failed to load object 'jit_probe_mem' libbpf: failed to load BPF skeleton 'jit_probe_mem': -524 test_jit_probe_mem:FAIL:jit_probe_mem__open_and_load unexpected error: -524 #89 jit_probe_mem:FAIL [...] Add the test to the deny list. Fixes: 59fe41b5255f ("selftests/bpf: Add verifier test exercising jit PROBE_MEM logic") Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index 585fcf73c731..3efe091255bf 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -26,6 +26,7 @@ get_func_args_test # trampoline get_func_ip_test # get_func_ip_test__attach unexpected error: -524 (trampoline) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) htab_update # failed to attach: ERROR: strerror_r(-524)=22 (trampoline) +jit_probe_mem # jit_probe_mem__open_and_load unexpected error: -524 (kfunc) kfree_skb # attach fentry unexpected error: -524 (trampoline) kfunc_call # 'bpf_prog_active': not found in kernel BTF (?) kfunc_dynptr_param # JIT does not support calling kernel function (kfunc) -- cgit v1.2.3 From cfca00767febba5f4f5e300fab10e0974491dd4b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Wed, 21 Dec 2022 20:55:29 +0100 Subject: bpf: Remove unused field initialization in bpf's ctl_table Maxlen is used by standard proc_handlers such as proc_dointvec(), but in this case we have our own proc_handler via bpf_stats_handler(). Therefore, remove the initialization. Signed-off-by: Ricardo Ribalda Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20221221-bpf-syscall-v1-0-9550f5f2c3fc@chromium.org --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35972afb6850..8e55456bd648 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5319,7 +5319,6 @@ static struct ctl_table bpf_syscall_table[] = { { .procname = "bpf_stats_enabled", .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, .proc_handler = bpf_stats_handler, }, -- cgit v1.2.3 From e8f55fcf77794c9867a5edbcb84baf21609465a7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:15 -0800 Subject: bpf: teach refsafe() to take into account ID remapping states_equal() check performs ID mapping between old and new states to establish a 1-to-1 correspondence between IDs, even if their absolute numberic values across two equivalent states differ. This is important both for correctness and to avoid unnecessary work when two states are equivalent. With recent changes we partially fixed this logic by maintaining ID map across all function frames. This patch also makes refsafe() check take into account (and maintain) ID map, making states_equal() behavior more optimal and correct. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index faa358b3d5d7..ab8337f6a576 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13223,12 +13223,20 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } -static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, + struct bpf_id_pair *idmap) { + int i; + if (old->acquired_refs != cur->acquired_refs) return false; - return !memcmp(old->refs, cur->refs, - sizeof(*old->refs) * old->acquired_refs); + + for (i = 0; i < old->acquired_refs; i++) { + if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap)) + return false; + } + + return true; } /* compare two verifier states @@ -13270,7 +13278,7 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; - if (!refsafe(old, cur)) + if (!refsafe(old, cur, env->idmap_scratch)) return false; return true; -- cgit v1.2.3 From a73bf9f2d969cbb04d5ca778f2a224060cda1027 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:16 -0800 Subject: bpf: reorganize struct bpf_reg_state fields Move id and ref_obj_id fields after scalar data section (var_off and ranges). This is necessary to simplify next patch which will change regsafe()'s logic to be safer, as it makes the contents that has to be an exact match (type-specific parts, off, type, and var_off+ranges) a single sequential block of memory, while id and ref_obj_id should always be remapped and thus can't be memcp()'ed. There are few places that assume that var_off is after id/ref_obj_id to clear out id/ref_obj_id with the single memset(0). These are changed to explicitly zero-out id/ref_obj_id fields. Other places are adjusted to preserve exact byte-by-byte comparison behavior. No functional changes. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 40 ++++++++++++++++++++-------------------- kernel/bpf/verifier.c | 17 ++++++++--------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53d175cbaa02..127058cfec47 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -92,6 +92,26 @@ struct bpf_reg_state { u32 subprogno; /* for PTR_TO_FUNC */ }; + /* For scalar types (SCALAR_VALUE), this represents our knowledge of + * the actual value. + * For pointer types, this represents the variable part of the offset + * from the pointed-to object, and is shared with all bpf_reg_states + * with the same id as us. + */ + struct tnum var_off; + /* Used to determine if any memory access using this register will + * result in a bad access. + * These refer to the same value as var_off, not necessarily the actual + * contents of the register. + */ + s64 smin_value; /* minimum possible (s64)value */ + s64 smax_value; /* maximum possible (s64)value */ + u64 umin_value; /* minimum possible (u64)value */ + u64 umax_value; /* maximum possible (u64)value */ + s32 s32_min_value; /* minimum possible (s32)value */ + s32 s32_max_value; /* maximum possible (s32)value */ + u32 u32_min_value; /* minimum possible (u32)value */ + u32 u32_max_value; /* maximum possible (u32)value */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -144,26 +164,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* For scalar types (SCALAR_VALUE), this represents our knowledge of - * the actual value. - * For pointer types, this represents the variable part of the offset - * from the pointed-to object, and is shared with all bpf_reg_states - * with the same id as us. - */ - struct tnum var_off; - /* Used to determine if any memory access using this register will - * result in a bad access. - * These refer to the same value as var_off, not necessarily the actual - * contents of the register. - */ - s64 smin_value; /* minimum possible (s64)value */ - s64 smax_value; /* maximum possible (s64)value */ - u64 umin_value; /* minimum possible (u64)value */ - u64 umax_value; /* maximum possible (u64)value */ - s32 s32_min_value; /* minimum possible (s32)value */ - s32 s32_max_value; /* maximum possible (s32)value */ - u32 u32_min_value; /* minimum possible (u32)value */ - u32 u32_max_value; /* maximum possible (u32)value */ /* parentage chain for liveness checking */ struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab8337f6a576..e419e6024251 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1402,9 +1402,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) */ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - /* Clear id, off, and union(map_ptr, range) */ + /* Clear off and union(map_ptr, range) */ memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); + reg->id = 0; + reg->ref_obj_id = 0; ___mark_reg_known(reg, imm); } @@ -1750,11 +1752,13 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { /* - * Clear type, id, off, and union(map_ptr, range) and + * Clear type, off, and union(map_ptr, range) and * padding between 'type' and union */ memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); reg->type = SCALAR_VALUE; + reg->id = 0; + reg->ref_obj_id = 0; reg->var_off = tnum_unknown; reg->frameno = 0; reg->precise = !env->bpf_capable; @@ -13104,7 +13108,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (type_may_be_null(rold->type)) { if (!type_may_be_null(rcur->type)) return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off))) return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); @@ -13112,13 +13116,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * 'id' is not compared, since it's only used for maps with - * bpf_spin_lock inside map element and in such cases if - * the rest of the prog is valid for one map element then - * it's valid for all map elements regardless of the key - * used in bpf_map_lookup() */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap); -- cgit v1.2.3 From 7f4ce97cd5edf723c7f2e32668481b6aa86c9ec6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:17 -0800 Subject: bpf: generalize MAYBE_NULL vs non-MAYBE_NULL rule Make generic check to prevent XXX_OR_NULL and XXX register types to be intermixed. While technically in some situations it could be safe, it's impossible to enforce due to the loss of an ID when converting XXX_OR_NULL to its non-NULL variant. So prevent this in general, not just for PTR_TO_MAP_KEY and PTR_TO_MAP_VALUE. PTR_TO_MAP_KEY_OR_NULL and PTR_TO_MAP_VALUE_OR_NULL checks, which were previously special-cased, are simplified to generic check that takes into account range_within() and tnum_in(). This is correct as BPF verifier doesn't allow arithmetic on XXX_OR_NULL register types, so var_off and ranges should stay zero. But even if in the future this restriction is lifted, it's even more important to enforce that var_off and ranges are compatible, otherwise it's possible to construct case where this can be exploited to bypass verifier's memory range safety checks. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e419e6024251..218a7ace4210 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13074,6 +13074,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; + + /* Register types that are *not* MAYBE_NULL could technically be safe + * to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE is + * safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point to + * the same map). + * However, if the old MAYBE_NULL register then got NULL checked, + * doing so could have affected others with the same id, and we can't + * check for that because we lost the id when we converted to + * a non-MAYBE_NULL variant. + * So, as a general rule we don't allow mixing MAYBE_NULL and + * non-MAYBE_NULL registers. + */ + if (type_may_be_null(rold->type) != type_may_be_null(rcur->type)) + return false; + switch (base_type(rold->type)) { case SCALAR_VALUE: if (equal) @@ -13098,22 +13113,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (type_may_be_null(rold->type)) { - if (!type_may_be_null(rcur->type)) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); - } - /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. */ -- cgit v1.2.3 From 910f69996674bfc4a273a335c1fb2ecb45062bf6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:18 -0800 Subject: bpf: reject non-exact register type matches in regsafe() Generalize the (somewhat implicit) rule of regsafe(), which states that if register types in old and current states do not match *exactly*, they can't be safely considered equivalent. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 218a7ace4210..5133d0a5b0cb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13075,18 +13075,28 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (rcur->type == NOT_INIT) return false; - /* Register types that are *not* MAYBE_NULL could technically be safe - * to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE is - * safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point to - * the same map). + /* Enforce that register types have to match exactly, including their + * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general + * rule. + * + * One can make a point that using a pointer register as unbounded + * SCALAR would be technically acceptable, but this could lead to + * pointer leaks because scalars are allowed to leak while pointers + * are not. We could make this safe in special cases if root is + * calling us, but it's probably not worth the hassle. + * + * Also, register types that are *not* MAYBE_NULL could technically be + * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE + * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point + * to the same map). * However, if the old MAYBE_NULL register then got NULL checked, * doing so could have affected others with the same id, and we can't * check for that because we lost the id when we converted to * a non-MAYBE_NULL variant. * So, as a general rule we don't allow mixing MAYBE_NULL and - * non-MAYBE_NULL registers. + * non-MAYBE_NULL registers as well. */ - if (type_may_be_null(rold->type) != type_may_be_null(rcur->type)) + if (rold->type != rcur->type) return false; switch (base_type(rold->type)) { @@ -13095,22 +13105,11 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (env->explore_alu_limits) return false; - if (rcur->type == SCALAR_VALUE) { - if (!rold->precise) - return true; - /* new val must satisfy old val knowledge */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); - } else { - /* We're trying to use a pointer in place of a scalar. - * Even if the scalar was unbounded, this could lead to - * pointer leaks because scalars are allowed to leak - * while pointers are not. We could make this safe in - * special cases if root is calling us, but it's - * probably not worth the hassle. - */ - return false; - } + if (!rold->precise) + return true; + /* new val must satisfy old val knowledge */ + return range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and @@ -13122,8 +13121,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != rold->type) - return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are * still safe. This is true even if old range < old off, -- cgit v1.2.3 From 4a95c85c994801c9ae12d9cb7216da7b484564b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:19 -0800 Subject: bpf: perform byte-by-byte comparison only when necessary in regsafe() Extract byte-by-byte comparison of bpf_reg_state in regsafe() into a helper function, which makes it more convenient to use it "on demand" only for registers that benefit from such checks, instead of doing it all the time, even if result of such comparison is ignored. Also, remove WARN_ON_ONCE(1)+return false dead code. There is no risk of missing some case as compiler will warn about non-void function not returning value in some branches (and that under assumption that default case is removed in the future). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5133d0a5b0cb..6431b994b3f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13057,18 +13057,19 @@ next: } } +static bool regs_exact(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur) +{ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; +} + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { - bool equal; - if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; - if (rold->type == NOT_INIT) /* explored state can't have used this */ return true; @@ -13101,7 +13102,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, switch (base_type(rold->type)) { case SCALAR_VALUE: - if (equal) + if (regs_exact(rold, rcur)) return true; if (env->explore_alu_limits) return false; @@ -13144,15 +13145,11 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* two stack pointers are equal only if they're pointing to * the same stack frame, since fp-8 in foo != fp-8 in bar */ - return equal && rold->frameno == rcur->frameno; + return regs_exact(rold, rcur) && rold->frameno == rcur->frameno; default: /* Only valid matches are exact, which memcmp() */ - return equal; + return regs_exact(rold, rcur); } - - /* Shouldn't get here; if we do, say it's not safe */ - WARN_ON_ONCE(1); - return false; } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, -- cgit v1.2.3 From 4633a00682589931e8415c166979d8e5dd174282 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Dec 2022 21:49:20 -0800 Subject: bpf: fix regs_exact() logic in regsafe() to remap IDs correctly Comparing IDs exactly between two separate states is not just suboptimal, but also incorrect in some cases. So update regs_exact() check to do byte-by-byte memcmp() only up to id/ref_obj_id. For id and ref_obj_id perform proper check_ids() checks, taking into account idmap. This change makes more states equivalent improving insns and states stats across a bunch of selftest BPF programs: File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------------------------- -------------------------------- --------- --------- -------------- ---------- ---------- ------------- cgrp_kfunc_success.bpf.linked1.o test_cgrp_get_release 141 137 -4 (-2.84%) 13 13 +0 (+0.00%) cgrp_kfunc_success.bpf.linked1.o test_cgrp_xchg_release 142 139 -3 (-2.11%) 14 13 -1 (-7.14%) connect6_prog.bpf.linked1.o connect_v6_prog 139 102 -37 (-26.62%) 9 6 -3 (-33.33%) ima.bpf.linked1.o bprm_creds_for_exec 68 61 -7 (-10.29%) 6 5 -1 (-16.67%) linked_list.bpf.linked1.o global_list_in_list 569 499 -70 (-12.30%) 60 52 -8 (-13.33%) linked_list.bpf.linked1.o global_list_push_pop 167 150 -17 (-10.18%) 18 16 -2 (-11.11%) linked_list.bpf.linked1.o global_list_push_pop_multiple 881 815 -66 (-7.49%) 74 63 -11 (-14.86%) linked_list.bpf.linked1.o inner_map_list_in_list 579 534 -45 (-7.77%) 61 55 -6 (-9.84%) linked_list.bpf.linked1.o inner_map_list_push_pop 190 181 -9 (-4.74%) 19 18 -1 (-5.26%) linked_list.bpf.linked1.o inner_map_list_push_pop_multiple 916 850 -66 (-7.21%) 75 64 -11 (-14.67%) linked_list.bpf.linked1.o map_list_in_list 588 525 -63 (-10.71%) 62 55 -7 (-11.29%) linked_list.bpf.linked1.o map_list_push_pop 183 174 -9 (-4.92%) 18 17 -1 (-5.56%) linked_list.bpf.linked1.o map_list_push_pop_multiple 909 843 -66 (-7.26%) 75 64 -11 (-14.67%) map_kptr.bpf.linked1.o test_map_kptr 264 256 -8 (-3.03%) 26 26 +0 (+0.00%) map_kptr.bpf.linked1.o test_map_kptr_ref 95 91 -4 (-4.21%) 9 8 -1 (-11.11%) task_kfunc_success.bpf.linked1.o test_task_xchg_release 139 136 -3 (-2.16%) 14 13 -1 (-7.14%) test_bpf_nf.bpf.linked1.o nf_skb_ct_test 815 509 -306 (-37.55%) 57 30 -27 (-47.37%) test_bpf_nf.bpf.linked1.o nf_xdp_ct_test 815 509 -306 (-37.55%) 57 30 -27 (-47.37%) test_cls_redirect.bpf.linked1.o cls_redirect 78925 78390 -535 (-0.68%) 4782 4704 -78 (-1.63%) test_cls_redirect_subprogs.bpf.linked1.o cls_redirect 64901 63897 -1004 (-1.55%) 4612 4470 -142 (-3.08%) test_sk_lookup.bpf.linked1.o access_ctx_sk 181 95 -86 (-47.51%) 19 10 -9 (-47.37%) test_sk_lookup.bpf.linked1.o ctx_narrow_access 447 437 -10 (-2.24%) 38 37 -1 (-2.63%) test_sk_lookup_kern.bpf.linked1.o sk_lookup_success 148 133 -15 (-10.14%) 14 12 -2 (-14.29%) test_tcp_check_syncookie_kern.bpf.linked1.o check_syncookie_clsact 304 300 -4 (-1.32%) 23 22 -1 (-4.35%) test_tcp_check_syncookie_kern.bpf.linked1.o check_syncookie_xdp 304 300 -4 (-1.32%) 23 22 -1 (-4.35%) test_verify_pkcs7_sig.bpf.linked1.o bpf 87 76 -11 (-12.64%) 7 6 -1 (-14.29%) ------------------------------------------- -------------------------------- --------- --------- -------------- ---------- ---------- ------------- Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221223054921.958283-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6431b994b3f6..b23812d2bb49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12946,6 +12946,13 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i; + /* either both IDs should be set or both should be zero */ + if (!!old_id != !!cur_id) + return false; + + if (old_id == 0) /* cur_id == 0 as well */ + return true; + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ @@ -13058,9 +13065,12 @@ next: } static bool regs_exact(const struct bpf_reg_state *rold, - const struct bpf_reg_state *rcur) + const struct bpf_reg_state *rcur, + struct bpf_id_pair *idmap) { - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_ids(rold->id, rcur->id, idmap) && + check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); } /* Returns true if (rold safe implies rcur safe) */ @@ -13102,7 +13112,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, switch (base_type(rold->type)) { case SCALAR_VALUE: - if (regs_exact(rold, rcur)) + if (regs_exact(rold, rcur, idmap)) return true; if (env->explore_alu_limits) return false; @@ -13136,7 +13146,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (rold->off != rcur->off) return false; /* id relations must be preserved */ - if (rold->id && !check_ids(rold->id, rcur->id, idmap)) + if (!check_ids(rold->id, rcur->id, idmap)) return false; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -13145,10 +13155,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* two stack pointers are equal only if they're pointing to * the same stack frame, since fp-8 in foo != fp-8 in bar */ - return regs_exact(rold, rcur) && rold->frameno == rcur->frameno; + return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; default: - /* Only valid matches are exact, which memcmp() */ - return regs_exact(rold, rcur); + return regs_exact(rold, rcur, idmap); } } -- cgit v1.2.3 From 07453245620c075779abefa2a9f469fa336e7510 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Fri, 23 Dec 2022 21:36:18 +0800 Subject: libbpf: fix errno is overwritten after being closed. In the ensure_good_fd function, if the fcntl function succeeds but the close function fails, ensure_good_fd returns a normal fd and sets errno, which may cause users to misunderstand. The close failure is not a serious problem, and the correct FD has been handed over to the upper-layer application. Let's restore errno here. Signed-off-by: Xin Liu Link: https://lore.kernel.org/r/20221223133618.10323-1-liuxin350@huawei.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf_internal.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 377642ff51fc..98333a6c38e9 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -543,10 +543,9 @@ static inline int ensure_good_fd(int fd) fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); saved_errno = errno; close(old_fd); - if (fd < 0) { + errno = saved_errno; + if (fd < 0) pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); - errno = saved_errno; - } } return fd; } -- cgit v1.2.3 From 30465003ad776a922c32b2dac58db14f120f037e Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 17 Dec 2022 00:24:57 -0800 Subject: bpf: rename list_head -> graph_root in field info types Many of the structs recently added to track field info for linked-list head are useful as-is for rbtree root. So let's do a mechanical renaming of list_head-related types and fields: include/linux/bpf.h: struct btf_field_list_head -> struct btf_field_graph_root list_head -> graph_root in struct btf_field union kernel/bpf/btf.c: list_head -> graph_root in struct btf_field_info This is a nonfunctional change, functionality to actually use these fields for rbtree will be added in further patches. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20221217082506.1570898-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- kernel/bpf/btf.c | 21 +++++++++++---------- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 21 +++++++++++---------- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5fec2d1be6d7..1697bd87fc06 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -189,7 +189,7 @@ struct btf_field_kptr { u32 btf_id; }; -struct btf_field_list_head { +struct btf_field_graph_root { struct btf *btf; u32 value_btf_id; u32 node_offset; @@ -201,7 +201,7 @@ struct btf_field { enum btf_field_type type; union { struct btf_field_kptr kptr; - struct btf_field_list_head list_head; + struct btf_field_graph_root graph_root; }; }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..578cee398550 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3228,7 +3228,7 @@ struct btf_field_info { struct { const char *node_name; u32 value_btf_id; - } list_head; + } graph_root; }; }; @@ -3335,8 +3335,8 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, return -EINVAL; info->type = BPF_LIST_HEAD; info->off = off; - info->list_head.value_btf_id = id; - info->list_head.node_name = list_node; + info->graph_root.value_btf_id = id; + info->graph_root.node_name = list_node; return BTF_FIELD_FOUND; } @@ -3604,13 +3604,14 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, u32 offset; int i; - t = btf_type_by_id(btf, info->list_head.value_btf_id); + t = btf_type_by_id(btf, info->graph_root.value_btf_id); /* We've already checked that value_btf_id is a struct type. We * just need to figure out the offset of the list_node, and * verify its type. */ for_each_member(i, t, member) { - if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off))) + if (strcmp(info->graph_root.node_name, + __btf_name_by_offset(btf, member->name_off))) continue; /* Invalid BTF, two members with same name */ if (n) @@ -3627,9 +3628,9 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, if (offset % __alignof__(struct bpf_list_node)) return -EINVAL; - field->list_head.btf = (struct btf *)btf; - field->list_head.value_btf_id = info->list_head.value_btf_id; - field->list_head.node_offset = offset; + field->graph_root.btf = (struct btf *)btf; + field->graph_root.value_btf_id = info->graph_root.value_btf_id; + field->graph_root.node_offset = offset; } if (!n) return -ENOENT; @@ -3736,11 +3737,11 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) if (!(rec->fields[i].type & BPF_LIST_HEAD)) continue; - btf_id = rec->fields[i].list_head.value_btf_id; + btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); if (!meta) return -EFAULT; - rec->fields[i].list_head.value_rec = meta->record; + rec->fields[i].graph_root.value_rec = meta->record; if (!(rec->field_mask & BPF_LIST_NODE)) continue; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 23aa8cf8fd1a..458db2db2f81 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1756,12 +1756,12 @@ unlock: while (head != orig_head) { void *obj = head; - obj -= field->list_head.node_offset; + obj -= field->graph_root.node_offset; head = head->next; /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - bpf_obj_free_fields(field->list_head.value_rec, obj); + bpf_obj_free_fields(field->graph_root.value_rec, obj); /* bpf_mem_free requires migrate_disable(), since we can be * called from map free path as well apart from BPF program (as * part of map ops doing bpf_obj_free_fields). diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b23812d2bb49..4a25375ebb0d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8776,21 +8776,22 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, field = meta->arg_list_head.field; - et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id); + et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id); t = btf_type_by_id(reg->btf, reg->btf_id); - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf, - field->list_head.value_btf_id, true)) { + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf, + field->graph_root.value_btf_id, true)) { verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " "in struct %s, but arg is at offset=%d in struct %s\n", - field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off), + field->graph_root.node_offset, + btf_name_by_offset(field->graph_root.btf, et->name_off), list_node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } - if (list_node_off != field->list_head.node_offset) { + if (list_node_off != field->graph_root.node_offset) { verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", - list_node_off, field->list_head.node_offset, - btf_name_by_offset(field->list_head.btf, et->name_off)); + list_node_off, field->graph_root.node_offset, + btf_name_by_offset(field->graph_root.btf, et->name_off)); return -EINVAL; } /* Set arg#1 for expiration after unlock */ @@ -9232,9 +9233,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = field->list_head.btf; - regs[BPF_REG_0].btf_id = field->list_head.value_btf_id; - regs[BPF_REG_0].off = field->list_head.node_offset; + regs[BPF_REG_0].btf = field->graph_root.btf; + regs[BPF_REG_0].btf_id = field->graph_root.value_btf_id; + regs[BPF_REG_0].off = field->graph_root.node_offset; } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; -- cgit v1.2.3 From 1d0c5f6f3d1387ec9c3a379fb232c078f5838d55 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:22 +0900 Subject: samples/bpf: Use kyscall instead of kprobe in syscall tracing program Syscall tracing using kprobe is quite unstable. Since it uses the exact name of the kernel function, the program might broke due to the rename of a function. The problem can also be caused by a changes in the arguments of the function to which the kprobe connects. In this commit, ksyscall is used instead of kprobe. By using ksyscall, libbpf will detect the appropriate kernel function name. (e.g. sys_write -> __s390_sys_write). This eliminates the need to worry about which wrapper function to attach in order to parse arguments. In addition, ksyscall provides more fine method with attaching system call, the coarse SYSCALL helper at trace_common.h can be removed. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-2-danieltimlee@gmail.com --- samples/bpf/map_perf_test_kern.c | 17 ++++++++--------- samples/bpf/test_current_task_under_cgroup_kern.c | 3 +-- samples/bpf/test_map_in_map_kern.c | 1 - samples/bpf/test_probe_write_user_kern.c | 3 +-- samples/bpf/trace_common.h | 13 ------------- samples/bpf/trace_output_kern.c | 3 +-- samples/bpf/tracex2_kern.c | 3 +-- 7 files changed, 12 insertions(+), 31 deletions(-) delete mode 100644 samples/bpf/trace_common.h diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 7342c5b2f278..874e2f7e3d5d 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -11,7 +11,6 @@ #include #include #include -#include "trace_common.h" #define MAX_ENTRIES 1000 #define MAX_NR_CPUS 1024 @@ -102,7 +101,7 @@ struct { __uint(max_entries, MAX_ENTRIES); } lru_hash_lookup_map SEC(".maps"); -SEC("kprobe/" SYSCALL(sys_getuid)) +SEC("ksyscall/getuid") int stress_hmap(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); @@ -120,7 +119,7 @@ int stress_hmap(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_geteuid)) +SEC("ksyscall/geteuid") int stress_percpu_hmap(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); @@ -137,7 +136,7 @@ int stress_percpu_hmap(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_getgid)) +SEC("ksyscall/getgid") int stress_hmap_alloc(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); @@ -154,7 +153,7 @@ int stress_hmap_alloc(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_getegid)) +SEC("ksyscall/getegid") int stress_percpu_hmap_alloc(struct pt_regs *ctx) { u32 key = bpf_get_current_pid_tgid(); @@ -171,7 +170,7 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_connect)) +SEC("ksyscall/connect") int stress_lru_hmap_alloc(struct pt_regs *ctx) { struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); @@ -251,7 +250,7 @@ done: return 0; } -SEC("kprobe/" SYSCALL(sys_gettid)) +SEC("ksyscall/gettid") int stress_lpm_trie_map_alloc(struct pt_regs *ctx) { union { @@ -273,7 +272,7 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_getpgid)) +SEC("ksyscall/getpgid") int stress_hash_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; @@ -286,7 +285,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/" SYSCALL(sys_getppid)) +SEC("ksyscall/getppid") int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c index fbd43e2bb4d3..541fc861b984 100644 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -10,7 +10,6 @@ #include #include #include -#include "trace_common.h" struct { __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); @@ -27,7 +26,7 @@ struct { } perf_map SEC(".maps"); /* Writes the last PID that called sync to a map at index 0 */ -SEC("kprobe/" SYSCALL(sys_sync)) +SEC("ksyscall/sync") int bpf_prog1(struct pt_regs *ctx) { u64 pid = bpf_get_current_pid_tgid(); diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index b0200c8eac09..0e17f9ade5c5 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -13,7 +13,6 @@ #include #include #include -#include "trace_common.h" #define MAX_NR_PORTS 65536 diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c index 220a96438d75..d60cabaaf753 100644 --- a/samples/bpf/test_probe_write_user_kern.c +++ b/samples/bpf/test_probe_write_user_kern.c @@ -11,7 +11,6 @@ #include #include #include -#include "trace_common.h" struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -28,7 +27,7 @@ struct { * This example sits on a syscall, and the syscall ABI is relatively stable * of course, across platforms, and over time, the ABI may change. */ -SEC("kprobe/" SYSCALL(sys_connect)) +SEC("ksyscall/connect") int bpf_prog1(struct pt_regs *ctx) { struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h deleted file mode 100644 index 8cb5400aed1f..000000000000 --- a/samples/bpf/trace_common.h +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#ifndef __TRACE_COMMON_H -#define __TRACE_COMMON_H - -#ifdef __x86_64__ -#define SYSCALL(SYS) "__x64_" __stringify(SYS) -#elif defined(__s390x__) -#define SYSCALL(SYS) "__s390x_" __stringify(SYS) -#else -#define SYSCALL(SYS) __stringify(SYS) -#endif - -#endif diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index b64815af0943..a481abf8c4c5 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -2,7 +2,6 @@ #include #include #include -#include "trace_common.h" struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); @@ -11,7 +10,7 @@ struct { __uint(max_entries, 2); } my_map SEC(".maps"); -SEC("kprobe/" SYSCALL(sys_write)) +SEC("ksyscall/write") int bpf_prog1(struct pt_regs *ctx) { struct S { diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index 93e0b7680b4f..82091facb83c 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -10,7 +10,6 @@ #include #include #include -#include "trace_common.h" struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -78,7 +77,7 @@ struct { __uint(max_entries, 1024); } my_hist_map SEC(".maps"); -SEC("kprobe/" SYSCALL(sys_write)) +SEC("ksyscall/write") int bpf_prog3(struct pt_regs *ctx) { long write_size = PT_REGS_PARM3(ctx); -- cgit v1.2.3 From 8a4dd0bcbdfd5bdaa5d1a5b390f44a45b60e8aa9 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:23 +0900 Subject: samples/bpf: Use vmlinux.h instead of implicit headers in syscall tracing program This commit applies vmlinux.h to syscall tracing program. This change allows the bpf program to refer to the internal structure as a single "vmlinux.h" instead of including each header referenced by the bpf program. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-3-danieltimlee@gmail.com --- samples/bpf/map_perf_test_kern.c | 5 ++--- samples/bpf/test_current_task_under_cgroup_kern.c | 4 +--- samples/bpf/test_probe_write_user_kern.c | 5 ++--- samples/bpf/trace_output_kern.c | 3 +-- samples/bpf/tracex2_kern.c | 4 +--- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 874e2f7e3d5d..0c7885057ffe 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -4,10 +4,9 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include +#include "vmlinux.h" +#include #include -#include #include #include #include diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c index 541fc861b984..0b059cee3cba 100644 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -5,11 +5,9 @@ * License as published by the Free Software Foundation. */ -#include -#include +#include "vmlinux.h" #include #include -#include struct { __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c index d60cabaaf753..a0f10c5ca273 100644 --- a/samples/bpf/test_probe_write_user_kern.c +++ b/samples/bpf/test_probe_write_user_kern.c @@ -4,9 +4,8 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include -#include +#include "vmlinux.h" +#include #include #include #include diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index a481abf8c4c5..565a73b51b04 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -1,6 +1,5 @@ -#include +#include "vmlinux.h" #include -#include #include struct { diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index 82091facb83c..a712eefc742e 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -4,10 +4,8 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include +#include "vmlinux.h" #include -#include #include #include -- cgit v1.2.3 From d4fffba4d04b8d605ff07f1ed987399f6af0ad5b Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:24 +0900 Subject: samples/bpf: Change _kern suffix to .bpf with syscall tracing program Currently old compile rule (CLANG-bpf) doesn't contains VMLINUX_H define flag which is essential for the bpf program that includes "vmlinux.h". Also old compile rule doesn't directly specify the compile target as bpf, instead it uses bunch of extra options with clang followed by long chain of commands. (e.g. clang | opt | llvm-dis | llc) In Makefile, there is already new compile rule which is more simple and neat. And it also has -D__VMLINUX_H__ option. By just changing the _kern suffix to .bpf will inherit the benefit of the new CLANG-BPF compile target. Also, this commit adds dummy gnu/stub.h to the samples/bpf directory. As commit 1c2dd16add7e ("selftests/bpf: get rid of -D__x86_64__") noted, compiling with 'clang -target bpf' will raise an error with stubs.h unless workaround (-D__x86_64) is used. This commit solves this problem by adding dummy stub.h to make /usr/include/features.h to follow the expected path as the same way selftests/bpf dealt with. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-4-danieltimlee@gmail.com --- samples/bpf/Makefile | 10 +- samples/bpf/gnu/stubs.h | 1 + samples/bpf/map_perf_test.bpf.c | 301 ++++++++++++++++++++++ samples/bpf/map_perf_test_kern.c | 301 ---------------------- samples/bpf/map_perf_test_user.c | 2 +- samples/bpf/test_current_task_under_cgroup.bpf.c | 41 +++ samples/bpf/test_current_task_under_cgroup_kern.c | 41 --- samples/bpf/test_current_task_under_cgroup_user.c | 2 +- samples/bpf/test_probe_write_user.bpf.c | 54 ++++ samples/bpf/test_probe_write_user_kern.c | 54 ---- samples/bpf/test_probe_write_user_user.c | 2 +- samples/bpf/trace_output.bpf.c | 29 +++ samples/bpf/trace_output_kern.c | 29 --- samples/bpf/trace_output_user.c | 2 +- samples/bpf/tracex2.bpf.c | 99 +++++++ samples/bpf/tracex2_kern.c | 99 ------- samples/bpf/tracex2_user.c | 2 +- 17 files changed, 535 insertions(+), 534 deletions(-) create mode 100644 samples/bpf/gnu/stubs.h create mode 100644 samples/bpf/map_perf_test.bpf.c delete mode 100644 samples/bpf/map_perf_test_kern.c create mode 100644 samples/bpf/test_current_task_under_cgroup.bpf.c delete mode 100644 samples/bpf/test_current_task_under_cgroup_kern.c create mode 100644 samples/bpf/test_probe_write_user.bpf.c delete mode 100644 samples/bpf/test_probe_write_user_kern.c create mode 100644 samples/bpf/trace_output.bpf.c delete mode 100644 samples/bpf/trace_output_kern.c create mode 100644 samples/bpf/tracex2.bpf.c delete mode 100644 samples/bpf/tracex2_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 727da3c5879b..22039a0a5b35 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -125,21 +125,21 @@ always-y += sockex1_kern.o always-y += sockex2_kern.o always-y += sockex3_kern.o always-y += tracex1_kern.o -always-y += tracex2_kern.o +always-y += tracex2.bpf.o always-y += tracex3_kern.o always-y += tracex4_kern.o always-y += tracex5_kern.o always-y += tracex6_kern.o always-y += tracex7_kern.o always-y += sock_flags_kern.o -always-y += test_probe_write_user_kern.o -always-y += trace_output_kern.o +always-y += test_probe_write_user.bpf.o +always-y += trace_output.bpf.o always-y += tcbpf1_kern.o always-y += tc_l2_redirect_kern.o always-y += lathist_kern.o always-y += offwaketime_kern.o always-y += spintest_kern.o -always-y += map_perf_test_kern.o +always-y += map_perf_test.bpf.o always-y += test_overhead_tp_kern.o always-y += test_overhead_raw_tp_kern.o always-y += test_overhead_kprobe_kern.o @@ -147,7 +147,7 @@ always-y += parse_varlen.o parse_simple.o parse_ldabs.o always-y += test_cgrp2_tc_kern.o always-y += xdp1_kern.o always-y += xdp2_kern.o -always-y += test_current_task_under_cgroup_kern.o +always-y += test_current_task_under_cgroup.bpf.o always-y += trace_event_kern.o always-y += sampleip_kern.o always-y += lwt_len_hist_kern.o diff --git a/samples/bpf/gnu/stubs.h b/samples/bpf/gnu/stubs.h new file mode 100644 index 000000000000..719225b16626 --- /dev/null +++ b/samples/bpf/gnu/stubs.h @@ -0,0 +1 @@ +/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */ diff --git a/samples/bpf/map_perf_test.bpf.c b/samples/bpf/map_perf_test.bpf.c new file mode 100644 index 000000000000..0c7885057ffe --- /dev/null +++ b/samples/bpf/map_perf_test.bpf.c @@ -0,0 +1,301 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "vmlinux.h" +#include +#include +#include +#include +#include + +#define MAX_ENTRIES 1000 +#define MAX_NR_CPUS 1024 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); +} lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_COMMON_LRU); +} nocommon_lru_hash_map SEC(".maps"); + +struct inner_lru { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NUMA_NODE); + __uint(numa_node, 0); +} inner_lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_CPUS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_lru); /* use inner_lru as inner map */ +} array_of_lru_hashs SEC(".maps") = { + /* statically initialize the first element */ + .values = { &inner_lru_hash_map }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); +} percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} percpu_hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(long)); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_trie_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} array_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} lru_hash_lookup_map SEC(".maps"); + +SEC("ksyscall/getuid") +int stress_hmap(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) + bpf_map_delete_elem(&hash_map, &key); + } + + return 0; +} + +SEC("ksyscall/geteuid") +int stress_percpu_hmap(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map, &key); + } + return 0; +} + +SEC("ksyscall/getgid") +int stress_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&hash_map_alloc, &key); + } + return 0; +} + +SEC("ksyscall/getegid") +int stress_percpu_hmap_alloc(struct pt_regs *ctx) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + } + return 0; +} + +SEC("ksyscall/connect") +int stress_lru_hmap_alloc(struct pt_regs *ctx) +{ + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; + struct sockaddr_in6 *in6; + u16 test_case; + int addrlen, ret; + long val = 1; + u32 key = 0; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); + addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6), + &in6->sin6_addr); + if (ret) + goto done; + + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) + return 0; + + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); + + if (test_case == 0) { + ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + } else if (test_case == 1) { + ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, + BPF_ANY); + } else if (test_case == 2) { + void *nolocal_lru_map; + int cpu = bpf_get_smp_processor_id(); + + nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, + &cpu); + if (!nolocal_lru_map) { + ret = -ENOENT; + goto done; + } + + ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, + BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } + } else { + ret = -EINVAL; + } + +done: + if (ret) + bpf_trace_printk(fmt, sizeof(fmt), ret); + + return 0; +} + +SEC("ksyscall/gettid") +int stress_lpm_trie_map_alloc(struct pt_regs *ctx) +{ + union { + u32 b32[2]; + u8 b8[8]; + } key; + unsigned int i; + + key.b32[0] = 32; + key.b8[4] = 192; + key.b8[5] = 168; + key.b8[6] = 0; + key.b8[7] = 1; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; ++i) + bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); + + return 0; +} + +SEC("ksyscall/getpgid") +int stress_hash_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&hash_map, &key); + + return 0; +} + +SEC("ksyscall/getppid") +int stress_array_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&array_map, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c deleted file mode 100644 index 0c7885057ffe..000000000000 --- a/samples/bpf/map_perf_test_kern.c +++ /dev/null @@ -1,301 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include "vmlinux.h" -#include -#include -#include -#include -#include - -#define MAX_ENTRIES 1000 -#define MAX_NR_CPUS 1024 - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, MAX_ENTRIES); -} hash_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, 10000); -} lru_hash_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, 10000); - __uint(map_flags, BPF_F_NO_COMMON_LRU); -} nocommon_lru_hash_map SEC(".maps"); - -struct inner_lru { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, MAX_ENTRIES); - __uint(map_flags, BPF_F_NUMA_NODE); - __uint(numa_node, 0); -} inner_lru_hash_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); - __uint(max_entries, MAX_NR_CPUS); - __uint(key_size, sizeof(u32)); - __array(values, struct inner_lru); /* use inner_lru as inner map */ -} array_of_lru_hashs SEC(".maps") = { - /* statically initialize the first element */ - .values = { &inner_lru_hash_map }, -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(long)); - __uint(max_entries, MAX_ENTRIES); -} percpu_hash_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, MAX_ENTRIES); - __uint(map_flags, BPF_F_NO_PREALLOC); -} hash_map_alloc SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(long)); - __uint(max_entries, MAX_ENTRIES); - __uint(map_flags, BPF_F_NO_PREALLOC); -} percpu_hash_map_alloc SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LPM_TRIE); - __uint(key_size, 8); - __uint(value_size, sizeof(long)); - __uint(max_entries, 10000); - __uint(map_flags, BPF_F_NO_PREALLOC); -} lpm_trie_map_alloc SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, MAX_ENTRIES); -} array_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, long); - __uint(max_entries, MAX_ENTRIES); -} lru_hash_lookup_map SEC(".maps"); - -SEC("ksyscall/getuid") -int stress_hmap(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - int i; - - for (i = 0; i < 10; i++) { - bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map, &key); - if (value) - bpf_map_delete_elem(&hash_map, &key); - } - - return 0; -} - -SEC("ksyscall/geteuid") -int stress_percpu_hmap(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - int i; - - for (i = 0; i < 10; i++) { - bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map, &key); - } - return 0; -} - -SEC("ksyscall/getgid") -int stress_hmap_alloc(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - int i; - - for (i = 0; i < 10; i++) { - bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&hash_map_alloc, &key); - } - return 0; -} - -SEC("ksyscall/getegid") -int stress_percpu_hmap_alloc(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - int i; - - for (i = 0; i < 10; i++) { - bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map_alloc, &key); - } - return 0; -} - -SEC("ksyscall/connect") -int stress_lru_hmap_alloc(struct pt_regs *ctx) -{ - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); - char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; - union { - u16 dst6[8]; - struct { - u16 magic0; - u16 magic1; - u16 tcase; - u16 unused16; - u32 unused32; - u32 key; - }; - } test_params; - struct sockaddr_in6 *in6; - u16 test_case; - int addrlen, ret; - long val = 1; - u32 key = 0; - - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); - addrlen = (int)PT_REGS_PARM3_CORE(real_regs); - - if (addrlen != sizeof(*in6)) - return 0; - - ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6), - &in6->sin6_addr); - if (ret) - goto done; - - if (test_params.magic0 != 0xdead || - test_params.magic1 != 0xbeef) - return 0; - - test_case = test_params.tcase; - if (test_case != 3) - key = bpf_get_prandom_u32(); - - if (test_case == 0) { - ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); - } else if (test_case == 1) { - ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, - BPF_ANY); - } else if (test_case == 2) { - void *nolocal_lru_map; - int cpu = bpf_get_smp_processor_id(); - - nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, - &cpu); - if (!nolocal_lru_map) { - ret = -ENOENT; - goto done; - } - - ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, - BPF_ANY); - } else if (test_case == 3) { - u32 i; - - key = test_params.key; - -#pragma clang loop unroll(full) - for (i = 0; i < 32; i++) { - bpf_map_lookup_elem(&lru_hash_lookup_map, &key); - key++; - } - } else { - ret = -EINVAL; - } - -done: - if (ret) - bpf_trace_printk(fmt, sizeof(fmt), ret); - - return 0; -} - -SEC("ksyscall/gettid") -int stress_lpm_trie_map_alloc(struct pt_regs *ctx) -{ - union { - u32 b32[2]; - u8 b8[8]; - } key; - unsigned int i; - - key.b32[0] = 32; - key.b8[4] = 192; - key.b8[5] = 168; - key.b8[6] = 0; - key.b8[7] = 1; - -#pragma clang loop unroll(full) - for (i = 0; i < 32; ++i) - bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); - - return 0; -} - -SEC("ksyscall/getpgid") -int stress_hash_map_lookup(struct pt_regs *ctx) -{ - u32 key = 1, i; - long *value; - -#pragma clang loop unroll(full) - for (i = 0; i < 64; ++i) - value = bpf_map_lookup_elem(&hash_map, &key); - - return 0; -} - -SEC("ksyscall/getppid") -int stress_array_map_lookup(struct pt_regs *ctx) -{ - u32 key = 1, i; - long *value; - -#pragma clang loop unroll(full) - for (i = 0; i < 64; ++i) - value = bpf_map_lookup_elem(&array_map, &key); - - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 1bb53f4b29e1..d2fbcf963cdf 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -443,7 +443,7 @@ int main(int argc, char **argv) if (argc > 4) max_cnt = atoi(argv[4]); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/test_current_task_under_cgroup.bpf.c b/samples/bpf/test_current_task_under_cgroup.bpf.c new file mode 100644 index 000000000000..0b059cee3cba --- /dev/null +++ b/samples/bpf/test_current_task_under_cgroup.bpf.c @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 Sargun Dhillon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +} perf_map SEC(".maps"); + +/* Writes the last PID that called sync to a map at index 0 */ +SEC("ksyscall/sync") +int bpf_prog1(struct pt_regs *ctx) +{ + u64 pid = bpf_get_current_pid_tgid(); + int idx = 0; + + if (!bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c deleted file mode 100644 index 0b059cee3cba..000000000000 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 Sargun Dhillon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#include "vmlinux.h" -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); - __uint(max_entries, 1); -} cgroup_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, 1); -} perf_map SEC(".maps"); - -/* Writes the last PID that called sync to a map at index 0 */ -SEC("ksyscall/sync") -int bpf_prog1(struct pt_regs *ctx) -{ - u64 pid = bpf_get_current_pid_tgid(); - int idx = 0; - - if (!bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY); - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 6fb25906835e..9726ed2a8a8b 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -21,7 +21,7 @@ int main(int argc, char **argv) char filename[256]; int map_fd[2]; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/test_probe_write_user.bpf.c b/samples/bpf/test_probe_write_user.bpf.c new file mode 100644 index 000000000000..a0f10c5ca273 --- /dev/null +++ b/samples/bpf/test_probe_write_user.bpf.c @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 Sargun Dhillon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "vmlinux.h" +#include +#include +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sockaddr_in); + __type(value, struct sockaddr_in); + __uint(max_entries, 256); +} dnat_map SEC(".maps"); + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + * + * This example sits on a syscall, and the syscall ABI is relatively stable + * of course, across platforms, and over time, the ABI may change. + */ +SEC("ksyscall/connect") +int bpf_prog1(struct pt_regs *ctx) +{ + struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); + void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); + int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); + struct sockaddr_in new_addr, orig_addr = {}; + struct sockaddr_in *mapped_addr; + + if (sockaddr_len > sizeof(orig_addr)) + return 0; + + if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) + return 0; + + mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); + if (mapped_addr != NULL) { + memcpy(&new_addr, mapped_addr, sizeof(new_addr)); + bpf_probe_write_user(sockaddr_arg, &new_addr, + sizeof(new_addr)); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c deleted file mode 100644 index a0f10c5ca273..000000000000 --- a/samples/bpf/test_probe_write_user_kern.c +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2016 Sargun Dhillon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include "vmlinux.h" -#include -#include -#include -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, struct sockaddr_in); - __type(value, struct sockaddr_in); - __uint(max_entries, 256); -} dnat_map SEC(".maps"); - -/* kprobe is NOT a stable ABI - * kernel functions can be removed, renamed or completely change semantics. - * Number of arguments and their positions can change, etc. - * In such case this bpf+kprobe example will no longer be meaningful - * - * This example sits on a syscall, and the syscall ABI is relatively stable - * of course, across platforms, and over time, the ABI may change. - */ -SEC("ksyscall/connect") -int bpf_prog1(struct pt_regs *ctx) -{ - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); - void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); - int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); - struct sockaddr_in new_addr, orig_addr = {}; - struct sockaddr_in *mapped_addr; - - if (sockaddr_len > sizeof(orig_addr)) - return 0; - - if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) - return 0; - - mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); - if (mapped_addr != NULL) { - memcpy(&new_addr, mapped_addr, sizeof(new_addr)); - bpf_probe_write_user(sockaddr_arg, &new_addr, - sizeof(new_addr)); - } - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c index 00ccfb834e45..2a539aec4116 100644 --- a/samples/bpf/test_probe_write_user_user.c +++ b/samples/bpf/test_probe_write_user_user.c @@ -24,7 +24,7 @@ int main(int ac, char **argv) mapped_addr_in = (struct sockaddr_in *)&mapped_addr; tmp_addr_in = (struct sockaddr_in *)&tmp_addr; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/trace_output.bpf.c b/samples/bpf/trace_output.bpf.c new file mode 100644 index 000000000000..565a73b51b04 --- /dev/null +++ b/samples/bpf/trace_output.bpf.c @@ -0,0 +1,29 @@ +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} my_map SEC(".maps"); + +SEC("ksyscall/write") +int bpf_prog1(struct pt_regs *ctx) +{ + struct S { + u64 pid; + u64 cookie; + } data; + + data.pid = bpf_get_current_pid_tgid(); + data.cookie = 0x12345678; + + bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c deleted file mode 100644 index 565a73b51b04..000000000000 --- a/samples/bpf/trace_output_kern.c +++ /dev/null @@ -1,29 +0,0 @@ -#include "vmlinux.h" -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(u32)); - __uint(max_entries, 2); -} my_map SEC(".maps"); - -SEC("ksyscall/write") -int bpf_prog1(struct pt_regs *ctx) -{ - struct S { - u64 pid; - u64 cookie; - } data; - - data.pid = bpf_get_current_pid_tgid(); - data.cookie = 0x12345678; - - bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data)); - - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 371732f9cf8e..d316fd2c8e24 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) char filename[256]; FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex2.bpf.c b/samples/bpf/tracex2.bpf.c new file mode 100644 index 000000000000..a712eefc742e --- /dev/null +++ b/samples/bpf/tracex2.bpf.c @@ -0,0 +1,99 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "vmlinux.h" +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kfree_skb_reason") +int bpf_prog2(struct pt_regs *ctx) +{ + long loc = 0; + long init_val = 1; + long *value; + + /* read ip of kfree_skb_reason caller. + * non-portable version of __builtin_return_address(0) + */ + BPF_KPROBE_READ_RET_IP(loc, ctx); + + value = bpf_map_lookup_elem(&my_map, &loc); + if (value) + *value += 1; + else + bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct hist_key { + char comm[16]; + u64 pid_tgid; + u64 uid_gid; + u64 index; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct hist_key)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_hist_map SEC(".maps"); + +SEC("ksyscall/write") +int bpf_prog3(struct pt_regs *ctx) +{ + long write_size = PT_REGS_PARM3(ctx); + long init_val = 1; + long *value; + struct hist_key key; + + key.index = log2l(write_size); + key.pid_tgid = bpf_get_current_pid_tgid(); + key.uid_gid = bpf_get_current_uid_gid(); + bpf_get_current_comm(&key.comm, sizeof(key.comm)); + + value = bpf_map_lookup_elem(&my_hist_map, &key); + if (value) + __sync_fetch_and_add(value, 1); + else + bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c deleted file mode 100644 index a712eefc742e..000000000000 --- a/samples/bpf/tracex2_kern.c +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include "vmlinux.h" -#include -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, long); - __type(value, long); - __uint(max_entries, 1024); -} my_map SEC(".maps"); - -/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe - * example will no longer be meaningful - */ -SEC("kprobe/kfree_skb_reason") -int bpf_prog2(struct pt_regs *ctx) -{ - long loc = 0; - long init_val = 1; - long *value; - - /* read ip of kfree_skb_reason caller. - * non-portable version of __builtin_return_address(0) - */ - BPF_KPROBE_READ_RET_IP(loc, ctx); - - value = bpf_map_lookup_elem(&my_map, &loc); - if (value) - *value += 1; - else - bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); - return 0; -} - -static unsigned int log2(unsigned int v) -{ - unsigned int r; - unsigned int shift; - - r = (v > 0xFFFF) << 4; v >>= r; - shift = (v > 0xFF) << 3; v >>= shift; r |= shift; - shift = (v > 0xF) << 2; v >>= shift; r |= shift; - shift = (v > 0x3) << 1; v >>= shift; r |= shift; - r |= (v >> 1); - return r; -} - -static unsigned int log2l(unsigned long v) -{ - unsigned int hi = v >> 32; - if (hi) - return log2(hi) + 32; - else - return log2(v); -} - -struct hist_key { - char comm[16]; - u64 pid_tgid; - u64 uid_gid; - u64 index; -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(key_size, sizeof(struct hist_key)); - __uint(value_size, sizeof(long)); - __uint(max_entries, 1024); -} my_hist_map SEC(".maps"); - -SEC("ksyscall/write") -int bpf_prog3(struct pt_regs *ctx) -{ - long write_size = PT_REGS_PARM3(ctx); - long init_val = 1; - long *value; - struct hist_key key; - - key.index = log2l(write_size); - key.pid_tgid = bpf_get_current_pid_tgid(); - key.uid_gid = bpf_get_current_uid_gid(); - bpf_get_current_comm(&key.comm, sizeof(key.comm)); - - value = bpf_map_lookup_elem(&my_hist_map, &key); - if (value) - __sync_fetch_and_add(value, 1); - else - bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); - return 0; -} -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 089e408abd7a..2131f1648cf1 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -123,7 +123,7 @@ int main(int ac, char **argv) int i, j = 0; FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); -- cgit v1.2.3 From 2e5c4dd7f81545a98e9f06317347e760749c020b Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:25 +0900 Subject: samples/bpf: Fix tracex2 by using BPF_KSYSCALL macro Currently, there is a problem with tracex2, as it doesn't print the histogram properly and the results are misleading. (all results report as 0) The problem is caused by a change in arguments of the function to which the kprobe connects. This tracex2 bpf program uses kprobe (attached to __x64_sys_write) to figure out the size of the write system call. In order to achieve this, the third argument 'count' must be intact. The following is a prototype of the sys_write variant. (checked with pfunct) ~/git/linux$ pfunct -P fs/read_write.o | grep sys_write ssize_t ksys_write(unsigned int fd, const char * buf, size_t count); long int __x64_sys_write(const struct pt_regs * regs); ... cross compile with s390x ... long int __s390_sys_write(struct pt_regs * regs); Since the nature of SYSCALL_WRAPPER function wraps the argument once, additional process of argument extraction is required to properly parse the argument. #define BPF_KSYSCALL(name, args...) ... snip ... struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ : ctx; \ In order to fix this problem, the BPF_SYSCALL macro has been used. This reduces the hassle of parsing arguments from pt_regs. Since the macro uses the CORE version of argument extraction, additional portability comes too. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-5-danieltimlee@gmail.com --- samples/bpf/tracex2.bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/bpf/tracex2.bpf.c b/samples/bpf/tracex2.bpf.c index a712eefc742e..0a5c75b367be 100644 --- a/samples/bpf/tracex2.bpf.c +++ b/samples/bpf/tracex2.bpf.c @@ -8,6 +8,7 @@ #include #include #include +#include struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -76,14 +77,13 @@ struct { } my_hist_map SEC(".maps"); SEC("ksyscall/write") -int bpf_prog3(struct pt_regs *ctx) +int BPF_KSYSCALL(bpf_prog3, unsigned int fd, const char *buf, size_t count) { - long write_size = PT_REGS_PARM3(ctx); long init_val = 1; long *value; struct hist_key key; - key.index = log2l(write_size); + key.index = log2l(count); key.pid_tgid = bpf_get_current_pid_tgid(); key.uid_gid = bpf_get_current_uid_gid(); bpf_get_current_comm(&key.comm, sizeof(key.comm)); -- cgit v1.2.3 From c5ffb26375ad309c858453f17e777b716723d527 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:26 +0900 Subject: samples/bpf: Use BPF_KSYSCALL macro in syscall tracing programs This commit enhances the syscall tracing programs by using the BPF_SYSCALL macro to reduce the inconvenience of parsing arguments from pt_regs. By simplifying argument extraction, bpf program will become clear to understand. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-6-danieltimlee@gmail.com --- samples/bpf/map_perf_test.bpf.c | 26 ++++++++++-------------- samples/bpf/test_current_task_under_cgroup.bpf.c | 4 +++- samples/bpf/test_probe_write_user.bpf.c | 12 +++++------ 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/samples/bpf/map_perf_test.bpf.c b/samples/bpf/map_perf_test.bpf.c index 0c7885057ffe..3cdeba2afe12 100644 --- a/samples/bpf/map_perf_test.bpf.c +++ b/samples/bpf/map_perf_test.bpf.c @@ -101,7 +101,7 @@ struct { } lru_hash_lookup_map SEC(".maps"); SEC("ksyscall/getuid") -int stress_hmap(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_hmap) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; @@ -119,7 +119,7 @@ int stress_hmap(struct pt_regs *ctx) } SEC("ksyscall/geteuid") -int stress_percpu_hmap(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_percpu_hmap) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; @@ -136,7 +136,7 @@ int stress_percpu_hmap(struct pt_regs *ctx) } SEC("ksyscall/getgid") -int stress_hmap_alloc(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_hmap_alloc) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; @@ -153,7 +153,7 @@ int stress_hmap_alloc(struct pt_regs *ctx) } SEC("ksyscall/getegid") -int stress_percpu_hmap_alloc(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_percpu_hmap_alloc) { u32 key = bpf_get_current_pid_tgid(); long init_val = 1; @@ -168,11 +168,10 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) } return 0; } - SEC("ksyscall/connect") -int stress_lru_hmap_alloc(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_lru_hmap_alloc, int fd, struct sockaddr_in *uservaddr, + int addrlen) { - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; union { u16 dst6[8]; @@ -185,14 +184,11 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) u32 key; }; } test_params; - struct sockaddr_in6 *in6; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)uservaddr; u16 test_case; - int addrlen, ret; long val = 1; u32 key = 0; - - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); - addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + int ret; if (addrlen != sizeof(*in6)) return 0; @@ -250,7 +246,7 @@ done: } SEC("ksyscall/gettid") -int stress_lpm_trie_map_alloc(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_lpm_trie_map_alloc) { union { u32 b32[2]; @@ -272,7 +268,7 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) } SEC("ksyscall/getpgid") -int stress_hash_map_lookup(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_hash_map_lookup) { u32 key = 1, i; long *value; @@ -285,7 +281,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) } SEC("ksyscall/getppid") -int stress_array_map_lookup(struct pt_regs *ctx) +int BPF_KSYSCALL(stress_array_map_lookup) { u32 key = 1, i; long *value; diff --git a/samples/bpf/test_current_task_under_cgroup.bpf.c b/samples/bpf/test_current_task_under_cgroup.bpf.c index 0b059cee3cba..58b9cf7ed659 100644 --- a/samples/bpf/test_current_task_under_cgroup.bpf.c +++ b/samples/bpf/test_current_task_under_cgroup.bpf.c @@ -8,6 +8,8 @@ #include "vmlinux.h" #include #include +#include +#include struct { __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); @@ -25,7 +27,7 @@ struct { /* Writes the last PID that called sync to a map at index 0 */ SEC("ksyscall/sync") -int bpf_prog1(struct pt_regs *ctx) +int BPF_KSYSCALL(bpf_prog1) { u64 pid = bpf_get_current_pid_tgid(); int idx = 0; diff --git a/samples/bpf/test_probe_write_user.bpf.c b/samples/bpf/test_probe_write_user.bpf.c index a0f10c5ca273..a4f3798b7fb0 100644 --- a/samples/bpf/test_probe_write_user.bpf.c +++ b/samples/bpf/test_probe_write_user.bpf.c @@ -27,24 +27,22 @@ struct { * of course, across platforms, and over time, the ABI may change. */ SEC("ksyscall/connect") -int bpf_prog1(struct pt_regs *ctx) +int BPF_KSYSCALL(bpf_prog1, int fd, struct sockaddr_in *uservaddr, + int addrlen) { - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); - void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); - int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); struct sockaddr_in new_addr, orig_addr = {}; struct sockaddr_in *mapped_addr; - if (sockaddr_len > sizeof(orig_addr)) + if (addrlen > sizeof(orig_addr)) return 0; - if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) + if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), uservaddr) != 0) return 0; mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); if (mapped_addr != NULL) { memcpy(&new_addr, mapped_addr, sizeof(new_addr)); - bpf_probe_write_user(sockaddr_arg, &new_addr, + bpf_probe_write_user(uservaddr, &new_addr, sizeof(new_addr)); } return 0; -- cgit v1.2.3 From 7244eb669397f309c3d014264823cdc9cb3f8e6b Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sat, 24 Dec 2022 16:15:27 +0900 Subject: libbpf: Fix invalid return address register in s390 There is currently an invalid register mapping in the s390 return address register. As the manual[1] states, the return address can be found at r14. In bpf_tracing.h, the s390 registers were named gprs(general purpose registers). This commit fixes the problem by correcting the mistyped mapping. [1]: https://uclibc.org/docs/psABI-s390x.pdf#page=14 Fixes: 3cc31d794097 ("libbpf: Normalize PT_REGS_xxx() macro definitions") Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224071527.2292-7-danieltimlee@gmail.com --- tools/lib/bpf/bpf_tracing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 2972dc25ff72..9c1b1689068d 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -137,7 +137,7 @@ struct pt_regs___s390 { #define __PT_PARM3_REG gprs[4] #define __PT_PARM4_REG gprs[5] #define __PT_PARM5_REG gprs[6] -#define __PT_RET_REG grps[14] +#define __PT_RET_REG gprs[14] #define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG gprs[2] #define __PT_SP_REG gprs[15] -- cgit v1.2.3 From 678a1c036199011e65b203367900f002a28da004 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Sat, 24 Dec 2022 19:20:58 +0800 Subject: libbpf: Added the description of some API functions Currently, many API functions are not described in the document. Add add API description of the following four API functions: - libbpf_set_print; - bpf_object__open; - bpf_object__load; - bpf_object__close. Signed-off-by: Xin Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221224112058.12038-1-liuxin350@huawei.com --- tools/lib/bpf/libbpf.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index eee883f007f9..898db26e42e9 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -96,6 +96,12 @@ enum libbpf_print_level { typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, const char *, va_list ap); +/** + * @brief **libbpf_set_print()** sets user-provided log callback function to + * be used for libbpf warnings and informational messages. + * @param fn The log print function. If NULL, libbpf won't print anything. + * @return Pointer to old print function. + */ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ @@ -174,6 +180,14 @@ struct bpf_object_open_opts { }; #define bpf_object_open_opts__last_field kernel_log_level +/** + * @brief **bpf_object__open()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path. + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_object *bpf_object__open(const char *path); /** @@ -203,10 +217,21 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); -/* Load/unload object into/from kernel */ +/** + * @brief **bpf_object__load()** loads BPF object into kernel. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** APIs + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ LIBBPF_API int bpf_object__load(struct bpf_object *obj); -LIBBPF_API void bpf_object__close(struct bpf_object *object); +/** + * @brief **bpf_object__close()** closes a BPF object and releases all + * resources. + * @param obj Pointer to a valid BPF object + */ +LIBBPF_API void bpf_object__close(struct bpf_object *obj); /* pin_maps and unpin_maps can both be called with a NULL path, in which case * they will use the pin_path attribute of each map (and ignore all maps that -- cgit v1.2.3 From bb5747cfbc4b7fe29621ca6cd4a695d2723bf2e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 29 Dec 2022 19:12:17 -0800 Subject: libbpf: Restore errno after pr_warn. pr_warn calls into user-provided callback, which can clobber errno, so `errno = saved_errno` should happen after pr_warn. Fixes: 07453245620c ("libbpf: fix errno is overwritten after being closed.") Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf_internal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 98333a6c38e9..e4d05662a96c 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -544,8 +544,10 @@ static inline int ensure_good_fd(int fd) saved_errno = errno; close(old_fd); errno = saved_errno; - if (fd < 0) + if (fd < 0) { pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); + errno = saved_errno; + } } return fd; } -- cgit v1.2.3 From 00883922ab404c0fc921709d8c2cec86f49c32f2 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sat, 31 Dec 2022 18:07:57 +0800 Subject: libbpf: Add LoongArch support to bpf_tracing.h Add PT_REGS macros for LoongArch ([0]). [0]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Acked-by: Huacai Chen Link: https://lore.kernel.org/bpf/20221231100757.3177034-1-hengqi.chen@gmail.com --- tools/lib/bpf/bpf_tracing.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 9c1b1689068d..bdb0f6b5be84 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -32,6 +32,9 @@ #elif defined(__TARGET_ARCH_arc) #define bpf_target_arc #define bpf_target_defined +#elif defined(__TARGET_ARCH_loongarch) + #define bpf_target_loongarch + #define bpf_target_defined #else /* Fall back to what the compiler says */ @@ -62,6 +65,9 @@ #elif defined(__arc__) #define bpf_target_arc #define bpf_target_defined +#elif defined(__loongarch__) + #define bpf_target_loongarch + #define bpf_target_defined #endif /* no compiler target */ #endif @@ -258,6 +264,23 @@ struct pt_regs___arm64 { /* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ #define PT_REGS_SYSCALL_REGS(ctx) ctx +#elif defined(bpf_target_loongarch) + +/* https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html */ + +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_RET_REG regs[1] +#define __PT_FP_REG regs[22] +#define __PT_RC_REG regs[4] +#define __PT_SP_REG regs[3] +#define __PT_IP_REG csr_era +/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx + #endif #if defined(bpf_target_defined) -- cgit v1.2.3 From acd3b7768048fe338248cdf43ccfbf8c084a6bc1 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 31 Dec 2022 23:14:36 +0800 Subject: libbpf: Return -ENODATA for missing btf section As discussed before, return -ENODATA (No data available) would be more meaningful than ENOENT (No such file or directory). Suggested-by: Leo Yan Signed-off-by: Changbin Du Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221231151436.6541-1-changbin.du@gmail.com --- tools/lib/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b03250007018..64841117fbb2 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1004,7 +1004,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, if (!btf_data) { pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); - err = -ENOENT; + err = -ENODATA; goto done; } btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); -- cgit v1.2.3