summaryrefslogtreecommitdiff
path: root/include/linux/bpf_verifier.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 10:07:29 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-11 10:07:29 -0800
commit3e7aeb78ab01c2c2f0e1f784e5ddec88fcd3d106 (patch)
treebdbfd45f8d8e967b06ed2d9cb92f67f686d02659 /include/linux/bpf_verifier.h
parentde927f6c0b07d9e698416c5b287c521b07694cac (diff)
parenta7fe0881d9b78d402bbd9067dd4503a57c57a1d9 (diff)
downloadlwn-3e7aeb78ab01c2c2f0e1f784e5ddec88fcd3d106.tar.gz
lwn-3e7aeb78ab01c2c2f0e1f784e5ddec88fcd3d106.zip
Merge tag 'net-next-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "The most interesting thing is probably the networking structs reorganization and a significant amount of changes is around self-tests. Core & protocols: - Analyze and reorganize core networking structs (socks, netdev, netns, mibs) to optimize cacheline consumption and set up build time warnings to safeguard against future header changes This improves TCP performances with many concurrent connections up to 40% - Add page-pool netlink-based introspection, exposing the memory usage and recycling stats. This helps indentify bad PP users and possible leaks - Refine TCP/DCCP source port selection to no longer favor even source port at connect() time when IP_LOCAL_PORT_RANGE is set. This lowers the time taken by connect() for hosts having many active connections to the same destination - Refactor the TCP bind conflict code, shrinking related socket structs - Refactor TCP SYN-Cookie handling, as a preparation step to allow arbitrary SYN-Cookie processing via eBPF - Tune optmem_max for 0-copy usage, increasing the default value to 128KB and namespecifying it - Allow coalescing for cloned skbs coming from page pools, improving RX performances with some common configurations - Reduce extension header parsing overhead at GRO time - Add bridge MDB bulk deletion support, allowing user-space to request the deletion of matching entries - Reorder nftables struct members, to keep data accessed by the datapath first - Introduce TC block ports tracking and use. This allows supporting multicast-like behavior at the TC layer - Remove UAPI support for retired TC qdiscs (dsmark, CBQ and ATM) and classifiers (RSVP and tcindex) - More data-race annotations - Extend the diag interface to dump TCP bound-only sockets - Conditional notification of events for TC qdisc class and actions - Support for WPAN dynamic associations with nearby devices, to form a sub-network using a specific PAN ID - Implement SMCv2.1 virtual ISM device support - Add support for Batman-avd mulicast packet type BPF: - Tons of verifier improvements: - BPF register bounds logic and range support along with a large test suite - log improvements - complete precision tracking support for register spills - track aligned STACK_ZERO cases as imprecise spilled registers. This improves the verifier "instructions processed" metric from single digit to 50-60% for some programs - support for user's global BPF subprogram arguments with few commonly requested annotations for a better developer experience - support tracking of BPF_JNE which helps cases when the compiler transforms (unsigned) "a > 0" into "if a == 0 goto xxx" and the like - several fixes - Add initial TX metadata implementation for AF_XDP with support in mlx5 and stmmac drivers. Two types of offloads are supported right now, that is, TX timestamp and TX checksum offload - Fix kCFI bugs in BPF all forms of indirect calls from BPF into kernel and from kernel into BPF work with CFI enabled. This allows BPF to work with CONFIG_FINEIBT=y - Change BPF verifier logic to validate global subprograms lazily instead of unconditionally before the main program, so they can be guarded using BPF CO-RE techniques - Support uid/gid options when mounting bpffs - Add a new kfunc which acquires the associated cgroup of a task within a specific cgroup v1 hierarchy where the latter is identified by its id - Extend verifier to allow bpf_refcount_acquire() of a map value field obtained via direct load which is a use-case needed in sched_ext - Add BPF link_info support for uprobe multi link along with bpftool integration for the latter - Support for VLAN tag in XDP hints - Remove deprecated bpfilter kernel leftovers given the project is developed in user-space (https://github.com/facebook/bpfilter) Misc: - Support for parellel TC self-tests execution - Increase MPTCP self-tests coverage - Updated the bridge documentation, including several so-far undocumented features - Convert all the net self-tests to run in unique netns, to avoid random failures due to conflict and allow concurrent runs - Add TCP-AO self-tests - Add kunit tests for both cfg80211 and mac80211 - Autogenerate Netlink families documentation from YAML spec - Add yml-gen support for fixed headers and recursive nests, the tool can now generate user-space code for all genetlink families for which we have specs - A bunch of additional module descriptions fixes - Catch incorrect freeing of pages belonging to a page pool Driver API: - Rust abstractions for network PHY drivers; do not cover yet the full C API, but already allow implementing functional PHY drivers in rust - Introduce queue and NAPI support in the netdev Netlink interface, allowing complete access to the device <> NAPIs <> queues relationship - Introduce notifications filtering for devlink to allow control application scale to thousands of instances - Improve PHY validation, requesting rate matching information for each ethtool link mode supported by both the PHY and host - Add support for ethtool symmetric-xor RSS hash - ACPI based Wifi band RFI (WBRF) mitigation feature for the AMD platform - Expose pin fractional frequency offset value over new DPLL generic netlink attribute - Convert older drivers to platform remove callback returning void - Add support for PHY package MMD read/write New hardware / drivers: - Ethernet: - Octeon CN10K devices - Broadcom 5760X P7 - Qualcomm SM8550 SoC - Texas Instrument DP83TG720S PHY - Bluetooth: - IMC Networks Bluetooth radio Removed: - WiFi: - libertas 16-bit PCMCIA support - Atmel at76c50x drivers - HostAP ISA/PCMCIA style 802.11b driver - zd1201 802.11b USB dongles - Orinoco ISA/PCMCIA 802.11b driver - Aviator/Raytheon driver - Planet WL3501 driver - RNDIS USB 802.11b driver Driver updates: - Ethernet high-speed NICs: - Intel (100G, ice, idpf): - allow one by one port representors creation and removal - add temperature and clock information reporting - add get/set for ethtool's header split ringparam - add again FW logging - adds support switchdev hardware packet mirroring - iavf: implement symmetric-xor RSS hash - igc: add support for concurrent physical and free-running timers - i40e: increase the allowable descriptors - nVidia/Mellanox: - Preparation for Socket-Direct multi-dev netdev. That will allow in future releases combining multiple PFs devices attached to different NUMA nodes under the same netdev - Broadcom (bnxt): - TX completion handling improvements - add basic ntuple filter support - reduce MSIX vectors usage for MQPRIO offload - add VXLAN support, USO offload and TX coalesce completion for P7 - Marvell Octeon EP: - xmit-more support - add PF-VF mailbox support and use it for FW notifications for VFs - Wangxun (ngbe/txgbe): - implement ethtool functions to operate pause param, ring param, coalesce channel number and msglevel - Netronome/Corigine (nfp): - add flow-steering support - support UDP segmentation offload - Ethernet NICs embedded, slower, virtual: - Xilinx AXI: remove duplicate DMA code adopting the dma engine driver - stmmac: add support for HW-accelerated VLAN stripping - TI AM654x sw: add mqprio, frame preemption & coalescing - gve: add support for non-4k page sizes. - virtio-net: support dynamic coalescing moderation - nVidia/Mellanox Ethernet datacenter switches: - allow firmware upgrade without a reboot - more flexible support for bridge flooding via the compressed FID flooding mode - Ethernet embedded switches: - Microchip: - fine-tune flow control and speed configurations in KSZ8xxx - KSZ88X3: enable setting rmii reference - Renesas: - add jumbo frames support - Marvell: - 88E6xxx: add "eth-mac" and "rmon" stats support - Ethernet PHYs: - aquantia: add firmware load support - at803x: refactor the driver to simplify adding support for more chip variants - NXP C45 TJA11xx: Add MACsec offload support - Wifi: - MediaTek (mt76): - NVMEM EEPROM improvements - mt7996 Extremely High Throughput (EHT) improvements - mt7996 Wireless Ethernet Dispatcher (WED) support - mt7996 36-bit DMA support - Qualcomm (ath12k): - support for a single MSI vector - WCN7850: support AP mode - Intel (iwlwifi): - new debugfs file fw_dbg_clear - allow concurrent P2P operation on DFS channels - Bluetooth: - QCA2066: support HFP offload - ISO: more broadcast-related improvements - NXP: better recovery in case receiver/transmitter get out of sync" * tag 'net-next-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1714 commits) lan78xx: remove redundant statement in lan78xx_get_eee lan743x: remove redundant statement in lan743x_ethtool_get_eee bnxt_en: Fix RCU locking for ntuple filters in bnxt_rx_flow_steer() bnxt_en: Fix RCU locking for ntuple filters in bnxt_srxclsrldel() bnxt_en: Remove unneeded variable in bnxt_hwrm_clear_vnic_filter() tcp: Revert no longer abort SYN_SENT when receiving some ICMP Revert "mlx5 updates 2023-12-20" Revert "net: stmmac: Enable Per DMA Channel interrupt" ipvlan: Remove usage of the deprecated ida_simple_xx() API ipvlan: Fix a typo in a comment net/sched: Remove ipt action tests net: stmmac: Use interrupt mode INTM=1 for per channel irq net: stmmac: Add support for TX/RX channel interrupt net: stmmac: Make MSI interrupt routine generic dt-bindings: net: snps,dwmac: per channel irq net: phy: at803x: make read_status more generic net: phy: at803x: add support for cdt cross short test for qca808x net: phy: at803x: refactor qca808x cable test get status function net: phy: at803x: generalize cdt fault length function net: ethernet: cortina: Drop TSO support ...
Diffstat (limited to 'include/linux/bpf_verifier.h')
-rw-r--r--include/linux/bpf_verifier.h172
1 files changed, 152 insertions, 20 deletions
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index aa4d19d0bc94..d07d857ca67f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -275,6 +275,11 @@ struct bpf_reference_state {
int callback_ref;
};
+struct bpf_retval_range {
+ s32 minval;
+ s32 maxval;
+};
+
/* state of the program:
* type of all registers and stack info
*/
@@ -297,8 +302,8 @@ struct bpf_func_state {
* void foo(void) { bpf_timer_set_callback(,foo); }
*/
u32 async_entry_cnt;
+ struct bpf_retval_range callback_ret_range;
bool in_callback_fn;
- struct tnum callback_ret_range;
bool in_async_callback_fn;
bool in_exception_callback_fn;
/* For callback calling functions that limit number of possible
@@ -316,16 +321,48 @@ struct bpf_func_state {
/* The following fields should be last. See copy_func_state() */
int acquired_refs;
struct bpf_reference_state *refs;
- int allocated_stack;
+ /* The state of the stack. Each element of the array describes BPF_REG_SIZE
+ * (i.e. 8) bytes worth of stack memory.
+ * stack[0] represents bytes [*(r10-8)..*(r10-1)]
+ * stack[1] represents bytes [*(r10-16)..*(r10-9)]
+ * ...
+ * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)]
+ */
struct bpf_stack_state *stack;
+ /* Size of the current stack, in bytes. The stack state is tracked below, in
+ * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
+ */
+ int allocated_stack;
+};
+
+#define MAX_CALL_FRAMES 8
+
+/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+enum {
+ /* instruction references stack slot through PTR_TO_STACK register;
+ * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
+ * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
+ * 8 bytes per slot, so slot index (spi) is [0, 63])
+ */
+ INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */
+
+ INSN_F_SPI_MASK = 0x3f, /* 6 bits */
+ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */
+
+ INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */
};
-struct bpf_idx_pair {
- u32 prev_idx;
+static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
+static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
+
+struct bpf_jmp_history_entry {
u32 idx;
+ /* insn idx can't be bigger than 1 million */
+ u32 prev_idx : 22;
+ /* special flags, e.g., whether insn is doing register stack spill/load */
+ u32 flags : 10;
};
-#define MAX_CALL_FRAMES 8
/* Maximum number of register states that can exist at once */
#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
struct bpf_verifier_state {
@@ -408,7 +445,7 @@ struct bpf_verifier_state {
* For most states jmp_history_cnt is [0-3].
* For loops can go up to ~40.
*/
- struct bpf_idx_pair *jmp_history;
+ struct bpf_jmp_history_entry *jmp_history;
u32 jmp_history_cnt;
u32 dfs_depth;
u32 callback_unroll_depth;
@@ -569,17 +606,28 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
#define BPF_MAX_SUBPROGS 256
+struct bpf_subprog_arg_info {
+ enum bpf_arg_type arg_type;
+ union {
+ u32 mem_size;
+ };
+};
+
struct bpf_subprog_info {
/* 'start' has to be the first field otherwise find_subprog() won't work */
u32 start; /* insn idx of function entry point */
u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
u16 stack_depth; /* max. stack depth used by this function */
- bool has_tail_call;
- bool tail_call_reachable;
- bool has_ld_abs;
- bool is_cb;
- bool is_async_cb;
- bool is_exception_cb;
+ bool has_tail_call: 1;
+ bool tail_call_reachable: 1;
+ bool has_ld_abs: 1;
+ bool is_cb: 1;
+ bool is_async_cb: 1;
+ bool is_exception_cb: 1;
+ bool args_cached: 1;
+
+ u8 arg_cnt;
+ struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};
struct bpf_verifier_env;
@@ -618,6 +666,7 @@ struct bpf_verifier_env {
int stack_size; /* number of states to be processed */
bool strict_alignment; /* perform strict pointer alignment checks */
bool test_state_freq; /* test verifier with different pruning frequency */
+ bool test_reg_invariants; /* fail verification on register invariants violations */
struct bpf_verifier_state *cur_state; /* current verifier state */
struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_verifier_state_list *free_list;
@@ -630,6 +679,10 @@ struct bpf_verifier_env {
int exception_callback_subprog;
bool explore_alu_limits;
bool allow_ptr_leaks;
+ /* Allow access to uninitialized stack memory. Writes with fixed offset are
+ * always allowed, so this refers to reads (with fixed or variable offset),
+ * to writes with variable offset and to indirect (helper) accesses.
+ */
bool allow_uninit_stack;
bool bpf_capable;
bool bypass_spec_v1;
@@ -650,6 +703,7 @@ struct bpf_verifier_env {
int cur_stack;
} cfg;
struct backtrack_state bt;
+ struct bpf_jmp_history_entry *cur_hist_ent;
u32 pass_cnt; /* number of times do_check() was called */
u32 subprog_cnt;
/* number of instructions analyzed by the verifier */
@@ -684,6 +738,16 @@ struct bpf_verifier_env {
char tmp_str_buf[TMP_STR_BUF_LEN];
};
+static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
+{
+ return &env->prog->aux->func_info_aux[subprog];
+}
+
+static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+ return &env->subprog_info[subprog];
+}
+
__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
const char *fmt, va_list args);
__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
@@ -695,6 +759,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...);
+
static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
{
struct bpf_verifier_state *cur = env->cur_state;
@@ -717,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
void
bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
-int check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno);
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
- enum bpf_arg_type arg_type);
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno, u32 mem_size);
-
/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
struct btf *btf, u32 btf_id)
@@ -794,4 +854,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type)
return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
}
+static inline bool type_is_ptr_alloc_obj(u32 type)
+{
+ return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
+}
+
+static inline bool type_is_non_owning_ref(u32 type)
+{
+ return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
+static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ type = base_type(type);
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
+}
+
+static inline bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_XDP_SOCK;
+}
+
+static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+ env->scratched_regs |= 1U << regno;
+}
+
+static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+ env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+ return (env->scratched_regs >> regno) & 1;
+}
+
+static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+ return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+ return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = 0U;
+ env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = ~0U;
+ env->scratched_stack_slots = ~0ULL;
+}
+
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
+const char *dynptr_type_str(enum bpf_dynptr_type type);
+const char *iter_type_str(const struct btf *btf, u32 btf_id);
+const char *iter_state_str(enum bpf_iter_state state);
+
+void print_verifier_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state, bool print_all);
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state);
+
#endif /* _LINUX_BPF_VERIFIER_H */